xref: /llvm-project/llvm/lib/Target/AMDGPU/GCNSubtarget.h (revision 6206f5444fc0732e6495703c75a67f1f90f5b418)
1 //=====-- GCNSubtarget.h - Define GCN Subtarget for AMDGPU ------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //==-----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// AMD GCN specific subclass of TargetSubtarget.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #ifndef LLVM_LIB_TARGET_AMDGPU_GCNSUBTARGET_H
15 #define LLVM_LIB_TARGET_AMDGPU_GCNSUBTARGET_H
16 
17 #include "AMDGPUCallLowering.h"
18 #include "AMDGPURegisterBankInfo.h"
19 #include "AMDGPUSubtarget.h"
20 #include "SIFrameLowering.h"
21 #include "SIISelLowering.h"
22 #include "SIInstrInfo.h"
23 #include "Utils/AMDGPUBaseInfo.h"
24 #include "llvm/Support/ErrorHandling.h"
25 
26 #define GET_SUBTARGETINFO_HEADER
27 #include "AMDGPUGenSubtargetInfo.inc"
28 
29 namespace llvm {
30 
31 class GCNTargetMachine;
32 
33 class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
34                            public AMDGPUSubtarget {
35 public:
36   using AMDGPUSubtarget::getMaxWavesPerEU;
37 
38   // Following 2 enums are documented at:
39   //   - https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
40   enum class TrapHandlerAbi {
41     NONE   = 0x00,
42     AMDHSA = 0x01,
43   };
44 
45   enum class TrapID {
46     LLVMAMDHSATrap      = 0x02,
47     LLVMAMDHSADebugTrap = 0x03,
48   };
49 
50 private:
51   /// SelectionDAGISel related APIs.
52   std::unique_ptr<const SelectionDAGTargetInfo> TSInfo;
53 
54   /// GlobalISel related APIs.
55   std::unique_ptr<AMDGPUCallLowering> CallLoweringInfo;
56   std::unique_ptr<InlineAsmLowering> InlineAsmLoweringInfo;
57   std::unique_ptr<InstructionSelector> InstSelector;
58   std::unique_ptr<LegalizerInfo> Legalizer;
59   std::unique_ptr<AMDGPURegisterBankInfo> RegBankInfo;
60 
61 protected:
62   // Basic subtarget description.
63   Triple TargetTriple;
64   AMDGPU::IsaInfo::AMDGPUTargetID TargetID;
65   unsigned Gen = INVALID;
66   InstrItineraryData InstrItins;
67   int LDSBankCount = 0;
68   unsigned MaxPrivateElementSize = 0;
69 
70   // Possibly statically set by tablegen, but may want to be overridden.
71   bool FastDenormalF32 = false;
72   bool HalfRate64Ops = false;
73   bool FullRate64Ops = false;
74 
75   // Dynamically set bits that enable features.
76   bool FlatForGlobal = false;
77   bool AutoWaitcntBeforeBarrier = false;
78   bool BackOffBarrier = false;
79   bool UnalignedScratchAccess = false;
80   bool UnalignedAccessMode = false;
81   bool HasApertureRegs = false;
82   bool SupportsXNACK = false;
83   bool KernargPreload = false;
84 
85   // This should not be used directly. 'TargetID' tracks the dynamic settings
86   // for XNACK.
87   bool EnableXNACK = false;
88 
89   bool EnableTgSplit = false;
90   bool EnableCuMode = false;
91   bool TrapHandler = false;
92   bool EnablePreciseMemory = false;
93 
94   // Used as options.
95   bool EnableLoadStoreOpt = false;
96   bool EnableUnsafeDSOffsetFolding = false;
97   bool EnableSIScheduler = false;
98   bool EnableDS128 = false;
99   bool EnablePRTStrictNull = false;
100   bool DumpCode = false;
101 
102   // Subtarget statically properties set by tablegen
103   bool FP64 = false;
104   bool FMA = false;
105   bool MIMG_R128 = false;
106   bool CIInsts = false;
107   bool GFX8Insts = false;
108   bool GFX9Insts = false;
109   bool GFX90AInsts = false;
110   bool GFX940Insts = false;
111   bool GFX950Insts = false;
112   bool GFX10Insts = false;
113   bool GFX11Insts = false;
114   bool GFX12Insts = false;
115   bool GFX10_3Insts = false;
116   bool GFX7GFX8GFX9Insts = false;
117   bool SGPRInitBug = false;
118   bool UserSGPRInit16Bug = false;
119   bool NegativeScratchOffsetBug = false;
120   bool NegativeUnalignedScratchOffsetBug = false;
121   bool HasSMemRealTime = false;
122   bool HasIntClamp = false;
123   bool HasFmaMixInsts = false;
124   bool HasMovrel = false;
125   bool HasVGPRIndexMode = false;
126   bool HasScalarDwordx3Loads = false;
127   bool HasScalarStores = false;
128   bool HasScalarAtomics = false;
129   bool HasSDWAOmod = false;
130   bool HasSDWAScalar = false;
131   bool HasSDWASdst = false;
132   bool HasSDWAMac = false;
133   bool HasSDWAOutModsVOPC = false;
134   bool HasDPP = false;
135   bool HasDPP8 = false;
136   bool HasDPALU_DPP = false;
137   bool HasDPPSrc1SGPR = false;
138   bool HasPackedFP32Ops = false;
139   bool HasImageInsts = false;
140   bool HasExtendedImageInsts = false;
141   bool HasR128A16 = false;
142   bool HasA16 = false;
143   bool HasG16 = false;
144   bool HasNSAEncoding = false;
145   bool HasPartialNSAEncoding = false;
146   bool GFX10_AEncoding = false;
147   bool GFX10_BEncoding = false;
148   bool HasDLInsts = false;
149   bool HasFmacF64Inst = false;
150   bool HasDot1Insts = false;
151   bool HasDot2Insts = false;
152   bool HasDot3Insts = false;
153   bool HasDot4Insts = false;
154   bool HasDot5Insts = false;
155   bool HasDot6Insts = false;
156   bool HasDot7Insts = false;
157   bool HasDot8Insts = false;
158   bool HasDot9Insts = false;
159   bool HasDot10Insts = false;
160   bool HasDot11Insts = false;
161   bool HasDot12Insts = false;
162   bool HasDot13Insts = false;
163   bool HasMAIInsts = false;
164   bool HasFP8Insts = false;
165   bool HasFP8ConversionInsts = false;
166   bool HasCvtFP8Vop1Bug = false;
167   bool HasPkFmacF16Inst = false;
168   bool HasAtomicFMinFMaxF32GlobalInsts = false;
169   bool HasAtomicFMinFMaxF64GlobalInsts = false;
170   bool HasAtomicFMinFMaxF32FlatInsts = false;
171   bool HasAtomicFMinFMaxF64FlatInsts = false;
172   bool HasAtomicDsPkAdd16Insts = false;
173   bool HasAtomicFlatPkAdd16Insts = false;
174   bool HasAtomicFaddRtnInsts = false;
175   bool HasAtomicFaddNoRtnInsts = false;
176   bool HasMemoryAtomicFaddF32DenormalSupport = false;
177   bool HasAtomicBufferGlobalPkAddF16NoRtnInsts = false;
178   bool HasAtomicBufferGlobalPkAddF16Insts = false;
179   bool HasAtomicCSubNoRtnInsts = false;
180   bool HasAtomicGlobalPkAddBF16Inst = false;
181   bool HasAtomicBufferPkAddBF16Inst = false;
182   bool HasFlatAtomicFaddF32Inst = false;
183   bool HasFlatBufferGlobalAtomicFaddF64Inst = false;
184   bool HasDefaultComponentZero = false;
185   bool HasAgentScopeFineGrainedRemoteMemoryAtomics = false;
186   bool HasDefaultComponentBroadcast = false;
187   bool HasXF32Insts = false;
188   /// The maximum number of instructions that may be placed within an S_CLAUSE,
189   /// which is one greater than the maximum argument to S_CLAUSE. A value of 0
190   /// indicates a lack of S_CLAUSE support.
191   unsigned MaxHardClauseLength = 0;
192   bool SupportsSRAMECC = false;
193 
194   // This should not be used directly. 'TargetID' tracks the dynamic settings
195   // for SRAMECC.
196   bool EnableSRAMECC = false;
197 
198   bool HasNoSdstCMPX = false;
199   bool HasVscnt = false;
200   bool HasGetWaveIdInst = false;
201   bool HasSMemTimeInst = false;
202   bool HasShaderCyclesRegister = false;
203   bool HasShaderCyclesHiLoRegisters = false;
204   bool HasVOP3Literal = false;
205   bool HasNoDataDepHazard = false;
206   bool FlatAddressSpace = false;
207   bool FlatInstOffsets = false;
208   bool FlatGlobalInsts = false;
209   bool FlatScratchInsts = false;
210   bool ScalarFlatScratchInsts = false;
211   bool HasArchitectedFlatScratch = false;
212   bool EnableFlatScratch = false;
213   bool HasArchitectedSGPRs = false;
214   bool HasGDS = false;
215   bool HasGWS = false;
216   bool AddNoCarryInsts = false;
217   bool HasUnpackedD16VMem = false;
218   bool LDSMisalignedBug = false;
219   bool HasMFMAInlineLiteralBug = false;
220   bool UnalignedBufferAccess = false;
221   bool UnalignedDSAccess = false;
222   bool HasPackedTID = false;
223   bool ScalarizeGlobal = false;
224   bool HasSALUFloatInsts = false;
225   bool HasPseudoScalarTrans = false;
226   bool HasRestrictedSOffset = false;
227   bool HasBitOp3Insts = false;
228   bool HasPrngInst = false;
229   bool HasPermlane16Swap = false;
230   bool HasPermlane32Swap = false;
231   bool HasVcmpxPermlaneHazard = false;
232   bool HasVMEMtoScalarWriteHazard = false;
233   bool HasSMEMtoVectorWriteHazard = false;
234   bool HasInstFwdPrefetchBug = false;
235   bool HasVcmpxExecWARHazard = false;
236   bool HasLdsBranchVmemWARHazard = false;
237   bool HasNSAtoVMEMBug = false;
238   bool HasNSAClauseBug = false;
239   bool HasOffset3fBug = false;
240   bool HasFlatSegmentOffsetBug = false;
241   bool HasImageStoreD16Bug = false;
242   bool HasImageGather4D16Bug = false;
243   bool HasMSAALoadDstSelBug = false;
244   bool HasPrivEnabledTrap2NopBug = false;
245   bool Has1_5xVGPRs = false;
246   bool HasMADIntraFwdBug = false;
247   bool HasVOPDInsts = false;
248   bool HasVALUTransUseHazard = false;
249   bool HasForceStoreSC0SC1 = false;
250   bool HasRequiredExportPriority = false;
251   bool HasVmemWriteVgprInOrder = false;
252   bool HasAshrPkInsts = false;
253   bool HasMinimum3Maximum3F32 = false;
254   bool HasMinimum3Maximum3F16 = false;
255   bool HasMinimum3Maximum3PKF16 = false;
256 
257   bool RequiresCOV6 = false;
258 
259   // Dummy feature to use for assembler in tablegen.
260   bool FeatureDisable = false;
261 
262 private:
263   SIInstrInfo InstrInfo;
264   SITargetLowering TLInfo;
265   SIFrameLowering FrameLowering;
266 
267 public:
268   GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
269                const GCNTargetMachine &TM);
270   ~GCNSubtarget() override;
271 
272   GCNSubtarget &initializeSubtargetDependencies(const Triple &TT,
273                                                    StringRef GPU, StringRef FS);
274 
275   /// Diagnose inconsistent subtarget features before attempting to codegen
276   /// function \p F.
277   void checkSubtargetFeatures(const Function &F) const;
278 
279   const SIInstrInfo *getInstrInfo() const override {
280     return &InstrInfo;
281   }
282 
283   const SIFrameLowering *getFrameLowering() const override {
284     return &FrameLowering;
285   }
286 
287   const SITargetLowering *getTargetLowering() const override {
288     return &TLInfo;
289   }
290 
291   const SIRegisterInfo *getRegisterInfo() const override {
292     return &InstrInfo.getRegisterInfo();
293   }
294 
295   const SelectionDAGTargetInfo *getSelectionDAGInfo() const override;
296 
297   const CallLowering *getCallLowering() const override {
298     return CallLoweringInfo.get();
299   }
300 
301   const InlineAsmLowering *getInlineAsmLowering() const override {
302     return InlineAsmLoweringInfo.get();
303   }
304 
305   InstructionSelector *getInstructionSelector() const override {
306     return InstSelector.get();
307   }
308 
309   const LegalizerInfo *getLegalizerInfo() const override {
310     return Legalizer.get();
311   }
312 
313   const AMDGPURegisterBankInfo *getRegBankInfo() const override {
314     return RegBankInfo.get();
315   }
316 
317   const AMDGPU::IsaInfo::AMDGPUTargetID &getTargetID() const {
318     return TargetID;
319   }
320 
321   const InstrItineraryData *getInstrItineraryData() const override {
322     return &InstrItins;
323   }
324 
325   void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS);
326 
327   Generation getGeneration() const {
328     return (Generation)Gen;
329   }
330 
331   unsigned getMaxWaveScratchSize() const {
332     // See COMPUTE_TMPRING_SIZE.WAVESIZE.
333     if (getGeneration() >= GFX12) {
334       // 18-bit field in units of 64-dword.
335       return (64 * 4) * ((1 << 18) - 1);
336     }
337     if (getGeneration() == GFX11) {
338       // 15-bit field in units of 64-dword.
339       return (64 * 4) * ((1 << 15) - 1);
340     }
341     // 13-bit field in units of 256-dword.
342     return (256 * 4) * ((1 << 13) - 1);
343   }
344 
345   /// Return the number of high bits known to be zero for a frame index.
346   unsigned getKnownHighZeroBitsForFrameIndex() const {
347     return llvm::countl_zero(getMaxWaveScratchSize()) + getWavefrontSizeLog2();
348   }
349 
350   int getLDSBankCount() const {
351     return LDSBankCount;
352   }
353 
354   unsigned getMaxPrivateElementSize(bool ForBufferRSrc = false) const {
355     return (ForBufferRSrc || !enableFlatScratch()) ? MaxPrivateElementSize : 16;
356   }
357 
358   unsigned getConstantBusLimit(unsigned Opcode) const;
359 
360   /// Returns if the result of this instruction with a 16-bit result returned in
361   /// a 32-bit register implicitly zeroes the high 16-bits, rather than preserve
362   /// the original value.
363   bool zeroesHigh16BitsOfDest(unsigned Opcode) const;
364 
365   bool supportsWGP() const { return getGeneration() >= GFX10; }
366 
367   bool hasIntClamp() const {
368     return HasIntClamp;
369   }
370 
371   bool hasFP64() const {
372     return FP64;
373   }
374 
375   bool hasMIMG_R128() const {
376     return MIMG_R128;
377   }
378 
379   bool hasHWFP64() const {
380     return FP64;
381   }
382 
383   bool hasHalfRate64Ops() const {
384     return HalfRate64Ops;
385   }
386 
387   bool hasFullRate64Ops() const {
388     return FullRate64Ops;
389   }
390 
391   bool hasAddr64() const {
392     return (getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS);
393   }
394 
395   bool hasFlat() const {
396     return (getGeneration() > AMDGPUSubtarget::SOUTHERN_ISLANDS);
397   }
398 
399   // Return true if the target only has the reverse operand versions of VALU
400   // shift instructions (e.g. v_lshrrev_b32, and no v_lshr_b32).
401   bool hasOnlyRevVALUShifts() const {
402     return getGeneration() >= VOLCANIC_ISLANDS;
403   }
404 
405   bool hasFractBug() const {
406     return getGeneration() == SOUTHERN_ISLANDS;
407   }
408 
409   bool hasBFE() const {
410     return true;
411   }
412 
413   bool hasBFI() const {
414     return true;
415   }
416 
417   bool hasBFM() const {
418     return hasBFE();
419   }
420 
421   bool hasBCNT(unsigned Size) const {
422     return true;
423   }
424 
425   bool hasFFBL() const {
426     return true;
427   }
428 
429   bool hasFFBH() const {
430     return true;
431   }
432 
433   bool hasMed3_16() const {
434     return getGeneration() >= AMDGPUSubtarget::GFX9;
435   }
436 
437   bool hasMin3Max3_16() const {
438     return getGeneration() >= AMDGPUSubtarget::GFX9;
439   }
440 
441   bool hasFmaMixInsts() const {
442     return HasFmaMixInsts;
443   }
444 
445   bool hasCARRY() const {
446     return true;
447   }
448 
449   bool hasFMA() const {
450     return FMA;
451   }
452 
453   bool hasSwap() const {
454     return GFX9Insts;
455   }
456 
457   bool hasScalarPackInsts() const {
458     return GFX9Insts;
459   }
460 
461   bool hasScalarMulHiInsts() const {
462     return GFX9Insts;
463   }
464 
465   bool hasScalarSubwordLoads() const { return getGeneration() >= GFX12; }
466 
467   TrapHandlerAbi getTrapHandlerAbi() const {
468     return isAmdHsaOS() ? TrapHandlerAbi::AMDHSA : TrapHandlerAbi::NONE;
469   }
470 
471   bool supportsGetDoorbellID() const {
472     // The S_GETREG DOORBELL_ID is supported by all GFX9 onward targets.
473     return getGeneration() >= GFX9;
474   }
475 
476   /// True if the offset field of DS instructions works as expected. On SI, the
477   /// offset uses a 16-bit adder and does not always wrap properly.
478   bool hasUsableDSOffset() const {
479     return getGeneration() >= SEA_ISLANDS;
480   }
481 
482   bool unsafeDSOffsetFoldingEnabled() const {
483     return EnableUnsafeDSOffsetFolding;
484   }
485 
486   /// Condition output from div_scale is usable.
487   bool hasUsableDivScaleConditionOutput() const {
488     return getGeneration() != SOUTHERN_ISLANDS;
489   }
490 
491   /// Extra wait hazard is needed in some cases before
492   /// s_cbranch_vccnz/s_cbranch_vccz.
493   bool hasReadVCCZBug() const {
494     return getGeneration() <= SEA_ISLANDS;
495   }
496 
497   /// Writes to VCC_LO/VCC_HI update the VCCZ flag.
498   bool partialVCCWritesUpdateVCCZ() const {
499     return getGeneration() >= GFX10;
500   }
501 
502   /// A read of an SGPR by SMRD instruction requires 4 wait states when the SGPR
503   /// was written by a VALU instruction.
504   bool hasSMRDReadVALUDefHazard() const {
505     return getGeneration() == SOUTHERN_ISLANDS;
506   }
507 
508   /// A read of an SGPR by a VMEM instruction requires 5 wait states when the
509   /// SGPR was written by a VALU Instruction.
510   bool hasVMEMReadSGPRVALUDefHazard() const {
511     return getGeneration() >= VOLCANIC_ISLANDS;
512   }
513 
514   bool hasRFEHazards() const {
515     return getGeneration() >= VOLCANIC_ISLANDS;
516   }
517 
518   /// Number of hazard wait states for s_setreg_b32/s_setreg_imm32_b32.
519   unsigned getSetRegWaitStates() const {
520     return getGeneration() <= SEA_ISLANDS ? 1 : 2;
521   }
522 
523   bool dumpCode() const {
524     return DumpCode;
525   }
526 
527   /// Return the amount of LDS that can be used that will not restrict the
528   /// occupancy lower than WaveCount.
529   unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount,
530                                            const Function &) const;
531 
532   bool supportsMinMaxDenormModes() const {
533     return getGeneration() >= AMDGPUSubtarget::GFX9;
534   }
535 
536   /// \returns If target supports S_DENORM_MODE.
537   bool hasDenormModeInst() const {
538     return getGeneration() >= AMDGPUSubtarget::GFX10;
539   }
540 
541   bool useFlatForGlobal() const {
542     return FlatForGlobal;
543   }
544 
545   /// \returns If target supports ds_read/write_b128 and user enables generation
546   /// of ds_read/write_b128.
547   bool useDS128() const {
548     return CIInsts && EnableDS128;
549   }
550 
551   /// \return If target supports ds_read/write_b96/128.
552   bool hasDS96AndDS128() const {
553     return CIInsts;
554   }
555 
556   /// Have v_trunc_f64, v_ceil_f64, v_rndne_f64
557   bool haveRoundOpsF64() const {
558     return CIInsts;
559   }
560 
561   /// \returns If MUBUF instructions always perform range checking, even for
562   /// buffer resources used for private memory access.
563   bool privateMemoryResourceIsRangeChecked() const {
564     return getGeneration() < AMDGPUSubtarget::GFX9;
565   }
566 
567   /// \returns If target requires PRT Struct NULL support (zero result registers
568   /// for sparse texture support).
569   bool usePRTStrictNull() const {
570     return EnablePRTStrictNull;
571   }
572 
573   bool hasAutoWaitcntBeforeBarrier() const {
574     return AutoWaitcntBeforeBarrier;
575   }
576 
577   /// \returns true if the target supports backing off of s_barrier instructions
578   /// when an exception is raised.
579   bool supportsBackOffBarrier() const {
580     return BackOffBarrier;
581   }
582 
583   bool hasUnalignedBufferAccess() const {
584     return UnalignedBufferAccess;
585   }
586 
587   bool hasUnalignedBufferAccessEnabled() const {
588     return UnalignedBufferAccess && UnalignedAccessMode;
589   }
590 
591   bool hasUnalignedDSAccess() const {
592     return UnalignedDSAccess;
593   }
594 
595   bool hasUnalignedDSAccessEnabled() const {
596     return UnalignedDSAccess && UnalignedAccessMode;
597   }
598 
599   bool hasUnalignedScratchAccess() const {
600     return UnalignedScratchAccess;
601   }
602 
603   bool hasUnalignedScratchAccessEnabled() const {
604     return UnalignedScratchAccess && UnalignedAccessMode;
605   }
606 
607   bool hasUnalignedAccessMode() const {
608     return UnalignedAccessMode;
609   }
610 
611   bool hasApertureRegs() const {
612     return HasApertureRegs;
613   }
614 
615   bool isTrapHandlerEnabled() const {
616     return TrapHandler;
617   }
618 
619   bool isXNACKEnabled() const {
620     return TargetID.isXnackOnOrAny();
621   }
622 
623   bool isTgSplitEnabled() const {
624     return EnableTgSplit;
625   }
626 
627   bool isCuModeEnabled() const {
628     return EnableCuMode;
629   }
630 
631   bool isPreciseMemoryEnabled() const { return EnablePreciseMemory; }
632 
633   bool hasFlatAddressSpace() const {
634     return FlatAddressSpace;
635   }
636 
637   bool hasFlatScrRegister() const {
638     return hasFlatAddressSpace();
639   }
640 
641   bool hasFlatInstOffsets() const {
642     return FlatInstOffsets;
643   }
644 
645   bool hasFlatGlobalInsts() const {
646     return FlatGlobalInsts;
647   }
648 
649   bool hasFlatScratchInsts() const {
650     return FlatScratchInsts;
651   }
652 
653   // Check if target supports ST addressing mode with FLAT scratch instructions.
654   // The ST addressing mode means no registers are used, either VGPR or SGPR,
655   // but only immediate offset is swizzled and added to the FLAT scratch base.
656   bool hasFlatScratchSTMode() const {
657     return hasFlatScratchInsts() && (hasGFX10_3Insts() || hasGFX940Insts());
658   }
659 
660   bool hasFlatScratchSVSMode() const { return GFX940Insts || GFX11Insts; }
661 
662   bool hasScalarFlatScratchInsts() const {
663     return ScalarFlatScratchInsts;
664   }
665 
666   bool enableFlatScratch() const {
667     return flatScratchIsArchitected() ||
668            (EnableFlatScratch && hasFlatScratchInsts());
669   }
670 
671   bool hasGlobalAddTidInsts() const {
672     return GFX10_BEncoding;
673   }
674 
675   bool hasAtomicCSub() const {
676     return GFX10_BEncoding;
677   }
678 
679   bool hasExportInsts() const {
680     return !hasGFX940Insts();
681   }
682 
683   bool hasVINTERPEncoding() const {
684     return GFX11Insts;
685   }
686 
687   // DS_ADD_F64/DS_ADD_RTN_F64
688   bool hasLdsAtomicAddF64() const { return hasGFX90AInsts(); }
689 
690   bool hasMultiDwordFlatScratchAddressing() const {
691     return getGeneration() >= GFX9;
692   }
693 
694   bool hasFlatSegmentOffsetBug() const {
695     return HasFlatSegmentOffsetBug;
696   }
697 
698   bool hasFlatLgkmVMemCountInOrder() const {
699     return getGeneration() > GFX9;
700   }
701 
702   bool hasD16LoadStore() const {
703     return getGeneration() >= GFX9;
704   }
705 
706   bool d16PreservesUnusedBits() const {
707     return hasD16LoadStore() && !TargetID.isSramEccOnOrAny();
708   }
709 
710   bool hasD16Images() const {
711     return getGeneration() >= VOLCANIC_ISLANDS;
712   }
713 
714   /// Return if most LDS instructions have an m0 use that require m0 to be
715   /// initialized.
716   bool ldsRequiresM0Init() const {
717     return getGeneration() < GFX9;
718   }
719 
720   // True if the hardware rewinds and replays GWS operations if a wave is
721   // preempted.
722   //
723   // If this is false, a GWS operation requires testing if a nack set the
724   // MEM_VIOL bit, and repeating if so.
725   bool hasGWSAutoReplay() const {
726     return getGeneration() >= GFX9;
727   }
728 
729   /// \returns if target has ds_gws_sema_release_all instruction.
730   bool hasGWSSemaReleaseAll() const {
731     return CIInsts;
732   }
733 
734   /// \returns true if the target has integer add/sub instructions that do not
735   /// produce a carry-out. This includes v_add_[iu]32, v_sub_[iu]32,
736   /// v_add_[iu]16, and v_sub_[iu]16, all of which support the clamp modifier
737   /// for saturation.
738   bool hasAddNoCarry() const {
739     return AddNoCarryInsts;
740   }
741 
742   bool hasScalarAddSub64() const { return getGeneration() >= GFX12; }
743 
744   bool hasScalarSMulU64() const { return getGeneration() >= GFX12; }
745 
746   bool hasUnpackedD16VMem() const {
747     return HasUnpackedD16VMem;
748   }
749 
750   // Covers VS/PS/CS graphics shaders
751   bool isMesaGfxShader(const Function &F) const {
752     return isMesa3DOS() && AMDGPU::isShader(F.getCallingConv());
753   }
754 
755   bool hasMad64_32() const {
756     return getGeneration() >= SEA_ISLANDS;
757   }
758 
759   bool hasSDWAOmod() const {
760     return HasSDWAOmod;
761   }
762 
763   bool hasSDWAScalar() const {
764     return HasSDWAScalar;
765   }
766 
767   bool hasSDWASdst() const {
768     return HasSDWASdst;
769   }
770 
771   bool hasSDWAMac() const {
772     return HasSDWAMac;
773   }
774 
775   bool hasSDWAOutModsVOPC() const {
776     return HasSDWAOutModsVOPC;
777   }
778 
779   bool hasDLInsts() const {
780     return HasDLInsts;
781   }
782 
783   bool hasFmacF64Inst() const { return HasFmacF64Inst; }
784 
785   bool hasDot1Insts() const {
786     return HasDot1Insts;
787   }
788 
789   bool hasDot2Insts() const {
790     return HasDot2Insts;
791   }
792 
793   bool hasDot3Insts() const {
794     return HasDot3Insts;
795   }
796 
797   bool hasDot4Insts() const {
798     return HasDot4Insts;
799   }
800 
801   bool hasDot5Insts() const {
802     return HasDot5Insts;
803   }
804 
805   bool hasDot6Insts() const {
806     return HasDot6Insts;
807   }
808 
809   bool hasDot7Insts() const {
810     return HasDot7Insts;
811   }
812 
813   bool hasDot8Insts() const {
814     return HasDot8Insts;
815   }
816 
817   bool hasDot9Insts() const {
818     return HasDot9Insts;
819   }
820 
821   bool hasDot10Insts() const {
822     return HasDot10Insts;
823   }
824 
825   bool hasDot11Insts() const {
826     return HasDot11Insts;
827   }
828 
829   bool hasDot12Insts() const {
830     return HasDot12Insts;
831   }
832 
833   bool hasDot13Insts() const {
834     return HasDot13Insts;
835   }
836 
837   bool hasMAIInsts() const {
838     return HasMAIInsts;
839   }
840 
841   bool hasFP8Insts() const {
842     return HasFP8Insts;
843   }
844 
845   bool hasFP8ConversionInsts() const { return HasFP8ConversionInsts; }
846 
847   bool hasPkFmacF16Inst() const {
848     return HasPkFmacF16Inst;
849   }
850 
851   bool hasAtomicFMinFMaxF32GlobalInsts() const {
852     return HasAtomicFMinFMaxF32GlobalInsts;
853   }
854 
855   bool hasAtomicFMinFMaxF64GlobalInsts() const {
856     return HasAtomicFMinFMaxF64GlobalInsts;
857   }
858 
859   bool hasAtomicFMinFMaxF32FlatInsts() const {
860     return HasAtomicFMinFMaxF32FlatInsts;
861   }
862 
863   bool hasAtomicFMinFMaxF64FlatInsts() const {
864     return HasAtomicFMinFMaxF64FlatInsts;
865   }
866 
867   bool hasAtomicDsPkAdd16Insts() const { return HasAtomicDsPkAdd16Insts; }
868 
869   bool hasAtomicFlatPkAdd16Insts() const { return HasAtomicFlatPkAdd16Insts; }
870 
871   bool hasAtomicFaddInsts() const {
872     return HasAtomicFaddRtnInsts || HasAtomicFaddNoRtnInsts;
873   }
874 
875   bool hasAtomicFaddRtnInsts() const { return HasAtomicFaddRtnInsts; }
876 
877   bool hasAtomicFaddNoRtnInsts() const { return HasAtomicFaddNoRtnInsts; }
878 
879   bool hasAtomicBufferGlobalPkAddF16NoRtnInsts() const {
880     return HasAtomicBufferGlobalPkAddF16NoRtnInsts;
881   }
882 
883   bool hasAtomicBufferGlobalPkAddF16Insts() const {
884     return HasAtomicBufferGlobalPkAddF16Insts;
885   }
886 
887   bool hasAtomicGlobalPkAddBF16Inst() const {
888     return HasAtomicGlobalPkAddBF16Inst;
889   }
890 
891   bool hasAtomicBufferPkAddBF16Inst() const {
892     return HasAtomicBufferPkAddBF16Inst;
893   }
894 
895   bool hasFlatAtomicFaddF32Inst() const { return HasFlatAtomicFaddF32Inst; }
896 
897   /// \return true if the target has flat, global, and buffer atomic fadd for
898   /// double.
899   bool hasFlatBufferGlobalAtomicFaddF64Inst() const {
900     return HasFlatBufferGlobalAtomicFaddF64Inst;
901   }
902 
903   /// \return true if the target's flat, global, and buffer atomic fadd for
904   /// float supports denormal handling.
905   bool hasMemoryAtomicFaddF32DenormalSupport() const {
906     return HasMemoryAtomicFaddF32DenormalSupport;
907   }
908 
909   /// \return true if atomic operations targeting fine-grained memory work
910   /// correctly at device scope, in allocations in host or peer PCIe device
911   /// memory.
912   bool supportsAgentScopeFineGrainedRemoteMemoryAtomics() const {
913     return HasAgentScopeFineGrainedRemoteMemoryAtomics;
914   }
915 
916   bool hasDefaultComponentZero() const { return HasDefaultComponentZero; }
917 
918   bool hasDefaultComponentBroadcast() const {
919     return HasDefaultComponentBroadcast;
920   }
921 
922   bool hasNoSdstCMPX() const {
923     return HasNoSdstCMPX;
924   }
925 
926   bool hasVscnt() const {
927     return HasVscnt;
928   }
929 
930   bool hasGetWaveIdInst() const {
931     return HasGetWaveIdInst;
932   }
933 
934   bool hasSMemTimeInst() const {
935     return HasSMemTimeInst;
936   }
937 
938   bool hasShaderCyclesRegister() const {
939     return HasShaderCyclesRegister;
940   }
941 
942   bool hasShaderCyclesHiLoRegisters() const {
943     return HasShaderCyclesHiLoRegisters;
944   }
945 
946   bool hasVOP3Literal() const {
947     return HasVOP3Literal;
948   }
949 
950   bool hasNoDataDepHazard() const {
951     return HasNoDataDepHazard;
952   }
953 
954   bool vmemWriteNeedsExpWaitcnt() const {
955     return getGeneration() < SEA_ISLANDS;
956   }
957 
958   bool hasInstPrefetch() const {
959     return getGeneration() == GFX10 || getGeneration() == GFX11;
960   }
961 
962   bool hasPrefetch() const { return GFX12Insts; }
963 
964   // Has s_cmpk_* instructions.
965   bool hasSCmpK() const { return getGeneration() < GFX12; }
966 
967   // Scratch is allocated in 256 dword per wave blocks for the entire
968   // wavefront. When viewed from the perspective of an arbitrary workitem, this
969   // is 4-byte aligned.
970   //
971   // Only 4-byte alignment is really needed to access anything. Transformations
972   // on the pointer value itself may rely on the alignment / known low bits of
973   // the pointer. Set this to something above the minimum to avoid needing
974   // dynamic realignment in common cases.
975   Align getStackAlignment() const { return Align(16); }
976 
977   bool enableMachineScheduler() const override {
978     return true;
979   }
980 
981   bool useAA() const override;
982 
983   bool enableSubRegLiveness() const override {
984     return true;
985   }
986 
987   void setScalarizeGlobalBehavior(bool b) { ScalarizeGlobal = b; }
988   bool getScalarizeGlobalBehavior() const { return ScalarizeGlobal; }
989 
990   // static wrappers
991   static bool hasHalfRate64Ops(const TargetSubtargetInfo &STI);
992 
993   // XXX - Why is this here if it isn't in the default pass set?
994   bool enableEarlyIfConversion() const override {
995     return true;
996   }
997 
998   void overrideSchedPolicy(MachineSchedPolicy &Policy,
999                            unsigned NumRegionInstrs) const override;
1000 
1001   void mirFileLoaded(MachineFunction &MF) const override;
1002 
1003   unsigned getMaxNumUserSGPRs() const {
1004     return AMDGPU::getMaxNumUserSGPRs(*this);
1005   }
1006 
1007   bool hasSMemRealTime() const {
1008     return HasSMemRealTime;
1009   }
1010 
1011   bool hasMovrel() const {
1012     return HasMovrel;
1013   }
1014 
1015   bool hasVGPRIndexMode() const {
1016     return HasVGPRIndexMode;
1017   }
1018 
1019   bool useVGPRIndexMode() const;
1020 
1021   bool hasScalarCompareEq64() const {
1022     return getGeneration() >= VOLCANIC_ISLANDS;
1023   }
1024 
1025   bool hasScalarDwordx3Loads() const { return HasScalarDwordx3Loads; }
1026 
1027   bool hasScalarStores() const {
1028     return HasScalarStores;
1029   }
1030 
1031   bool hasScalarAtomics() const {
1032     return HasScalarAtomics;
1033   }
1034 
1035   bool hasLDSFPAtomicAddF32() const { return GFX8Insts; }
1036   bool hasLDSFPAtomicAddF64() const { return GFX90AInsts; }
1037 
1038   /// \returns true if the subtarget has the v_permlanex16_b32 instruction.
1039   bool hasPermLaneX16() const { return getGeneration() >= GFX10; }
1040 
1041   /// \returns true if the subtarget has the v_permlane64_b32 instruction.
1042   bool hasPermLane64() const { return getGeneration() >= GFX11; }
1043 
1044   bool hasDPP() const {
1045     return HasDPP;
1046   }
1047 
1048   bool hasDPPBroadcasts() const {
1049     return HasDPP && getGeneration() < GFX10;
1050   }
1051 
1052   bool hasDPPWavefrontShifts() const {
1053     return HasDPP && getGeneration() < GFX10;
1054   }
1055 
1056   bool hasDPP8() const {
1057     return HasDPP8;
1058   }
1059 
1060   bool hasDPALU_DPP() const {
1061     return HasDPALU_DPP;
1062   }
1063 
1064   bool hasDPPSrc1SGPR() const { return HasDPPSrc1SGPR; }
1065 
1066   bool hasPackedFP32Ops() const {
1067     return HasPackedFP32Ops;
1068   }
1069 
1070   // Has V_PK_MOV_B32 opcode
1071   bool hasPkMovB32() const {
1072     return GFX90AInsts;
1073   }
1074 
1075   bool hasFmaakFmamkF32Insts() const {
1076     return getGeneration() >= GFX10 || hasGFX940Insts();
1077   }
1078 
1079   bool hasImageInsts() const {
1080     return HasImageInsts;
1081   }
1082 
1083   bool hasExtendedImageInsts() const {
1084     return HasExtendedImageInsts;
1085   }
1086 
1087   bool hasR128A16() const {
1088     return HasR128A16;
1089   }
1090 
1091   bool hasA16() const { return HasA16; }
1092 
1093   bool hasG16() const { return HasG16; }
1094 
1095   bool hasOffset3fBug() const {
1096     return HasOffset3fBug;
1097   }
1098 
1099   bool hasImageStoreD16Bug() const { return HasImageStoreD16Bug; }
1100 
1101   bool hasImageGather4D16Bug() const { return HasImageGather4D16Bug; }
1102 
1103   bool hasMADIntraFwdBug() const { return HasMADIntraFwdBug; }
1104 
1105   bool hasMSAALoadDstSelBug() const { return HasMSAALoadDstSelBug; }
1106 
1107   bool hasPrivEnabledTrap2NopBug() const { return HasPrivEnabledTrap2NopBug; }
1108 
1109   bool hasNSAEncoding() const { return HasNSAEncoding; }
1110 
1111   bool hasNonNSAEncoding() const { return getGeneration() < GFX12; }
1112 
1113   bool hasPartialNSAEncoding() const { return HasPartialNSAEncoding; }
1114 
1115   unsigned getNSAMaxSize(bool HasSampler = false) const {
1116     return AMDGPU::getNSAMaxSize(*this, HasSampler);
1117   }
1118 
1119   bool hasGFX10_AEncoding() const {
1120     return GFX10_AEncoding;
1121   }
1122 
1123   bool hasGFX10_BEncoding() const {
1124     return GFX10_BEncoding;
1125   }
1126 
1127   bool hasGFX10_3Insts() const {
1128     return GFX10_3Insts;
1129   }
1130 
1131   bool hasMadF16() const;
1132 
1133   bool hasMovB64() const { return GFX940Insts; }
1134 
1135   bool hasLshlAddB64() const { return GFX940Insts; }
1136 
1137   bool enableSIScheduler() const {
1138     return EnableSIScheduler;
1139   }
1140 
1141   bool loadStoreOptEnabled() const {
1142     return EnableLoadStoreOpt;
1143   }
1144 
1145   bool hasSGPRInitBug() const {
1146     return SGPRInitBug;
1147   }
1148 
1149   bool hasUserSGPRInit16Bug() const {
1150     return UserSGPRInit16Bug && isWave32();
1151   }
1152 
1153   bool hasNegativeScratchOffsetBug() const { return NegativeScratchOffsetBug; }
1154 
1155   bool hasNegativeUnalignedScratchOffsetBug() const {
1156     return NegativeUnalignedScratchOffsetBug;
1157   }
1158 
1159   bool hasMFMAInlineLiteralBug() const {
1160     return HasMFMAInlineLiteralBug;
1161   }
1162 
1163   bool has12DWordStoreHazard() const {
1164     return getGeneration() != AMDGPUSubtarget::SOUTHERN_ISLANDS;
1165   }
1166 
1167   // \returns true if the subtarget supports DWORDX3 load/store instructions.
1168   bool hasDwordx3LoadStores() const {
1169     return CIInsts;
1170   }
1171 
1172   bool hasReadM0MovRelInterpHazard() const {
1173     return getGeneration() == AMDGPUSubtarget::GFX9;
1174   }
1175 
1176   bool hasReadM0SendMsgHazard() const {
1177     return getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
1178            getGeneration() <= AMDGPUSubtarget::GFX9;
1179   }
1180 
1181   bool hasReadM0LdsDmaHazard() const {
1182     return getGeneration() == AMDGPUSubtarget::GFX9;
1183   }
1184 
1185   bool hasReadM0LdsDirectHazard() const {
1186     return getGeneration() == AMDGPUSubtarget::GFX9;
1187   }
1188 
1189   bool hasVcmpxPermlaneHazard() const {
1190     return HasVcmpxPermlaneHazard;
1191   }
1192 
1193   bool hasVMEMtoScalarWriteHazard() const {
1194     return HasVMEMtoScalarWriteHazard;
1195   }
1196 
1197   bool hasSMEMtoVectorWriteHazard() const {
1198     return HasSMEMtoVectorWriteHazard;
1199   }
1200 
1201   bool hasLDSMisalignedBug() const {
1202     return LDSMisalignedBug && !EnableCuMode;
1203   }
1204 
1205   bool hasInstFwdPrefetchBug() const {
1206     return HasInstFwdPrefetchBug;
1207   }
1208 
1209   bool hasVcmpxExecWARHazard() const {
1210     return HasVcmpxExecWARHazard;
1211   }
1212 
1213   bool hasLdsBranchVmemWARHazard() const {
1214     return HasLdsBranchVmemWARHazard;
1215   }
1216 
1217   // Shift amount of a 64 bit shift cannot be a highest allocated register
1218   // if also at the end of the allocation block.
1219   bool hasShift64HighRegBug() const {
1220     return GFX90AInsts && !GFX940Insts;
1221   }
1222 
1223   // Has one cycle hazard on transcendental instruction feeding a
1224   // non transcendental VALU.
1225   bool hasTransForwardingHazard() const { return GFX940Insts; }
1226 
1227   // Has one cycle hazard on a VALU instruction partially writing dst with
1228   // a shift of result bits feeding another VALU instruction.
1229   bool hasDstSelForwardingHazard() const { return GFX940Insts; }
1230 
1231   // Cannot use op_sel with v_dot instructions.
1232   bool hasDOTOpSelHazard() const { return GFX940Insts || GFX11Insts; }
1233 
1234   // Does not have HW interlocs for VALU writing and then reading SGPRs.
1235   bool hasVDecCoExecHazard() const {
1236     return GFX940Insts;
1237   }
1238 
1239   bool hasNSAtoVMEMBug() const {
1240     return HasNSAtoVMEMBug;
1241   }
1242 
1243   bool hasNSAClauseBug() const { return HasNSAClauseBug; }
1244 
1245   bool hasHardClauses() const { return MaxHardClauseLength > 0; }
1246 
1247   bool hasGFX90AInsts() const { return GFX90AInsts; }
1248 
1249   bool hasFPAtomicToDenormModeHazard() const {
1250     return getGeneration() == GFX10;
1251   }
1252 
1253   bool hasVOP3DPP() const { return getGeneration() >= GFX11; }
1254 
1255   bool hasLdsDirect() const { return getGeneration() >= GFX11; }
1256 
1257   bool hasLdsWaitVMSRC() const { return getGeneration() >= GFX12; }
1258 
1259   bool hasVALUPartialForwardingHazard() const {
1260     return getGeneration() == GFX11;
1261   }
1262 
1263   bool hasVALUTransUseHazard() const { return HasVALUTransUseHazard; }
1264 
1265   bool hasCvtScaleForwardingHazard() const { return GFX950Insts; }
1266 
1267   bool hasForceStoreSC0SC1() const { return HasForceStoreSC0SC1; }
1268 
1269   bool requiresCodeObjectV6() const { return RequiresCOV6; }
1270 
1271   bool hasVALUMaskWriteHazard() const { return getGeneration() == GFX11; }
1272 
1273   bool hasVALUReadSGPRHazard() const { return getGeneration() == GFX12; }
1274 
1275   /// Return if operations acting on VGPR tuples require even alignment.
1276   bool needsAlignedVGPRs() const { return GFX90AInsts; }
1277 
1278   /// Return true if the target has the S_PACK_HL_B32_B16 instruction.
1279   bool hasSPackHL() const { return GFX11Insts; }
1280 
1281   /// Return true if the target's EXP instruction has the COMPR flag, which
1282   /// affects the meaning of the EN (enable) bits.
1283   bool hasCompressedExport() const { return !GFX11Insts; }
1284 
1285   /// Return true if the target's EXP instruction supports the NULL export
1286   /// target.
1287   bool hasNullExportTarget() const { return !GFX11Insts; }
1288 
1289   bool has1_5xVGPRs() const { return Has1_5xVGPRs; }
1290 
1291   bool hasVOPDInsts() const { return HasVOPDInsts; }
1292 
1293   bool hasFlatScratchSVSSwizzleBug() const { return getGeneration() == GFX11; }
1294 
1295   /// Return true if the target has the S_DELAY_ALU instruction.
1296   bool hasDelayAlu() const { return GFX11Insts; }
1297 
1298   bool hasPackedTID() const { return HasPackedTID; }
1299 
1300   // GFX940 is a derivation to GFX90A. hasGFX940Insts() being true implies that
1301   // hasGFX90AInsts is also true.
1302   bool hasGFX940Insts() const { return GFX940Insts; }
1303 
1304   // GFX950 is a derivation to GFX940. hasGFX950Insts() implies that
1305   // hasGFX940Insts and hasGFX90AInsts are also true.
1306   bool hasGFX950Insts() const { return GFX950Insts; }
1307 
1308   /// Returns true if the target supports
1309   /// global_load_lds_dwordx3/global_load_lds_dwordx4 or
1310   /// buffer_load_dwordx3/buffer_load_dwordx4 with the lds bit.
1311   bool hasLDSLoadB96_B128() const {
1312     return hasGFX950Insts();
1313   }
1314 
1315   bool hasSALUFloatInsts() const { return HasSALUFloatInsts; }
1316 
1317   bool hasPseudoScalarTrans() const { return HasPseudoScalarTrans; }
1318 
1319   bool hasRestrictedSOffset() const { return HasRestrictedSOffset; }
1320 
1321   bool hasRequiredExportPriority() const { return HasRequiredExportPriority; }
1322 
1323   bool hasVmemWriteVgprInOrder() const { return HasVmemWriteVgprInOrder; }
1324 
1325   /// \returns true if the target uses LOADcnt/SAMPLEcnt/BVHcnt, DScnt/KMcnt
1326   /// and STOREcnt rather than VMcnt, LGKMcnt and VScnt respectively.
1327   bool hasExtendedWaitCounts() const { return getGeneration() >= GFX12; }
1328 
1329   /// \returns true if inline constants are not supported for F16 pseudo
1330   /// scalar transcendentals.
1331   bool hasNoF16PseudoScalarTransInlineConstants() const {
1332     return getGeneration() == GFX12;
1333   }
1334 
1335   /// \returns true if the target has instructions with xf32 format support.
1336   bool hasXF32Insts() const { return HasXF32Insts; }
1337 
1338   bool hasBitOp3Insts() const { return HasBitOp3Insts; }
1339 
1340   bool hasPermlane16Swap() const { return HasPermlane16Swap; }
1341   bool hasPermlane32Swap() const { return HasPermlane32Swap; }
1342   bool hasAshrPkInsts() const { return HasAshrPkInsts; }
1343 
1344   bool hasMinimum3Maximum3F32() const {
1345     return HasMinimum3Maximum3F32;
1346   }
1347 
1348   bool hasMinimum3Maximum3F16() const {
1349     return HasMinimum3Maximum3F16;
1350   }
1351 
1352   bool hasMinimum3Maximum3PKF16() const {
1353     return HasMinimum3Maximum3PKF16;
1354   }
1355 
1356   /// \returns The maximum number of instructions that can be enclosed in an
1357   /// S_CLAUSE on the given subtarget, or 0 for targets that do not support that
1358   /// instruction.
1359   unsigned maxHardClauseLength() const { return MaxHardClauseLength; }
1360 
1361   bool hasPrngInst() const { return HasPrngInst; }
1362 
1363   /// Return the maximum number of waves per SIMD for kernels using \p SGPRs
1364   /// SGPRs
1365   unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const;
1366 
1367   /// Return the maximum number of waves per SIMD for kernels using \p VGPRs
1368   /// VGPRs
1369   unsigned getOccupancyWithNumVGPRs(unsigned VGPRs) const;
1370 
1371   /// Subtarget's minimum/maximum occupancy, in number of waves per EU, that can
1372   /// be achieved when the only function running on a CU is \p F, each workgroup
1373   /// uses \p LDSSize bytes of LDS, and each wave uses \p NumSGPRs SGPRs and \p
1374   /// NumVGPRs VGPRs. The flat workgroup sizes associated to the function are a
1375   /// range, so this returns a range as well.
1376   ///
1377   /// Note that occupancy can be affected by the scratch allocation as well, but
1378   /// we do not have enough information to compute it.
1379   std::pair<unsigned, unsigned> computeOccupancy(const Function &F,
1380                                                  unsigned LDSSize = 0,
1381                                                  unsigned NumSGPRs = 0,
1382                                                  unsigned NumVGPRs = 0) const;
1383 
1384   /// \returns true if the flat_scratch register should be initialized with the
1385   /// pointer to the wave's scratch memory rather than a size and offset.
1386   bool flatScratchIsPointer() const {
1387     return getGeneration() >= AMDGPUSubtarget::GFX9;
1388   }
1389 
1390   /// \returns true if the flat_scratch register is initialized by the HW.
1391   /// In this case it is readonly.
1392   bool flatScratchIsArchitected() const { return HasArchitectedFlatScratch; }
1393 
1394   /// \returns true if the architected SGPRs are enabled.
1395   bool hasArchitectedSGPRs() const { return HasArchitectedSGPRs; }
1396 
1397   /// \returns true if Global Data Share is supported.
1398   bool hasGDS() const { return HasGDS; }
1399 
1400   /// \returns true if Global Wave Sync is supported.
1401   bool hasGWS() const { return HasGWS; }
1402 
1403   /// \returns true if the machine has merged shaders in which s0-s7 are
1404   /// reserved by the hardware and user SGPRs start at s8
1405   bool hasMergedShaders() const {
1406     return getGeneration() >= GFX9;
1407   }
1408 
1409   // \returns true if the target supports the pre-NGG legacy geometry path.
1410   bool hasLegacyGeometry() const { return getGeneration() < GFX11; }
1411 
1412   // \returns true if preloading kernel arguments is supported.
1413   bool hasKernargPreload() const { return KernargPreload; }
1414 
1415   // \returns true if the target has split barriers feature
1416   bool hasSplitBarriers() const { return getGeneration() >= GFX12; }
1417 
1418   // \returns true if FP8/BF8 VOP1 form of conversion to F32 is unreliable.
1419   bool hasCvtFP8VOP1Bug() const { return HasCvtFP8Vop1Bug; }
1420 
1421   // \returns true if CSUB (a.k.a. SUB_CLAMP on GFX12) atomics support a
1422   // no-return form.
1423   bool hasAtomicCSubNoRtnInsts() const { return HasAtomicCSubNoRtnInsts; }
1424 
1425   // \returns true if the target has DX10_CLAMP kernel descriptor mode bit
1426   bool hasDX10ClampMode() const { return getGeneration() < GFX12; }
1427 
1428   // \returns true if the target has IEEE kernel descriptor mode bit
1429   bool hasIEEEMode() const { return getGeneration() < GFX12; }
1430 
1431   // \returns true if the target has IEEE fminimum/fmaximum instructions
1432   bool hasIEEEMinMax() const { return getGeneration() >= GFX12; }
1433 
1434   // \returns true if the target has IEEE fminimum3/fmaximum3 instructions
1435   bool hasIEEEMinMax3() const { return hasIEEEMinMax(); }
1436 
1437   // \returns true if the target has WG_RR_MODE kernel descriptor mode bit
1438   bool hasRrWGMode() const { return getGeneration() >= GFX12; }
1439 
1440   /// \returns true if VADDR and SADDR fields in VSCRATCH can use negative
1441   /// values.
1442   bool hasSignedScratchOffsets() const { return getGeneration() >= GFX12; }
1443 
1444   // \returns true if S_GETPC_B64 zero-extends the result from 48 bits instead
1445   // of sign-extending.
1446   bool hasGetPCZeroExtension() const { return GFX12Insts; }
1447 
1448   /// \returns SGPR allocation granularity supported by the subtarget.
1449   unsigned getSGPRAllocGranule() const {
1450     return AMDGPU::IsaInfo::getSGPRAllocGranule(this);
1451   }
1452 
1453   /// \returns SGPR encoding granularity supported by the subtarget.
1454   unsigned getSGPREncodingGranule() const {
1455     return AMDGPU::IsaInfo::getSGPREncodingGranule(this);
1456   }
1457 
1458   /// \returns Total number of SGPRs supported by the subtarget.
1459   unsigned getTotalNumSGPRs() const {
1460     return AMDGPU::IsaInfo::getTotalNumSGPRs(this);
1461   }
1462 
1463   /// \returns Addressable number of SGPRs supported by the subtarget.
1464   unsigned getAddressableNumSGPRs() const {
1465     return AMDGPU::IsaInfo::getAddressableNumSGPRs(this);
1466   }
1467 
1468   /// \returns Minimum number of SGPRs that meets the given number of waves per
1469   /// execution unit requirement supported by the subtarget.
1470   unsigned getMinNumSGPRs(unsigned WavesPerEU) const {
1471     return AMDGPU::IsaInfo::getMinNumSGPRs(this, WavesPerEU);
1472   }
1473 
1474   /// \returns Maximum number of SGPRs that meets the given number of waves per
1475   /// execution unit requirement supported by the subtarget.
1476   unsigned getMaxNumSGPRs(unsigned WavesPerEU, bool Addressable) const {
1477     return AMDGPU::IsaInfo::getMaxNumSGPRs(this, WavesPerEU, Addressable);
1478   }
1479 
1480   /// \returns Reserved number of SGPRs. This is common
1481   /// utility function called by MachineFunction and
1482   /// Function variants of getReservedNumSGPRs.
1483   unsigned getBaseReservedNumSGPRs(const bool HasFlatScratch) const;
1484   /// \returns Reserved number of SGPRs for given machine function \p MF.
1485   unsigned getReservedNumSGPRs(const MachineFunction &MF) const;
1486 
1487   /// \returns Reserved number of SGPRs for given function \p F.
1488   unsigned getReservedNumSGPRs(const Function &F) const;
1489 
1490   /// \returns max num SGPRs. This is the common utility
1491   /// function called by MachineFunction and Function
1492   /// variants of getMaxNumSGPRs.
1493   unsigned getBaseMaxNumSGPRs(const Function &F,
1494                               std::pair<unsigned, unsigned> WavesPerEU,
1495                               unsigned PreloadedSGPRs,
1496                               unsigned ReservedNumSGPRs) const;
1497 
1498   /// \returns Maximum number of SGPRs that meets number of waves per execution
1499   /// unit requirement for function \p MF, or number of SGPRs explicitly
1500   /// requested using "amdgpu-num-sgpr" attribute attached to function \p MF.
1501   ///
1502   /// \returns Value that meets number of waves per execution unit requirement
1503   /// if explicitly requested value cannot be converted to integer, violates
1504   /// subtarget's specifications, or does not meet number of waves per execution
1505   /// unit requirement.
1506   unsigned getMaxNumSGPRs(const MachineFunction &MF) const;
1507 
1508   /// \returns Maximum number of SGPRs that meets number of waves per execution
1509   /// unit requirement for function \p F, or number of SGPRs explicitly
1510   /// requested using "amdgpu-num-sgpr" attribute attached to function \p F.
1511   ///
1512   /// \returns Value that meets number of waves per execution unit requirement
1513   /// if explicitly requested value cannot be converted to integer, violates
1514   /// subtarget's specifications, or does not meet number of waves per execution
1515   /// unit requirement.
1516   unsigned getMaxNumSGPRs(const Function &F) const;
1517 
1518   /// \returns VGPR allocation granularity supported by the subtarget.
1519   unsigned getVGPRAllocGranule() const {
1520     return AMDGPU::IsaInfo::getVGPRAllocGranule(this);
1521   }
1522 
1523   /// \returns VGPR encoding granularity supported by the subtarget.
1524   unsigned getVGPREncodingGranule() const {
1525     return AMDGPU::IsaInfo::getVGPREncodingGranule(this);
1526   }
1527 
1528   /// \returns Total number of VGPRs supported by the subtarget.
1529   unsigned getTotalNumVGPRs() const {
1530     return AMDGPU::IsaInfo::getTotalNumVGPRs(this);
1531   }
1532 
1533   /// \returns Addressable number of architectural VGPRs supported by the
1534   /// subtarget.
1535   unsigned getAddressableNumArchVGPRs() const {
1536     return AMDGPU::IsaInfo::getAddressableNumArchVGPRs(this);
1537   }
1538 
1539   /// \returns Addressable number of VGPRs supported by the subtarget.
1540   unsigned getAddressableNumVGPRs() const {
1541     return AMDGPU::IsaInfo::getAddressableNumVGPRs(this);
1542   }
1543 
1544   /// \returns the minimum number of VGPRs that will prevent achieving more than
1545   /// the specified number of waves \p WavesPerEU.
1546   unsigned getMinNumVGPRs(unsigned WavesPerEU) const {
1547     return AMDGPU::IsaInfo::getMinNumVGPRs(this, WavesPerEU);
1548   }
1549 
1550   /// \returns the maximum number of VGPRs that can be used and still achieved
1551   /// at least the specified number of waves \p WavesPerEU.
1552   unsigned getMaxNumVGPRs(unsigned WavesPerEU) const {
1553     return AMDGPU::IsaInfo::getMaxNumVGPRs(this, WavesPerEU);
1554   }
1555 
1556   /// \returns max num VGPRs. This is the common utility function
1557   /// called by MachineFunction and Function variants of getMaxNumVGPRs.
1558   unsigned getBaseMaxNumVGPRs(const Function &F,
1559                               std::pair<unsigned, unsigned> WavesPerEU) const;
1560   /// \returns Maximum number of VGPRs that meets number of waves per execution
1561   /// unit requirement for function \p F, or number of VGPRs explicitly
1562   /// requested using "amdgpu-num-vgpr" attribute attached to function \p F.
1563   ///
1564   /// \returns Value that meets number of waves per execution unit requirement
1565   /// if explicitly requested value cannot be converted to integer, violates
1566   /// subtarget's specifications, or does not meet number of waves per execution
1567   /// unit requirement.
1568   unsigned getMaxNumVGPRs(const Function &F) const;
1569 
1570   unsigned getMaxNumAGPRs(const Function &F) const {
1571     return getMaxNumVGPRs(F);
1572   }
1573 
1574   /// \returns Maximum number of VGPRs that meets number of waves per execution
1575   /// unit requirement for function \p MF, or number of VGPRs explicitly
1576   /// requested using "amdgpu-num-vgpr" attribute attached to function \p MF.
1577   ///
1578   /// \returns Value that meets number of waves per execution unit requirement
1579   /// if explicitly requested value cannot be converted to integer, violates
1580   /// subtarget's specifications, or does not meet number of waves per execution
1581   /// unit requirement.
1582   unsigned getMaxNumVGPRs(const MachineFunction &MF) const;
1583 
1584   bool isWave32() const {
1585     return getWavefrontSize() == 32;
1586   }
1587 
1588   bool isWave64() const {
1589     return getWavefrontSize() == 64;
1590   }
1591 
1592   /// Returns if the wavesize of this subtarget is known reliable. This is false
1593   /// only for the a default target-cpu that does not have an explicit
1594   /// +wavefrontsize target feature.
1595   bool isWaveSizeKnown() const {
1596     return hasFeature(AMDGPU::FeatureWavefrontSize32) ||
1597            hasFeature(AMDGPU::FeatureWavefrontSize64);
1598   }
1599 
1600   const TargetRegisterClass *getBoolRC() const {
1601     return getRegisterInfo()->getBoolRC();
1602   }
1603 
1604   /// \returns Maximum number of work groups per compute unit supported by the
1605   /// subtarget and limited by given \p FlatWorkGroupSize.
1606   unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const override {
1607     return AMDGPU::IsaInfo::getMaxWorkGroupsPerCU(this, FlatWorkGroupSize);
1608   }
1609 
1610   /// \returns Minimum flat work group size supported by the subtarget.
1611   unsigned getMinFlatWorkGroupSize() const override {
1612     return AMDGPU::IsaInfo::getMinFlatWorkGroupSize(this);
1613   }
1614 
1615   /// \returns Maximum flat work group size supported by the subtarget.
1616   unsigned getMaxFlatWorkGroupSize() const override {
1617     return AMDGPU::IsaInfo::getMaxFlatWorkGroupSize(this);
1618   }
1619 
1620   /// \returns Number of waves per execution unit required to support the given
1621   /// \p FlatWorkGroupSize.
1622   unsigned
1623   getWavesPerEUForWorkGroup(unsigned FlatWorkGroupSize) const override {
1624     return AMDGPU::IsaInfo::getWavesPerEUForWorkGroup(this, FlatWorkGroupSize);
1625   }
1626 
1627   /// \returns Minimum number of waves per execution unit supported by the
1628   /// subtarget.
1629   unsigned getMinWavesPerEU() const override {
1630     return AMDGPU::IsaInfo::getMinWavesPerEU(this);
1631   }
1632 
1633   void adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use, int UseOpIdx,
1634                              SDep &Dep,
1635                              const TargetSchedModel *SchedModel) const override;
1636 
1637   // \returns true if it's beneficial on this subtarget for the scheduler to
1638   // cluster stores as well as loads.
1639   bool shouldClusterStores() const { return getGeneration() >= GFX11; }
1640 
1641   // \returns the number of address arguments from which to enable MIMG NSA
1642   // on supported architectures.
1643   unsigned getNSAThreshold(const MachineFunction &MF) const;
1644 
1645   // \returns true if the subtarget has a hazard requiring an "s_nop 0"
1646   // instruction before "s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)".
1647   bool requiresNopBeforeDeallocVGPRs() const {
1648     // Currently all targets that support the dealloc VGPRs message also require
1649     // the nop.
1650     return true;
1651   }
1652 
1653   bool requiresDisjointEarlyClobberAndUndef() const override {
1654     // AMDGPU doesn't care if early-clobber and undef operands are allocated
1655     // to the same register.
1656     return false;
1657   }
1658 };
1659 
1660 class GCNUserSGPRUsageInfo {
1661 public:
1662   bool hasImplicitBufferPtr() const { return ImplicitBufferPtr; }
1663 
1664   bool hasPrivateSegmentBuffer() const { return PrivateSegmentBuffer; }
1665 
1666   bool hasDispatchPtr() const { return DispatchPtr; }
1667 
1668   bool hasQueuePtr() const { return QueuePtr; }
1669 
1670   bool hasKernargSegmentPtr() const { return KernargSegmentPtr; }
1671 
1672   bool hasDispatchID() const { return DispatchID; }
1673 
1674   bool hasFlatScratchInit() const { return FlatScratchInit; }
1675 
1676   bool hasPrivateSegmentSize() const { return PrivateSegmentSize; }
1677 
1678   unsigned getNumKernargPreloadSGPRs() const { return NumKernargPreloadSGPRs; }
1679 
1680   unsigned getNumUsedUserSGPRs() const { return NumUsedUserSGPRs; }
1681 
1682   unsigned getNumFreeUserSGPRs();
1683 
1684   void allocKernargPreloadSGPRs(unsigned NumSGPRs);
1685 
1686   enum UserSGPRID : unsigned {
1687     ImplicitBufferPtrID = 0,
1688     PrivateSegmentBufferID = 1,
1689     DispatchPtrID = 2,
1690     QueuePtrID = 3,
1691     KernargSegmentPtrID = 4,
1692     DispatchIdID = 5,
1693     FlatScratchInitID = 6,
1694     PrivateSegmentSizeID = 7
1695   };
1696 
1697   // Returns the size in number of SGPRs for preload user SGPR field.
1698   static unsigned getNumUserSGPRForField(UserSGPRID ID) {
1699     switch (ID) {
1700     case ImplicitBufferPtrID:
1701       return 2;
1702     case PrivateSegmentBufferID:
1703       return 4;
1704     case DispatchPtrID:
1705       return 2;
1706     case QueuePtrID:
1707       return 2;
1708     case KernargSegmentPtrID:
1709       return 2;
1710     case DispatchIdID:
1711       return 2;
1712     case FlatScratchInitID:
1713       return 2;
1714     case PrivateSegmentSizeID:
1715       return 1;
1716     }
1717     llvm_unreachable("Unknown UserSGPRID.");
1718   }
1719 
1720   GCNUserSGPRUsageInfo(const Function &F, const GCNSubtarget &ST);
1721 
1722 private:
1723   const GCNSubtarget &ST;
1724 
1725   // Private memory buffer
1726   // Compute directly in sgpr[0:1]
1727   // Other shaders indirect 64-bits at sgpr[0:1]
1728   bool ImplicitBufferPtr = false;
1729 
1730   bool PrivateSegmentBuffer = false;
1731 
1732   bool DispatchPtr = false;
1733 
1734   bool QueuePtr = false;
1735 
1736   bool KernargSegmentPtr = false;
1737 
1738   bool DispatchID = false;
1739 
1740   bool FlatScratchInit = false;
1741 
1742   bool PrivateSegmentSize = false;
1743 
1744   unsigned NumKernargPreloadSGPRs = 0;
1745 
1746   unsigned NumUsedUserSGPRs = 0;
1747 };
1748 
1749 } // end namespace llvm
1750 
1751 #endif // LLVM_LIB_TARGET_AMDGPU_GCNSUBTARGET_H
1752