xref: /freebsd-src/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h (revision a7dea1671b87c07d2d266f836bfa8b58efc7c134)
1 //=====-- AMDGPUSubtarget.h - Define Subtarget for AMDGPU ------*- C++ -*-====//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //==-----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// AMDGPU specific subclass of TargetSubtarget.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUSUBTARGET_H
15 #define LLVM_LIB_TARGET_AMDGPU_AMDGPUSUBTARGET_H
16 
17 #include "AMDGPU.h"
18 #include "AMDGPUCallLowering.h"
19 #include "R600FrameLowering.h"
20 #include "R600ISelLowering.h"
21 #include "R600InstrInfo.h"
22 #include "SIFrameLowering.h"
23 #include "SIISelLowering.h"
24 #include "SIInstrInfo.h"
25 #include "Utils/AMDGPUBaseInfo.h"
26 #include "llvm/ADT/Triple.h"
27 #include "llvm/CodeGen/GlobalISel/InstructionSelector.h"
28 #include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
29 #include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
30 #include "llvm/CodeGen/MachineFunction.h"
31 #include "llvm/CodeGen/SelectionDAGTargetInfo.h"
32 #include "llvm/MC/MCInstrItineraries.h"
33 #include "llvm/Support/MathExtras.h"
34 #include <cassert>
35 #include <cstdint>
36 #include <memory>
37 #include <utility>
38 
39 #define GET_SUBTARGETINFO_HEADER
40 #include "AMDGPUGenSubtargetInfo.inc"
41 #define GET_SUBTARGETINFO_HEADER
42 #include "R600GenSubtargetInfo.inc"
43 
44 namespace llvm {
45 
46 class StringRef;
47 
48 class AMDGPUSubtarget {
49 public:
50   enum Generation {
51     R600 = 0,
52     R700 = 1,
53     EVERGREEN = 2,
54     NORTHERN_ISLANDS = 3,
55     SOUTHERN_ISLANDS = 4,
56     SEA_ISLANDS = 5,
57     VOLCANIC_ISLANDS = 6,
58     GFX9 = 7,
59     GFX10 = 8
60   };
61 
62 private:
63   Triple TargetTriple;
64 
65 protected:
66   bool Has16BitInsts;
67   bool HasMadMixInsts;
68   bool FP32Denormals;
69   bool FPExceptions;
70   bool HasSDWA;
71   bool HasVOP3PInsts;
72   bool HasMulI24;
73   bool HasMulU24;
74   bool HasInv2PiInlineImm;
75   bool HasFminFmaxLegacy;
76   bool EnablePromoteAlloca;
77   bool HasTrigReducedRange;
78   unsigned MaxWavesPerEU;
79   int LocalMemorySize;
80   unsigned WavefrontSize;
81 
82 public:
83   AMDGPUSubtarget(const Triple &TT);
84 
85   static const AMDGPUSubtarget &get(const MachineFunction &MF);
86   static const AMDGPUSubtarget &get(const TargetMachine &TM,
87                                     const Function &F);
88 
89   /// \returns Default range flat work group size for a calling convention.
90   std::pair<unsigned, unsigned> getDefaultFlatWorkGroupSize(CallingConv::ID CC) const;
91 
92   /// \returns Subtarget's default pair of minimum/maximum flat work group sizes
93   /// for function \p F, or minimum/maximum flat work group sizes explicitly
94   /// requested using "amdgpu-flat-work-group-size" attribute attached to
95   /// function \p F.
96   ///
97   /// \returns Subtarget's default values if explicitly requested values cannot
98   /// be converted to integer, or violate subtarget's specifications.
99   std::pair<unsigned, unsigned> getFlatWorkGroupSizes(const Function &F) const;
100 
101   /// \returns Subtarget's default pair of minimum/maximum number of waves per
102   /// execution unit for function \p F, or minimum/maximum number of waves per
103   /// execution unit explicitly requested using "amdgpu-waves-per-eu" attribute
104   /// attached to function \p F.
105   ///
106   /// \returns Subtarget's default values if explicitly requested values cannot
107   /// be converted to integer, violate subtarget's specifications, or are not
108   /// compatible with minimum/maximum number of waves limited by flat work group
109   /// size, register usage, and/or lds usage.
110   std::pair<unsigned, unsigned> getWavesPerEU(const Function &F) const;
111 
112   /// Return the amount of LDS that can be used that will not restrict the
113   /// occupancy lower than WaveCount.
114   unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount,
115                                            const Function &) const;
116 
117   /// Inverse of getMaxLocalMemWithWaveCount. Return the maximum wavecount if
118   /// the given LDS memory size is the only constraint.
119   unsigned getOccupancyWithLocalMemSize(uint32_t Bytes, const Function &) const;
120 
121   unsigned getOccupancyWithLocalMemSize(const MachineFunction &MF) const;
122 
123   bool isAmdHsaOS() const {
124     return TargetTriple.getOS() == Triple::AMDHSA;
125   }
126 
127   bool isAmdPalOS() const {
128     return TargetTriple.getOS() == Triple::AMDPAL;
129   }
130 
131   bool isMesa3DOS() const {
132     return TargetTriple.getOS() == Triple::Mesa3D;
133   }
134 
135   bool isMesaKernel(const Function &F) const {
136     return isMesa3DOS() && !AMDGPU::isShader(F.getCallingConv());
137   }
138 
139   bool isAmdHsaOrMesa(const Function &F) const {
140     return isAmdHsaOS() || isMesaKernel(F);
141   }
142 
143   bool has16BitInsts() const {
144     return Has16BitInsts;
145   }
146 
147   bool hasMadMixInsts() const {
148     return HasMadMixInsts;
149   }
150 
151   bool hasFP32Denormals() const {
152     return FP32Denormals;
153   }
154 
155   bool hasFPExceptions() const {
156     return FPExceptions;
157   }
158 
159   bool hasSDWA() const {
160     return HasSDWA;
161   }
162 
163   bool hasVOP3PInsts() const {
164     return HasVOP3PInsts;
165   }
166 
167   bool hasMulI24() const {
168     return HasMulI24;
169   }
170 
171   bool hasMulU24() const {
172     return HasMulU24;
173   }
174 
175   bool hasInv2PiInlineImm() const {
176     return HasInv2PiInlineImm;
177   }
178 
179   bool hasFminFmaxLegacy() const {
180     return HasFminFmaxLegacy;
181   }
182 
183   bool hasTrigReducedRange() const {
184     return HasTrigReducedRange;
185   }
186 
187   bool isPromoteAllocaEnabled() const {
188     return EnablePromoteAlloca;
189   }
190 
191   unsigned getWavefrontSize() const {
192     return WavefrontSize;
193   }
194 
195   int getLocalMemorySize() const {
196     return LocalMemorySize;
197   }
198 
199   Align getAlignmentForImplicitArgPtr() const {
200     return isAmdHsaOS() ? Align(8) : Align(4);
201   }
202 
203   /// Returns the offset in bytes from the start of the input buffer
204   ///        of the first explicit kernel argument.
205   unsigned getExplicitKernelArgOffset(const Function &F) const {
206     return isAmdHsaOrMesa(F) ? 0 : 36;
207   }
208 
209   /// \returns Maximum number of work groups per compute unit supported by the
210   /// subtarget and limited by given \p FlatWorkGroupSize.
211   virtual unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const = 0;
212 
213   /// \returns Minimum flat work group size supported by the subtarget.
214   virtual unsigned getMinFlatWorkGroupSize() const = 0;
215 
216   /// \returns Maximum flat work group size supported by the subtarget.
217   virtual unsigned getMaxFlatWorkGroupSize() const = 0;
218 
219   /// \returns Maximum number of waves per execution unit supported by the
220   /// subtarget and limited by given \p FlatWorkGroupSize.
221   virtual unsigned getMaxWavesPerEU(unsigned FlatWorkGroupSize) const  = 0;
222 
223   /// \returns Minimum number of waves per execution unit supported by the
224   /// subtarget.
225   virtual unsigned getMinWavesPerEU() const = 0;
226 
227   /// \returns Maximum number of waves per execution unit supported by the
228   /// subtarget without any kind of limitation.
229   unsigned getMaxWavesPerEU() const { return MaxWavesPerEU; }
230 
231   /// Creates value range metadata on an workitemid.* inrinsic call or load.
232   bool makeLIDRangeMetadata(Instruction *I) const;
233 
234   /// \returns Number of bytes of arguments that are passed to a shader or
235   /// kernel in addition to the explicit ones declared for the function.
236   unsigned getImplicitArgNumBytes(const Function &F) const {
237     if (isMesaKernel(F))
238       return 16;
239     return AMDGPU::getIntegerAttribute(F, "amdgpu-implicitarg-num-bytes", 0);
240   }
241   uint64_t getExplicitKernArgSize(const Function &F, Align &MaxAlign) const;
242   unsigned getKernArgSegmentSize(const Function &F, Align &MaxAlign) const;
243 
244   virtual ~AMDGPUSubtarget() {}
245 };
246 
247 class GCNSubtarget : public AMDGPUGenSubtargetInfo,
248                      public AMDGPUSubtarget {
249 
250   using AMDGPUSubtarget::getMaxWavesPerEU;
251 
252 public:
253   enum TrapHandlerAbi {
254     TrapHandlerAbiNone = 0,
255     TrapHandlerAbiHsa = 1
256   };
257 
258   enum TrapID {
259     TrapIDHardwareReserved = 0,
260     TrapIDHSADebugTrap = 1,
261     TrapIDLLVMTrap = 2,
262     TrapIDLLVMDebugTrap = 3,
263     TrapIDDebugBreakpoint = 7,
264     TrapIDDebugReserved8 = 8,
265     TrapIDDebugReservedFE = 0xfe,
266     TrapIDDebugReservedFF = 0xff
267   };
268 
269   enum TrapRegValues {
270     LLVMTrapHandlerRegValue = 1
271   };
272 
273 private:
274   /// GlobalISel related APIs.
275   std::unique_ptr<AMDGPUCallLowering> CallLoweringInfo;
276   std::unique_ptr<InstructionSelector> InstSelector;
277   std::unique_ptr<LegalizerInfo> Legalizer;
278   std::unique_ptr<RegisterBankInfo> RegBankInfo;
279 
280 protected:
281   // Basic subtarget description.
282   Triple TargetTriple;
283   unsigned Gen;
284   InstrItineraryData InstrItins;
285   int LDSBankCount;
286   unsigned MaxPrivateElementSize;
287 
288   // Possibly statically set by tablegen, but may want to be overridden.
289   bool FastFMAF32;
290   bool HalfRate64Ops;
291 
292   // Dynamially set bits that enable features.
293   bool FP64FP16Denormals;
294   bool FlatForGlobal;
295   bool AutoWaitcntBeforeBarrier;
296   bool CodeObjectV3;
297   bool UnalignedScratchAccess;
298   bool UnalignedBufferAccess;
299   bool HasApertureRegs;
300   bool EnableXNACK;
301   bool DoesNotSupportXNACK;
302   bool EnableCuMode;
303   bool TrapHandler;
304 
305   // Used as options.
306   bool EnableLoadStoreOpt;
307   bool EnableUnsafeDSOffsetFolding;
308   bool EnableSIScheduler;
309   bool EnableDS128;
310   bool EnablePRTStrictNull;
311   bool DumpCode;
312 
313   // Subtarget statically properties set by tablegen
314   bool FP64;
315   bool FMA;
316   bool MIMG_R128;
317   bool IsGCN;
318   bool GCN3Encoding;
319   bool CIInsts;
320   bool GFX8Insts;
321   bool GFX9Insts;
322   bool GFX10Insts;
323   bool GFX7GFX8GFX9Insts;
324   bool SGPRInitBug;
325   bool HasSMemRealTime;
326   bool HasIntClamp;
327   bool HasFmaMixInsts;
328   bool HasMovrel;
329   bool HasVGPRIndexMode;
330   bool HasScalarStores;
331   bool HasScalarAtomics;
332   bool HasSDWAOmod;
333   bool HasSDWAScalar;
334   bool HasSDWASdst;
335   bool HasSDWAMac;
336   bool HasSDWAOutModsVOPC;
337   bool HasDPP;
338   bool HasDPP8;
339   bool HasR128A16;
340   bool HasNSAEncoding;
341   bool HasDLInsts;
342   bool HasDot1Insts;
343   bool HasDot2Insts;
344   bool HasDot3Insts;
345   bool HasDot4Insts;
346   bool HasDot5Insts;
347   bool HasDot6Insts;
348   bool HasMAIInsts;
349   bool HasPkFmacF16Inst;
350   bool HasAtomicFaddInsts;
351   bool EnableSRAMECC;
352   bool DoesNotSupportSRAMECC;
353   bool HasNoSdstCMPX;
354   bool HasVscnt;
355   bool HasRegisterBanking;
356   bool HasVOP3Literal;
357   bool HasNoDataDepHazard;
358   bool FlatAddressSpace;
359   bool FlatInstOffsets;
360   bool FlatGlobalInsts;
361   bool FlatScratchInsts;
362   bool ScalarFlatScratchInsts;
363   bool AddNoCarryInsts;
364   bool HasUnpackedD16VMem;
365   bool R600ALUInst;
366   bool CaymanISA;
367   bool CFALUBug;
368   bool LDSMisalignedBug;
369   bool HasMFMAInlineLiteralBug;
370   bool HasVertexCache;
371   short TexVTXClauseSize;
372   bool ScalarizeGlobal;
373 
374   bool HasVcmpxPermlaneHazard;
375   bool HasVMEMtoScalarWriteHazard;
376   bool HasSMEMtoVectorWriteHazard;
377   bool HasInstFwdPrefetchBug;
378   bool HasVcmpxExecWARHazard;
379   bool HasLdsBranchVmemWARHazard;
380   bool HasNSAtoVMEMBug;
381   bool HasOffset3fBug;
382   bool HasFlatSegmentOffsetBug;
383 
384   // Dummy feature to use for assembler in tablegen.
385   bool FeatureDisable;
386 
387   SelectionDAGTargetInfo TSInfo;
388 private:
389   SIInstrInfo InstrInfo;
390   SITargetLowering TLInfo;
391   SIFrameLowering FrameLowering;
392 
393   // See COMPUTE_TMPRING_SIZE.WAVESIZE, 13-bit field in units of 256-dword.
394   static const unsigned MaxWaveScratchSize = (256 * 4) * ((1 << 13) - 1);
395 
396 public:
397   GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
398                const GCNTargetMachine &TM);
399   ~GCNSubtarget() override;
400 
401   GCNSubtarget &initializeSubtargetDependencies(const Triple &TT,
402                                                    StringRef GPU, StringRef FS);
403 
404   const SIInstrInfo *getInstrInfo() const override {
405     return &InstrInfo;
406   }
407 
408   const SIFrameLowering *getFrameLowering() const override {
409     return &FrameLowering;
410   }
411 
412   const SITargetLowering *getTargetLowering() const override {
413     return &TLInfo;
414   }
415 
416   const SIRegisterInfo *getRegisterInfo() const override {
417     return &InstrInfo.getRegisterInfo();
418   }
419 
420   const CallLowering *getCallLowering() const override {
421     return CallLoweringInfo.get();
422   }
423 
424   InstructionSelector *getInstructionSelector() const override {
425     return InstSelector.get();
426   }
427 
428   const LegalizerInfo *getLegalizerInfo() const override {
429     return Legalizer.get();
430   }
431 
432   const RegisterBankInfo *getRegBankInfo() const override {
433     return RegBankInfo.get();
434   }
435 
436   // Nothing implemented, just prevent crashes on use.
437   const SelectionDAGTargetInfo *getSelectionDAGInfo() const override {
438     return &TSInfo;
439   }
440 
441   const InstrItineraryData *getInstrItineraryData() const override {
442     return &InstrItins;
443   }
444 
445   void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
446 
447   Generation getGeneration() const {
448     return (Generation)Gen;
449   }
450 
451   unsigned getWavefrontSizeLog2() const {
452     return Log2_32(WavefrontSize);
453   }
454 
455   /// Return the number of high bits known to be zero fror a frame index.
456   unsigned getKnownHighZeroBitsForFrameIndex() const {
457     return countLeadingZeros(MaxWaveScratchSize) + getWavefrontSizeLog2();
458   }
459 
460   int getLDSBankCount() const {
461     return LDSBankCount;
462   }
463 
464   unsigned getMaxPrivateElementSize() const {
465     return MaxPrivateElementSize;
466   }
467 
468   unsigned getConstantBusLimit(unsigned Opcode) const;
469 
470   bool hasIntClamp() const {
471     return HasIntClamp;
472   }
473 
474   bool hasFP64() const {
475     return FP64;
476   }
477 
478   bool hasMIMG_R128() const {
479     return MIMG_R128;
480   }
481 
482   bool hasHWFP64() const {
483     return FP64;
484   }
485 
486   bool hasFastFMAF32() const {
487     return FastFMAF32;
488   }
489 
490   bool hasHalfRate64Ops() const {
491     return HalfRate64Ops;
492   }
493 
494   bool hasAddr64() const {
495     return (getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS);
496   }
497 
498   // Return true if the target only has the reverse operand versions of VALU
499   // shift instructions (e.g. v_lshrrev_b32, and no v_lshr_b32).
500   bool hasOnlyRevVALUShifts() const {
501     return getGeneration() >= VOLCANIC_ISLANDS;
502   }
503 
504   bool hasBFE() const {
505     return true;
506   }
507 
508   bool hasBFI() const {
509     return true;
510   }
511 
512   bool hasBFM() const {
513     return hasBFE();
514   }
515 
516   bool hasBCNT(unsigned Size) const {
517     return true;
518   }
519 
520   bool hasFFBL() const {
521     return true;
522   }
523 
524   bool hasFFBH() const {
525     return true;
526   }
527 
528   bool hasMed3_16() const {
529     return getGeneration() >= AMDGPUSubtarget::GFX9;
530   }
531 
532   bool hasMin3Max3_16() const {
533     return getGeneration() >= AMDGPUSubtarget::GFX9;
534   }
535 
536   bool hasFmaMixInsts() const {
537     return HasFmaMixInsts;
538   }
539 
540   bool hasCARRY() const {
541     return true;
542   }
543 
544   bool hasFMA() const {
545     return FMA;
546   }
547 
548   bool hasSwap() const {
549     return GFX9Insts;
550   }
551 
552   bool hasScalarPackInsts() const {
553     return GFX9Insts;
554   }
555 
556   bool hasScalarMulHiInsts() const {
557     return GFX9Insts;
558   }
559 
560   TrapHandlerAbi getTrapHandlerAbi() const {
561     return isAmdHsaOS() ? TrapHandlerAbiHsa : TrapHandlerAbiNone;
562   }
563 
564   /// True if the offset field of DS instructions works as expected. On SI, the
565   /// offset uses a 16-bit adder and does not always wrap properly.
566   bool hasUsableDSOffset() const {
567     return getGeneration() >= SEA_ISLANDS;
568   }
569 
570   bool unsafeDSOffsetFoldingEnabled() const {
571     return EnableUnsafeDSOffsetFolding;
572   }
573 
574   /// Condition output from div_scale is usable.
575   bool hasUsableDivScaleConditionOutput() const {
576     return getGeneration() != SOUTHERN_ISLANDS;
577   }
578 
579   /// Extra wait hazard is needed in some cases before
580   /// s_cbranch_vccnz/s_cbranch_vccz.
581   bool hasReadVCCZBug() const {
582     return getGeneration() <= SEA_ISLANDS;
583   }
584 
585   /// A read of an SGPR by SMRD instruction requires 4 wait states when the SGPR
586   /// was written by a VALU instruction.
587   bool hasSMRDReadVALUDefHazard() const {
588     return getGeneration() == SOUTHERN_ISLANDS;
589   }
590 
591   /// A read of an SGPR by a VMEM instruction requires 5 wait states when the
592   /// SGPR was written by a VALU Instruction.
593   bool hasVMEMReadSGPRVALUDefHazard() const {
594     return getGeneration() >= VOLCANIC_ISLANDS;
595   }
596 
597   bool hasRFEHazards() const {
598     return getGeneration() >= VOLCANIC_ISLANDS;
599   }
600 
601   /// Number of hazard wait states for s_setreg_b32/s_setreg_imm32_b32.
602   unsigned getSetRegWaitStates() const {
603     return getGeneration() <= SEA_ISLANDS ? 1 : 2;
604   }
605 
606   bool dumpCode() const {
607     return DumpCode;
608   }
609 
610   /// Return the amount of LDS that can be used that will not restrict the
611   /// occupancy lower than WaveCount.
612   unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount,
613                                            const Function &) const;
614 
615   bool hasFP16Denormals() const {
616     return FP64FP16Denormals;
617   }
618 
619   bool hasFP64Denormals() const {
620     return FP64FP16Denormals;
621   }
622 
623   bool supportsMinMaxDenormModes() const {
624     return getGeneration() >= AMDGPUSubtarget::GFX9;
625   }
626 
627   /// \returns If target supports S_DENORM_MODE.
628   bool hasDenormModeInst() const {
629     return getGeneration() >= AMDGPUSubtarget::GFX10;
630   }
631 
632   bool useFlatForGlobal() const {
633     return FlatForGlobal;
634   }
635 
636   /// \returns If target supports ds_read/write_b128 and user enables generation
637   /// of ds_read/write_b128.
638   bool useDS128() const {
639     return CIInsts && EnableDS128;
640   }
641 
642   /// Have v_trunc_f64, v_ceil_f64, v_rndne_f64
643   bool haveRoundOpsF64() const {
644     return CIInsts;
645   }
646 
647   /// \returns If MUBUF instructions always perform range checking, even for
648   /// buffer resources used for private memory access.
649   bool privateMemoryResourceIsRangeChecked() const {
650     return getGeneration() < AMDGPUSubtarget::GFX9;
651   }
652 
653   /// \returns If target requires PRT Struct NULL support (zero result registers
654   /// for sparse texture support).
655   bool usePRTStrictNull() const {
656     return EnablePRTStrictNull;
657   }
658 
659   bool hasAutoWaitcntBeforeBarrier() const {
660     return AutoWaitcntBeforeBarrier;
661   }
662 
663   bool hasCodeObjectV3() const {
664     // FIXME: Need to add code object v3 support for mesa and pal.
665     return isAmdHsaOS() ? CodeObjectV3 : false;
666   }
667 
668   bool hasUnalignedBufferAccess() const {
669     return UnalignedBufferAccess;
670   }
671 
672   bool hasUnalignedScratchAccess() const {
673     return UnalignedScratchAccess;
674   }
675 
676   bool hasApertureRegs() const {
677     return HasApertureRegs;
678   }
679 
680   bool isTrapHandlerEnabled() const {
681     return TrapHandler;
682   }
683 
684   bool isXNACKEnabled() const {
685     return EnableXNACK;
686   }
687 
688   bool isCuModeEnabled() const {
689     return EnableCuMode;
690   }
691 
692   bool hasFlatAddressSpace() const {
693     return FlatAddressSpace;
694   }
695 
696   bool hasFlatScrRegister() const {
697     return hasFlatAddressSpace();
698   }
699 
700   bool hasFlatInstOffsets() const {
701     return FlatInstOffsets;
702   }
703 
704   bool hasFlatGlobalInsts() const {
705     return FlatGlobalInsts;
706   }
707 
708   bool hasFlatScratchInsts() const {
709     return FlatScratchInsts;
710   }
711 
712   bool hasScalarFlatScratchInsts() const {
713     return ScalarFlatScratchInsts;
714   }
715 
716   bool hasFlatSegmentOffsetBug() const {
717     return HasFlatSegmentOffsetBug;
718   }
719 
720   bool hasFlatLgkmVMemCountInOrder() const {
721     return getGeneration() > GFX9;
722   }
723 
724   bool hasD16LoadStore() const {
725     return getGeneration() >= GFX9;
726   }
727 
728   bool d16PreservesUnusedBits() const {
729     return hasD16LoadStore() && !isSRAMECCEnabled();
730   }
731 
732   bool hasD16Images() const {
733     return getGeneration() >= VOLCANIC_ISLANDS;
734   }
735 
736   /// Return if most LDS instructions have an m0 use that require m0 to be
737   /// iniitalized.
738   bool ldsRequiresM0Init() const {
739     return getGeneration() < GFX9;
740   }
741 
742   // True if the hardware rewinds and replays GWS operations if a wave is
743   // preempted.
744   //
745   // If this is false, a GWS operation requires testing if a nack set the
746   // MEM_VIOL bit, and repeating if so.
747   bool hasGWSAutoReplay() const {
748     return getGeneration() >= GFX9;
749   }
750 
751   /// \returns if target has ds_gws_sema_release_all instruction.
752   bool hasGWSSemaReleaseAll() const {
753     return CIInsts;
754   }
755 
756   bool hasAddNoCarry() const {
757     return AddNoCarryInsts;
758   }
759 
760   bool hasUnpackedD16VMem() const {
761     return HasUnpackedD16VMem;
762   }
763 
764   // Covers VS/PS/CS graphics shaders
765   bool isMesaGfxShader(const Function &F) const {
766     return isMesa3DOS() && AMDGPU::isShader(F.getCallingConv());
767   }
768 
769   bool hasMad64_32() const {
770     return getGeneration() >= SEA_ISLANDS;
771   }
772 
773   bool hasSDWAOmod() const {
774     return HasSDWAOmod;
775   }
776 
777   bool hasSDWAScalar() const {
778     return HasSDWAScalar;
779   }
780 
781   bool hasSDWASdst() const {
782     return HasSDWASdst;
783   }
784 
785   bool hasSDWAMac() const {
786     return HasSDWAMac;
787   }
788 
789   bool hasSDWAOutModsVOPC() const {
790     return HasSDWAOutModsVOPC;
791   }
792 
793   bool hasDLInsts() const {
794     return HasDLInsts;
795   }
796 
797   bool hasDot1Insts() const {
798     return HasDot1Insts;
799   }
800 
801   bool hasDot2Insts() const {
802     return HasDot2Insts;
803   }
804 
805   bool hasDot3Insts() const {
806     return HasDot3Insts;
807   }
808 
809   bool hasDot4Insts() const {
810     return HasDot4Insts;
811   }
812 
813   bool hasDot5Insts() const {
814     return HasDot5Insts;
815   }
816 
817   bool hasDot6Insts() const {
818     return HasDot6Insts;
819   }
820 
821   bool hasMAIInsts() const {
822     return HasMAIInsts;
823   }
824 
825   bool hasPkFmacF16Inst() const {
826     return HasPkFmacF16Inst;
827   }
828 
829   bool hasAtomicFaddInsts() const {
830     return HasAtomicFaddInsts;
831   }
832 
833   bool isSRAMECCEnabled() const {
834     return EnableSRAMECC;
835   }
836 
837   bool hasNoSdstCMPX() const {
838     return HasNoSdstCMPX;
839   }
840 
841   bool hasVscnt() const {
842     return HasVscnt;
843   }
844 
845   bool hasRegisterBanking() const {
846     return HasRegisterBanking;
847   }
848 
849   bool hasVOP3Literal() const {
850     return HasVOP3Literal;
851   }
852 
853   bool hasNoDataDepHazard() const {
854     return HasNoDataDepHazard;
855   }
856 
857   bool vmemWriteNeedsExpWaitcnt() const {
858     return getGeneration() < SEA_ISLANDS;
859   }
860 
861   // Scratch is allocated in 256 dword per wave blocks for the entire
862   // wavefront. When viewed from the perspecive of an arbitrary workitem, this
863   // is 4-byte aligned.
864   //
865   // Only 4-byte alignment is really needed to access anything. Transformations
866   // on the pointer value itself may rely on the alignment / known low bits of
867   // the pointer. Set this to something above the minimum to avoid needing
868   // dynamic realignment in common cases.
869   Align getStackAlignment() const { return Align(16); }
870 
871   bool enableMachineScheduler() const override {
872     return true;
873   }
874 
875   bool enableSubRegLiveness() const override {
876     return true;
877   }
878 
879   void setScalarizeGlobalBehavior(bool b) { ScalarizeGlobal = b; }
880   bool getScalarizeGlobalBehavior() const { return ScalarizeGlobal; }
881 
882   /// \returns Number of execution units per compute unit supported by the
883   /// subtarget.
884   unsigned getEUsPerCU() const {
885     return AMDGPU::IsaInfo::getEUsPerCU(this);
886   }
887 
888   /// \returns Maximum number of waves per compute unit supported by the
889   /// subtarget without any kind of limitation.
890   unsigned getMaxWavesPerCU() const {
891     return AMDGPU::IsaInfo::getMaxWavesPerCU(this);
892   }
893 
894   /// \returns Maximum number of waves per compute unit supported by the
895   /// subtarget and limited by given \p FlatWorkGroupSize.
896   unsigned getMaxWavesPerCU(unsigned FlatWorkGroupSize) const {
897     return AMDGPU::IsaInfo::getMaxWavesPerCU(this, FlatWorkGroupSize);
898   }
899 
900   /// \returns Number of waves per work group supported by the subtarget and
901   /// limited by given \p FlatWorkGroupSize.
902   unsigned getWavesPerWorkGroup(unsigned FlatWorkGroupSize) const {
903     return AMDGPU::IsaInfo::getWavesPerWorkGroup(this, FlatWorkGroupSize);
904   }
905 
906   // static wrappers
907   static bool hasHalfRate64Ops(const TargetSubtargetInfo &STI);
908 
909   // XXX - Why is this here if it isn't in the default pass set?
910   bool enableEarlyIfConversion() const override {
911     return true;
912   }
913 
914   void overrideSchedPolicy(MachineSchedPolicy &Policy,
915                            unsigned NumRegionInstrs) const override;
916 
917   unsigned getMaxNumUserSGPRs() const {
918     return 16;
919   }
920 
921   bool hasSMemRealTime() const {
922     return HasSMemRealTime;
923   }
924 
925   bool hasMovrel() const {
926     return HasMovrel;
927   }
928 
929   bool hasVGPRIndexMode() const {
930     return HasVGPRIndexMode;
931   }
932 
933   bool useVGPRIndexMode(bool UserEnable) const {
934     return !hasMovrel() || (UserEnable && hasVGPRIndexMode());
935   }
936 
937   bool hasScalarCompareEq64() const {
938     return getGeneration() >= VOLCANIC_ISLANDS;
939   }
940 
941   bool hasScalarStores() const {
942     return HasScalarStores;
943   }
944 
945   bool hasScalarAtomics() const {
946     return HasScalarAtomics;
947   }
948 
949   bool hasLDSFPAtomics() const {
950     return GFX8Insts;
951   }
952 
953   bool hasDPP() const {
954     return HasDPP;
955   }
956 
957   bool hasDPPBroadcasts() const {
958     return HasDPP && getGeneration() < GFX10;
959   }
960 
961   bool hasDPPWavefrontShifts() const {
962     return HasDPP && getGeneration() < GFX10;
963   }
964 
965   bool hasDPP8() const {
966     return HasDPP8;
967   }
968 
969   bool hasR128A16() const {
970     return HasR128A16;
971   }
972 
973   bool hasOffset3fBug() const {
974     return HasOffset3fBug;
975   }
976 
977   bool hasNSAEncoding() const {
978     return HasNSAEncoding;
979   }
980 
981   bool hasMadF16() const;
982 
983   bool enableSIScheduler() const {
984     return EnableSIScheduler;
985   }
986 
987   bool loadStoreOptEnabled() const {
988     return EnableLoadStoreOpt;
989   }
990 
991   bool hasSGPRInitBug() const {
992     return SGPRInitBug;
993   }
994 
995   bool hasMFMAInlineLiteralBug() const {
996     return HasMFMAInlineLiteralBug;
997   }
998 
999   bool has12DWordStoreHazard() const {
1000     return getGeneration() != AMDGPUSubtarget::SOUTHERN_ISLANDS;
1001   }
1002 
1003   // \returns true if the subtarget supports DWORDX3 load/store instructions.
1004   bool hasDwordx3LoadStores() const {
1005     return CIInsts;
1006   }
1007 
1008   bool hasSMovFedHazard() const {
1009     return getGeneration() == AMDGPUSubtarget::GFX9;
1010   }
1011 
1012   bool hasReadM0MovRelInterpHazard() const {
1013     return getGeneration() == AMDGPUSubtarget::GFX9;
1014   }
1015 
1016   bool hasReadM0SendMsgHazard() const {
1017     return getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
1018            getGeneration() <= AMDGPUSubtarget::GFX9;
1019   }
1020 
1021   bool hasVcmpxPermlaneHazard() const {
1022     return HasVcmpxPermlaneHazard;
1023   }
1024 
1025   bool hasVMEMtoScalarWriteHazard() const {
1026     return HasVMEMtoScalarWriteHazard;
1027   }
1028 
1029   bool hasSMEMtoVectorWriteHazard() const {
1030     return HasSMEMtoVectorWriteHazard;
1031   }
1032 
1033   bool hasLDSMisalignedBug() const {
1034     return LDSMisalignedBug && !EnableCuMode;
1035   }
1036 
1037   bool hasInstFwdPrefetchBug() const {
1038     return HasInstFwdPrefetchBug;
1039   }
1040 
1041   bool hasVcmpxExecWARHazard() const {
1042     return HasVcmpxExecWARHazard;
1043   }
1044 
1045   bool hasLdsBranchVmemWARHazard() const {
1046     return HasLdsBranchVmemWARHazard;
1047   }
1048 
1049   bool hasNSAtoVMEMBug() const {
1050     return HasNSAtoVMEMBug;
1051   }
1052 
1053   /// Return the maximum number of waves per SIMD for kernels using \p SGPRs
1054   /// SGPRs
1055   unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const;
1056 
1057   /// Return the maximum number of waves per SIMD for kernels using \p VGPRs
1058   /// VGPRs
1059   unsigned getOccupancyWithNumVGPRs(unsigned VGPRs) const;
1060 
1061   /// Return occupancy for the given function. Used LDS and a number of
1062   /// registers if provided.
1063   /// Note, occupancy can be affected by the scratch allocation as well, but
1064   /// we do not have enough information to compute it.
1065   unsigned computeOccupancy(const MachineFunction &MF, unsigned LDSSize = 0,
1066                             unsigned NumSGPRs = 0, unsigned NumVGPRs = 0) const;
1067 
1068   /// \returns true if the flat_scratch register should be initialized with the
1069   /// pointer to the wave's scratch memory rather than a size and offset.
1070   bool flatScratchIsPointer() const {
1071     return getGeneration() >= AMDGPUSubtarget::GFX9;
1072   }
1073 
1074   /// \returns true if the machine has merged shaders in which s0-s7 are
1075   /// reserved by the hardware and user SGPRs start at s8
1076   bool hasMergedShaders() const {
1077     return getGeneration() >= GFX9;
1078   }
1079 
1080   /// \returns SGPR allocation granularity supported by the subtarget.
1081   unsigned getSGPRAllocGranule() const {
1082     return AMDGPU::IsaInfo::getSGPRAllocGranule(this);
1083   }
1084 
1085   /// \returns SGPR encoding granularity supported by the subtarget.
1086   unsigned getSGPREncodingGranule() const {
1087     return AMDGPU::IsaInfo::getSGPREncodingGranule(this);
1088   }
1089 
1090   /// \returns Total number of SGPRs supported by the subtarget.
1091   unsigned getTotalNumSGPRs() const {
1092     return AMDGPU::IsaInfo::getTotalNumSGPRs(this);
1093   }
1094 
1095   /// \returns Addressable number of SGPRs supported by the subtarget.
1096   unsigned getAddressableNumSGPRs() const {
1097     return AMDGPU::IsaInfo::getAddressableNumSGPRs(this);
1098   }
1099 
1100   /// \returns Minimum number of SGPRs that meets the given number of waves per
1101   /// execution unit requirement supported by the subtarget.
1102   unsigned getMinNumSGPRs(unsigned WavesPerEU) const {
1103     return AMDGPU::IsaInfo::getMinNumSGPRs(this, WavesPerEU);
1104   }
1105 
1106   /// \returns Maximum number of SGPRs that meets the given number of waves per
1107   /// execution unit requirement supported by the subtarget.
1108   unsigned getMaxNumSGPRs(unsigned WavesPerEU, bool Addressable) const {
1109     return AMDGPU::IsaInfo::getMaxNumSGPRs(this, WavesPerEU, Addressable);
1110   }
1111 
1112   /// \returns Reserved number of SGPRs for given function \p MF.
1113   unsigned getReservedNumSGPRs(const MachineFunction &MF) const;
1114 
1115   /// \returns Maximum number of SGPRs that meets number of waves per execution
1116   /// unit requirement for function \p MF, or number of SGPRs explicitly
1117   /// requested using "amdgpu-num-sgpr" attribute attached to function \p MF.
1118   ///
1119   /// \returns Value that meets number of waves per execution unit requirement
1120   /// if explicitly requested value cannot be converted to integer, violates
1121   /// subtarget's specifications, or does not meet number of waves per execution
1122   /// unit requirement.
1123   unsigned getMaxNumSGPRs(const MachineFunction &MF) const;
1124 
1125   /// \returns VGPR allocation granularity supported by the subtarget.
1126   unsigned getVGPRAllocGranule() const {
1127     return AMDGPU::IsaInfo::getVGPRAllocGranule(this);
1128   }
1129 
1130   /// \returns VGPR encoding granularity supported by the subtarget.
1131   unsigned getVGPREncodingGranule() const {
1132     return AMDGPU::IsaInfo::getVGPREncodingGranule(this);
1133   }
1134 
1135   /// \returns Total number of VGPRs supported by the subtarget.
1136   unsigned getTotalNumVGPRs() const {
1137     return AMDGPU::IsaInfo::getTotalNumVGPRs(this);
1138   }
1139 
1140   /// \returns Addressable number of VGPRs supported by the subtarget.
1141   unsigned getAddressableNumVGPRs() const {
1142     return AMDGPU::IsaInfo::getAddressableNumVGPRs(this);
1143   }
1144 
1145   /// \returns Minimum number of VGPRs that meets given number of waves per
1146   /// execution unit requirement supported by the subtarget.
1147   unsigned getMinNumVGPRs(unsigned WavesPerEU) const {
1148     return AMDGPU::IsaInfo::getMinNumVGPRs(this, WavesPerEU);
1149   }
1150 
1151   /// \returns Maximum number of VGPRs that meets given number of waves per
1152   /// execution unit requirement supported by the subtarget.
1153   unsigned getMaxNumVGPRs(unsigned WavesPerEU) const {
1154     return AMDGPU::IsaInfo::getMaxNumVGPRs(this, WavesPerEU);
1155   }
1156 
1157   /// \returns Maximum number of VGPRs that meets number of waves per execution
1158   /// unit requirement for function \p MF, or number of VGPRs explicitly
1159   /// requested using "amdgpu-num-vgpr" attribute attached to function \p MF.
1160   ///
1161   /// \returns Value that meets number of waves per execution unit requirement
1162   /// if explicitly requested value cannot be converted to integer, violates
1163   /// subtarget's specifications, or does not meet number of waves per execution
1164   /// unit requirement.
1165   unsigned getMaxNumVGPRs(const MachineFunction &MF) const;
1166 
1167   void getPostRAMutations(
1168       std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations)
1169       const override;
1170 
1171   bool isWave32() const {
1172     return WavefrontSize == 32;
1173   }
1174 
1175   const TargetRegisterClass *getBoolRC() const {
1176     return getRegisterInfo()->getBoolRC();
1177   }
1178 
1179   /// \returns Maximum number of work groups per compute unit supported by the
1180   /// subtarget and limited by given \p FlatWorkGroupSize.
1181   unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const override {
1182     return AMDGPU::IsaInfo::getMaxWorkGroupsPerCU(this, FlatWorkGroupSize);
1183   }
1184 
1185   /// \returns Minimum flat work group size supported by the subtarget.
1186   unsigned getMinFlatWorkGroupSize() const override {
1187     return AMDGPU::IsaInfo::getMinFlatWorkGroupSize(this);
1188   }
1189 
1190   /// \returns Maximum flat work group size supported by the subtarget.
1191   unsigned getMaxFlatWorkGroupSize() const override {
1192     return AMDGPU::IsaInfo::getMaxFlatWorkGroupSize(this);
1193   }
1194 
1195   /// \returns Maximum number of waves per execution unit supported by the
1196   /// subtarget and limited by given \p FlatWorkGroupSize.
1197   unsigned getMaxWavesPerEU(unsigned FlatWorkGroupSize) const override {
1198     return AMDGPU::IsaInfo::getMaxWavesPerEU(this, FlatWorkGroupSize);
1199   }
1200 
1201   /// \returns Minimum number of waves per execution unit supported by the
1202   /// subtarget.
1203   unsigned getMinWavesPerEU() const override {
1204     return AMDGPU::IsaInfo::getMinWavesPerEU(this);
1205   }
1206 };
1207 
1208 class R600Subtarget final : public R600GenSubtargetInfo,
1209                             public AMDGPUSubtarget {
1210 private:
1211   R600InstrInfo InstrInfo;
1212   R600FrameLowering FrameLowering;
1213   bool FMA;
1214   bool CaymanISA;
1215   bool CFALUBug;
1216   bool HasVertexCache;
1217   bool R600ALUInst;
1218   bool FP64;
1219   short TexVTXClauseSize;
1220   Generation Gen;
1221   R600TargetLowering TLInfo;
1222   InstrItineraryData InstrItins;
1223   SelectionDAGTargetInfo TSInfo;
1224 
1225 public:
1226   R600Subtarget(const Triple &TT, StringRef CPU, StringRef FS,
1227                 const TargetMachine &TM);
1228 
1229   const R600InstrInfo *getInstrInfo() const override { return &InstrInfo; }
1230 
1231   const R600FrameLowering *getFrameLowering() const override {
1232     return &FrameLowering;
1233   }
1234 
1235   const R600TargetLowering *getTargetLowering() const override {
1236     return &TLInfo;
1237   }
1238 
1239   const R600RegisterInfo *getRegisterInfo() const override {
1240     return &InstrInfo.getRegisterInfo();
1241   }
1242 
1243   const InstrItineraryData *getInstrItineraryData() const override {
1244     return &InstrItins;
1245   }
1246 
1247   // Nothing implemented, just prevent crashes on use.
1248   const SelectionDAGTargetInfo *getSelectionDAGInfo() const override {
1249     return &TSInfo;
1250   }
1251 
1252   void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
1253 
1254   Generation getGeneration() const {
1255     return Gen;
1256   }
1257 
1258   Align getStackAlignment() const { return Align(4); }
1259 
1260   R600Subtarget &initializeSubtargetDependencies(const Triple &TT,
1261                                                  StringRef GPU, StringRef FS);
1262 
1263   bool hasBFE() const {
1264     return (getGeneration() >= EVERGREEN);
1265   }
1266 
1267   bool hasBFI() const {
1268     return (getGeneration() >= EVERGREEN);
1269   }
1270 
1271   bool hasBCNT(unsigned Size) const {
1272     if (Size == 32)
1273       return (getGeneration() >= EVERGREEN);
1274 
1275     return false;
1276   }
1277 
1278   bool hasBORROW() const {
1279     return (getGeneration() >= EVERGREEN);
1280   }
1281 
1282   bool hasCARRY() const {
1283     return (getGeneration() >= EVERGREEN);
1284   }
1285 
1286   bool hasCaymanISA() const {
1287     return CaymanISA;
1288   }
1289 
1290   bool hasFFBL() const {
1291     return (getGeneration() >= EVERGREEN);
1292   }
1293 
1294   bool hasFFBH() const {
1295     return (getGeneration() >= EVERGREEN);
1296   }
1297 
1298   bool hasFMA() const { return FMA; }
1299 
1300   bool hasCFAluBug() const { return CFALUBug; }
1301 
1302   bool hasVertexCache() const { return HasVertexCache; }
1303 
1304   short getTexVTXClauseSize() const { return TexVTXClauseSize; }
1305 
1306   bool enableMachineScheduler() const override {
1307     return true;
1308   }
1309 
1310   bool enableSubRegLiveness() const override {
1311     return true;
1312   }
1313 
1314   /// \returns Maximum number of work groups per compute unit supported by the
1315   /// subtarget and limited by given \p FlatWorkGroupSize.
1316   unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const override {
1317     return AMDGPU::IsaInfo::getMaxWorkGroupsPerCU(this, FlatWorkGroupSize);
1318   }
1319 
1320   /// \returns Minimum flat work group size supported by the subtarget.
1321   unsigned getMinFlatWorkGroupSize() const override {
1322     return AMDGPU::IsaInfo::getMinFlatWorkGroupSize(this);
1323   }
1324 
1325   /// \returns Maximum flat work group size supported by the subtarget.
1326   unsigned getMaxFlatWorkGroupSize() const override {
1327     return AMDGPU::IsaInfo::getMaxFlatWorkGroupSize(this);
1328   }
1329 
1330   /// \returns Maximum number of waves per execution unit supported by the
1331   /// subtarget and limited by given \p FlatWorkGroupSize.
1332   unsigned getMaxWavesPerEU(unsigned FlatWorkGroupSize) const override {
1333     return AMDGPU::IsaInfo::getMaxWavesPerEU(this, FlatWorkGroupSize);
1334   }
1335 
1336   /// \returns Minimum number of waves per execution unit supported by the
1337   /// subtarget.
1338   unsigned getMinWavesPerEU() const override {
1339     return AMDGPU::IsaInfo::getMinWavesPerEU(this);
1340   }
1341 };
1342 
1343 } // end namespace llvm
1344 
1345 #endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUSUBTARGET_H
1346