1 //=====-- GCNSubtarget.h - Define GCN Subtarget for AMDGPU ------*- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //==-----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// AMD GCN specific subclass of TargetSubtarget. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #ifndef LLVM_LIB_TARGET_AMDGPU_GCNSUBTARGET_H 15 #define LLVM_LIB_TARGET_AMDGPU_GCNSUBTARGET_H 16 17 #include "AMDGPUCallLowering.h" 18 #include "AMDGPURegisterBankInfo.h" 19 #include "AMDGPUSubtarget.h" 20 #include "SIFrameLowering.h" 21 #include "SIISelLowering.h" 22 #include "SIInstrInfo.h" 23 #include "Utils/AMDGPUBaseInfo.h" 24 #include "llvm/Support/ErrorHandling.h" 25 26 #define GET_SUBTARGETINFO_HEADER 27 #include "AMDGPUGenSubtargetInfo.inc" 28 29 namespace llvm { 30 31 class GCNTargetMachine; 32 33 class GCNSubtarget final : public AMDGPUGenSubtargetInfo, 34 public AMDGPUSubtarget { 35 public: 36 using AMDGPUSubtarget::getMaxWavesPerEU; 37 38 // Following 2 enums are documented at: 39 // - https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi 40 enum class TrapHandlerAbi { 41 NONE = 0x00, 42 AMDHSA = 0x01, 43 }; 44 45 enum class TrapID { 46 LLVMAMDHSATrap = 0x02, 47 LLVMAMDHSADebugTrap = 0x03, 48 }; 49 50 private: 51 /// SelectionDAGISel related APIs. 52 std::unique_ptr<const SelectionDAGTargetInfo> TSInfo; 53 54 /// GlobalISel related APIs. 55 std::unique_ptr<AMDGPUCallLowering> CallLoweringInfo; 56 std::unique_ptr<InlineAsmLowering> InlineAsmLoweringInfo; 57 std::unique_ptr<InstructionSelector> InstSelector; 58 std::unique_ptr<LegalizerInfo> Legalizer; 59 std::unique_ptr<AMDGPURegisterBankInfo> RegBankInfo; 60 61 protected: 62 // Basic subtarget description. 63 Triple TargetTriple; 64 AMDGPU::IsaInfo::AMDGPUTargetID TargetID; 65 unsigned Gen = INVALID; 66 InstrItineraryData InstrItins; 67 int LDSBankCount = 0; 68 unsigned MaxPrivateElementSize = 0; 69 70 // Possibly statically set by tablegen, but may want to be overridden. 71 bool FastDenormalF32 = false; 72 bool HalfRate64Ops = false; 73 bool FullRate64Ops = false; 74 75 // Dynamically set bits that enable features. 76 bool FlatForGlobal = false; 77 bool AutoWaitcntBeforeBarrier = false; 78 bool BackOffBarrier = false; 79 bool UnalignedScratchAccess = false; 80 bool UnalignedAccessMode = false; 81 bool HasApertureRegs = false; 82 bool SupportsXNACK = false; 83 bool KernargPreload = false; 84 85 // This should not be used directly. 'TargetID' tracks the dynamic settings 86 // for XNACK. 87 bool EnableXNACK = false; 88 89 bool EnableTgSplit = false; 90 bool EnableCuMode = false; 91 bool TrapHandler = false; 92 bool EnablePreciseMemory = false; 93 94 // Used as options. 95 bool EnableLoadStoreOpt = false; 96 bool EnableUnsafeDSOffsetFolding = false; 97 bool EnableSIScheduler = false; 98 bool EnableDS128 = false; 99 bool EnablePRTStrictNull = false; 100 bool DumpCode = false; 101 102 // Subtarget statically properties set by tablegen 103 bool FP64 = false; 104 bool FMA = false; 105 bool MIMG_R128 = false; 106 bool CIInsts = false; 107 bool GFX8Insts = false; 108 bool GFX9Insts = false; 109 bool GFX90AInsts = false; 110 bool GFX940Insts = false; 111 bool GFX950Insts = false; 112 bool GFX10Insts = false; 113 bool GFX11Insts = false; 114 bool GFX12Insts = false; 115 bool GFX10_3Insts = false; 116 bool GFX7GFX8GFX9Insts = false; 117 bool SGPRInitBug = false; 118 bool UserSGPRInit16Bug = false; 119 bool NegativeScratchOffsetBug = false; 120 bool NegativeUnalignedScratchOffsetBug = false; 121 bool HasSMemRealTime = false; 122 bool HasIntClamp = false; 123 bool HasFmaMixInsts = false; 124 bool HasMovrel = false; 125 bool HasVGPRIndexMode = false; 126 bool HasScalarDwordx3Loads = false; 127 bool HasScalarStores = false; 128 bool HasScalarAtomics = false; 129 bool HasSDWAOmod = false; 130 bool HasSDWAScalar = false; 131 bool HasSDWASdst = false; 132 bool HasSDWAMac = false; 133 bool HasSDWAOutModsVOPC = false; 134 bool HasDPP = false; 135 bool HasDPP8 = false; 136 bool HasDPALU_DPP = false; 137 bool HasDPPSrc1SGPR = false; 138 bool HasPackedFP32Ops = false; 139 bool HasImageInsts = false; 140 bool HasExtendedImageInsts = false; 141 bool HasR128A16 = false; 142 bool HasA16 = false; 143 bool HasG16 = false; 144 bool HasNSAEncoding = false; 145 bool HasPartialNSAEncoding = false; 146 bool GFX10_AEncoding = false; 147 bool GFX10_BEncoding = false; 148 bool HasDLInsts = false; 149 bool HasFmacF64Inst = false; 150 bool HasDot1Insts = false; 151 bool HasDot2Insts = false; 152 bool HasDot3Insts = false; 153 bool HasDot4Insts = false; 154 bool HasDot5Insts = false; 155 bool HasDot6Insts = false; 156 bool HasDot7Insts = false; 157 bool HasDot8Insts = false; 158 bool HasDot9Insts = false; 159 bool HasDot10Insts = false; 160 bool HasDot11Insts = false; 161 bool HasDot12Insts = false; 162 bool HasDot13Insts = false; 163 bool HasMAIInsts = false; 164 bool HasFP8Insts = false; 165 bool HasFP8ConversionInsts = false; 166 bool HasCvtFP8Vop1Bug = false; 167 bool HasPkFmacF16Inst = false; 168 bool HasAtomicFMinFMaxF32GlobalInsts = false; 169 bool HasAtomicFMinFMaxF64GlobalInsts = false; 170 bool HasAtomicFMinFMaxF32FlatInsts = false; 171 bool HasAtomicFMinFMaxF64FlatInsts = false; 172 bool HasAtomicDsPkAdd16Insts = false; 173 bool HasAtomicFlatPkAdd16Insts = false; 174 bool HasAtomicFaddRtnInsts = false; 175 bool HasAtomicFaddNoRtnInsts = false; 176 bool HasMemoryAtomicFaddF32DenormalSupport = false; 177 bool HasAtomicBufferGlobalPkAddF16NoRtnInsts = false; 178 bool HasAtomicBufferGlobalPkAddF16Insts = false; 179 bool HasAtomicCSubNoRtnInsts = false; 180 bool HasAtomicGlobalPkAddBF16Inst = false; 181 bool HasAtomicBufferPkAddBF16Inst = false; 182 bool HasFlatAtomicFaddF32Inst = false; 183 bool HasFlatBufferGlobalAtomicFaddF64Inst = false; 184 bool HasDefaultComponentZero = false; 185 bool HasAgentScopeFineGrainedRemoteMemoryAtomics = false; 186 bool HasDefaultComponentBroadcast = false; 187 bool HasXF32Insts = false; 188 /// The maximum number of instructions that may be placed within an S_CLAUSE, 189 /// which is one greater than the maximum argument to S_CLAUSE. A value of 0 190 /// indicates a lack of S_CLAUSE support. 191 unsigned MaxHardClauseLength = 0; 192 bool SupportsSRAMECC = false; 193 194 // This should not be used directly. 'TargetID' tracks the dynamic settings 195 // for SRAMECC. 196 bool EnableSRAMECC = false; 197 198 bool HasNoSdstCMPX = false; 199 bool HasVscnt = false; 200 bool HasGetWaveIdInst = false; 201 bool HasSMemTimeInst = false; 202 bool HasShaderCyclesRegister = false; 203 bool HasShaderCyclesHiLoRegisters = false; 204 bool HasVOP3Literal = false; 205 bool HasNoDataDepHazard = false; 206 bool FlatAddressSpace = false; 207 bool FlatInstOffsets = false; 208 bool FlatGlobalInsts = false; 209 bool FlatScratchInsts = false; 210 bool ScalarFlatScratchInsts = false; 211 bool HasArchitectedFlatScratch = false; 212 bool EnableFlatScratch = false; 213 bool HasArchitectedSGPRs = false; 214 bool HasGDS = false; 215 bool HasGWS = false; 216 bool AddNoCarryInsts = false; 217 bool HasUnpackedD16VMem = false; 218 bool LDSMisalignedBug = false; 219 bool HasMFMAInlineLiteralBug = false; 220 bool UnalignedBufferAccess = false; 221 bool UnalignedDSAccess = false; 222 bool HasPackedTID = false; 223 bool ScalarizeGlobal = false; 224 bool HasSALUFloatInsts = false; 225 bool HasPseudoScalarTrans = false; 226 bool HasRestrictedSOffset = false; 227 bool HasBitOp3Insts = false; 228 bool HasPrngInst = false; 229 bool HasPermlane16Swap = false; 230 bool HasPermlane32Swap = false; 231 bool HasVcmpxPermlaneHazard = false; 232 bool HasVMEMtoScalarWriteHazard = false; 233 bool HasSMEMtoVectorWriteHazard = false; 234 bool HasInstFwdPrefetchBug = false; 235 bool HasVcmpxExecWARHazard = false; 236 bool HasLdsBranchVmemWARHazard = false; 237 bool HasNSAtoVMEMBug = false; 238 bool HasNSAClauseBug = false; 239 bool HasOffset3fBug = false; 240 bool HasFlatSegmentOffsetBug = false; 241 bool HasImageStoreD16Bug = false; 242 bool HasImageGather4D16Bug = false; 243 bool HasMSAALoadDstSelBug = false; 244 bool HasPrivEnabledTrap2NopBug = false; 245 bool Has1_5xVGPRs = false; 246 bool HasMADIntraFwdBug = false; 247 bool HasVOPDInsts = false; 248 bool HasVALUTransUseHazard = false; 249 bool HasForceStoreSC0SC1 = false; 250 bool HasRequiredExportPriority = false; 251 bool HasVmemWriteVgprInOrder = false; 252 bool HasAshrPkInsts = false; 253 bool HasMinimum3Maximum3F32 = false; 254 bool HasMinimum3Maximum3F16 = false; 255 bool HasMinimum3Maximum3PKF16 = false; 256 257 bool RequiresCOV6 = false; 258 259 // Dummy feature to use for assembler in tablegen. 260 bool FeatureDisable = false; 261 262 private: 263 SIInstrInfo InstrInfo; 264 SITargetLowering TLInfo; 265 SIFrameLowering FrameLowering; 266 267 public: 268 GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, 269 const GCNTargetMachine &TM); 270 ~GCNSubtarget() override; 271 272 GCNSubtarget &initializeSubtargetDependencies(const Triple &TT, 273 StringRef GPU, StringRef FS); 274 275 /// Diagnose inconsistent subtarget features before attempting to codegen 276 /// function \p F. 277 void checkSubtargetFeatures(const Function &F) const; 278 279 const SIInstrInfo *getInstrInfo() const override { 280 return &InstrInfo; 281 } 282 283 const SIFrameLowering *getFrameLowering() const override { 284 return &FrameLowering; 285 } 286 287 const SITargetLowering *getTargetLowering() const override { 288 return &TLInfo; 289 } 290 291 const SIRegisterInfo *getRegisterInfo() const override { 292 return &InstrInfo.getRegisterInfo(); 293 } 294 295 const SelectionDAGTargetInfo *getSelectionDAGInfo() const override; 296 297 const CallLowering *getCallLowering() const override { 298 return CallLoweringInfo.get(); 299 } 300 301 const InlineAsmLowering *getInlineAsmLowering() const override { 302 return InlineAsmLoweringInfo.get(); 303 } 304 305 InstructionSelector *getInstructionSelector() const override { 306 return InstSelector.get(); 307 } 308 309 const LegalizerInfo *getLegalizerInfo() const override { 310 return Legalizer.get(); 311 } 312 313 const AMDGPURegisterBankInfo *getRegBankInfo() const override { 314 return RegBankInfo.get(); 315 } 316 317 const AMDGPU::IsaInfo::AMDGPUTargetID &getTargetID() const { 318 return TargetID; 319 } 320 321 const InstrItineraryData *getInstrItineraryData() const override { 322 return &InstrItins; 323 } 324 325 void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS); 326 327 Generation getGeneration() const { 328 return (Generation)Gen; 329 } 330 331 unsigned getMaxWaveScratchSize() const { 332 // See COMPUTE_TMPRING_SIZE.WAVESIZE. 333 if (getGeneration() >= GFX12) { 334 // 18-bit field in units of 64-dword. 335 return (64 * 4) * ((1 << 18) - 1); 336 } 337 if (getGeneration() == GFX11) { 338 // 15-bit field in units of 64-dword. 339 return (64 * 4) * ((1 << 15) - 1); 340 } 341 // 13-bit field in units of 256-dword. 342 return (256 * 4) * ((1 << 13) - 1); 343 } 344 345 /// Return the number of high bits known to be zero for a frame index. 346 unsigned getKnownHighZeroBitsForFrameIndex() const { 347 return llvm::countl_zero(getMaxWaveScratchSize()) + getWavefrontSizeLog2(); 348 } 349 350 int getLDSBankCount() const { 351 return LDSBankCount; 352 } 353 354 unsigned getMaxPrivateElementSize(bool ForBufferRSrc = false) const { 355 return (ForBufferRSrc || !enableFlatScratch()) ? MaxPrivateElementSize : 16; 356 } 357 358 unsigned getConstantBusLimit(unsigned Opcode) const; 359 360 /// Returns if the result of this instruction with a 16-bit result returned in 361 /// a 32-bit register implicitly zeroes the high 16-bits, rather than preserve 362 /// the original value. 363 bool zeroesHigh16BitsOfDest(unsigned Opcode) const; 364 365 bool supportsWGP() const { return getGeneration() >= GFX10; } 366 367 bool hasIntClamp() const { 368 return HasIntClamp; 369 } 370 371 bool hasFP64() const { 372 return FP64; 373 } 374 375 bool hasMIMG_R128() const { 376 return MIMG_R128; 377 } 378 379 bool hasHWFP64() const { 380 return FP64; 381 } 382 383 bool hasHalfRate64Ops() const { 384 return HalfRate64Ops; 385 } 386 387 bool hasFullRate64Ops() const { 388 return FullRate64Ops; 389 } 390 391 bool hasAddr64() const { 392 return (getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS); 393 } 394 395 bool hasFlat() const { 396 return (getGeneration() > AMDGPUSubtarget::SOUTHERN_ISLANDS); 397 } 398 399 // Return true if the target only has the reverse operand versions of VALU 400 // shift instructions (e.g. v_lshrrev_b32, and no v_lshr_b32). 401 bool hasOnlyRevVALUShifts() const { 402 return getGeneration() >= VOLCANIC_ISLANDS; 403 } 404 405 bool hasFractBug() const { 406 return getGeneration() == SOUTHERN_ISLANDS; 407 } 408 409 bool hasBFE() const { 410 return true; 411 } 412 413 bool hasBFI() const { 414 return true; 415 } 416 417 bool hasBFM() const { 418 return hasBFE(); 419 } 420 421 bool hasBCNT(unsigned Size) const { 422 return true; 423 } 424 425 bool hasFFBL() const { 426 return true; 427 } 428 429 bool hasFFBH() const { 430 return true; 431 } 432 433 bool hasMed3_16() const { 434 return getGeneration() >= AMDGPUSubtarget::GFX9; 435 } 436 437 bool hasMin3Max3_16() const { 438 return getGeneration() >= AMDGPUSubtarget::GFX9; 439 } 440 441 bool hasFmaMixInsts() const { 442 return HasFmaMixInsts; 443 } 444 445 bool hasCARRY() const { 446 return true; 447 } 448 449 bool hasFMA() const { 450 return FMA; 451 } 452 453 bool hasSwap() const { 454 return GFX9Insts; 455 } 456 457 bool hasScalarPackInsts() const { 458 return GFX9Insts; 459 } 460 461 bool hasScalarMulHiInsts() const { 462 return GFX9Insts; 463 } 464 465 bool hasScalarSubwordLoads() const { return getGeneration() >= GFX12; } 466 467 TrapHandlerAbi getTrapHandlerAbi() const { 468 return isAmdHsaOS() ? TrapHandlerAbi::AMDHSA : TrapHandlerAbi::NONE; 469 } 470 471 bool supportsGetDoorbellID() const { 472 // The S_GETREG DOORBELL_ID is supported by all GFX9 onward targets. 473 return getGeneration() >= GFX9; 474 } 475 476 /// True if the offset field of DS instructions works as expected. On SI, the 477 /// offset uses a 16-bit adder and does not always wrap properly. 478 bool hasUsableDSOffset() const { 479 return getGeneration() >= SEA_ISLANDS; 480 } 481 482 bool unsafeDSOffsetFoldingEnabled() const { 483 return EnableUnsafeDSOffsetFolding; 484 } 485 486 /// Condition output from div_scale is usable. 487 bool hasUsableDivScaleConditionOutput() const { 488 return getGeneration() != SOUTHERN_ISLANDS; 489 } 490 491 /// Extra wait hazard is needed in some cases before 492 /// s_cbranch_vccnz/s_cbranch_vccz. 493 bool hasReadVCCZBug() const { 494 return getGeneration() <= SEA_ISLANDS; 495 } 496 497 /// Writes to VCC_LO/VCC_HI update the VCCZ flag. 498 bool partialVCCWritesUpdateVCCZ() const { 499 return getGeneration() >= GFX10; 500 } 501 502 /// A read of an SGPR by SMRD instruction requires 4 wait states when the SGPR 503 /// was written by a VALU instruction. 504 bool hasSMRDReadVALUDefHazard() const { 505 return getGeneration() == SOUTHERN_ISLANDS; 506 } 507 508 /// A read of an SGPR by a VMEM instruction requires 5 wait states when the 509 /// SGPR was written by a VALU Instruction. 510 bool hasVMEMReadSGPRVALUDefHazard() const { 511 return getGeneration() >= VOLCANIC_ISLANDS; 512 } 513 514 bool hasRFEHazards() const { 515 return getGeneration() >= VOLCANIC_ISLANDS; 516 } 517 518 /// Number of hazard wait states for s_setreg_b32/s_setreg_imm32_b32. 519 unsigned getSetRegWaitStates() const { 520 return getGeneration() <= SEA_ISLANDS ? 1 : 2; 521 } 522 523 bool dumpCode() const { 524 return DumpCode; 525 } 526 527 /// Return the amount of LDS that can be used that will not restrict the 528 /// occupancy lower than WaveCount. 529 unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount, 530 const Function &) const; 531 532 bool supportsMinMaxDenormModes() const { 533 return getGeneration() >= AMDGPUSubtarget::GFX9; 534 } 535 536 /// \returns If target supports S_DENORM_MODE. 537 bool hasDenormModeInst() const { 538 return getGeneration() >= AMDGPUSubtarget::GFX10; 539 } 540 541 bool useFlatForGlobal() const { 542 return FlatForGlobal; 543 } 544 545 /// \returns If target supports ds_read/write_b128 and user enables generation 546 /// of ds_read/write_b128. 547 bool useDS128() const { 548 return CIInsts && EnableDS128; 549 } 550 551 /// \return If target supports ds_read/write_b96/128. 552 bool hasDS96AndDS128() const { 553 return CIInsts; 554 } 555 556 /// Have v_trunc_f64, v_ceil_f64, v_rndne_f64 557 bool haveRoundOpsF64() const { 558 return CIInsts; 559 } 560 561 /// \returns If MUBUF instructions always perform range checking, even for 562 /// buffer resources used for private memory access. 563 bool privateMemoryResourceIsRangeChecked() const { 564 return getGeneration() < AMDGPUSubtarget::GFX9; 565 } 566 567 /// \returns If target requires PRT Struct NULL support (zero result registers 568 /// for sparse texture support). 569 bool usePRTStrictNull() const { 570 return EnablePRTStrictNull; 571 } 572 573 bool hasAutoWaitcntBeforeBarrier() const { 574 return AutoWaitcntBeforeBarrier; 575 } 576 577 /// \returns true if the target supports backing off of s_barrier instructions 578 /// when an exception is raised. 579 bool supportsBackOffBarrier() const { 580 return BackOffBarrier; 581 } 582 583 bool hasUnalignedBufferAccess() const { 584 return UnalignedBufferAccess; 585 } 586 587 bool hasUnalignedBufferAccessEnabled() const { 588 return UnalignedBufferAccess && UnalignedAccessMode; 589 } 590 591 bool hasUnalignedDSAccess() const { 592 return UnalignedDSAccess; 593 } 594 595 bool hasUnalignedDSAccessEnabled() const { 596 return UnalignedDSAccess && UnalignedAccessMode; 597 } 598 599 bool hasUnalignedScratchAccess() const { 600 return UnalignedScratchAccess; 601 } 602 603 bool hasUnalignedScratchAccessEnabled() const { 604 return UnalignedScratchAccess && UnalignedAccessMode; 605 } 606 607 bool hasUnalignedAccessMode() const { 608 return UnalignedAccessMode; 609 } 610 611 bool hasApertureRegs() const { 612 return HasApertureRegs; 613 } 614 615 bool isTrapHandlerEnabled() const { 616 return TrapHandler; 617 } 618 619 bool isXNACKEnabled() const { 620 return TargetID.isXnackOnOrAny(); 621 } 622 623 bool isTgSplitEnabled() const { 624 return EnableTgSplit; 625 } 626 627 bool isCuModeEnabled() const { 628 return EnableCuMode; 629 } 630 631 bool isPreciseMemoryEnabled() const { return EnablePreciseMemory; } 632 633 bool hasFlatAddressSpace() const { 634 return FlatAddressSpace; 635 } 636 637 bool hasFlatScrRegister() const { 638 return hasFlatAddressSpace(); 639 } 640 641 bool hasFlatInstOffsets() const { 642 return FlatInstOffsets; 643 } 644 645 bool hasFlatGlobalInsts() const { 646 return FlatGlobalInsts; 647 } 648 649 bool hasFlatScratchInsts() const { 650 return FlatScratchInsts; 651 } 652 653 // Check if target supports ST addressing mode with FLAT scratch instructions. 654 // The ST addressing mode means no registers are used, either VGPR or SGPR, 655 // but only immediate offset is swizzled and added to the FLAT scratch base. 656 bool hasFlatScratchSTMode() const { 657 return hasFlatScratchInsts() && (hasGFX10_3Insts() || hasGFX940Insts()); 658 } 659 660 bool hasFlatScratchSVSMode() const { return GFX940Insts || GFX11Insts; } 661 662 bool hasScalarFlatScratchInsts() const { 663 return ScalarFlatScratchInsts; 664 } 665 666 bool enableFlatScratch() const { 667 return flatScratchIsArchitected() || 668 (EnableFlatScratch && hasFlatScratchInsts()); 669 } 670 671 bool hasGlobalAddTidInsts() const { 672 return GFX10_BEncoding; 673 } 674 675 bool hasAtomicCSub() const { 676 return GFX10_BEncoding; 677 } 678 679 bool hasExportInsts() const { 680 return !hasGFX940Insts(); 681 } 682 683 bool hasVINTERPEncoding() const { 684 return GFX11Insts; 685 } 686 687 // DS_ADD_F64/DS_ADD_RTN_F64 688 bool hasLdsAtomicAddF64() const { return hasGFX90AInsts(); } 689 690 bool hasMultiDwordFlatScratchAddressing() const { 691 return getGeneration() >= GFX9; 692 } 693 694 bool hasFlatSegmentOffsetBug() const { 695 return HasFlatSegmentOffsetBug; 696 } 697 698 bool hasFlatLgkmVMemCountInOrder() const { 699 return getGeneration() > GFX9; 700 } 701 702 bool hasD16LoadStore() const { 703 return getGeneration() >= GFX9; 704 } 705 706 bool d16PreservesUnusedBits() const { 707 return hasD16LoadStore() && !TargetID.isSramEccOnOrAny(); 708 } 709 710 bool hasD16Images() const { 711 return getGeneration() >= VOLCANIC_ISLANDS; 712 } 713 714 /// Return if most LDS instructions have an m0 use that require m0 to be 715 /// initialized. 716 bool ldsRequiresM0Init() const { 717 return getGeneration() < GFX9; 718 } 719 720 // True if the hardware rewinds and replays GWS operations if a wave is 721 // preempted. 722 // 723 // If this is false, a GWS operation requires testing if a nack set the 724 // MEM_VIOL bit, and repeating if so. 725 bool hasGWSAutoReplay() const { 726 return getGeneration() >= GFX9; 727 } 728 729 /// \returns if target has ds_gws_sema_release_all instruction. 730 bool hasGWSSemaReleaseAll() const { 731 return CIInsts; 732 } 733 734 /// \returns true if the target has integer add/sub instructions that do not 735 /// produce a carry-out. This includes v_add_[iu]32, v_sub_[iu]32, 736 /// v_add_[iu]16, and v_sub_[iu]16, all of which support the clamp modifier 737 /// for saturation. 738 bool hasAddNoCarry() const { 739 return AddNoCarryInsts; 740 } 741 742 bool hasScalarAddSub64() const { return getGeneration() >= GFX12; } 743 744 bool hasScalarSMulU64() const { return getGeneration() >= GFX12; } 745 746 bool hasUnpackedD16VMem() const { 747 return HasUnpackedD16VMem; 748 } 749 750 // Covers VS/PS/CS graphics shaders 751 bool isMesaGfxShader(const Function &F) const { 752 return isMesa3DOS() && AMDGPU::isShader(F.getCallingConv()); 753 } 754 755 bool hasMad64_32() const { 756 return getGeneration() >= SEA_ISLANDS; 757 } 758 759 bool hasSDWAOmod() const { 760 return HasSDWAOmod; 761 } 762 763 bool hasSDWAScalar() const { 764 return HasSDWAScalar; 765 } 766 767 bool hasSDWASdst() const { 768 return HasSDWASdst; 769 } 770 771 bool hasSDWAMac() const { 772 return HasSDWAMac; 773 } 774 775 bool hasSDWAOutModsVOPC() const { 776 return HasSDWAOutModsVOPC; 777 } 778 779 bool hasDLInsts() const { 780 return HasDLInsts; 781 } 782 783 bool hasFmacF64Inst() const { return HasFmacF64Inst; } 784 785 bool hasDot1Insts() const { 786 return HasDot1Insts; 787 } 788 789 bool hasDot2Insts() const { 790 return HasDot2Insts; 791 } 792 793 bool hasDot3Insts() const { 794 return HasDot3Insts; 795 } 796 797 bool hasDot4Insts() const { 798 return HasDot4Insts; 799 } 800 801 bool hasDot5Insts() const { 802 return HasDot5Insts; 803 } 804 805 bool hasDot6Insts() const { 806 return HasDot6Insts; 807 } 808 809 bool hasDot7Insts() const { 810 return HasDot7Insts; 811 } 812 813 bool hasDot8Insts() const { 814 return HasDot8Insts; 815 } 816 817 bool hasDot9Insts() const { 818 return HasDot9Insts; 819 } 820 821 bool hasDot10Insts() const { 822 return HasDot10Insts; 823 } 824 825 bool hasDot11Insts() const { 826 return HasDot11Insts; 827 } 828 829 bool hasDot12Insts() const { 830 return HasDot12Insts; 831 } 832 833 bool hasDot13Insts() const { 834 return HasDot13Insts; 835 } 836 837 bool hasMAIInsts() const { 838 return HasMAIInsts; 839 } 840 841 bool hasFP8Insts() const { 842 return HasFP8Insts; 843 } 844 845 bool hasFP8ConversionInsts() const { return HasFP8ConversionInsts; } 846 847 bool hasPkFmacF16Inst() const { 848 return HasPkFmacF16Inst; 849 } 850 851 bool hasAtomicFMinFMaxF32GlobalInsts() const { 852 return HasAtomicFMinFMaxF32GlobalInsts; 853 } 854 855 bool hasAtomicFMinFMaxF64GlobalInsts() const { 856 return HasAtomicFMinFMaxF64GlobalInsts; 857 } 858 859 bool hasAtomicFMinFMaxF32FlatInsts() const { 860 return HasAtomicFMinFMaxF32FlatInsts; 861 } 862 863 bool hasAtomicFMinFMaxF64FlatInsts() const { 864 return HasAtomicFMinFMaxF64FlatInsts; 865 } 866 867 bool hasAtomicDsPkAdd16Insts() const { return HasAtomicDsPkAdd16Insts; } 868 869 bool hasAtomicFlatPkAdd16Insts() const { return HasAtomicFlatPkAdd16Insts; } 870 871 bool hasAtomicFaddInsts() const { 872 return HasAtomicFaddRtnInsts || HasAtomicFaddNoRtnInsts; 873 } 874 875 bool hasAtomicFaddRtnInsts() const { return HasAtomicFaddRtnInsts; } 876 877 bool hasAtomicFaddNoRtnInsts() const { return HasAtomicFaddNoRtnInsts; } 878 879 bool hasAtomicBufferGlobalPkAddF16NoRtnInsts() const { 880 return HasAtomicBufferGlobalPkAddF16NoRtnInsts; 881 } 882 883 bool hasAtomicBufferGlobalPkAddF16Insts() const { 884 return HasAtomicBufferGlobalPkAddF16Insts; 885 } 886 887 bool hasAtomicGlobalPkAddBF16Inst() const { 888 return HasAtomicGlobalPkAddBF16Inst; 889 } 890 891 bool hasAtomicBufferPkAddBF16Inst() const { 892 return HasAtomicBufferPkAddBF16Inst; 893 } 894 895 bool hasFlatAtomicFaddF32Inst() const { return HasFlatAtomicFaddF32Inst; } 896 897 /// \return true if the target has flat, global, and buffer atomic fadd for 898 /// double. 899 bool hasFlatBufferGlobalAtomicFaddF64Inst() const { 900 return HasFlatBufferGlobalAtomicFaddF64Inst; 901 } 902 903 /// \return true if the target's flat, global, and buffer atomic fadd for 904 /// float supports denormal handling. 905 bool hasMemoryAtomicFaddF32DenormalSupport() const { 906 return HasMemoryAtomicFaddF32DenormalSupport; 907 } 908 909 /// \return true if atomic operations targeting fine-grained memory work 910 /// correctly at device scope, in allocations in host or peer PCIe device 911 /// memory. 912 bool supportsAgentScopeFineGrainedRemoteMemoryAtomics() const { 913 return HasAgentScopeFineGrainedRemoteMemoryAtomics; 914 } 915 916 bool hasDefaultComponentZero() const { return HasDefaultComponentZero; } 917 918 bool hasDefaultComponentBroadcast() const { 919 return HasDefaultComponentBroadcast; 920 } 921 922 bool hasNoSdstCMPX() const { 923 return HasNoSdstCMPX; 924 } 925 926 bool hasVscnt() const { 927 return HasVscnt; 928 } 929 930 bool hasGetWaveIdInst() const { 931 return HasGetWaveIdInst; 932 } 933 934 bool hasSMemTimeInst() const { 935 return HasSMemTimeInst; 936 } 937 938 bool hasShaderCyclesRegister() const { 939 return HasShaderCyclesRegister; 940 } 941 942 bool hasShaderCyclesHiLoRegisters() const { 943 return HasShaderCyclesHiLoRegisters; 944 } 945 946 bool hasVOP3Literal() const { 947 return HasVOP3Literal; 948 } 949 950 bool hasNoDataDepHazard() const { 951 return HasNoDataDepHazard; 952 } 953 954 bool vmemWriteNeedsExpWaitcnt() const { 955 return getGeneration() < SEA_ISLANDS; 956 } 957 958 bool hasInstPrefetch() const { 959 return getGeneration() == GFX10 || getGeneration() == GFX11; 960 } 961 962 bool hasPrefetch() const { return GFX12Insts; } 963 964 // Has s_cmpk_* instructions. 965 bool hasSCmpK() const { return getGeneration() < GFX12; } 966 967 // Scratch is allocated in 256 dword per wave blocks for the entire 968 // wavefront. When viewed from the perspective of an arbitrary workitem, this 969 // is 4-byte aligned. 970 // 971 // Only 4-byte alignment is really needed to access anything. Transformations 972 // on the pointer value itself may rely on the alignment / known low bits of 973 // the pointer. Set this to something above the minimum to avoid needing 974 // dynamic realignment in common cases. 975 Align getStackAlignment() const { return Align(16); } 976 977 bool enableMachineScheduler() const override { 978 return true; 979 } 980 981 bool useAA() const override; 982 983 bool enableSubRegLiveness() const override { 984 return true; 985 } 986 987 void setScalarizeGlobalBehavior(bool b) { ScalarizeGlobal = b; } 988 bool getScalarizeGlobalBehavior() const { return ScalarizeGlobal; } 989 990 // static wrappers 991 static bool hasHalfRate64Ops(const TargetSubtargetInfo &STI); 992 993 // XXX - Why is this here if it isn't in the default pass set? 994 bool enableEarlyIfConversion() const override { 995 return true; 996 } 997 998 void overrideSchedPolicy(MachineSchedPolicy &Policy, 999 unsigned NumRegionInstrs) const override; 1000 1001 void mirFileLoaded(MachineFunction &MF) const override; 1002 1003 unsigned getMaxNumUserSGPRs() const { 1004 return AMDGPU::getMaxNumUserSGPRs(*this); 1005 } 1006 1007 bool hasSMemRealTime() const { 1008 return HasSMemRealTime; 1009 } 1010 1011 bool hasMovrel() const { 1012 return HasMovrel; 1013 } 1014 1015 bool hasVGPRIndexMode() const { 1016 return HasVGPRIndexMode; 1017 } 1018 1019 bool useVGPRIndexMode() const; 1020 1021 bool hasScalarCompareEq64() const { 1022 return getGeneration() >= VOLCANIC_ISLANDS; 1023 } 1024 1025 bool hasScalarDwordx3Loads() const { return HasScalarDwordx3Loads; } 1026 1027 bool hasScalarStores() const { 1028 return HasScalarStores; 1029 } 1030 1031 bool hasScalarAtomics() const { 1032 return HasScalarAtomics; 1033 } 1034 1035 bool hasLDSFPAtomicAddF32() const { return GFX8Insts; } 1036 bool hasLDSFPAtomicAddF64() const { return GFX90AInsts; } 1037 1038 /// \returns true if the subtarget has the v_permlanex16_b32 instruction. 1039 bool hasPermLaneX16() const { return getGeneration() >= GFX10; } 1040 1041 /// \returns true if the subtarget has the v_permlane64_b32 instruction. 1042 bool hasPermLane64() const { return getGeneration() >= GFX11; } 1043 1044 bool hasDPP() const { 1045 return HasDPP; 1046 } 1047 1048 bool hasDPPBroadcasts() const { 1049 return HasDPP && getGeneration() < GFX10; 1050 } 1051 1052 bool hasDPPWavefrontShifts() const { 1053 return HasDPP && getGeneration() < GFX10; 1054 } 1055 1056 bool hasDPP8() const { 1057 return HasDPP8; 1058 } 1059 1060 bool hasDPALU_DPP() const { 1061 return HasDPALU_DPP; 1062 } 1063 1064 bool hasDPPSrc1SGPR() const { return HasDPPSrc1SGPR; } 1065 1066 bool hasPackedFP32Ops() const { 1067 return HasPackedFP32Ops; 1068 } 1069 1070 // Has V_PK_MOV_B32 opcode 1071 bool hasPkMovB32() const { 1072 return GFX90AInsts; 1073 } 1074 1075 bool hasFmaakFmamkF32Insts() const { 1076 return getGeneration() >= GFX10 || hasGFX940Insts(); 1077 } 1078 1079 bool hasImageInsts() const { 1080 return HasImageInsts; 1081 } 1082 1083 bool hasExtendedImageInsts() const { 1084 return HasExtendedImageInsts; 1085 } 1086 1087 bool hasR128A16() const { 1088 return HasR128A16; 1089 } 1090 1091 bool hasA16() const { return HasA16; } 1092 1093 bool hasG16() const { return HasG16; } 1094 1095 bool hasOffset3fBug() const { 1096 return HasOffset3fBug; 1097 } 1098 1099 bool hasImageStoreD16Bug() const { return HasImageStoreD16Bug; } 1100 1101 bool hasImageGather4D16Bug() const { return HasImageGather4D16Bug; } 1102 1103 bool hasMADIntraFwdBug() const { return HasMADIntraFwdBug; } 1104 1105 bool hasMSAALoadDstSelBug() const { return HasMSAALoadDstSelBug; } 1106 1107 bool hasPrivEnabledTrap2NopBug() const { return HasPrivEnabledTrap2NopBug; } 1108 1109 bool hasNSAEncoding() const { return HasNSAEncoding; } 1110 1111 bool hasNonNSAEncoding() const { return getGeneration() < GFX12; } 1112 1113 bool hasPartialNSAEncoding() const { return HasPartialNSAEncoding; } 1114 1115 unsigned getNSAMaxSize(bool HasSampler = false) const { 1116 return AMDGPU::getNSAMaxSize(*this, HasSampler); 1117 } 1118 1119 bool hasGFX10_AEncoding() const { 1120 return GFX10_AEncoding; 1121 } 1122 1123 bool hasGFX10_BEncoding() const { 1124 return GFX10_BEncoding; 1125 } 1126 1127 bool hasGFX10_3Insts() const { 1128 return GFX10_3Insts; 1129 } 1130 1131 bool hasMadF16() const; 1132 1133 bool hasMovB64() const { return GFX940Insts; } 1134 1135 bool hasLshlAddB64() const { return GFX940Insts; } 1136 1137 bool enableSIScheduler() const { 1138 return EnableSIScheduler; 1139 } 1140 1141 bool loadStoreOptEnabled() const { 1142 return EnableLoadStoreOpt; 1143 } 1144 1145 bool hasSGPRInitBug() const { 1146 return SGPRInitBug; 1147 } 1148 1149 bool hasUserSGPRInit16Bug() const { 1150 return UserSGPRInit16Bug && isWave32(); 1151 } 1152 1153 bool hasNegativeScratchOffsetBug() const { return NegativeScratchOffsetBug; } 1154 1155 bool hasNegativeUnalignedScratchOffsetBug() const { 1156 return NegativeUnalignedScratchOffsetBug; 1157 } 1158 1159 bool hasMFMAInlineLiteralBug() const { 1160 return HasMFMAInlineLiteralBug; 1161 } 1162 1163 bool has12DWordStoreHazard() const { 1164 return getGeneration() != AMDGPUSubtarget::SOUTHERN_ISLANDS; 1165 } 1166 1167 // \returns true if the subtarget supports DWORDX3 load/store instructions. 1168 bool hasDwordx3LoadStores() const { 1169 return CIInsts; 1170 } 1171 1172 bool hasReadM0MovRelInterpHazard() const { 1173 return getGeneration() == AMDGPUSubtarget::GFX9; 1174 } 1175 1176 bool hasReadM0SendMsgHazard() const { 1177 return getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS && 1178 getGeneration() <= AMDGPUSubtarget::GFX9; 1179 } 1180 1181 bool hasReadM0LdsDmaHazard() const { 1182 return getGeneration() == AMDGPUSubtarget::GFX9; 1183 } 1184 1185 bool hasReadM0LdsDirectHazard() const { 1186 return getGeneration() == AMDGPUSubtarget::GFX9; 1187 } 1188 1189 bool hasVcmpxPermlaneHazard() const { 1190 return HasVcmpxPermlaneHazard; 1191 } 1192 1193 bool hasVMEMtoScalarWriteHazard() const { 1194 return HasVMEMtoScalarWriteHazard; 1195 } 1196 1197 bool hasSMEMtoVectorWriteHazard() const { 1198 return HasSMEMtoVectorWriteHazard; 1199 } 1200 1201 bool hasLDSMisalignedBug() const { 1202 return LDSMisalignedBug && !EnableCuMode; 1203 } 1204 1205 bool hasInstFwdPrefetchBug() const { 1206 return HasInstFwdPrefetchBug; 1207 } 1208 1209 bool hasVcmpxExecWARHazard() const { 1210 return HasVcmpxExecWARHazard; 1211 } 1212 1213 bool hasLdsBranchVmemWARHazard() const { 1214 return HasLdsBranchVmemWARHazard; 1215 } 1216 1217 // Shift amount of a 64 bit shift cannot be a highest allocated register 1218 // if also at the end of the allocation block. 1219 bool hasShift64HighRegBug() const { 1220 return GFX90AInsts && !GFX940Insts; 1221 } 1222 1223 // Has one cycle hazard on transcendental instruction feeding a 1224 // non transcendental VALU. 1225 bool hasTransForwardingHazard() const { return GFX940Insts; } 1226 1227 // Has one cycle hazard on a VALU instruction partially writing dst with 1228 // a shift of result bits feeding another VALU instruction. 1229 bool hasDstSelForwardingHazard() const { return GFX940Insts; } 1230 1231 // Cannot use op_sel with v_dot instructions. 1232 bool hasDOTOpSelHazard() const { return GFX940Insts || GFX11Insts; } 1233 1234 // Does not have HW interlocs for VALU writing and then reading SGPRs. 1235 bool hasVDecCoExecHazard() const { 1236 return GFX940Insts; 1237 } 1238 1239 bool hasNSAtoVMEMBug() const { 1240 return HasNSAtoVMEMBug; 1241 } 1242 1243 bool hasNSAClauseBug() const { return HasNSAClauseBug; } 1244 1245 bool hasHardClauses() const { return MaxHardClauseLength > 0; } 1246 1247 bool hasGFX90AInsts() const { return GFX90AInsts; } 1248 1249 bool hasFPAtomicToDenormModeHazard() const { 1250 return getGeneration() == GFX10; 1251 } 1252 1253 bool hasVOP3DPP() const { return getGeneration() >= GFX11; } 1254 1255 bool hasLdsDirect() const { return getGeneration() >= GFX11; } 1256 1257 bool hasLdsWaitVMSRC() const { return getGeneration() >= GFX12; } 1258 1259 bool hasVALUPartialForwardingHazard() const { 1260 return getGeneration() == GFX11; 1261 } 1262 1263 bool hasVALUTransUseHazard() const { return HasVALUTransUseHazard; } 1264 1265 bool hasCvtScaleForwardingHazard() const { return GFX950Insts; } 1266 1267 bool hasForceStoreSC0SC1() const { return HasForceStoreSC0SC1; } 1268 1269 bool requiresCodeObjectV6() const { return RequiresCOV6; } 1270 1271 bool hasVALUMaskWriteHazard() const { return getGeneration() == GFX11; } 1272 1273 bool hasVALUReadSGPRHazard() const { return getGeneration() == GFX12; } 1274 1275 /// Return if operations acting on VGPR tuples require even alignment. 1276 bool needsAlignedVGPRs() const { return GFX90AInsts; } 1277 1278 /// Return true if the target has the S_PACK_HL_B32_B16 instruction. 1279 bool hasSPackHL() const { return GFX11Insts; } 1280 1281 /// Return true if the target's EXP instruction has the COMPR flag, which 1282 /// affects the meaning of the EN (enable) bits. 1283 bool hasCompressedExport() const { return !GFX11Insts; } 1284 1285 /// Return true if the target's EXP instruction supports the NULL export 1286 /// target. 1287 bool hasNullExportTarget() const { return !GFX11Insts; } 1288 1289 bool has1_5xVGPRs() const { return Has1_5xVGPRs; } 1290 1291 bool hasVOPDInsts() const { return HasVOPDInsts; } 1292 1293 bool hasFlatScratchSVSSwizzleBug() const { return getGeneration() == GFX11; } 1294 1295 /// Return true if the target has the S_DELAY_ALU instruction. 1296 bool hasDelayAlu() const { return GFX11Insts; } 1297 1298 bool hasPackedTID() const { return HasPackedTID; } 1299 1300 // GFX940 is a derivation to GFX90A. hasGFX940Insts() being true implies that 1301 // hasGFX90AInsts is also true. 1302 bool hasGFX940Insts() const { return GFX940Insts; } 1303 1304 // GFX950 is a derivation to GFX940. hasGFX950Insts() implies that 1305 // hasGFX940Insts and hasGFX90AInsts are also true. 1306 bool hasGFX950Insts() const { return GFX950Insts; } 1307 1308 /// Returns true if the target supports 1309 /// global_load_lds_dwordx3/global_load_lds_dwordx4 or 1310 /// buffer_load_dwordx3/buffer_load_dwordx4 with the lds bit. 1311 bool hasLDSLoadB96_B128() const { 1312 return hasGFX950Insts(); 1313 } 1314 1315 bool hasSALUFloatInsts() const { return HasSALUFloatInsts; } 1316 1317 bool hasPseudoScalarTrans() const { return HasPseudoScalarTrans; } 1318 1319 bool hasRestrictedSOffset() const { return HasRestrictedSOffset; } 1320 1321 bool hasRequiredExportPriority() const { return HasRequiredExportPriority; } 1322 1323 bool hasVmemWriteVgprInOrder() const { return HasVmemWriteVgprInOrder; } 1324 1325 /// \returns true if the target uses LOADcnt/SAMPLEcnt/BVHcnt, DScnt/KMcnt 1326 /// and STOREcnt rather than VMcnt, LGKMcnt and VScnt respectively. 1327 bool hasExtendedWaitCounts() const { return getGeneration() >= GFX12; } 1328 1329 /// \returns true if inline constants are not supported for F16 pseudo 1330 /// scalar transcendentals. 1331 bool hasNoF16PseudoScalarTransInlineConstants() const { 1332 return getGeneration() == GFX12; 1333 } 1334 1335 /// \returns true if the target has instructions with xf32 format support. 1336 bool hasXF32Insts() const { return HasXF32Insts; } 1337 1338 bool hasBitOp3Insts() const { return HasBitOp3Insts; } 1339 1340 bool hasPermlane16Swap() const { return HasPermlane16Swap; } 1341 bool hasPermlane32Swap() const { return HasPermlane32Swap; } 1342 bool hasAshrPkInsts() const { return HasAshrPkInsts; } 1343 1344 bool hasMinimum3Maximum3F32() const { 1345 return HasMinimum3Maximum3F32; 1346 } 1347 1348 bool hasMinimum3Maximum3F16() const { 1349 return HasMinimum3Maximum3F16; 1350 } 1351 1352 bool hasMinimum3Maximum3PKF16() const { 1353 return HasMinimum3Maximum3PKF16; 1354 } 1355 1356 /// \returns The maximum number of instructions that can be enclosed in an 1357 /// S_CLAUSE on the given subtarget, or 0 for targets that do not support that 1358 /// instruction. 1359 unsigned maxHardClauseLength() const { return MaxHardClauseLength; } 1360 1361 bool hasPrngInst() const { return HasPrngInst; } 1362 1363 /// Return the maximum number of waves per SIMD for kernels using \p SGPRs 1364 /// SGPRs 1365 unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const; 1366 1367 /// Return the maximum number of waves per SIMD for kernels using \p VGPRs 1368 /// VGPRs 1369 unsigned getOccupancyWithNumVGPRs(unsigned VGPRs) const; 1370 1371 /// Subtarget's minimum/maximum occupancy, in number of waves per EU, that can 1372 /// be achieved when the only function running on a CU is \p F, each workgroup 1373 /// uses \p LDSSize bytes of LDS, and each wave uses \p NumSGPRs SGPRs and \p 1374 /// NumVGPRs VGPRs. The flat workgroup sizes associated to the function are a 1375 /// range, so this returns a range as well. 1376 /// 1377 /// Note that occupancy can be affected by the scratch allocation as well, but 1378 /// we do not have enough information to compute it. 1379 std::pair<unsigned, unsigned> computeOccupancy(const Function &F, 1380 unsigned LDSSize = 0, 1381 unsigned NumSGPRs = 0, 1382 unsigned NumVGPRs = 0) const; 1383 1384 /// \returns true if the flat_scratch register should be initialized with the 1385 /// pointer to the wave's scratch memory rather than a size and offset. 1386 bool flatScratchIsPointer() const { 1387 return getGeneration() >= AMDGPUSubtarget::GFX9; 1388 } 1389 1390 /// \returns true if the flat_scratch register is initialized by the HW. 1391 /// In this case it is readonly. 1392 bool flatScratchIsArchitected() const { return HasArchitectedFlatScratch; } 1393 1394 /// \returns true if the architected SGPRs are enabled. 1395 bool hasArchitectedSGPRs() const { return HasArchitectedSGPRs; } 1396 1397 /// \returns true if Global Data Share is supported. 1398 bool hasGDS() const { return HasGDS; } 1399 1400 /// \returns true if Global Wave Sync is supported. 1401 bool hasGWS() const { return HasGWS; } 1402 1403 /// \returns true if the machine has merged shaders in which s0-s7 are 1404 /// reserved by the hardware and user SGPRs start at s8 1405 bool hasMergedShaders() const { 1406 return getGeneration() >= GFX9; 1407 } 1408 1409 // \returns true if the target supports the pre-NGG legacy geometry path. 1410 bool hasLegacyGeometry() const { return getGeneration() < GFX11; } 1411 1412 // \returns true if preloading kernel arguments is supported. 1413 bool hasKernargPreload() const { return KernargPreload; } 1414 1415 // \returns true if the target has split barriers feature 1416 bool hasSplitBarriers() const { return getGeneration() >= GFX12; } 1417 1418 // \returns true if FP8/BF8 VOP1 form of conversion to F32 is unreliable. 1419 bool hasCvtFP8VOP1Bug() const { return HasCvtFP8Vop1Bug; } 1420 1421 // \returns true if CSUB (a.k.a. SUB_CLAMP on GFX12) atomics support a 1422 // no-return form. 1423 bool hasAtomicCSubNoRtnInsts() const { return HasAtomicCSubNoRtnInsts; } 1424 1425 // \returns true if the target has DX10_CLAMP kernel descriptor mode bit 1426 bool hasDX10ClampMode() const { return getGeneration() < GFX12; } 1427 1428 // \returns true if the target has IEEE kernel descriptor mode bit 1429 bool hasIEEEMode() const { return getGeneration() < GFX12; } 1430 1431 // \returns true if the target has IEEE fminimum/fmaximum instructions 1432 bool hasIEEEMinMax() const { return getGeneration() >= GFX12; } 1433 1434 // \returns true if the target has IEEE fminimum3/fmaximum3 instructions 1435 bool hasIEEEMinMax3() const { return hasIEEEMinMax(); } 1436 1437 // \returns true if the target has WG_RR_MODE kernel descriptor mode bit 1438 bool hasRrWGMode() const { return getGeneration() >= GFX12; } 1439 1440 /// \returns true if VADDR and SADDR fields in VSCRATCH can use negative 1441 /// values. 1442 bool hasSignedScratchOffsets() const { return getGeneration() >= GFX12; } 1443 1444 // \returns true if S_GETPC_B64 zero-extends the result from 48 bits instead 1445 // of sign-extending. 1446 bool hasGetPCZeroExtension() const { return GFX12Insts; } 1447 1448 /// \returns SGPR allocation granularity supported by the subtarget. 1449 unsigned getSGPRAllocGranule() const { 1450 return AMDGPU::IsaInfo::getSGPRAllocGranule(this); 1451 } 1452 1453 /// \returns SGPR encoding granularity supported by the subtarget. 1454 unsigned getSGPREncodingGranule() const { 1455 return AMDGPU::IsaInfo::getSGPREncodingGranule(this); 1456 } 1457 1458 /// \returns Total number of SGPRs supported by the subtarget. 1459 unsigned getTotalNumSGPRs() const { 1460 return AMDGPU::IsaInfo::getTotalNumSGPRs(this); 1461 } 1462 1463 /// \returns Addressable number of SGPRs supported by the subtarget. 1464 unsigned getAddressableNumSGPRs() const { 1465 return AMDGPU::IsaInfo::getAddressableNumSGPRs(this); 1466 } 1467 1468 /// \returns Minimum number of SGPRs that meets the given number of waves per 1469 /// execution unit requirement supported by the subtarget. 1470 unsigned getMinNumSGPRs(unsigned WavesPerEU) const { 1471 return AMDGPU::IsaInfo::getMinNumSGPRs(this, WavesPerEU); 1472 } 1473 1474 /// \returns Maximum number of SGPRs that meets the given number of waves per 1475 /// execution unit requirement supported by the subtarget. 1476 unsigned getMaxNumSGPRs(unsigned WavesPerEU, bool Addressable) const { 1477 return AMDGPU::IsaInfo::getMaxNumSGPRs(this, WavesPerEU, Addressable); 1478 } 1479 1480 /// \returns Reserved number of SGPRs. This is common 1481 /// utility function called by MachineFunction and 1482 /// Function variants of getReservedNumSGPRs. 1483 unsigned getBaseReservedNumSGPRs(const bool HasFlatScratch) const; 1484 /// \returns Reserved number of SGPRs for given machine function \p MF. 1485 unsigned getReservedNumSGPRs(const MachineFunction &MF) const; 1486 1487 /// \returns Reserved number of SGPRs for given function \p F. 1488 unsigned getReservedNumSGPRs(const Function &F) const; 1489 1490 /// \returns max num SGPRs. This is the common utility 1491 /// function called by MachineFunction and Function 1492 /// variants of getMaxNumSGPRs. 1493 unsigned getBaseMaxNumSGPRs(const Function &F, 1494 std::pair<unsigned, unsigned> WavesPerEU, 1495 unsigned PreloadedSGPRs, 1496 unsigned ReservedNumSGPRs) const; 1497 1498 /// \returns Maximum number of SGPRs that meets number of waves per execution 1499 /// unit requirement for function \p MF, or number of SGPRs explicitly 1500 /// requested using "amdgpu-num-sgpr" attribute attached to function \p MF. 1501 /// 1502 /// \returns Value that meets number of waves per execution unit requirement 1503 /// if explicitly requested value cannot be converted to integer, violates 1504 /// subtarget's specifications, or does not meet number of waves per execution 1505 /// unit requirement. 1506 unsigned getMaxNumSGPRs(const MachineFunction &MF) const; 1507 1508 /// \returns Maximum number of SGPRs that meets number of waves per execution 1509 /// unit requirement for function \p F, or number of SGPRs explicitly 1510 /// requested using "amdgpu-num-sgpr" attribute attached to function \p F. 1511 /// 1512 /// \returns Value that meets number of waves per execution unit requirement 1513 /// if explicitly requested value cannot be converted to integer, violates 1514 /// subtarget's specifications, or does not meet number of waves per execution 1515 /// unit requirement. 1516 unsigned getMaxNumSGPRs(const Function &F) const; 1517 1518 /// \returns VGPR allocation granularity supported by the subtarget. 1519 unsigned getVGPRAllocGranule() const { 1520 return AMDGPU::IsaInfo::getVGPRAllocGranule(this); 1521 } 1522 1523 /// \returns VGPR encoding granularity supported by the subtarget. 1524 unsigned getVGPREncodingGranule() const { 1525 return AMDGPU::IsaInfo::getVGPREncodingGranule(this); 1526 } 1527 1528 /// \returns Total number of VGPRs supported by the subtarget. 1529 unsigned getTotalNumVGPRs() const { 1530 return AMDGPU::IsaInfo::getTotalNumVGPRs(this); 1531 } 1532 1533 /// \returns Addressable number of architectural VGPRs supported by the 1534 /// subtarget. 1535 unsigned getAddressableNumArchVGPRs() const { 1536 return AMDGPU::IsaInfo::getAddressableNumArchVGPRs(this); 1537 } 1538 1539 /// \returns Addressable number of VGPRs supported by the subtarget. 1540 unsigned getAddressableNumVGPRs() const { 1541 return AMDGPU::IsaInfo::getAddressableNumVGPRs(this); 1542 } 1543 1544 /// \returns the minimum number of VGPRs that will prevent achieving more than 1545 /// the specified number of waves \p WavesPerEU. 1546 unsigned getMinNumVGPRs(unsigned WavesPerEU) const { 1547 return AMDGPU::IsaInfo::getMinNumVGPRs(this, WavesPerEU); 1548 } 1549 1550 /// \returns the maximum number of VGPRs that can be used and still achieved 1551 /// at least the specified number of waves \p WavesPerEU. 1552 unsigned getMaxNumVGPRs(unsigned WavesPerEU) const { 1553 return AMDGPU::IsaInfo::getMaxNumVGPRs(this, WavesPerEU); 1554 } 1555 1556 /// \returns max num VGPRs. This is the common utility function 1557 /// called by MachineFunction and Function variants of getMaxNumVGPRs. 1558 unsigned getBaseMaxNumVGPRs(const Function &F, 1559 std::pair<unsigned, unsigned> WavesPerEU) const; 1560 /// \returns Maximum number of VGPRs that meets number of waves per execution 1561 /// unit requirement for function \p F, or number of VGPRs explicitly 1562 /// requested using "amdgpu-num-vgpr" attribute attached to function \p F. 1563 /// 1564 /// \returns Value that meets number of waves per execution unit requirement 1565 /// if explicitly requested value cannot be converted to integer, violates 1566 /// subtarget's specifications, or does not meet number of waves per execution 1567 /// unit requirement. 1568 unsigned getMaxNumVGPRs(const Function &F) const; 1569 1570 unsigned getMaxNumAGPRs(const Function &F) const { 1571 return getMaxNumVGPRs(F); 1572 } 1573 1574 /// \returns Maximum number of VGPRs that meets number of waves per execution 1575 /// unit requirement for function \p MF, or number of VGPRs explicitly 1576 /// requested using "amdgpu-num-vgpr" attribute attached to function \p MF. 1577 /// 1578 /// \returns Value that meets number of waves per execution unit requirement 1579 /// if explicitly requested value cannot be converted to integer, violates 1580 /// subtarget's specifications, or does not meet number of waves per execution 1581 /// unit requirement. 1582 unsigned getMaxNumVGPRs(const MachineFunction &MF) const; 1583 1584 bool isWave32() const { 1585 return getWavefrontSize() == 32; 1586 } 1587 1588 bool isWave64() const { 1589 return getWavefrontSize() == 64; 1590 } 1591 1592 /// Returns if the wavesize of this subtarget is known reliable. This is false 1593 /// only for the a default target-cpu that does not have an explicit 1594 /// +wavefrontsize target feature. 1595 bool isWaveSizeKnown() const { 1596 return hasFeature(AMDGPU::FeatureWavefrontSize32) || 1597 hasFeature(AMDGPU::FeatureWavefrontSize64); 1598 } 1599 1600 const TargetRegisterClass *getBoolRC() const { 1601 return getRegisterInfo()->getBoolRC(); 1602 } 1603 1604 /// \returns Maximum number of work groups per compute unit supported by the 1605 /// subtarget and limited by given \p FlatWorkGroupSize. 1606 unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const override { 1607 return AMDGPU::IsaInfo::getMaxWorkGroupsPerCU(this, FlatWorkGroupSize); 1608 } 1609 1610 /// \returns Minimum flat work group size supported by the subtarget. 1611 unsigned getMinFlatWorkGroupSize() const override { 1612 return AMDGPU::IsaInfo::getMinFlatWorkGroupSize(this); 1613 } 1614 1615 /// \returns Maximum flat work group size supported by the subtarget. 1616 unsigned getMaxFlatWorkGroupSize() const override { 1617 return AMDGPU::IsaInfo::getMaxFlatWorkGroupSize(this); 1618 } 1619 1620 /// \returns Number of waves per execution unit required to support the given 1621 /// \p FlatWorkGroupSize. 1622 unsigned 1623 getWavesPerEUForWorkGroup(unsigned FlatWorkGroupSize) const override { 1624 return AMDGPU::IsaInfo::getWavesPerEUForWorkGroup(this, FlatWorkGroupSize); 1625 } 1626 1627 /// \returns Minimum number of waves per execution unit supported by the 1628 /// subtarget. 1629 unsigned getMinWavesPerEU() const override { 1630 return AMDGPU::IsaInfo::getMinWavesPerEU(this); 1631 } 1632 1633 void adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use, int UseOpIdx, 1634 SDep &Dep, 1635 const TargetSchedModel *SchedModel) const override; 1636 1637 // \returns true if it's beneficial on this subtarget for the scheduler to 1638 // cluster stores as well as loads. 1639 bool shouldClusterStores() const { return getGeneration() >= GFX11; } 1640 1641 // \returns the number of address arguments from which to enable MIMG NSA 1642 // on supported architectures. 1643 unsigned getNSAThreshold(const MachineFunction &MF) const; 1644 1645 // \returns true if the subtarget has a hazard requiring an "s_nop 0" 1646 // instruction before "s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)". 1647 bool requiresNopBeforeDeallocVGPRs() const { 1648 // Currently all targets that support the dealloc VGPRs message also require 1649 // the nop. 1650 return true; 1651 } 1652 1653 bool requiresDisjointEarlyClobberAndUndef() const override { 1654 // AMDGPU doesn't care if early-clobber and undef operands are allocated 1655 // to the same register. 1656 return false; 1657 } 1658 }; 1659 1660 class GCNUserSGPRUsageInfo { 1661 public: 1662 bool hasImplicitBufferPtr() const { return ImplicitBufferPtr; } 1663 1664 bool hasPrivateSegmentBuffer() const { return PrivateSegmentBuffer; } 1665 1666 bool hasDispatchPtr() const { return DispatchPtr; } 1667 1668 bool hasQueuePtr() const { return QueuePtr; } 1669 1670 bool hasKernargSegmentPtr() const { return KernargSegmentPtr; } 1671 1672 bool hasDispatchID() const { return DispatchID; } 1673 1674 bool hasFlatScratchInit() const { return FlatScratchInit; } 1675 1676 bool hasPrivateSegmentSize() const { return PrivateSegmentSize; } 1677 1678 unsigned getNumKernargPreloadSGPRs() const { return NumKernargPreloadSGPRs; } 1679 1680 unsigned getNumUsedUserSGPRs() const { return NumUsedUserSGPRs; } 1681 1682 unsigned getNumFreeUserSGPRs(); 1683 1684 void allocKernargPreloadSGPRs(unsigned NumSGPRs); 1685 1686 enum UserSGPRID : unsigned { 1687 ImplicitBufferPtrID = 0, 1688 PrivateSegmentBufferID = 1, 1689 DispatchPtrID = 2, 1690 QueuePtrID = 3, 1691 KernargSegmentPtrID = 4, 1692 DispatchIdID = 5, 1693 FlatScratchInitID = 6, 1694 PrivateSegmentSizeID = 7 1695 }; 1696 1697 // Returns the size in number of SGPRs for preload user SGPR field. 1698 static unsigned getNumUserSGPRForField(UserSGPRID ID) { 1699 switch (ID) { 1700 case ImplicitBufferPtrID: 1701 return 2; 1702 case PrivateSegmentBufferID: 1703 return 4; 1704 case DispatchPtrID: 1705 return 2; 1706 case QueuePtrID: 1707 return 2; 1708 case KernargSegmentPtrID: 1709 return 2; 1710 case DispatchIdID: 1711 return 2; 1712 case FlatScratchInitID: 1713 return 2; 1714 case PrivateSegmentSizeID: 1715 return 1; 1716 } 1717 llvm_unreachable("Unknown UserSGPRID."); 1718 } 1719 1720 GCNUserSGPRUsageInfo(const Function &F, const GCNSubtarget &ST); 1721 1722 private: 1723 const GCNSubtarget &ST; 1724 1725 // Private memory buffer 1726 // Compute directly in sgpr[0:1] 1727 // Other shaders indirect 64-bits at sgpr[0:1] 1728 bool ImplicitBufferPtr = false; 1729 1730 bool PrivateSegmentBuffer = false; 1731 1732 bool DispatchPtr = false; 1733 1734 bool QueuePtr = false; 1735 1736 bool KernargSegmentPtr = false; 1737 1738 bool DispatchID = false; 1739 1740 bool FlatScratchInit = false; 1741 1742 bool PrivateSegmentSize = false; 1743 1744 unsigned NumKernargPreloadSGPRs = 0; 1745 1746 unsigned NumUsedUserSGPRs = 0; 1747 }; 1748 1749 } // end namespace llvm 1750 1751 #endif // LLVM_LIB_TARGET_AMDGPU_GCNSUBTARGET_H 1752