1 //=====-- AMDGPUSubtarget.h - Define Subtarget for AMDGPU ------*- C++ -*-====// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //==-----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// AMDGPU specific subclass of TargetSubtarget. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUSUBTARGET_H 15 #define LLVM_LIB_TARGET_AMDGPU_AMDGPUSUBTARGET_H 16 17 #include "AMDGPU.h" 18 #include "AMDGPUCallLowering.h" 19 #include "R600FrameLowering.h" 20 #include "R600ISelLowering.h" 21 #include "R600InstrInfo.h" 22 #include "SIFrameLowering.h" 23 #include "SIISelLowering.h" 24 #include "SIInstrInfo.h" 25 #include "Utils/AMDGPUBaseInfo.h" 26 #include "llvm/ADT/Triple.h" 27 #include "llvm/CodeGen/GlobalISel/InstructionSelector.h" 28 #include "llvm/CodeGen/GlobalISel/LegalizerInfo.h" 29 #include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h" 30 #include "llvm/CodeGen/MachineFunction.h" 31 #include "llvm/CodeGen/SelectionDAGTargetInfo.h" 32 #include "llvm/MC/MCInstrItineraries.h" 33 #include "llvm/Support/MathExtras.h" 34 #include <cassert> 35 #include <cstdint> 36 #include <memory> 37 #include <utility> 38 39 #define GET_SUBTARGETINFO_HEADER 40 #include "AMDGPUGenSubtargetInfo.inc" 41 #define GET_SUBTARGETINFO_HEADER 42 #include "R600GenSubtargetInfo.inc" 43 44 namespace llvm { 45 46 class StringRef; 47 48 class AMDGPUSubtarget { 49 public: 50 enum Generation { 51 R600 = 0, 52 R700 = 1, 53 EVERGREEN = 2, 54 NORTHERN_ISLANDS = 3, 55 SOUTHERN_ISLANDS = 4, 56 SEA_ISLANDS = 5, 57 VOLCANIC_ISLANDS = 6, 58 GFX9 = 7, 59 GFX10 = 8 60 }; 61 62 private: 63 Triple TargetTriple; 64 65 protected: 66 bool Has16BitInsts; 67 bool HasMadMixInsts; 68 bool FP32Denormals; 69 bool FPExceptions; 70 bool HasSDWA; 71 bool HasVOP3PInsts; 72 bool HasMulI24; 73 bool HasMulU24; 74 bool HasInv2PiInlineImm; 75 bool HasFminFmaxLegacy; 76 bool EnablePromoteAlloca; 77 bool HasTrigReducedRange; 78 unsigned MaxWavesPerEU; 79 int LocalMemorySize; 80 unsigned WavefrontSize; 81 82 public: 83 AMDGPUSubtarget(const Triple &TT); 84 85 static const AMDGPUSubtarget &get(const MachineFunction &MF); 86 static const AMDGPUSubtarget &get(const TargetMachine &TM, 87 const Function &F); 88 89 /// \returns Default range flat work group size for a calling convention. 90 std::pair<unsigned, unsigned> getDefaultFlatWorkGroupSize(CallingConv::ID CC) const; 91 92 /// \returns Subtarget's default pair of minimum/maximum flat work group sizes 93 /// for function \p F, or minimum/maximum flat work group sizes explicitly 94 /// requested using "amdgpu-flat-work-group-size" attribute attached to 95 /// function \p F. 96 /// 97 /// \returns Subtarget's default values if explicitly requested values cannot 98 /// be converted to integer, or violate subtarget's specifications. 99 std::pair<unsigned, unsigned> getFlatWorkGroupSizes(const Function &F) const; 100 101 /// \returns Subtarget's default pair of minimum/maximum number of waves per 102 /// execution unit for function \p F, or minimum/maximum number of waves per 103 /// execution unit explicitly requested using "amdgpu-waves-per-eu" attribute 104 /// attached to function \p F. 105 /// 106 /// \returns Subtarget's default values if explicitly requested values cannot 107 /// be converted to integer, violate subtarget's specifications, or are not 108 /// compatible with minimum/maximum number of waves limited by flat work group 109 /// size, register usage, and/or lds usage. 110 std::pair<unsigned, unsigned> getWavesPerEU(const Function &F) const; 111 112 /// Return the amount of LDS that can be used that will not restrict the 113 /// occupancy lower than WaveCount. 114 unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount, 115 const Function &) const; 116 117 /// Inverse of getMaxLocalMemWithWaveCount. Return the maximum wavecount if 118 /// the given LDS memory size is the only constraint. 119 unsigned getOccupancyWithLocalMemSize(uint32_t Bytes, const Function &) const; 120 121 unsigned getOccupancyWithLocalMemSize(const MachineFunction &MF) const; 122 123 bool isAmdHsaOS() const { 124 return TargetTriple.getOS() == Triple::AMDHSA; 125 } 126 127 bool isAmdPalOS() const { 128 return TargetTriple.getOS() == Triple::AMDPAL; 129 } 130 131 bool isMesa3DOS() const { 132 return TargetTriple.getOS() == Triple::Mesa3D; 133 } 134 135 bool isMesaKernel(const Function &F) const { 136 return isMesa3DOS() && !AMDGPU::isShader(F.getCallingConv()); 137 } 138 139 bool isAmdHsaOrMesa(const Function &F) const { 140 return isAmdHsaOS() || isMesaKernel(F); 141 } 142 143 bool has16BitInsts() const { 144 return Has16BitInsts; 145 } 146 147 bool hasMadMixInsts() const { 148 return HasMadMixInsts; 149 } 150 151 bool hasFP32Denormals() const { 152 return FP32Denormals; 153 } 154 155 bool hasFPExceptions() const { 156 return FPExceptions; 157 } 158 159 bool hasSDWA() const { 160 return HasSDWA; 161 } 162 163 bool hasVOP3PInsts() const { 164 return HasVOP3PInsts; 165 } 166 167 bool hasMulI24() const { 168 return HasMulI24; 169 } 170 171 bool hasMulU24() const { 172 return HasMulU24; 173 } 174 175 bool hasInv2PiInlineImm() const { 176 return HasInv2PiInlineImm; 177 } 178 179 bool hasFminFmaxLegacy() const { 180 return HasFminFmaxLegacy; 181 } 182 183 bool hasTrigReducedRange() const { 184 return HasTrigReducedRange; 185 } 186 187 bool isPromoteAllocaEnabled() const { 188 return EnablePromoteAlloca; 189 } 190 191 unsigned getWavefrontSize() const { 192 return WavefrontSize; 193 } 194 195 int getLocalMemorySize() const { 196 return LocalMemorySize; 197 } 198 199 Align getAlignmentForImplicitArgPtr() const { 200 return isAmdHsaOS() ? Align(8) : Align(4); 201 } 202 203 /// Returns the offset in bytes from the start of the input buffer 204 /// of the first explicit kernel argument. 205 unsigned getExplicitKernelArgOffset(const Function &F) const { 206 return isAmdHsaOrMesa(F) ? 0 : 36; 207 } 208 209 /// \returns Maximum number of work groups per compute unit supported by the 210 /// subtarget and limited by given \p FlatWorkGroupSize. 211 virtual unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const = 0; 212 213 /// \returns Minimum flat work group size supported by the subtarget. 214 virtual unsigned getMinFlatWorkGroupSize() const = 0; 215 216 /// \returns Maximum flat work group size supported by the subtarget. 217 virtual unsigned getMaxFlatWorkGroupSize() const = 0; 218 219 /// \returns Maximum number of waves per execution unit supported by the 220 /// subtarget and limited by given \p FlatWorkGroupSize. 221 virtual unsigned getMaxWavesPerEU(unsigned FlatWorkGroupSize) const = 0; 222 223 /// \returns Minimum number of waves per execution unit supported by the 224 /// subtarget. 225 virtual unsigned getMinWavesPerEU() const = 0; 226 227 /// \returns Maximum number of waves per execution unit supported by the 228 /// subtarget without any kind of limitation. 229 unsigned getMaxWavesPerEU() const { return MaxWavesPerEU; } 230 231 /// Creates value range metadata on an workitemid.* inrinsic call or load. 232 bool makeLIDRangeMetadata(Instruction *I) const; 233 234 /// \returns Number of bytes of arguments that are passed to a shader or 235 /// kernel in addition to the explicit ones declared for the function. 236 unsigned getImplicitArgNumBytes(const Function &F) const { 237 if (isMesaKernel(F)) 238 return 16; 239 return AMDGPU::getIntegerAttribute(F, "amdgpu-implicitarg-num-bytes", 0); 240 } 241 uint64_t getExplicitKernArgSize(const Function &F, Align &MaxAlign) const; 242 unsigned getKernArgSegmentSize(const Function &F, Align &MaxAlign) const; 243 244 virtual ~AMDGPUSubtarget() {} 245 }; 246 247 class GCNSubtarget : public AMDGPUGenSubtargetInfo, 248 public AMDGPUSubtarget { 249 250 using AMDGPUSubtarget::getMaxWavesPerEU; 251 252 public: 253 enum TrapHandlerAbi { 254 TrapHandlerAbiNone = 0, 255 TrapHandlerAbiHsa = 1 256 }; 257 258 enum TrapID { 259 TrapIDHardwareReserved = 0, 260 TrapIDHSADebugTrap = 1, 261 TrapIDLLVMTrap = 2, 262 TrapIDLLVMDebugTrap = 3, 263 TrapIDDebugBreakpoint = 7, 264 TrapIDDebugReserved8 = 8, 265 TrapIDDebugReservedFE = 0xfe, 266 TrapIDDebugReservedFF = 0xff 267 }; 268 269 enum TrapRegValues { 270 LLVMTrapHandlerRegValue = 1 271 }; 272 273 private: 274 /// GlobalISel related APIs. 275 std::unique_ptr<AMDGPUCallLowering> CallLoweringInfo; 276 std::unique_ptr<InstructionSelector> InstSelector; 277 std::unique_ptr<LegalizerInfo> Legalizer; 278 std::unique_ptr<RegisterBankInfo> RegBankInfo; 279 280 protected: 281 // Basic subtarget description. 282 Triple TargetTriple; 283 unsigned Gen; 284 InstrItineraryData InstrItins; 285 int LDSBankCount; 286 unsigned MaxPrivateElementSize; 287 288 // Possibly statically set by tablegen, but may want to be overridden. 289 bool FastFMAF32; 290 bool HalfRate64Ops; 291 292 // Dynamially set bits that enable features. 293 bool FP64FP16Denormals; 294 bool FlatForGlobal; 295 bool AutoWaitcntBeforeBarrier; 296 bool CodeObjectV3; 297 bool UnalignedScratchAccess; 298 bool UnalignedBufferAccess; 299 bool HasApertureRegs; 300 bool EnableXNACK; 301 bool DoesNotSupportXNACK; 302 bool EnableCuMode; 303 bool TrapHandler; 304 305 // Used as options. 306 bool EnableLoadStoreOpt; 307 bool EnableUnsafeDSOffsetFolding; 308 bool EnableSIScheduler; 309 bool EnableDS128; 310 bool EnablePRTStrictNull; 311 bool DumpCode; 312 313 // Subtarget statically properties set by tablegen 314 bool FP64; 315 bool FMA; 316 bool MIMG_R128; 317 bool IsGCN; 318 bool GCN3Encoding; 319 bool CIInsts; 320 bool GFX8Insts; 321 bool GFX9Insts; 322 bool GFX10Insts; 323 bool GFX7GFX8GFX9Insts; 324 bool SGPRInitBug; 325 bool HasSMemRealTime; 326 bool HasIntClamp; 327 bool HasFmaMixInsts; 328 bool HasMovrel; 329 bool HasVGPRIndexMode; 330 bool HasScalarStores; 331 bool HasScalarAtomics; 332 bool HasSDWAOmod; 333 bool HasSDWAScalar; 334 bool HasSDWASdst; 335 bool HasSDWAMac; 336 bool HasSDWAOutModsVOPC; 337 bool HasDPP; 338 bool HasDPP8; 339 bool HasR128A16; 340 bool HasNSAEncoding; 341 bool HasDLInsts; 342 bool HasDot1Insts; 343 bool HasDot2Insts; 344 bool HasDot3Insts; 345 bool HasDot4Insts; 346 bool HasDot5Insts; 347 bool HasDot6Insts; 348 bool HasMAIInsts; 349 bool HasPkFmacF16Inst; 350 bool HasAtomicFaddInsts; 351 bool EnableSRAMECC; 352 bool DoesNotSupportSRAMECC; 353 bool HasNoSdstCMPX; 354 bool HasVscnt; 355 bool HasRegisterBanking; 356 bool HasVOP3Literal; 357 bool HasNoDataDepHazard; 358 bool FlatAddressSpace; 359 bool FlatInstOffsets; 360 bool FlatGlobalInsts; 361 bool FlatScratchInsts; 362 bool ScalarFlatScratchInsts; 363 bool AddNoCarryInsts; 364 bool HasUnpackedD16VMem; 365 bool R600ALUInst; 366 bool CaymanISA; 367 bool CFALUBug; 368 bool LDSMisalignedBug; 369 bool HasMFMAInlineLiteralBug; 370 bool HasVertexCache; 371 short TexVTXClauseSize; 372 bool ScalarizeGlobal; 373 374 bool HasVcmpxPermlaneHazard; 375 bool HasVMEMtoScalarWriteHazard; 376 bool HasSMEMtoVectorWriteHazard; 377 bool HasInstFwdPrefetchBug; 378 bool HasVcmpxExecWARHazard; 379 bool HasLdsBranchVmemWARHazard; 380 bool HasNSAtoVMEMBug; 381 bool HasOffset3fBug; 382 bool HasFlatSegmentOffsetBug; 383 384 // Dummy feature to use for assembler in tablegen. 385 bool FeatureDisable; 386 387 SelectionDAGTargetInfo TSInfo; 388 private: 389 SIInstrInfo InstrInfo; 390 SITargetLowering TLInfo; 391 SIFrameLowering FrameLowering; 392 393 // See COMPUTE_TMPRING_SIZE.WAVESIZE, 13-bit field in units of 256-dword. 394 static const unsigned MaxWaveScratchSize = (256 * 4) * ((1 << 13) - 1); 395 396 public: 397 GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, 398 const GCNTargetMachine &TM); 399 ~GCNSubtarget() override; 400 401 GCNSubtarget &initializeSubtargetDependencies(const Triple &TT, 402 StringRef GPU, StringRef FS); 403 404 const SIInstrInfo *getInstrInfo() const override { 405 return &InstrInfo; 406 } 407 408 const SIFrameLowering *getFrameLowering() const override { 409 return &FrameLowering; 410 } 411 412 const SITargetLowering *getTargetLowering() const override { 413 return &TLInfo; 414 } 415 416 const SIRegisterInfo *getRegisterInfo() const override { 417 return &InstrInfo.getRegisterInfo(); 418 } 419 420 const CallLowering *getCallLowering() const override { 421 return CallLoweringInfo.get(); 422 } 423 424 InstructionSelector *getInstructionSelector() const override { 425 return InstSelector.get(); 426 } 427 428 const LegalizerInfo *getLegalizerInfo() const override { 429 return Legalizer.get(); 430 } 431 432 const RegisterBankInfo *getRegBankInfo() const override { 433 return RegBankInfo.get(); 434 } 435 436 // Nothing implemented, just prevent crashes on use. 437 const SelectionDAGTargetInfo *getSelectionDAGInfo() const override { 438 return &TSInfo; 439 } 440 441 const InstrItineraryData *getInstrItineraryData() const override { 442 return &InstrItins; 443 } 444 445 void ParseSubtargetFeatures(StringRef CPU, StringRef FS); 446 447 Generation getGeneration() const { 448 return (Generation)Gen; 449 } 450 451 unsigned getWavefrontSizeLog2() const { 452 return Log2_32(WavefrontSize); 453 } 454 455 /// Return the number of high bits known to be zero fror a frame index. 456 unsigned getKnownHighZeroBitsForFrameIndex() const { 457 return countLeadingZeros(MaxWaveScratchSize) + getWavefrontSizeLog2(); 458 } 459 460 int getLDSBankCount() const { 461 return LDSBankCount; 462 } 463 464 unsigned getMaxPrivateElementSize() const { 465 return MaxPrivateElementSize; 466 } 467 468 unsigned getConstantBusLimit(unsigned Opcode) const; 469 470 bool hasIntClamp() const { 471 return HasIntClamp; 472 } 473 474 bool hasFP64() const { 475 return FP64; 476 } 477 478 bool hasMIMG_R128() const { 479 return MIMG_R128; 480 } 481 482 bool hasHWFP64() const { 483 return FP64; 484 } 485 486 bool hasFastFMAF32() const { 487 return FastFMAF32; 488 } 489 490 bool hasHalfRate64Ops() const { 491 return HalfRate64Ops; 492 } 493 494 bool hasAddr64() const { 495 return (getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS); 496 } 497 498 // Return true if the target only has the reverse operand versions of VALU 499 // shift instructions (e.g. v_lshrrev_b32, and no v_lshr_b32). 500 bool hasOnlyRevVALUShifts() const { 501 return getGeneration() >= VOLCANIC_ISLANDS; 502 } 503 504 bool hasBFE() const { 505 return true; 506 } 507 508 bool hasBFI() const { 509 return true; 510 } 511 512 bool hasBFM() const { 513 return hasBFE(); 514 } 515 516 bool hasBCNT(unsigned Size) const { 517 return true; 518 } 519 520 bool hasFFBL() const { 521 return true; 522 } 523 524 bool hasFFBH() const { 525 return true; 526 } 527 528 bool hasMed3_16() const { 529 return getGeneration() >= AMDGPUSubtarget::GFX9; 530 } 531 532 bool hasMin3Max3_16() const { 533 return getGeneration() >= AMDGPUSubtarget::GFX9; 534 } 535 536 bool hasFmaMixInsts() const { 537 return HasFmaMixInsts; 538 } 539 540 bool hasCARRY() const { 541 return true; 542 } 543 544 bool hasFMA() const { 545 return FMA; 546 } 547 548 bool hasSwap() const { 549 return GFX9Insts; 550 } 551 552 bool hasScalarPackInsts() const { 553 return GFX9Insts; 554 } 555 556 bool hasScalarMulHiInsts() const { 557 return GFX9Insts; 558 } 559 560 TrapHandlerAbi getTrapHandlerAbi() const { 561 return isAmdHsaOS() ? TrapHandlerAbiHsa : TrapHandlerAbiNone; 562 } 563 564 /// True if the offset field of DS instructions works as expected. On SI, the 565 /// offset uses a 16-bit adder and does not always wrap properly. 566 bool hasUsableDSOffset() const { 567 return getGeneration() >= SEA_ISLANDS; 568 } 569 570 bool unsafeDSOffsetFoldingEnabled() const { 571 return EnableUnsafeDSOffsetFolding; 572 } 573 574 /// Condition output from div_scale is usable. 575 bool hasUsableDivScaleConditionOutput() const { 576 return getGeneration() != SOUTHERN_ISLANDS; 577 } 578 579 /// Extra wait hazard is needed in some cases before 580 /// s_cbranch_vccnz/s_cbranch_vccz. 581 bool hasReadVCCZBug() const { 582 return getGeneration() <= SEA_ISLANDS; 583 } 584 585 /// A read of an SGPR by SMRD instruction requires 4 wait states when the SGPR 586 /// was written by a VALU instruction. 587 bool hasSMRDReadVALUDefHazard() const { 588 return getGeneration() == SOUTHERN_ISLANDS; 589 } 590 591 /// A read of an SGPR by a VMEM instruction requires 5 wait states when the 592 /// SGPR was written by a VALU Instruction. 593 bool hasVMEMReadSGPRVALUDefHazard() const { 594 return getGeneration() >= VOLCANIC_ISLANDS; 595 } 596 597 bool hasRFEHazards() const { 598 return getGeneration() >= VOLCANIC_ISLANDS; 599 } 600 601 /// Number of hazard wait states for s_setreg_b32/s_setreg_imm32_b32. 602 unsigned getSetRegWaitStates() const { 603 return getGeneration() <= SEA_ISLANDS ? 1 : 2; 604 } 605 606 bool dumpCode() const { 607 return DumpCode; 608 } 609 610 /// Return the amount of LDS that can be used that will not restrict the 611 /// occupancy lower than WaveCount. 612 unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount, 613 const Function &) const; 614 615 bool hasFP16Denormals() const { 616 return FP64FP16Denormals; 617 } 618 619 bool hasFP64Denormals() const { 620 return FP64FP16Denormals; 621 } 622 623 bool supportsMinMaxDenormModes() const { 624 return getGeneration() >= AMDGPUSubtarget::GFX9; 625 } 626 627 /// \returns If target supports S_DENORM_MODE. 628 bool hasDenormModeInst() const { 629 return getGeneration() >= AMDGPUSubtarget::GFX10; 630 } 631 632 bool useFlatForGlobal() const { 633 return FlatForGlobal; 634 } 635 636 /// \returns If target supports ds_read/write_b128 and user enables generation 637 /// of ds_read/write_b128. 638 bool useDS128() const { 639 return CIInsts && EnableDS128; 640 } 641 642 /// Have v_trunc_f64, v_ceil_f64, v_rndne_f64 643 bool haveRoundOpsF64() const { 644 return CIInsts; 645 } 646 647 /// \returns If MUBUF instructions always perform range checking, even for 648 /// buffer resources used for private memory access. 649 bool privateMemoryResourceIsRangeChecked() const { 650 return getGeneration() < AMDGPUSubtarget::GFX9; 651 } 652 653 /// \returns If target requires PRT Struct NULL support (zero result registers 654 /// for sparse texture support). 655 bool usePRTStrictNull() const { 656 return EnablePRTStrictNull; 657 } 658 659 bool hasAutoWaitcntBeforeBarrier() const { 660 return AutoWaitcntBeforeBarrier; 661 } 662 663 bool hasCodeObjectV3() const { 664 // FIXME: Need to add code object v3 support for mesa and pal. 665 return isAmdHsaOS() ? CodeObjectV3 : false; 666 } 667 668 bool hasUnalignedBufferAccess() const { 669 return UnalignedBufferAccess; 670 } 671 672 bool hasUnalignedScratchAccess() const { 673 return UnalignedScratchAccess; 674 } 675 676 bool hasApertureRegs() const { 677 return HasApertureRegs; 678 } 679 680 bool isTrapHandlerEnabled() const { 681 return TrapHandler; 682 } 683 684 bool isXNACKEnabled() const { 685 return EnableXNACK; 686 } 687 688 bool isCuModeEnabled() const { 689 return EnableCuMode; 690 } 691 692 bool hasFlatAddressSpace() const { 693 return FlatAddressSpace; 694 } 695 696 bool hasFlatScrRegister() const { 697 return hasFlatAddressSpace(); 698 } 699 700 bool hasFlatInstOffsets() const { 701 return FlatInstOffsets; 702 } 703 704 bool hasFlatGlobalInsts() const { 705 return FlatGlobalInsts; 706 } 707 708 bool hasFlatScratchInsts() const { 709 return FlatScratchInsts; 710 } 711 712 bool hasScalarFlatScratchInsts() const { 713 return ScalarFlatScratchInsts; 714 } 715 716 bool hasFlatSegmentOffsetBug() const { 717 return HasFlatSegmentOffsetBug; 718 } 719 720 bool hasFlatLgkmVMemCountInOrder() const { 721 return getGeneration() > GFX9; 722 } 723 724 bool hasD16LoadStore() const { 725 return getGeneration() >= GFX9; 726 } 727 728 bool d16PreservesUnusedBits() const { 729 return hasD16LoadStore() && !isSRAMECCEnabled(); 730 } 731 732 bool hasD16Images() const { 733 return getGeneration() >= VOLCANIC_ISLANDS; 734 } 735 736 /// Return if most LDS instructions have an m0 use that require m0 to be 737 /// iniitalized. 738 bool ldsRequiresM0Init() const { 739 return getGeneration() < GFX9; 740 } 741 742 // True if the hardware rewinds and replays GWS operations if a wave is 743 // preempted. 744 // 745 // If this is false, a GWS operation requires testing if a nack set the 746 // MEM_VIOL bit, and repeating if so. 747 bool hasGWSAutoReplay() const { 748 return getGeneration() >= GFX9; 749 } 750 751 /// \returns if target has ds_gws_sema_release_all instruction. 752 bool hasGWSSemaReleaseAll() const { 753 return CIInsts; 754 } 755 756 bool hasAddNoCarry() const { 757 return AddNoCarryInsts; 758 } 759 760 bool hasUnpackedD16VMem() const { 761 return HasUnpackedD16VMem; 762 } 763 764 // Covers VS/PS/CS graphics shaders 765 bool isMesaGfxShader(const Function &F) const { 766 return isMesa3DOS() && AMDGPU::isShader(F.getCallingConv()); 767 } 768 769 bool hasMad64_32() const { 770 return getGeneration() >= SEA_ISLANDS; 771 } 772 773 bool hasSDWAOmod() const { 774 return HasSDWAOmod; 775 } 776 777 bool hasSDWAScalar() const { 778 return HasSDWAScalar; 779 } 780 781 bool hasSDWASdst() const { 782 return HasSDWASdst; 783 } 784 785 bool hasSDWAMac() const { 786 return HasSDWAMac; 787 } 788 789 bool hasSDWAOutModsVOPC() const { 790 return HasSDWAOutModsVOPC; 791 } 792 793 bool hasDLInsts() const { 794 return HasDLInsts; 795 } 796 797 bool hasDot1Insts() const { 798 return HasDot1Insts; 799 } 800 801 bool hasDot2Insts() const { 802 return HasDot2Insts; 803 } 804 805 bool hasDot3Insts() const { 806 return HasDot3Insts; 807 } 808 809 bool hasDot4Insts() const { 810 return HasDot4Insts; 811 } 812 813 bool hasDot5Insts() const { 814 return HasDot5Insts; 815 } 816 817 bool hasDot6Insts() const { 818 return HasDot6Insts; 819 } 820 821 bool hasMAIInsts() const { 822 return HasMAIInsts; 823 } 824 825 bool hasPkFmacF16Inst() const { 826 return HasPkFmacF16Inst; 827 } 828 829 bool hasAtomicFaddInsts() const { 830 return HasAtomicFaddInsts; 831 } 832 833 bool isSRAMECCEnabled() const { 834 return EnableSRAMECC; 835 } 836 837 bool hasNoSdstCMPX() const { 838 return HasNoSdstCMPX; 839 } 840 841 bool hasVscnt() const { 842 return HasVscnt; 843 } 844 845 bool hasRegisterBanking() const { 846 return HasRegisterBanking; 847 } 848 849 bool hasVOP3Literal() const { 850 return HasVOP3Literal; 851 } 852 853 bool hasNoDataDepHazard() const { 854 return HasNoDataDepHazard; 855 } 856 857 bool vmemWriteNeedsExpWaitcnt() const { 858 return getGeneration() < SEA_ISLANDS; 859 } 860 861 // Scratch is allocated in 256 dword per wave blocks for the entire 862 // wavefront. When viewed from the perspecive of an arbitrary workitem, this 863 // is 4-byte aligned. 864 // 865 // Only 4-byte alignment is really needed to access anything. Transformations 866 // on the pointer value itself may rely on the alignment / known low bits of 867 // the pointer. Set this to something above the minimum to avoid needing 868 // dynamic realignment in common cases. 869 Align getStackAlignment() const { return Align(16); } 870 871 bool enableMachineScheduler() const override { 872 return true; 873 } 874 875 bool enableSubRegLiveness() const override { 876 return true; 877 } 878 879 void setScalarizeGlobalBehavior(bool b) { ScalarizeGlobal = b; } 880 bool getScalarizeGlobalBehavior() const { return ScalarizeGlobal; } 881 882 /// \returns Number of execution units per compute unit supported by the 883 /// subtarget. 884 unsigned getEUsPerCU() const { 885 return AMDGPU::IsaInfo::getEUsPerCU(this); 886 } 887 888 /// \returns Maximum number of waves per compute unit supported by the 889 /// subtarget without any kind of limitation. 890 unsigned getMaxWavesPerCU() const { 891 return AMDGPU::IsaInfo::getMaxWavesPerCU(this); 892 } 893 894 /// \returns Maximum number of waves per compute unit supported by the 895 /// subtarget and limited by given \p FlatWorkGroupSize. 896 unsigned getMaxWavesPerCU(unsigned FlatWorkGroupSize) const { 897 return AMDGPU::IsaInfo::getMaxWavesPerCU(this, FlatWorkGroupSize); 898 } 899 900 /// \returns Number of waves per work group supported by the subtarget and 901 /// limited by given \p FlatWorkGroupSize. 902 unsigned getWavesPerWorkGroup(unsigned FlatWorkGroupSize) const { 903 return AMDGPU::IsaInfo::getWavesPerWorkGroup(this, FlatWorkGroupSize); 904 } 905 906 // static wrappers 907 static bool hasHalfRate64Ops(const TargetSubtargetInfo &STI); 908 909 // XXX - Why is this here if it isn't in the default pass set? 910 bool enableEarlyIfConversion() const override { 911 return true; 912 } 913 914 void overrideSchedPolicy(MachineSchedPolicy &Policy, 915 unsigned NumRegionInstrs) const override; 916 917 unsigned getMaxNumUserSGPRs() const { 918 return 16; 919 } 920 921 bool hasSMemRealTime() const { 922 return HasSMemRealTime; 923 } 924 925 bool hasMovrel() const { 926 return HasMovrel; 927 } 928 929 bool hasVGPRIndexMode() const { 930 return HasVGPRIndexMode; 931 } 932 933 bool useVGPRIndexMode(bool UserEnable) const { 934 return !hasMovrel() || (UserEnable && hasVGPRIndexMode()); 935 } 936 937 bool hasScalarCompareEq64() const { 938 return getGeneration() >= VOLCANIC_ISLANDS; 939 } 940 941 bool hasScalarStores() const { 942 return HasScalarStores; 943 } 944 945 bool hasScalarAtomics() const { 946 return HasScalarAtomics; 947 } 948 949 bool hasLDSFPAtomics() const { 950 return GFX8Insts; 951 } 952 953 bool hasDPP() const { 954 return HasDPP; 955 } 956 957 bool hasDPPBroadcasts() const { 958 return HasDPP && getGeneration() < GFX10; 959 } 960 961 bool hasDPPWavefrontShifts() const { 962 return HasDPP && getGeneration() < GFX10; 963 } 964 965 bool hasDPP8() const { 966 return HasDPP8; 967 } 968 969 bool hasR128A16() const { 970 return HasR128A16; 971 } 972 973 bool hasOffset3fBug() const { 974 return HasOffset3fBug; 975 } 976 977 bool hasNSAEncoding() const { 978 return HasNSAEncoding; 979 } 980 981 bool hasMadF16() const; 982 983 bool enableSIScheduler() const { 984 return EnableSIScheduler; 985 } 986 987 bool loadStoreOptEnabled() const { 988 return EnableLoadStoreOpt; 989 } 990 991 bool hasSGPRInitBug() const { 992 return SGPRInitBug; 993 } 994 995 bool hasMFMAInlineLiteralBug() const { 996 return HasMFMAInlineLiteralBug; 997 } 998 999 bool has12DWordStoreHazard() const { 1000 return getGeneration() != AMDGPUSubtarget::SOUTHERN_ISLANDS; 1001 } 1002 1003 // \returns true if the subtarget supports DWORDX3 load/store instructions. 1004 bool hasDwordx3LoadStores() const { 1005 return CIInsts; 1006 } 1007 1008 bool hasSMovFedHazard() const { 1009 return getGeneration() == AMDGPUSubtarget::GFX9; 1010 } 1011 1012 bool hasReadM0MovRelInterpHazard() const { 1013 return getGeneration() == AMDGPUSubtarget::GFX9; 1014 } 1015 1016 bool hasReadM0SendMsgHazard() const { 1017 return getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS && 1018 getGeneration() <= AMDGPUSubtarget::GFX9; 1019 } 1020 1021 bool hasVcmpxPermlaneHazard() const { 1022 return HasVcmpxPermlaneHazard; 1023 } 1024 1025 bool hasVMEMtoScalarWriteHazard() const { 1026 return HasVMEMtoScalarWriteHazard; 1027 } 1028 1029 bool hasSMEMtoVectorWriteHazard() const { 1030 return HasSMEMtoVectorWriteHazard; 1031 } 1032 1033 bool hasLDSMisalignedBug() const { 1034 return LDSMisalignedBug && !EnableCuMode; 1035 } 1036 1037 bool hasInstFwdPrefetchBug() const { 1038 return HasInstFwdPrefetchBug; 1039 } 1040 1041 bool hasVcmpxExecWARHazard() const { 1042 return HasVcmpxExecWARHazard; 1043 } 1044 1045 bool hasLdsBranchVmemWARHazard() const { 1046 return HasLdsBranchVmemWARHazard; 1047 } 1048 1049 bool hasNSAtoVMEMBug() const { 1050 return HasNSAtoVMEMBug; 1051 } 1052 1053 /// Return the maximum number of waves per SIMD for kernels using \p SGPRs 1054 /// SGPRs 1055 unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const; 1056 1057 /// Return the maximum number of waves per SIMD for kernels using \p VGPRs 1058 /// VGPRs 1059 unsigned getOccupancyWithNumVGPRs(unsigned VGPRs) const; 1060 1061 /// Return occupancy for the given function. Used LDS and a number of 1062 /// registers if provided. 1063 /// Note, occupancy can be affected by the scratch allocation as well, but 1064 /// we do not have enough information to compute it. 1065 unsigned computeOccupancy(const MachineFunction &MF, unsigned LDSSize = 0, 1066 unsigned NumSGPRs = 0, unsigned NumVGPRs = 0) const; 1067 1068 /// \returns true if the flat_scratch register should be initialized with the 1069 /// pointer to the wave's scratch memory rather than a size and offset. 1070 bool flatScratchIsPointer() const { 1071 return getGeneration() >= AMDGPUSubtarget::GFX9; 1072 } 1073 1074 /// \returns true if the machine has merged shaders in which s0-s7 are 1075 /// reserved by the hardware and user SGPRs start at s8 1076 bool hasMergedShaders() const { 1077 return getGeneration() >= GFX9; 1078 } 1079 1080 /// \returns SGPR allocation granularity supported by the subtarget. 1081 unsigned getSGPRAllocGranule() const { 1082 return AMDGPU::IsaInfo::getSGPRAllocGranule(this); 1083 } 1084 1085 /// \returns SGPR encoding granularity supported by the subtarget. 1086 unsigned getSGPREncodingGranule() const { 1087 return AMDGPU::IsaInfo::getSGPREncodingGranule(this); 1088 } 1089 1090 /// \returns Total number of SGPRs supported by the subtarget. 1091 unsigned getTotalNumSGPRs() const { 1092 return AMDGPU::IsaInfo::getTotalNumSGPRs(this); 1093 } 1094 1095 /// \returns Addressable number of SGPRs supported by the subtarget. 1096 unsigned getAddressableNumSGPRs() const { 1097 return AMDGPU::IsaInfo::getAddressableNumSGPRs(this); 1098 } 1099 1100 /// \returns Minimum number of SGPRs that meets the given number of waves per 1101 /// execution unit requirement supported by the subtarget. 1102 unsigned getMinNumSGPRs(unsigned WavesPerEU) const { 1103 return AMDGPU::IsaInfo::getMinNumSGPRs(this, WavesPerEU); 1104 } 1105 1106 /// \returns Maximum number of SGPRs that meets the given number of waves per 1107 /// execution unit requirement supported by the subtarget. 1108 unsigned getMaxNumSGPRs(unsigned WavesPerEU, bool Addressable) const { 1109 return AMDGPU::IsaInfo::getMaxNumSGPRs(this, WavesPerEU, Addressable); 1110 } 1111 1112 /// \returns Reserved number of SGPRs for given function \p MF. 1113 unsigned getReservedNumSGPRs(const MachineFunction &MF) const; 1114 1115 /// \returns Maximum number of SGPRs that meets number of waves per execution 1116 /// unit requirement for function \p MF, or number of SGPRs explicitly 1117 /// requested using "amdgpu-num-sgpr" attribute attached to function \p MF. 1118 /// 1119 /// \returns Value that meets number of waves per execution unit requirement 1120 /// if explicitly requested value cannot be converted to integer, violates 1121 /// subtarget's specifications, or does not meet number of waves per execution 1122 /// unit requirement. 1123 unsigned getMaxNumSGPRs(const MachineFunction &MF) const; 1124 1125 /// \returns VGPR allocation granularity supported by the subtarget. 1126 unsigned getVGPRAllocGranule() const { 1127 return AMDGPU::IsaInfo::getVGPRAllocGranule(this); 1128 } 1129 1130 /// \returns VGPR encoding granularity supported by the subtarget. 1131 unsigned getVGPREncodingGranule() const { 1132 return AMDGPU::IsaInfo::getVGPREncodingGranule(this); 1133 } 1134 1135 /// \returns Total number of VGPRs supported by the subtarget. 1136 unsigned getTotalNumVGPRs() const { 1137 return AMDGPU::IsaInfo::getTotalNumVGPRs(this); 1138 } 1139 1140 /// \returns Addressable number of VGPRs supported by the subtarget. 1141 unsigned getAddressableNumVGPRs() const { 1142 return AMDGPU::IsaInfo::getAddressableNumVGPRs(this); 1143 } 1144 1145 /// \returns Minimum number of VGPRs that meets given number of waves per 1146 /// execution unit requirement supported by the subtarget. 1147 unsigned getMinNumVGPRs(unsigned WavesPerEU) const { 1148 return AMDGPU::IsaInfo::getMinNumVGPRs(this, WavesPerEU); 1149 } 1150 1151 /// \returns Maximum number of VGPRs that meets given number of waves per 1152 /// execution unit requirement supported by the subtarget. 1153 unsigned getMaxNumVGPRs(unsigned WavesPerEU) const { 1154 return AMDGPU::IsaInfo::getMaxNumVGPRs(this, WavesPerEU); 1155 } 1156 1157 /// \returns Maximum number of VGPRs that meets number of waves per execution 1158 /// unit requirement for function \p MF, or number of VGPRs explicitly 1159 /// requested using "amdgpu-num-vgpr" attribute attached to function \p MF. 1160 /// 1161 /// \returns Value that meets number of waves per execution unit requirement 1162 /// if explicitly requested value cannot be converted to integer, violates 1163 /// subtarget's specifications, or does not meet number of waves per execution 1164 /// unit requirement. 1165 unsigned getMaxNumVGPRs(const MachineFunction &MF) const; 1166 1167 void getPostRAMutations( 1168 std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) 1169 const override; 1170 1171 bool isWave32() const { 1172 return WavefrontSize == 32; 1173 } 1174 1175 const TargetRegisterClass *getBoolRC() const { 1176 return getRegisterInfo()->getBoolRC(); 1177 } 1178 1179 /// \returns Maximum number of work groups per compute unit supported by the 1180 /// subtarget and limited by given \p FlatWorkGroupSize. 1181 unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const override { 1182 return AMDGPU::IsaInfo::getMaxWorkGroupsPerCU(this, FlatWorkGroupSize); 1183 } 1184 1185 /// \returns Minimum flat work group size supported by the subtarget. 1186 unsigned getMinFlatWorkGroupSize() const override { 1187 return AMDGPU::IsaInfo::getMinFlatWorkGroupSize(this); 1188 } 1189 1190 /// \returns Maximum flat work group size supported by the subtarget. 1191 unsigned getMaxFlatWorkGroupSize() const override { 1192 return AMDGPU::IsaInfo::getMaxFlatWorkGroupSize(this); 1193 } 1194 1195 /// \returns Maximum number of waves per execution unit supported by the 1196 /// subtarget and limited by given \p FlatWorkGroupSize. 1197 unsigned getMaxWavesPerEU(unsigned FlatWorkGroupSize) const override { 1198 return AMDGPU::IsaInfo::getMaxWavesPerEU(this, FlatWorkGroupSize); 1199 } 1200 1201 /// \returns Minimum number of waves per execution unit supported by the 1202 /// subtarget. 1203 unsigned getMinWavesPerEU() const override { 1204 return AMDGPU::IsaInfo::getMinWavesPerEU(this); 1205 } 1206 }; 1207 1208 class R600Subtarget final : public R600GenSubtargetInfo, 1209 public AMDGPUSubtarget { 1210 private: 1211 R600InstrInfo InstrInfo; 1212 R600FrameLowering FrameLowering; 1213 bool FMA; 1214 bool CaymanISA; 1215 bool CFALUBug; 1216 bool HasVertexCache; 1217 bool R600ALUInst; 1218 bool FP64; 1219 short TexVTXClauseSize; 1220 Generation Gen; 1221 R600TargetLowering TLInfo; 1222 InstrItineraryData InstrItins; 1223 SelectionDAGTargetInfo TSInfo; 1224 1225 public: 1226 R600Subtarget(const Triple &TT, StringRef CPU, StringRef FS, 1227 const TargetMachine &TM); 1228 1229 const R600InstrInfo *getInstrInfo() const override { return &InstrInfo; } 1230 1231 const R600FrameLowering *getFrameLowering() const override { 1232 return &FrameLowering; 1233 } 1234 1235 const R600TargetLowering *getTargetLowering() const override { 1236 return &TLInfo; 1237 } 1238 1239 const R600RegisterInfo *getRegisterInfo() const override { 1240 return &InstrInfo.getRegisterInfo(); 1241 } 1242 1243 const InstrItineraryData *getInstrItineraryData() const override { 1244 return &InstrItins; 1245 } 1246 1247 // Nothing implemented, just prevent crashes on use. 1248 const SelectionDAGTargetInfo *getSelectionDAGInfo() const override { 1249 return &TSInfo; 1250 } 1251 1252 void ParseSubtargetFeatures(StringRef CPU, StringRef FS); 1253 1254 Generation getGeneration() const { 1255 return Gen; 1256 } 1257 1258 Align getStackAlignment() const { return Align(4); } 1259 1260 R600Subtarget &initializeSubtargetDependencies(const Triple &TT, 1261 StringRef GPU, StringRef FS); 1262 1263 bool hasBFE() const { 1264 return (getGeneration() >= EVERGREEN); 1265 } 1266 1267 bool hasBFI() const { 1268 return (getGeneration() >= EVERGREEN); 1269 } 1270 1271 bool hasBCNT(unsigned Size) const { 1272 if (Size == 32) 1273 return (getGeneration() >= EVERGREEN); 1274 1275 return false; 1276 } 1277 1278 bool hasBORROW() const { 1279 return (getGeneration() >= EVERGREEN); 1280 } 1281 1282 bool hasCARRY() const { 1283 return (getGeneration() >= EVERGREEN); 1284 } 1285 1286 bool hasCaymanISA() const { 1287 return CaymanISA; 1288 } 1289 1290 bool hasFFBL() const { 1291 return (getGeneration() >= EVERGREEN); 1292 } 1293 1294 bool hasFFBH() const { 1295 return (getGeneration() >= EVERGREEN); 1296 } 1297 1298 bool hasFMA() const { return FMA; } 1299 1300 bool hasCFAluBug() const { return CFALUBug; } 1301 1302 bool hasVertexCache() const { return HasVertexCache; } 1303 1304 short getTexVTXClauseSize() const { return TexVTXClauseSize; } 1305 1306 bool enableMachineScheduler() const override { 1307 return true; 1308 } 1309 1310 bool enableSubRegLiveness() const override { 1311 return true; 1312 } 1313 1314 /// \returns Maximum number of work groups per compute unit supported by the 1315 /// subtarget and limited by given \p FlatWorkGroupSize. 1316 unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const override { 1317 return AMDGPU::IsaInfo::getMaxWorkGroupsPerCU(this, FlatWorkGroupSize); 1318 } 1319 1320 /// \returns Minimum flat work group size supported by the subtarget. 1321 unsigned getMinFlatWorkGroupSize() const override { 1322 return AMDGPU::IsaInfo::getMinFlatWorkGroupSize(this); 1323 } 1324 1325 /// \returns Maximum flat work group size supported by the subtarget. 1326 unsigned getMaxFlatWorkGroupSize() const override { 1327 return AMDGPU::IsaInfo::getMaxFlatWorkGroupSize(this); 1328 } 1329 1330 /// \returns Maximum number of waves per execution unit supported by the 1331 /// subtarget and limited by given \p FlatWorkGroupSize. 1332 unsigned getMaxWavesPerEU(unsigned FlatWorkGroupSize) const override { 1333 return AMDGPU::IsaInfo::getMaxWavesPerEU(this, FlatWorkGroupSize); 1334 } 1335 1336 /// \returns Minimum number of waves per execution unit supported by the 1337 /// subtarget. 1338 unsigned getMinWavesPerEU() const override { 1339 return AMDGPU::IsaInfo::getMinWavesPerEU(this); 1340 } 1341 }; 1342 1343 } // end namespace llvm 1344 1345 #endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUSUBTARGET_H 1346