1 //==- SIMachineFunctionInfo.h - SIMachineFunctionInfo interface --*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 // 11 //===----------------------------------------------------------------------===// 12 13 #ifndef LLVM_LIB_TARGET_AMDGPU_SIMACHINEFUNCTIONINFO_H 14 #define LLVM_LIB_TARGET_AMDGPU_SIMACHINEFUNCTIONINFO_H 15 16 #include "AMDGPUArgumentUsageInfo.h" 17 #include "AMDGPUMachineFunction.h" 18 #include "SIInstrInfo.h" 19 #include "SIRegisterInfo.h" 20 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 21 #include "llvm/ADT/ArrayRef.h" 22 #include "llvm/ADT/DenseMap.h" 23 #include "llvm/ADT/Optional.h" 24 #include "llvm/ADT/SmallVector.h" 25 #include "llvm/CodeGen/PseudoSourceValue.h" 26 #include "llvm/CodeGen/TargetInstrInfo.h" 27 #include "llvm/MC/MCRegisterInfo.h" 28 #include "llvm/Support/ErrorHandling.h" 29 #include <array> 30 #include <cassert> 31 #include <utility> 32 #include <vector> 33 34 namespace llvm { 35 36 class MachineFrameInfo; 37 class MachineFunction; 38 class TargetRegisterClass; 39 40 class AMDGPUImagePseudoSourceValue : public PseudoSourceValue { 41 public: 42 // TODO: Is the img rsrc useful? 43 explicit AMDGPUImagePseudoSourceValue(const TargetInstrInfo &TII) : 44 PseudoSourceValue(PseudoSourceValue::TargetCustom, TII) {} 45 46 bool isConstant(const MachineFrameInfo *) const override { 47 // This should probably be true for most images, but we will start by being 48 // conservative. 49 return false; 50 } 51 52 bool isAliased(const MachineFrameInfo *) const override { 53 return true; 54 } 55 56 bool mayAlias(const MachineFrameInfo *) const override { 57 return true; 58 } 59 }; 60 61 class AMDGPUBufferPseudoSourceValue : public PseudoSourceValue { 62 public: 63 explicit AMDGPUBufferPseudoSourceValue(const TargetInstrInfo &TII) : 64 PseudoSourceValue(PseudoSourceValue::TargetCustom, TII) { } 65 66 bool isConstant(const MachineFrameInfo *) const override { 67 // This should probably be true for most images, but we will start by being 68 // conservative. 69 return false; 70 } 71 72 bool isAliased(const MachineFrameInfo *) const override { 73 return true; 74 } 75 76 bool mayAlias(const MachineFrameInfo *) const override { 77 return true; 78 } 79 }; 80 81 /// This class keeps track of the SPI_SP_INPUT_ADDR config register, which 82 /// tells the hardware which interpolation parameters to load. 83 class SIMachineFunctionInfo final : public AMDGPUMachineFunction { 84 unsigned TIDReg = AMDGPU::NoRegister; 85 86 // Registers that may be reserved for spilling purposes. These may be the same 87 // as the input registers. 88 unsigned ScratchRSrcReg = AMDGPU::PRIVATE_RSRC_REG; 89 unsigned ScratchWaveOffsetReg = AMDGPU::SCRATCH_WAVE_OFFSET_REG; 90 91 // This is the current function's incremented size from the kernel's scratch 92 // wave offset register. For an entry function, this is exactly the same as 93 // the ScratchWaveOffsetReg. 94 unsigned FrameOffsetReg = AMDGPU::FP_REG; 95 96 // Top of the stack SGPR offset derived from the ScratchWaveOffsetReg. 97 unsigned StackPtrOffsetReg = AMDGPU::SP_REG; 98 99 AMDGPUFunctionArgInfo ArgInfo; 100 101 // Graphics info. 102 unsigned PSInputAddr = 0; 103 unsigned PSInputEnable = 0; 104 105 /// Number of bytes of arguments this function has on the stack. If the callee 106 /// is expected to restore the argument stack this should be a multiple of 16, 107 /// all usable during a tail call. 108 /// 109 /// The alternative would forbid tail call optimisation in some cases: if we 110 /// want to transfer control from a function with 8-bytes of stack-argument 111 /// space to a function with 16-bytes then misalignment of this value would 112 /// make a stack adjustment necessary, which could not be undone by the 113 /// callee. 114 unsigned BytesInStackArgArea = 0; 115 116 bool ReturnsVoid = true; 117 118 // A pair of default/requested minimum/maximum flat work group sizes. 119 // Minimum - first, maximum - second. 120 std::pair<unsigned, unsigned> FlatWorkGroupSizes = {0, 0}; 121 122 // A pair of default/requested minimum/maximum number of waves per execution 123 // unit. Minimum - first, maximum - second. 124 std::pair<unsigned, unsigned> WavesPerEU = {0, 0}; 125 126 // Stack object indices for work group IDs. 127 std::array<int, 3> DebuggerWorkGroupIDStackObjectIndices = {{0, 0, 0}}; 128 129 // Stack object indices for work item IDs. 130 std::array<int, 3> DebuggerWorkItemIDStackObjectIndices = {{0, 0, 0}}; 131 132 DenseMap<const Value *, 133 std::unique_ptr<const AMDGPUBufferPseudoSourceValue>> BufferPSVs; 134 DenseMap<const Value *, 135 std::unique_ptr<const AMDGPUImagePseudoSourceValue>> ImagePSVs; 136 137 private: 138 unsigned LDSWaveSpillSize = 0; 139 unsigned NumUserSGPRs = 0; 140 unsigned NumSystemSGPRs = 0; 141 142 bool HasSpilledSGPRs = false; 143 bool HasSpilledVGPRs = false; 144 bool HasNonSpillStackObjects = false; 145 bool IsStackRealigned = false; 146 147 unsigned NumSpilledSGPRs = 0; 148 unsigned NumSpilledVGPRs = 0; 149 150 // Feature bits required for inputs passed in user SGPRs. 151 bool PrivateSegmentBuffer : 1; 152 bool DispatchPtr : 1; 153 bool QueuePtr : 1; 154 bool KernargSegmentPtr : 1; 155 bool DispatchID : 1; 156 bool FlatScratchInit : 1; 157 158 // Feature bits required for inputs passed in system SGPRs. 159 bool WorkGroupIDX : 1; // Always initialized. 160 bool WorkGroupIDY : 1; 161 bool WorkGroupIDZ : 1; 162 bool WorkGroupInfo : 1; 163 bool PrivateSegmentWaveByteOffset : 1; 164 165 bool WorkItemIDX : 1; // Always initialized. 166 bool WorkItemIDY : 1; 167 bool WorkItemIDZ : 1; 168 169 // Private memory buffer 170 // Compute directly in sgpr[0:1] 171 // Other shaders indirect 64-bits at sgpr[0:1] 172 bool ImplicitBufferPtr : 1; 173 174 // Pointer to where the ABI inserts special kernel arguments separate from the 175 // user arguments. This is an offset from the KernargSegmentPtr. 176 bool ImplicitArgPtr : 1; 177 178 // The hard-wired high half of the address of the global information table 179 // for AMDPAL OS type. 0xffffffff represents no hard-wired high half, since 180 // current hardware only allows a 16 bit value. 181 unsigned GITPtrHigh; 182 183 unsigned HighBitsOf32BitAddress; 184 185 // Current recorded maximum possible occupancy. 186 unsigned Occupancy; 187 188 MCPhysReg getNextUserSGPR() const; 189 190 MCPhysReg getNextSystemSGPR() const; 191 192 public: 193 struct SpilledReg { 194 unsigned VGPR = 0; 195 int Lane = -1; 196 197 SpilledReg() = default; 198 SpilledReg(unsigned R, int L) : VGPR (R), Lane (L) {} 199 200 bool hasLane() { return Lane != -1;} 201 bool hasReg() { return VGPR != 0;} 202 }; 203 204 struct SGPRSpillVGPRCSR { 205 // VGPR used for SGPR spills 206 unsigned VGPR; 207 208 // If the VGPR is a CSR, the stack slot used to save/restore it in the 209 // prolog/epilog. 210 Optional<int> FI; 211 212 SGPRSpillVGPRCSR(unsigned V, Optional<int> F) : VGPR(V), FI(F) {} 213 }; 214 215 private: 216 // SGPR->VGPR spilling support. 217 using SpillRegMask = std::pair<unsigned, unsigned>; 218 219 // Track VGPR + wave index for each subregister of the SGPR spilled to 220 // frameindex key. 221 DenseMap<int, std::vector<SpilledReg>> SGPRToVGPRSpills; 222 unsigned NumVGPRSpillLanes = 0; 223 SmallVector<SGPRSpillVGPRCSR, 2> SpillVGPRs; 224 225 public: 226 SIMachineFunctionInfo(const MachineFunction &MF); 227 228 ArrayRef<SpilledReg> getSGPRToVGPRSpills(int FrameIndex) const { 229 auto I = SGPRToVGPRSpills.find(FrameIndex); 230 return (I == SGPRToVGPRSpills.end()) ? 231 ArrayRef<SpilledReg>() : makeArrayRef(I->second); 232 } 233 234 ArrayRef<SGPRSpillVGPRCSR> getSGPRSpillVGPRs() const { 235 return SpillVGPRs; 236 } 237 238 bool allocateSGPRSpillToVGPR(MachineFunction &MF, int FI); 239 void removeSGPRToVGPRFrameIndices(MachineFrameInfo &MFI); 240 241 bool hasCalculatedTID() const { return TIDReg != 0; }; 242 unsigned getTIDReg() const { return TIDReg; }; 243 void setTIDReg(unsigned Reg) { TIDReg = Reg; } 244 245 unsigned getBytesInStackArgArea() const { 246 return BytesInStackArgArea; 247 } 248 249 void setBytesInStackArgArea(unsigned Bytes) { 250 BytesInStackArgArea = Bytes; 251 } 252 253 // Add user SGPRs. 254 unsigned addPrivateSegmentBuffer(const SIRegisterInfo &TRI); 255 unsigned addDispatchPtr(const SIRegisterInfo &TRI); 256 unsigned addQueuePtr(const SIRegisterInfo &TRI); 257 unsigned addKernargSegmentPtr(const SIRegisterInfo &TRI); 258 unsigned addDispatchID(const SIRegisterInfo &TRI); 259 unsigned addFlatScratchInit(const SIRegisterInfo &TRI); 260 unsigned addImplicitBufferPtr(const SIRegisterInfo &TRI); 261 262 // Add system SGPRs. 263 unsigned addWorkGroupIDX() { 264 ArgInfo.WorkGroupIDX = ArgDescriptor::createRegister(getNextSystemSGPR()); 265 NumSystemSGPRs += 1; 266 return ArgInfo.WorkGroupIDX.getRegister(); 267 } 268 269 unsigned addWorkGroupIDY() { 270 ArgInfo.WorkGroupIDY = ArgDescriptor::createRegister(getNextSystemSGPR()); 271 NumSystemSGPRs += 1; 272 return ArgInfo.WorkGroupIDY.getRegister(); 273 } 274 275 unsigned addWorkGroupIDZ() { 276 ArgInfo.WorkGroupIDZ = ArgDescriptor::createRegister(getNextSystemSGPR()); 277 NumSystemSGPRs += 1; 278 return ArgInfo.WorkGroupIDZ.getRegister(); 279 } 280 281 unsigned addWorkGroupInfo() { 282 ArgInfo.WorkGroupInfo = ArgDescriptor::createRegister(getNextSystemSGPR()); 283 NumSystemSGPRs += 1; 284 return ArgInfo.WorkGroupInfo.getRegister(); 285 } 286 287 // Add special VGPR inputs 288 void setWorkItemIDX(ArgDescriptor Arg) { 289 ArgInfo.WorkItemIDX = Arg; 290 } 291 292 void setWorkItemIDY(ArgDescriptor Arg) { 293 ArgInfo.WorkItemIDY = Arg; 294 } 295 296 void setWorkItemIDZ(ArgDescriptor Arg) { 297 ArgInfo.WorkItemIDZ = Arg; 298 } 299 300 unsigned addPrivateSegmentWaveByteOffset() { 301 ArgInfo.PrivateSegmentWaveByteOffset 302 = ArgDescriptor::createRegister(getNextSystemSGPR()); 303 NumSystemSGPRs += 1; 304 return ArgInfo.PrivateSegmentWaveByteOffset.getRegister(); 305 } 306 307 void setPrivateSegmentWaveByteOffset(unsigned Reg) { 308 ArgInfo.PrivateSegmentWaveByteOffset = ArgDescriptor::createRegister(Reg); 309 } 310 311 bool hasPrivateSegmentBuffer() const { 312 return PrivateSegmentBuffer; 313 } 314 315 bool hasDispatchPtr() const { 316 return DispatchPtr; 317 } 318 319 bool hasQueuePtr() const { 320 return QueuePtr; 321 } 322 323 bool hasKernargSegmentPtr() const { 324 return KernargSegmentPtr; 325 } 326 327 bool hasDispatchID() const { 328 return DispatchID; 329 } 330 331 bool hasFlatScratchInit() const { 332 return FlatScratchInit; 333 } 334 335 bool hasWorkGroupIDX() const { 336 return WorkGroupIDX; 337 } 338 339 bool hasWorkGroupIDY() const { 340 return WorkGroupIDY; 341 } 342 343 bool hasWorkGroupIDZ() const { 344 return WorkGroupIDZ; 345 } 346 347 bool hasWorkGroupInfo() const { 348 return WorkGroupInfo; 349 } 350 351 bool hasPrivateSegmentWaveByteOffset() const { 352 return PrivateSegmentWaveByteOffset; 353 } 354 355 bool hasWorkItemIDX() const { 356 return WorkItemIDX; 357 } 358 359 bool hasWorkItemIDY() const { 360 return WorkItemIDY; 361 } 362 363 bool hasWorkItemIDZ() const { 364 return WorkItemIDZ; 365 } 366 367 bool hasImplicitArgPtr() const { 368 return ImplicitArgPtr; 369 } 370 371 bool hasImplicitBufferPtr() const { 372 return ImplicitBufferPtr; 373 } 374 375 AMDGPUFunctionArgInfo &getArgInfo() { 376 return ArgInfo; 377 } 378 379 const AMDGPUFunctionArgInfo &getArgInfo() const { 380 return ArgInfo; 381 } 382 383 std::pair<const ArgDescriptor *, const TargetRegisterClass *> 384 getPreloadedValue(AMDGPUFunctionArgInfo::PreloadedValue Value) const { 385 return ArgInfo.getPreloadedValue(Value); 386 } 387 388 unsigned getPreloadedReg(AMDGPUFunctionArgInfo::PreloadedValue Value) const { 389 return ArgInfo.getPreloadedValue(Value).first->getRegister(); 390 } 391 392 unsigned getGITPtrHigh() const { 393 return GITPtrHigh; 394 } 395 396 unsigned get32BitAddressHighBits() const { 397 return HighBitsOf32BitAddress; 398 } 399 400 unsigned getNumUserSGPRs() const { 401 return NumUserSGPRs; 402 } 403 404 unsigned getNumPreloadedSGPRs() const { 405 return NumUserSGPRs + NumSystemSGPRs; 406 } 407 408 unsigned getPrivateSegmentWaveByteOffsetSystemSGPR() const { 409 return ArgInfo.PrivateSegmentWaveByteOffset.getRegister(); 410 } 411 412 /// Returns the physical register reserved for use as the resource 413 /// descriptor for scratch accesses. 414 unsigned getScratchRSrcReg() const { 415 return ScratchRSrcReg; 416 } 417 418 void setScratchRSrcReg(unsigned Reg) { 419 assert(Reg != 0 && "Should never be unset"); 420 ScratchRSrcReg = Reg; 421 } 422 423 unsigned getScratchWaveOffsetReg() const { 424 return ScratchWaveOffsetReg; 425 } 426 427 unsigned getFrameOffsetReg() const { 428 return FrameOffsetReg; 429 } 430 431 void setStackPtrOffsetReg(unsigned Reg) { 432 assert(Reg != 0 && "Should never be unset"); 433 StackPtrOffsetReg = Reg; 434 } 435 436 // Note the unset value for this is AMDGPU::SP_REG rather than 437 // NoRegister. This is mostly a workaround for MIR tests where state that 438 // can't be directly computed from the function is not preserved in serialized 439 // MIR. 440 unsigned getStackPtrOffsetReg() const { 441 return StackPtrOffsetReg; 442 } 443 444 void setScratchWaveOffsetReg(unsigned Reg) { 445 assert(Reg != 0 && "Should never be unset"); 446 ScratchWaveOffsetReg = Reg; 447 if (isEntryFunction()) 448 FrameOffsetReg = ScratchWaveOffsetReg; 449 } 450 451 unsigned getQueuePtrUserSGPR() const { 452 return ArgInfo.QueuePtr.getRegister(); 453 } 454 455 unsigned getImplicitBufferPtrUserSGPR() const { 456 return ArgInfo.ImplicitBufferPtr.getRegister(); 457 } 458 459 bool hasSpilledSGPRs() const { 460 return HasSpilledSGPRs; 461 } 462 463 void setHasSpilledSGPRs(bool Spill = true) { 464 HasSpilledSGPRs = Spill; 465 } 466 467 bool hasSpilledVGPRs() const { 468 return HasSpilledVGPRs; 469 } 470 471 void setHasSpilledVGPRs(bool Spill = true) { 472 HasSpilledVGPRs = Spill; 473 } 474 475 bool hasNonSpillStackObjects() const { 476 return HasNonSpillStackObjects; 477 } 478 479 void setHasNonSpillStackObjects(bool StackObject = true) { 480 HasNonSpillStackObjects = StackObject; 481 } 482 483 bool isStackRealigned() const { 484 return IsStackRealigned; 485 } 486 487 void setIsStackRealigned(bool Realigned = true) { 488 IsStackRealigned = Realigned; 489 } 490 491 unsigned getNumSpilledSGPRs() const { 492 return NumSpilledSGPRs; 493 } 494 495 unsigned getNumSpilledVGPRs() const { 496 return NumSpilledVGPRs; 497 } 498 499 void addToSpilledSGPRs(unsigned num) { 500 NumSpilledSGPRs += num; 501 } 502 503 void addToSpilledVGPRs(unsigned num) { 504 NumSpilledVGPRs += num; 505 } 506 507 unsigned getPSInputAddr() const { 508 return PSInputAddr; 509 } 510 511 unsigned getPSInputEnable() const { 512 return PSInputEnable; 513 } 514 515 bool isPSInputAllocated(unsigned Index) const { 516 return PSInputAddr & (1 << Index); 517 } 518 519 void markPSInputAllocated(unsigned Index) { 520 PSInputAddr |= 1 << Index; 521 } 522 523 void markPSInputEnabled(unsigned Index) { 524 PSInputEnable |= 1 << Index; 525 } 526 527 bool returnsVoid() const { 528 return ReturnsVoid; 529 } 530 531 void setIfReturnsVoid(bool Value) { 532 ReturnsVoid = Value; 533 } 534 535 /// \returns A pair of default/requested minimum/maximum flat work group sizes 536 /// for this function. 537 std::pair<unsigned, unsigned> getFlatWorkGroupSizes() const { 538 return FlatWorkGroupSizes; 539 } 540 541 /// \returns Default/requested minimum flat work group size for this function. 542 unsigned getMinFlatWorkGroupSize() const { 543 return FlatWorkGroupSizes.first; 544 } 545 546 /// \returns Default/requested maximum flat work group size for this function. 547 unsigned getMaxFlatWorkGroupSize() const { 548 return FlatWorkGroupSizes.second; 549 } 550 551 /// \returns A pair of default/requested minimum/maximum number of waves per 552 /// execution unit. 553 std::pair<unsigned, unsigned> getWavesPerEU() const { 554 return WavesPerEU; 555 } 556 557 /// \returns Default/requested minimum number of waves per execution unit. 558 unsigned getMinWavesPerEU() const { 559 return WavesPerEU.first; 560 } 561 562 /// \returns Default/requested maximum number of waves per execution unit. 563 unsigned getMaxWavesPerEU() const { 564 return WavesPerEU.second; 565 } 566 567 /// \returns Stack object index for \p Dim's work group ID. 568 int getDebuggerWorkGroupIDStackObjectIndex(unsigned Dim) const { 569 assert(Dim < 3); 570 return DebuggerWorkGroupIDStackObjectIndices[Dim]; 571 } 572 573 /// Sets stack object index for \p Dim's work group ID to \p ObjectIdx. 574 void setDebuggerWorkGroupIDStackObjectIndex(unsigned Dim, int ObjectIdx) { 575 assert(Dim < 3); 576 DebuggerWorkGroupIDStackObjectIndices[Dim] = ObjectIdx; 577 } 578 579 /// \returns Stack object index for \p Dim's work item ID. 580 int getDebuggerWorkItemIDStackObjectIndex(unsigned Dim) const { 581 assert(Dim < 3); 582 return DebuggerWorkItemIDStackObjectIndices[Dim]; 583 } 584 585 /// Sets stack object index for \p Dim's work item ID to \p ObjectIdx. 586 void setDebuggerWorkItemIDStackObjectIndex(unsigned Dim, int ObjectIdx) { 587 assert(Dim < 3); 588 DebuggerWorkItemIDStackObjectIndices[Dim] = ObjectIdx; 589 } 590 591 /// \returns SGPR used for \p Dim's work group ID. 592 unsigned getWorkGroupIDSGPR(unsigned Dim) const { 593 switch (Dim) { 594 case 0: 595 assert(hasWorkGroupIDX()); 596 return ArgInfo.WorkGroupIDX.getRegister(); 597 case 1: 598 assert(hasWorkGroupIDY()); 599 return ArgInfo.WorkGroupIDY.getRegister(); 600 case 2: 601 assert(hasWorkGroupIDZ()); 602 return ArgInfo.WorkGroupIDZ.getRegister(); 603 } 604 llvm_unreachable("unexpected dimension"); 605 } 606 607 /// \returns VGPR used for \p Dim' work item ID. 608 unsigned getWorkItemIDVGPR(unsigned Dim) const; 609 610 unsigned getLDSWaveSpillSize() const { 611 return LDSWaveSpillSize; 612 } 613 614 const AMDGPUBufferPseudoSourceValue *getBufferPSV(const SIInstrInfo &TII, 615 const Value *BufferRsrc) { 616 assert(BufferRsrc); 617 auto PSV = BufferPSVs.try_emplace( 618 BufferRsrc, 619 llvm::make_unique<AMDGPUBufferPseudoSourceValue>(TII)); 620 return PSV.first->second.get(); 621 } 622 623 const AMDGPUImagePseudoSourceValue *getImagePSV(const SIInstrInfo &TII, 624 const Value *ImgRsrc) { 625 assert(ImgRsrc); 626 auto PSV = ImagePSVs.try_emplace( 627 ImgRsrc, 628 llvm::make_unique<AMDGPUImagePseudoSourceValue>(TII)); 629 return PSV.first->second.get(); 630 } 631 632 unsigned getOccupancy() const { 633 return Occupancy; 634 } 635 636 unsigned getMinAllowedOccupancy() const { 637 if (!isMemoryBound() && !needsWaveLimiter()) 638 return Occupancy; 639 return (Occupancy < 4) ? Occupancy : 4; 640 } 641 642 void limitOccupancy(const MachineFunction &MF); 643 644 void limitOccupancy(unsigned Limit) { 645 if (Occupancy > Limit) 646 Occupancy = Limit; 647 } 648 649 void increaseOccupancy(const MachineFunction &MF, unsigned Limit) { 650 if (Occupancy < Limit) 651 Occupancy = Limit; 652 limitOccupancy(MF); 653 } 654 }; 655 656 } // end namespace llvm 657 658 #endif // LLVM_LIB_TARGET_AMDGPU_SIMACHINEFUNCTIONINFO_H 659