1 //===-- SIRegisterInfo.cpp - SI Register Information ---------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// SI implementation of the TargetRegisterInfo class. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPU.h" 15 #include "AMDGPURegisterBankInfo.h" 16 #include "GCNSubtarget.h" 17 #include "MCTargetDesc/AMDGPUInstPrinter.h" 18 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 19 #include "SIMachineFunctionInfo.h" 20 #include "SIRegisterInfo.h" 21 #include "llvm/CodeGen/LiveIntervals.h" 22 #include "llvm/CodeGen/LiveRegUnits.h" 23 #include "llvm/CodeGen/MachineDominators.h" 24 #include "llvm/CodeGen/MachineFrameInfo.h" 25 #include "llvm/CodeGen/RegisterScavenging.h" 26 27 using namespace llvm; 28 29 #define GET_REGINFO_TARGET_DESC 30 #include "AMDGPUGenRegisterInfo.inc" 31 32 static cl::opt<bool> EnableSpillSGPRToVGPR( 33 "amdgpu-spill-sgpr-to-vgpr", 34 cl::desc("Enable spilling SGPRs to VGPRs"), 35 cl::ReallyHidden, 36 cl::init(true)); 37 38 std::array<std::vector<int16_t>, 16> SIRegisterInfo::RegSplitParts; 39 std::array<std::array<uint16_t, 32>, 9> SIRegisterInfo::SubRegFromChannelTable; 40 41 // Map numbers of DWORDs to indexes in SubRegFromChannelTable. 42 // Valid indexes are shifted 1, such that a 0 mapping means unsupported. 43 // e.g. for 8 DWORDs (256-bit), SubRegFromChannelTableWidthMap[8] = 8, 44 // meaning index 7 in SubRegFromChannelTable. 45 static const std::array<unsigned, 17> SubRegFromChannelTableWidthMap = { 46 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 0, 0, 0, 0, 0, 0, 9}; 47 48 namespace llvm { 49 50 // A temporary struct to spill SGPRs. 51 // This is mostly to spill SGPRs to memory. Spilling SGPRs into VGPR lanes emits 52 // just v_writelane and v_readlane. 53 // 54 // When spilling to memory, the SGPRs are written into VGPR lanes and the VGPR 55 // is saved to scratch (or the other way around for loads). 56 // For this, a VGPR is required where the needed lanes can be clobbered. The 57 // RegScavenger can provide a VGPR where currently active lanes can be 58 // clobbered, but we still need to save inactive lanes. 59 // The high-level steps are: 60 // - Try to scavenge SGPR(s) to save exec 61 // - Try to scavenge VGPR 62 // - Save needed, all or inactive lanes of a TmpVGPR 63 // - Spill/Restore SGPRs using TmpVGPR 64 // - Restore TmpVGPR 65 // 66 // To save all lanes of TmpVGPR, exec needs to be saved and modified. If we 67 // cannot scavenge temporary SGPRs to save exec, we use the following code: 68 // buffer_store_dword TmpVGPR ; only if active lanes need to be saved 69 // s_not exec, exec 70 // buffer_store_dword TmpVGPR ; save inactive lanes 71 // s_not exec, exec 72 struct SGPRSpillBuilder { 73 struct PerVGPRData { 74 unsigned PerVGPR; 75 unsigned NumVGPRs; 76 int64_t VGPRLanes; 77 }; 78 79 // The SGPR to save 80 Register SuperReg; 81 MachineBasicBlock::iterator MI; 82 ArrayRef<int16_t> SplitParts; 83 unsigned NumSubRegs; 84 bool IsKill; 85 const DebugLoc &DL; 86 87 /* When spilling to stack */ 88 // The SGPRs are written into this VGPR, which is then written to scratch 89 // (or vice versa for loads). 90 Register TmpVGPR = AMDGPU::NoRegister; 91 // Temporary spill slot to save TmpVGPR to. 92 int TmpVGPRIndex = 0; 93 // If TmpVGPR is live before the spill or if it is scavenged. 94 bool TmpVGPRLive = false; 95 // Scavenged SGPR to save EXEC. 96 Register SavedExecReg = AMDGPU::NoRegister; 97 // Stack index to write the SGPRs to. 98 int Index; 99 unsigned EltSize = 4; 100 101 RegScavenger *RS; 102 MachineBasicBlock *MBB; 103 MachineFunction &MF; 104 SIMachineFunctionInfo &MFI; 105 const SIInstrInfo &TII; 106 const SIRegisterInfo &TRI; 107 bool IsWave32; 108 Register ExecReg; 109 unsigned MovOpc; 110 unsigned NotOpc; 111 112 SGPRSpillBuilder(const SIRegisterInfo &TRI, const SIInstrInfo &TII, 113 bool IsWave32, MachineBasicBlock::iterator MI, int Index, 114 RegScavenger *RS) 115 : SGPRSpillBuilder(TRI, TII, IsWave32, MI, MI->getOperand(0).getReg(), 116 MI->getOperand(0).isKill(), Index, RS) {} 117 118 SGPRSpillBuilder(const SIRegisterInfo &TRI, const SIInstrInfo &TII, 119 bool IsWave32, MachineBasicBlock::iterator MI, Register Reg, 120 bool IsKill, int Index, RegScavenger *RS) 121 : SuperReg(Reg), MI(MI), IsKill(IsKill), DL(MI->getDebugLoc()), 122 Index(Index), RS(RS), MBB(MI->getParent()), MF(*MBB->getParent()), 123 MFI(*MF.getInfo<SIMachineFunctionInfo>()), TII(TII), TRI(TRI), 124 IsWave32(IsWave32) { 125 const TargetRegisterClass *RC = TRI.getPhysRegBaseClass(SuperReg); 126 SplitParts = TRI.getRegSplitParts(RC, EltSize); 127 NumSubRegs = SplitParts.empty() ? 1 : SplitParts.size(); 128 129 if (IsWave32) { 130 ExecReg = AMDGPU::EXEC_LO; 131 MovOpc = AMDGPU::S_MOV_B32; 132 NotOpc = AMDGPU::S_NOT_B32; 133 } else { 134 ExecReg = AMDGPU::EXEC; 135 MovOpc = AMDGPU::S_MOV_B64; 136 NotOpc = AMDGPU::S_NOT_B64; 137 } 138 139 assert(SuperReg != AMDGPU::M0 && "m0 should never spill"); 140 assert(SuperReg != AMDGPU::EXEC_LO && SuperReg != AMDGPU::EXEC_HI && 141 SuperReg != AMDGPU::EXEC && "exec should never spill"); 142 } 143 144 PerVGPRData getPerVGPRData() { 145 PerVGPRData Data; 146 Data.PerVGPR = IsWave32 ? 32 : 64; 147 Data.NumVGPRs = (NumSubRegs + (Data.PerVGPR - 1)) / Data.PerVGPR; 148 Data.VGPRLanes = (1LL << std::min(Data.PerVGPR, NumSubRegs)) - 1LL; 149 return Data; 150 } 151 152 // Tries to scavenge SGPRs to save EXEC and a VGPR. Uses v0 if no VGPR is 153 // free. 154 // Writes these instructions if an SGPR can be scavenged: 155 // s_mov_b64 s[6:7], exec ; Save exec 156 // s_mov_b64 exec, 3 ; Wanted lanemask 157 // buffer_store_dword v1 ; Write scavenged VGPR to emergency slot 158 // 159 // Writes these instructions if no SGPR can be scavenged: 160 // buffer_store_dword v0 ; Only if no free VGPR was found 161 // s_not_b64 exec, exec 162 // buffer_store_dword v0 ; Save inactive lanes 163 // ; exec stays inverted, it is flipped back in 164 // ; restore. 165 void prepare() { 166 // Scavenged temporary VGPR to use. It must be scavenged once for any number 167 // of spilled subregs. 168 // FIXME: The liveness analysis is limited and does not tell if a register 169 // is in use in lanes that are currently inactive. We can never be sure if 170 // a register as actually in use in another lane, so we need to save all 171 // used lanes of the chosen VGPR. 172 assert(RS && "Cannot spill SGPR to memory without RegScavenger"); 173 TmpVGPR = RS->scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass, MI, false, 174 0, false); 175 176 // Reserve temporary stack slot 177 TmpVGPRIndex = MFI.getScavengeFI(MF.getFrameInfo(), TRI); 178 if (TmpVGPR) { 179 // Found a register that is dead in the currently active lanes, we only 180 // need to spill inactive lanes. 181 TmpVGPRLive = false; 182 } else { 183 // Pick v0 because it doesn't make a difference. 184 TmpVGPR = AMDGPU::VGPR0; 185 TmpVGPRLive = true; 186 } 187 188 if (TmpVGPRLive) { 189 // We need to inform the scavenger that this index is already in use until 190 // we're done with the custom emergency spill. 191 RS->assignRegToScavengingIndex(TmpVGPRIndex, TmpVGPR); 192 } 193 194 // We may end up recursively calling the scavenger, and don't want to re-use 195 // the same register. 196 RS->setRegUsed(TmpVGPR); 197 198 // Try to scavenge SGPRs to save exec 199 assert(!SavedExecReg && "Exec is already saved, refuse to save again"); 200 const TargetRegisterClass &RC = 201 IsWave32 ? AMDGPU::SGPR_32RegClass : AMDGPU::SGPR_64RegClass; 202 RS->setRegUsed(SuperReg); 203 SavedExecReg = RS->scavengeRegisterBackwards(RC, MI, false, 0, false); 204 205 int64_t VGPRLanes = getPerVGPRData().VGPRLanes; 206 207 if (SavedExecReg) { 208 RS->setRegUsed(SavedExecReg); 209 // Set exec to needed lanes 210 BuildMI(*MBB, MI, DL, TII.get(MovOpc), SavedExecReg).addReg(ExecReg); 211 auto I = 212 BuildMI(*MBB, MI, DL, TII.get(MovOpc), ExecReg).addImm(VGPRLanes); 213 if (!TmpVGPRLive) 214 I.addReg(TmpVGPR, RegState::ImplicitDefine); 215 // Spill needed lanes 216 TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ false); 217 } else { 218 // The modify and restore of exec clobber SCC, which we would have to save 219 // and restore. FIXME: We probably would need to reserve a register for 220 // this. 221 if (RS->isRegUsed(AMDGPU::SCC)) 222 MI->emitError("unhandled SGPR spill to memory"); 223 224 // Spill active lanes 225 if (TmpVGPRLive) 226 TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ false, 227 /*IsKill*/ false); 228 // Spill inactive lanes 229 auto I = BuildMI(*MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg); 230 if (!TmpVGPRLive) 231 I.addReg(TmpVGPR, RegState::ImplicitDefine); 232 I->getOperand(2).setIsDead(); // Mark SCC as dead. 233 TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ false); 234 } 235 } 236 237 // Writes these instructions if an SGPR can be scavenged: 238 // buffer_load_dword v1 ; Write scavenged VGPR to emergency slot 239 // s_waitcnt vmcnt(0) ; If a free VGPR was found 240 // s_mov_b64 exec, s[6:7] ; Save exec 241 // 242 // Writes these instructions if no SGPR can be scavenged: 243 // buffer_load_dword v0 ; Restore inactive lanes 244 // s_waitcnt vmcnt(0) ; If a free VGPR was found 245 // s_not_b64 exec, exec 246 // buffer_load_dword v0 ; Only if no free VGPR was found 247 void restore() { 248 if (SavedExecReg) { 249 // Restore used lanes 250 TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ true, 251 /*IsKill*/ false); 252 // Restore exec 253 auto I = BuildMI(*MBB, MI, DL, TII.get(MovOpc), ExecReg) 254 .addReg(SavedExecReg, RegState::Kill); 255 // Add an implicit use of the load so it is not dead. 256 // FIXME This inserts an unnecessary waitcnt 257 if (!TmpVGPRLive) { 258 I.addReg(TmpVGPR, RegState::ImplicitKill); 259 } 260 } else { 261 // Restore inactive lanes 262 TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ true, 263 /*IsKill*/ false); 264 auto I = BuildMI(*MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg); 265 if (!TmpVGPRLive) 266 I.addReg(TmpVGPR, RegState::ImplicitKill); 267 I->getOperand(2).setIsDead(); // Mark SCC as dead. 268 269 // Restore active lanes 270 if (TmpVGPRLive) 271 TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ true); 272 } 273 274 // Inform the scavenger where we're releasing our custom scavenged register. 275 if (TmpVGPRLive) { 276 MachineBasicBlock::iterator RestorePt = std::prev(MI); 277 RS->assignRegToScavengingIndex(TmpVGPRIndex, TmpVGPR, &*RestorePt); 278 } 279 } 280 281 // Write TmpVGPR to memory or read TmpVGPR from memory. 282 // Either using a single buffer_load/store if exec is set to the needed mask 283 // or using 284 // buffer_load 285 // s_not exec, exec 286 // buffer_load 287 // s_not exec, exec 288 void readWriteTmpVGPR(unsigned Offset, bool IsLoad) { 289 if (SavedExecReg) { 290 // Spill needed lanes 291 TRI.buildVGPRSpillLoadStore(*this, Index, Offset, IsLoad); 292 } else { 293 // The modify and restore of exec clobber SCC, which we would have to save 294 // and restore. FIXME: We probably would need to reserve a register for 295 // this. 296 if (RS->isRegUsed(AMDGPU::SCC)) 297 MI->emitError("unhandled SGPR spill to memory"); 298 299 // Spill active lanes 300 TRI.buildVGPRSpillLoadStore(*this, Index, Offset, IsLoad, 301 /*IsKill*/ false); 302 // Spill inactive lanes 303 auto Not0 = BuildMI(*MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg); 304 Not0->getOperand(2).setIsDead(); // Mark SCC as dead. 305 TRI.buildVGPRSpillLoadStore(*this, Index, Offset, IsLoad); 306 auto Not1 = BuildMI(*MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg); 307 Not1->getOperand(2).setIsDead(); // Mark SCC as dead. 308 } 309 } 310 311 void setMI(MachineBasicBlock *NewMBB, MachineBasicBlock::iterator NewMI) { 312 assert(MBB->getParent() == &MF); 313 MI = NewMI; 314 MBB = NewMBB; 315 } 316 }; 317 318 } // namespace llvm 319 320 SIRegisterInfo::SIRegisterInfo(const GCNSubtarget &ST) 321 : AMDGPUGenRegisterInfo(AMDGPU::PC_REG, ST.getAMDGPUDwarfFlavour(), 322 ST.getAMDGPUDwarfFlavour()), 323 ST(ST), SpillSGPRToVGPR(EnableSpillSGPRToVGPR), isWave32(ST.isWave32()) { 324 325 assert(getSubRegIndexLaneMask(AMDGPU::sub0).getAsInteger() == 3 && 326 getSubRegIndexLaneMask(AMDGPU::sub31).getAsInteger() == (3ULL << 62) && 327 (getSubRegIndexLaneMask(AMDGPU::lo16) | 328 getSubRegIndexLaneMask(AMDGPU::hi16)).getAsInteger() == 329 getSubRegIndexLaneMask(AMDGPU::sub0).getAsInteger() && 330 "getNumCoveredRegs() will not work with generated subreg masks!"); 331 332 RegPressureIgnoredUnits.resize(getNumRegUnits()); 333 RegPressureIgnoredUnits.set(*regunits(MCRegister::from(AMDGPU::M0)).begin()); 334 for (auto Reg : AMDGPU::VGPR_16RegClass) { 335 if (AMDGPU::isHi(Reg, *this)) 336 RegPressureIgnoredUnits.set(*regunits(Reg).begin()); 337 } 338 339 // HACK: Until this is fully tablegen'd. 340 static llvm::once_flag InitializeRegSplitPartsFlag; 341 342 static auto InitializeRegSplitPartsOnce = [this]() { 343 for (unsigned Idx = 1, E = getNumSubRegIndices() - 1; Idx < E; ++Idx) { 344 unsigned Size = getSubRegIdxSize(Idx); 345 if (Size & 31) 346 continue; 347 std::vector<int16_t> &Vec = RegSplitParts[Size / 32 - 1]; 348 unsigned Pos = getSubRegIdxOffset(Idx); 349 if (Pos % Size) 350 continue; 351 Pos /= Size; 352 if (Vec.empty()) { 353 unsigned MaxNumParts = 1024 / Size; // Maximum register is 1024 bits. 354 Vec.resize(MaxNumParts); 355 } 356 Vec[Pos] = Idx; 357 } 358 }; 359 360 static llvm::once_flag InitializeSubRegFromChannelTableFlag; 361 362 static auto InitializeSubRegFromChannelTableOnce = [this]() { 363 for (auto &Row : SubRegFromChannelTable) 364 Row.fill(AMDGPU::NoSubRegister); 365 for (unsigned Idx = 1; Idx < getNumSubRegIndices(); ++Idx) { 366 unsigned Width = getSubRegIdxSize(Idx) / 32; 367 unsigned Offset = getSubRegIdxOffset(Idx) / 32; 368 assert(Width < SubRegFromChannelTableWidthMap.size()); 369 Width = SubRegFromChannelTableWidthMap[Width]; 370 if (Width == 0) 371 continue; 372 unsigned TableIdx = Width - 1; 373 assert(TableIdx < SubRegFromChannelTable.size()); 374 assert(Offset < SubRegFromChannelTable[TableIdx].size()); 375 SubRegFromChannelTable[TableIdx][Offset] = Idx; 376 } 377 }; 378 379 llvm::call_once(InitializeRegSplitPartsFlag, InitializeRegSplitPartsOnce); 380 llvm::call_once(InitializeSubRegFromChannelTableFlag, 381 InitializeSubRegFromChannelTableOnce); 382 } 383 384 void SIRegisterInfo::reserveRegisterTuples(BitVector &Reserved, 385 MCRegister Reg) const { 386 for (MCRegAliasIterator R(Reg, this, true); R.isValid(); ++R) 387 Reserved.set(*R); 388 } 389 390 // Forced to be here by one .inc 391 const MCPhysReg *SIRegisterInfo::getCalleeSavedRegs( 392 const MachineFunction *MF) const { 393 CallingConv::ID CC = MF->getFunction().getCallingConv(); 394 switch (CC) { 395 case CallingConv::C: 396 case CallingConv::Fast: 397 case CallingConv::Cold: 398 return ST.hasGFX90AInsts() ? CSR_AMDGPU_GFX90AInsts_SaveList 399 : CSR_AMDGPU_SaveList; 400 case CallingConv::AMDGPU_Gfx: 401 return ST.hasGFX90AInsts() ? CSR_AMDGPU_SI_Gfx_GFX90AInsts_SaveList 402 : CSR_AMDGPU_SI_Gfx_SaveList; 403 case CallingConv::AMDGPU_CS_ChainPreserve: 404 return CSR_AMDGPU_CS_ChainPreserve_SaveList; 405 default: { 406 // Dummy to not crash RegisterClassInfo. 407 static const MCPhysReg NoCalleeSavedReg = AMDGPU::NoRegister; 408 return &NoCalleeSavedReg; 409 } 410 } 411 } 412 413 const MCPhysReg * 414 SIRegisterInfo::getCalleeSavedRegsViaCopy(const MachineFunction *MF) const { 415 return nullptr; 416 } 417 418 const uint32_t *SIRegisterInfo::getCallPreservedMask(const MachineFunction &MF, 419 CallingConv::ID CC) const { 420 switch (CC) { 421 case CallingConv::C: 422 case CallingConv::Fast: 423 case CallingConv::Cold: 424 return ST.hasGFX90AInsts() ? CSR_AMDGPU_GFX90AInsts_RegMask 425 : CSR_AMDGPU_RegMask; 426 case CallingConv::AMDGPU_Gfx: 427 return ST.hasGFX90AInsts() ? CSR_AMDGPU_SI_Gfx_GFX90AInsts_RegMask 428 : CSR_AMDGPU_SI_Gfx_RegMask; 429 case CallingConv::AMDGPU_CS_Chain: 430 case CallingConv::AMDGPU_CS_ChainPreserve: 431 // Calls to these functions never return, so we can pretend everything is 432 // preserved. 433 return AMDGPU_AllVGPRs_RegMask; 434 default: 435 return nullptr; 436 } 437 } 438 439 const uint32_t *SIRegisterInfo::getNoPreservedMask() const { 440 return CSR_AMDGPU_NoRegs_RegMask; 441 } 442 443 bool SIRegisterInfo::isChainScratchRegister(Register VGPR) { 444 return VGPR >= AMDGPU::VGPR0 && VGPR < AMDGPU::VGPR8; 445 } 446 447 const TargetRegisterClass * 448 SIRegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC, 449 const MachineFunction &MF) const { 450 // FIXME: Should have a helper function like getEquivalentVGPRClass to get the 451 // equivalent AV class. If used one, the verifier will crash after 452 // RegBankSelect in the GISel flow. The aligned regclasses are not fully given 453 // until Instruction selection. 454 if (ST.hasMAIInsts() && (isVGPRClass(RC) || isAGPRClass(RC))) { 455 if (RC == &AMDGPU::VGPR_32RegClass || RC == &AMDGPU::AGPR_32RegClass) 456 return &AMDGPU::AV_32RegClass; 457 if (RC == &AMDGPU::VReg_64RegClass || RC == &AMDGPU::AReg_64RegClass) 458 return &AMDGPU::AV_64RegClass; 459 if (RC == &AMDGPU::VReg_64_Align2RegClass || 460 RC == &AMDGPU::AReg_64_Align2RegClass) 461 return &AMDGPU::AV_64_Align2RegClass; 462 if (RC == &AMDGPU::VReg_96RegClass || RC == &AMDGPU::AReg_96RegClass) 463 return &AMDGPU::AV_96RegClass; 464 if (RC == &AMDGPU::VReg_96_Align2RegClass || 465 RC == &AMDGPU::AReg_96_Align2RegClass) 466 return &AMDGPU::AV_96_Align2RegClass; 467 if (RC == &AMDGPU::VReg_128RegClass || RC == &AMDGPU::AReg_128RegClass) 468 return &AMDGPU::AV_128RegClass; 469 if (RC == &AMDGPU::VReg_128_Align2RegClass || 470 RC == &AMDGPU::AReg_128_Align2RegClass) 471 return &AMDGPU::AV_128_Align2RegClass; 472 if (RC == &AMDGPU::VReg_160RegClass || RC == &AMDGPU::AReg_160RegClass) 473 return &AMDGPU::AV_160RegClass; 474 if (RC == &AMDGPU::VReg_160_Align2RegClass || 475 RC == &AMDGPU::AReg_160_Align2RegClass) 476 return &AMDGPU::AV_160_Align2RegClass; 477 if (RC == &AMDGPU::VReg_192RegClass || RC == &AMDGPU::AReg_192RegClass) 478 return &AMDGPU::AV_192RegClass; 479 if (RC == &AMDGPU::VReg_192_Align2RegClass || 480 RC == &AMDGPU::AReg_192_Align2RegClass) 481 return &AMDGPU::AV_192_Align2RegClass; 482 if (RC == &AMDGPU::VReg_256RegClass || RC == &AMDGPU::AReg_256RegClass) 483 return &AMDGPU::AV_256RegClass; 484 if (RC == &AMDGPU::VReg_256_Align2RegClass || 485 RC == &AMDGPU::AReg_256_Align2RegClass) 486 return &AMDGPU::AV_256_Align2RegClass; 487 if (RC == &AMDGPU::VReg_512RegClass || RC == &AMDGPU::AReg_512RegClass) 488 return &AMDGPU::AV_512RegClass; 489 if (RC == &AMDGPU::VReg_512_Align2RegClass || 490 RC == &AMDGPU::AReg_512_Align2RegClass) 491 return &AMDGPU::AV_512_Align2RegClass; 492 if (RC == &AMDGPU::VReg_1024RegClass || RC == &AMDGPU::AReg_1024RegClass) 493 return &AMDGPU::AV_1024RegClass; 494 if (RC == &AMDGPU::VReg_1024_Align2RegClass || 495 RC == &AMDGPU::AReg_1024_Align2RegClass) 496 return &AMDGPU::AV_1024_Align2RegClass; 497 } 498 499 return TargetRegisterInfo::getLargestLegalSuperClass(RC, MF); 500 } 501 502 Register SIRegisterInfo::getFrameRegister(const MachineFunction &MF) const { 503 const SIFrameLowering *TFI = ST.getFrameLowering(); 504 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); 505 // During ISel lowering we always reserve the stack pointer in entry and chain 506 // functions, but never actually want to reference it when accessing our own 507 // frame. If we need a frame pointer we use it, but otherwise we can just use 508 // an immediate "0" which we represent by returning NoRegister. 509 if (FuncInfo->isBottomOfStack()) { 510 return TFI->hasFP(MF) ? FuncInfo->getFrameOffsetReg() : Register(); 511 } 512 return TFI->hasFP(MF) ? FuncInfo->getFrameOffsetReg() 513 : FuncInfo->getStackPtrOffsetReg(); 514 } 515 516 bool SIRegisterInfo::hasBasePointer(const MachineFunction &MF) const { 517 // When we need stack realignment, we can't reference off of the 518 // stack pointer, so we reserve a base pointer. 519 const MachineFrameInfo &MFI = MF.getFrameInfo(); 520 return MFI.getNumFixedObjects() && shouldRealignStack(MF); 521 } 522 523 Register SIRegisterInfo::getBaseRegister() const { return AMDGPU::SGPR34; } 524 525 const uint32_t *SIRegisterInfo::getAllVGPRRegMask() const { 526 return AMDGPU_AllVGPRs_RegMask; 527 } 528 529 const uint32_t *SIRegisterInfo::getAllAGPRRegMask() const { 530 return AMDGPU_AllAGPRs_RegMask; 531 } 532 533 const uint32_t *SIRegisterInfo::getAllVectorRegMask() const { 534 return AMDGPU_AllVectorRegs_RegMask; 535 } 536 537 const uint32_t *SIRegisterInfo::getAllAllocatableSRegMask() const { 538 return AMDGPU_AllAllocatableSRegs_RegMask; 539 } 540 541 unsigned SIRegisterInfo::getSubRegFromChannel(unsigned Channel, 542 unsigned NumRegs) { 543 assert(NumRegs < SubRegFromChannelTableWidthMap.size()); 544 unsigned NumRegIndex = SubRegFromChannelTableWidthMap[NumRegs]; 545 assert(NumRegIndex && "Not implemented"); 546 assert(Channel < SubRegFromChannelTable[NumRegIndex - 1].size()); 547 return SubRegFromChannelTable[NumRegIndex - 1][Channel]; 548 } 549 550 MCRegister 551 SIRegisterInfo::getAlignedHighSGPRForRC(const MachineFunction &MF, 552 const unsigned Align, 553 const TargetRegisterClass *RC) const { 554 unsigned BaseIdx = alignDown(ST.getMaxNumSGPRs(MF), Align) - Align; 555 MCRegister BaseReg(AMDGPU::SGPR_32RegClass.getRegister(BaseIdx)); 556 return getMatchingSuperReg(BaseReg, AMDGPU::sub0, RC); 557 } 558 559 MCRegister SIRegisterInfo::reservedPrivateSegmentBufferReg( 560 const MachineFunction &MF) const { 561 return getAlignedHighSGPRForRC(MF, /*Align=*/4, &AMDGPU::SGPR_128RegClass); 562 } 563 564 BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const { 565 BitVector Reserved(getNumRegs()); 566 Reserved.set(AMDGPU::MODE); 567 568 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 569 570 // Reserve special purpose registers. 571 // 572 // EXEC_LO and EXEC_HI could be allocated and used as regular register, but 573 // this seems likely to result in bugs, so I'm marking them as reserved. 574 reserveRegisterTuples(Reserved, AMDGPU::EXEC); 575 reserveRegisterTuples(Reserved, AMDGPU::FLAT_SCR); 576 577 // M0 has to be reserved so that llvm accepts it as a live-in into a block. 578 reserveRegisterTuples(Reserved, AMDGPU::M0); 579 580 // Reserve src_vccz, src_execz, src_scc. 581 reserveRegisterTuples(Reserved, AMDGPU::SRC_VCCZ); 582 reserveRegisterTuples(Reserved, AMDGPU::SRC_EXECZ); 583 reserveRegisterTuples(Reserved, AMDGPU::SRC_SCC); 584 585 // Reserve the memory aperture registers 586 reserveRegisterTuples(Reserved, AMDGPU::SRC_SHARED_BASE); 587 reserveRegisterTuples(Reserved, AMDGPU::SRC_SHARED_LIMIT); 588 reserveRegisterTuples(Reserved, AMDGPU::SRC_PRIVATE_BASE); 589 reserveRegisterTuples(Reserved, AMDGPU::SRC_PRIVATE_LIMIT); 590 591 // Reserve src_pops_exiting_wave_id - support is not implemented in Codegen. 592 reserveRegisterTuples(Reserved, AMDGPU::SRC_POPS_EXITING_WAVE_ID); 593 594 // Reserve xnack_mask registers - support is not implemented in Codegen. 595 reserveRegisterTuples(Reserved, AMDGPU::XNACK_MASK); 596 597 // Reserve lds_direct register - support is not implemented in Codegen. 598 reserveRegisterTuples(Reserved, AMDGPU::LDS_DIRECT); 599 600 // Reserve Trap Handler registers - support is not implemented in Codegen. 601 reserveRegisterTuples(Reserved, AMDGPU::TBA); 602 reserveRegisterTuples(Reserved, AMDGPU::TMA); 603 reserveRegisterTuples(Reserved, AMDGPU::TTMP0_TTMP1); 604 reserveRegisterTuples(Reserved, AMDGPU::TTMP2_TTMP3); 605 reserveRegisterTuples(Reserved, AMDGPU::TTMP4_TTMP5); 606 reserveRegisterTuples(Reserved, AMDGPU::TTMP6_TTMP7); 607 reserveRegisterTuples(Reserved, AMDGPU::TTMP8_TTMP9); 608 reserveRegisterTuples(Reserved, AMDGPU::TTMP10_TTMP11); 609 reserveRegisterTuples(Reserved, AMDGPU::TTMP12_TTMP13); 610 reserveRegisterTuples(Reserved, AMDGPU::TTMP14_TTMP15); 611 612 // Reserve null register - it shall never be allocated 613 reserveRegisterTuples(Reserved, AMDGPU::SGPR_NULL64); 614 615 // Reserve SGPRs. 616 // 617 unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF); 618 unsigned TotalNumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs(); 619 for (const TargetRegisterClass *RC : regclasses()) { 620 if (RC->isBaseClass() && isSGPRClass(RC)) { 621 unsigned NumRegs = divideCeil(getRegSizeInBits(*RC), 32); 622 for (MCPhysReg Reg : *RC) { 623 unsigned Index = getHWRegIndex(Reg); 624 if (Index + NumRegs > MaxNumSGPRs && Index < TotalNumSGPRs) 625 Reserved.set(Reg); 626 } 627 } 628 } 629 630 Register ScratchRSrcReg = MFI->getScratchRSrcReg(); 631 if (ScratchRSrcReg != AMDGPU::NoRegister) { 632 // Reserve 4 SGPRs for the scratch buffer resource descriptor in case we 633 // need to spill. 634 // TODO: May need to reserve a VGPR if doing LDS spilling. 635 reserveRegisterTuples(Reserved, ScratchRSrcReg); 636 } 637 638 Register LongBranchReservedReg = MFI->getLongBranchReservedReg(); 639 if (LongBranchReservedReg) 640 reserveRegisterTuples(Reserved, LongBranchReservedReg); 641 642 // We have to assume the SP is needed in case there are calls in the function, 643 // which is detected after the function is lowered. If we aren't really going 644 // to need SP, don't bother reserving it. 645 MCRegister StackPtrReg = MFI->getStackPtrOffsetReg(); 646 if (StackPtrReg) { 647 reserveRegisterTuples(Reserved, StackPtrReg); 648 assert(!isSubRegister(ScratchRSrcReg, StackPtrReg)); 649 } 650 651 MCRegister FrameReg = MFI->getFrameOffsetReg(); 652 if (FrameReg) { 653 reserveRegisterTuples(Reserved, FrameReg); 654 assert(!isSubRegister(ScratchRSrcReg, FrameReg)); 655 } 656 657 if (hasBasePointer(MF)) { 658 MCRegister BasePtrReg = getBaseRegister(); 659 reserveRegisterTuples(Reserved, BasePtrReg); 660 assert(!isSubRegister(ScratchRSrcReg, BasePtrReg)); 661 } 662 663 // FIXME: Use same reserved register introduced in D149775 664 // SGPR used to preserve EXEC MASK around WWM spill/copy instructions. 665 Register ExecCopyReg = MFI->getSGPRForEXECCopy(); 666 if (ExecCopyReg) 667 reserveRegisterTuples(Reserved, ExecCopyReg); 668 669 // Reserve VGPRs/AGPRs. 670 // 671 unsigned MaxNumVGPRs = ST.getMaxNumVGPRs(MF); 672 unsigned MaxNumAGPRs = MaxNumVGPRs; 673 unsigned TotalNumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs(); 674 675 // On GFX90A, the number of VGPRs and AGPRs need not be equal. Theoretically, 676 // a wave may have up to 512 total vector registers combining together both 677 // VGPRs and AGPRs. Hence, in an entry function without calls and without 678 // AGPRs used within it, it is possible to use the whole vector register 679 // budget for VGPRs. 680 // 681 // TODO: it shall be possible to estimate maximum AGPR/VGPR pressure and split 682 // register file accordingly. 683 if (ST.hasGFX90AInsts()) { 684 if (MFI->usesAGPRs(MF)) { 685 MaxNumVGPRs /= 2; 686 MaxNumAGPRs = MaxNumVGPRs; 687 } else { 688 if (MaxNumVGPRs > TotalNumVGPRs) { 689 MaxNumAGPRs = MaxNumVGPRs - TotalNumVGPRs; 690 MaxNumVGPRs = TotalNumVGPRs; 691 } else 692 MaxNumAGPRs = 0; 693 } 694 } 695 696 for (const TargetRegisterClass *RC : regclasses()) { 697 if (RC->isBaseClass() && isVGPRClass(RC)) { 698 unsigned NumRegs = divideCeil(getRegSizeInBits(*RC), 32); 699 for (MCPhysReg Reg : *RC) { 700 unsigned Index = getHWRegIndex(Reg); 701 if (Index + NumRegs > MaxNumVGPRs) 702 Reserved.set(Reg); 703 } 704 } 705 } 706 707 // Reserve all the AGPRs if there are no instructions to use it. 708 if (!ST.hasMAIInsts()) 709 MaxNumAGPRs = 0; 710 for (const TargetRegisterClass *RC : regclasses()) { 711 if (RC->isBaseClass() && isAGPRClass(RC)) { 712 unsigned NumRegs = divideCeil(getRegSizeInBits(*RC), 32); 713 for (MCPhysReg Reg : *RC) { 714 unsigned Index = getHWRegIndex(Reg); 715 if (Index + NumRegs > MaxNumAGPRs) 716 Reserved.set(Reg); 717 } 718 } 719 } 720 721 // On GFX908, in order to guarantee copying between AGPRs, we need a scratch 722 // VGPR available at all times. 723 if (ST.hasMAIInsts() && !ST.hasGFX90AInsts()) { 724 reserveRegisterTuples(Reserved, MFI->getVGPRForAGPRCopy()); 725 } 726 727 for (Register Reg : MFI->getWWMReservedRegs()) 728 reserveRegisterTuples(Reserved, Reg); 729 730 // FIXME: Stop using reserved registers for this. 731 for (MCPhysReg Reg : MFI->getAGPRSpillVGPRs()) 732 reserveRegisterTuples(Reserved, Reg); 733 734 for (MCPhysReg Reg : MFI->getVGPRSpillAGPRs()) 735 reserveRegisterTuples(Reserved, Reg); 736 737 return Reserved; 738 } 739 740 bool SIRegisterInfo::isAsmClobberable(const MachineFunction &MF, 741 MCRegister PhysReg) const { 742 return !MF.getRegInfo().isReserved(PhysReg); 743 } 744 745 bool SIRegisterInfo::shouldRealignStack(const MachineFunction &MF) const { 746 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 747 // On entry or in chain functions, the base address is 0, so it can't possibly 748 // need any more alignment. 749 750 // FIXME: Should be able to specify the entry frame alignment per calling 751 // convention instead. 752 if (Info->isBottomOfStack()) 753 return false; 754 755 return TargetRegisterInfo::shouldRealignStack(MF); 756 } 757 758 bool SIRegisterInfo::requiresRegisterScavenging(const MachineFunction &Fn) const { 759 const SIMachineFunctionInfo *Info = Fn.getInfo<SIMachineFunctionInfo>(); 760 if (Info->isEntryFunction()) { 761 const MachineFrameInfo &MFI = Fn.getFrameInfo(); 762 return MFI.hasStackObjects() || MFI.hasCalls(); 763 } 764 765 // May need scavenger for dealing with callee saved registers. 766 return true; 767 } 768 769 bool SIRegisterInfo::requiresFrameIndexScavenging( 770 const MachineFunction &MF) const { 771 // Do not use frame virtual registers. They used to be used for SGPRs, but 772 // once we reach PrologEpilogInserter, we can no longer spill SGPRs. If the 773 // scavenger fails, we can increment/decrement the necessary SGPRs to avoid a 774 // spill. 775 return false; 776 } 777 778 bool SIRegisterInfo::requiresFrameIndexReplacementScavenging( 779 const MachineFunction &MF) const { 780 const MachineFrameInfo &MFI = MF.getFrameInfo(); 781 return MFI.hasStackObjects(); 782 } 783 784 bool SIRegisterInfo::requiresVirtualBaseRegisters( 785 const MachineFunction &) const { 786 // There are no special dedicated stack or frame pointers. 787 return true; 788 } 789 790 int64_t SIRegisterInfo::getScratchInstrOffset(const MachineInstr *MI) const { 791 assert(SIInstrInfo::isMUBUF(*MI) || SIInstrInfo::isFLATScratch(*MI)); 792 793 int OffIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), 794 AMDGPU::OpName::offset); 795 return MI->getOperand(OffIdx).getImm(); 796 } 797 798 int64_t SIRegisterInfo::getFrameIndexInstrOffset(const MachineInstr *MI, 799 int Idx) const { 800 if (!SIInstrInfo::isMUBUF(*MI) && !SIInstrInfo::isFLATScratch(*MI)) 801 return 0; 802 803 assert((Idx == AMDGPU::getNamedOperandIdx(MI->getOpcode(), 804 AMDGPU::OpName::vaddr) || 805 (Idx == AMDGPU::getNamedOperandIdx(MI->getOpcode(), 806 AMDGPU::OpName::saddr))) && 807 "Should never see frame index on non-address operand"); 808 809 return getScratchInstrOffset(MI); 810 } 811 812 bool SIRegisterInfo::needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const { 813 if (!SIInstrInfo::isMUBUF(*MI) && !SIInstrInfo::isFLATScratch(*MI)) 814 return false; 815 816 int64_t FullOffset = Offset + getScratchInstrOffset(MI); 817 818 const SIInstrInfo *TII = ST.getInstrInfo(); 819 if (SIInstrInfo::isMUBUF(*MI)) 820 return !TII->isLegalMUBUFImmOffset(FullOffset); 821 822 return !TII->isLegalFLATOffset(FullOffset, AMDGPUAS::PRIVATE_ADDRESS, 823 SIInstrFlags::FlatScratch); 824 } 825 826 Register SIRegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB, 827 int FrameIdx, 828 int64_t Offset) const { 829 MachineBasicBlock::iterator Ins = MBB->begin(); 830 DebugLoc DL; // Defaults to "unknown" 831 832 if (Ins != MBB->end()) 833 DL = Ins->getDebugLoc(); 834 835 MachineFunction *MF = MBB->getParent(); 836 const SIInstrInfo *TII = ST.getInstrInfo(); 837 MachineRegisterInfo &MRI = MF->getRegInfo(); 838 unsigned MovOpc = ST.enableFlatScratch() ? AMDGPU::S_MOV_B32 839 : AMDGPU::V_MOV_B32_e32; 840 841 Register BaseReg = MRI.createVirtualRegister( 842 ST.enableFlatScratch() ? &AMDGPU::SReg_32_XEXEC_HIRegClass 843 : &AMDGPU::VGPR_32RegClass); 844 845 if (Offset == 0) { 846 BuildMI(*MBB, Ins, DL, TII->get(MovOpc), BaseReg) 847 .addFrameIndex(FrameIdx); 848 return BaseReg; 849 } 850 851 Register OffsetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); 852 853 Register FIReg = MRI.createVirtualRegister( 854 ST.enableFlatScratch() ? &AMDGPU::SReg_32_XM0RegClass 855 : &AMDGPU::VGPR_32RegClass); 856 857 BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::S_MOV_B32), OffsetReg) 858 .addImm(Offset); 859 BuildMI(*MBB, Ins, DL, TII->get(MovOpc), FIReg) 860 .addFrameIndex(FrameIdx); 861 862 if (ST.enableFlatScratch() ) { 863 BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::S_ADD_I32), BaseReg) 864 .addReg(OffsetReg, RegState::Kill) 865 .addReg(FIReg); 866 return BaseReg; 867 } 868 869 TII->getAddNoCarry(*MBB, Ins, DL, BaseReg) 870 .addReg(OffsetReg, RegState::Kill) 871 .addReg(FIReg) 872 .addImm(0); // clamp bit 873 874 return BaseReg; 875 } 876 877 void SIRegisterInfo::resolveFrameIndex(MachineInstr &MI, Register BaseReg, 878 int64_t Offset) const { 879 const SIInstrInfo *TII = ST.getInstrInfo(); 880 bool IsFlat = TII->isFLATScratch(MI); 881 882 #ifndef NDEBUG 883 // FIXME: Is it possible to be storing a frame index to itself? 884 bool SeenFI = false; 885 for (const MachineOperand &MO: MI.operands()) { 886 if (MO.isFI()) { 887 if (SeenFI) 888 llvm_unreachable("should not see multiple frame indices"); 889 890 SeenFI = true; 891 } 892 } 893 #endif 894 895 MachineOperand *FIOp = 896 TII->getNamedOperand(MI, IsFlat ? AMDGPU::OpName::saddr 897 : AMDGPU::OpName::vaddr); 898 899 MachineOperand *OffsetOp = TII->getNamedOperand(MI, AMDGPU::OpName::offset); 900 int64_t NewOffset = OffsetOp->getImm() + Offset; 901 902 assert(FIOp && FIOp->isFI() && "frame index must be address operand"); 903 assert(TII->isMUBUF(MI) || TII->isFLATScratch(MI)); 904 905 if (IsFlat) { 906 assert(TII->isLegalFLATOffset(NewOffset, AMDGPUAS::PRIVATE_ADDRESS, 907 SIInstrFlags::FlatScratch) && 908 "offset should be legal"); 909 FIOp->ChangeToRegister(BaseReg, false); 910 OffsetOp->setImm(NewOffset); 911 return; 912 } 913 914 #ifndef NDEBUG 915 MachineOperand *SOffset = TII->getNamedOperand(MI, AMDGPU::OpName::soffset); 916 assert(SOffset->isImm() && SOffset->getImm() == 0); 917 #endif 918 919 assert(TII->isLegalMUBUFImmOffset(NewOffset) && "offset should be legal"); 920 921 FIOp->ChangeToRegister(BaseReg, false); 922 OffsetOp->setImm(NewOffset); 923 } 924 925 bool SIRegisterInfo::isFrameOffsetLegal(const MachineInstr *MI, 926 Register BaseReg, 927 int64_t Offset) const { 928 if (!SIInstrInfo::isMUBUF(*MI) && !SIInstrInfo::isFLATScratch(*MI)) 929 return false; 930 931 int64_t NewOffset = Offset + getScratchInstrOffset(MI); 932 933 const SIInstrInfo *TII = ST.getInstrInfo(); 934 if (SIInstrInfo::isMUBUF(*MI)) 935 return TII->isLegalMUBUFImmOffset(NewOffset); 936 937 return TII->isLegalFLATOffset(NewOffset, AMDGPUAS::PRIVATE_ADDRESS, 938 SIInstrFlags::FlatScratch); 939 } 940 941 const TargetRegisterClass *SIRegisterInfo::getPointerRegClass( 942 const MachineFunction &MF, unsigned Kind) const { 943 // This is inaccurate. It depends on the instruction and address space. The 944 // only place where we should hit this is for dealing with frame indexes / 945 // private accesses, so this is correct in that case. 946 return &AMDGPU::VGPR_32RegClass; 947 } 948 949 const TargetRegisterClass * 950 SIRegisterInfo::getCrossCopyRegClass(const TargetRegisterClass *RC) const { 951 if (isAGPRClass(RC) && !ST.hasGFX90AInsts()) 952 return getEquivalentVGPRClass(RC); 953 if (RC == &AMDGPU::SCC_CLASSRegClass) 954 return getWaveMaskRegClass(); 955 956 return RC; 957 } 958 959 static unsigned getNumSubRegsForSpillOp(unsigned Op) { 960 961 switch (Op) { 962 case AMDGPU::SI_SPILL_S1024_SAVE: 963 case AMDGPU::SI_SPILL_S1024_RESTORE: 964 case AMDGPU::SI_SPILL_V1024_SAVE: 965 case AMDGPU::SI_SPILL_V1024_RESTORE: 966 case AMDGPU::SI_SPILL_A1024_SAVE: 967 case AMDGPU::SI_SPILL_A1024_RESTORE: 968 case AMDGPU::SI_SPILL_AV1024_SAVE: 969 case AMDGPU::SI_SPILL_AV1024_RESTORE: 970 return 32; 971 case AMDGPU::SI_SPILL_S512_SAVE: 972 case AMDGPU::SI_SPILL_S512_RESTORE: 973 case AMDGPU::SI_SPILL_V512_SAVE: 974 case AMDGPU::SI_SPILL_V512_RESTORE: 975 case AMDGPU::SI_SPILL_A512_SAVE: 976 case AMDGPU::SI_SPILL_A512_RESTORE: 977 case AMDGPU::SI_SPILL_AV512_SAVE: 978 case AMDGPU::SI_SPILL_AV512_RESTORE: 979 return 16; 980 case AMDGPU::SI_SPILL_S384_SAVE: 981 case AMDGPU::SI_SPILL_S384_RESTORE: 982 case AMDGPU::SI_SPILL_V384_SAVE: 983 case AMDGPU::SI_SPILL_V384_RESTORE: 984 case AMDGPU::SI_SPILL_A384_SAVE: 985 case AMDGPU::SI_SPILL_A384_RESTORE: 986 case AMDGPU::SI_SPILL_AV384_SAVE: 987 case AMDGPU::SI_SPILL_AV384_RESTORE: 988 return 12; 989 case AMDGPU::SI_SPILL_S352_SAVE: 990 case AMDGPU::SI_SPILL_S352_RESTORE: 991 case AMDGPU::SI_SPILL_V352_SAVE: 992 case AMDGPU::SI_SPILL_V352_RESTORE: 993 case AMDGPU::SI_SPILL_A352_SAVE: 994 case AMDGPU::SI_SPILL_A352_RESTORE: 995 case AMDGPU::SI_SPILL_AV352_SAVE: 996 case AMDGPU::SI_SPILL_AV352_RESTORE: 997 return 11; 998 case AMDGPU::SI_SPILL_S320_SAVE: 999 case AMDGPU::SI_SPILL_S320_RESTORE: 1000 case AMDGPU::SI_SPILL_V320_SAVE: 1001 case AMDGPU::SI_SPILL_V320_RESTORE: 1002 case AMDGPU::SI_SPILL_A320_SAVE: 1003 case AMDGPU::SI_SPILL_A320_RESTORE: 1004 case AMDGPU::SI_SPILL_AV320_SAVE: 1005 case AMDGPU::SI_SPILL_AV320_RESTORE: 1006 return 10; 1007 case AMDGPU::SI_SPILL_S288_SAVE: 1008 case AMDGPU::SI_SPILL_S288_RESTORE: 1009 case AMDGPU::SI_SPILL_V288_SAVE: 1010 case AMDGPU::SI_SPILL_V288_RESTORE: 1011 case AMDGPU::SI_SPILL_A288_SAVE: 1012 case AMDGPU::SI_SPILL_A288_RESTORE: 1013 case AMDGPU::SI_SPILL_AV288_SAVE: 1014 case AMDGPU::SI_SPILL_AV288_RESTORE: 1015 return 9; 1016 case AMDGPU::SI_SPILL_S256_SAVE: 1017 case AMDGPU::SI_SPILL_S256_RESTORE: 1018 case AMDGPU::SI_SPILL_V256_SAVE: 1019 case AMDGPU::SI_SPILL_V256_RESTORE: 1020 case AMDGPU::SI_SPILL_A256_SAVE: 1021 case AMDGPU::SI_SPILL_A256_RESTORE: 1022 case AMDGPU::SI_SPILL_AV256_SAVE: 1023 case AMDGPU::SI_SPILL_AV256_RESTORE: 1024 return 8; 1025 case AMDGPU::SI_SPILL_S224_SAVE: 1026 case AMDGPU::SI_SPILL_S224_RESTORE: 1027 case AMDGPU::SI_SPILL_V224_SAVE: 1028 case AMDGPU::SI_SPILL_V224_RESTORE: 1029 case AMDGPU::SI_SPILL_A224_SAVE: 1030 case AMDGPU::SI_SPILL_A224_RESTORE: 1031 case AMDGPU::SI_SPILL_AV224_SAVE: 1032 case AMDGPU::SI_SPILL_AV224_RESTORE: 1033 return 7; 1034 case AMDGPU::SI_SPILL_S192_SAVE: 1035 case AMDGPU::SI_SPILL_S192_RESTORE: 1036 case AMDGPU::SI_SPILL_V192_SAVE: 1037 case AMDGPU::SI_SPILL_V192_RESTORE: 1038 case AMDGPU::SI_SPILL_A192_SAVE: 1039 case AMDGPU::SI_SPILL_A192_RESTORE: 1040 case AMDGPU::SI_SPILL_AV192_SAVE: 1041 case AMDGPU::SI_SPILL_AV192_RESTORE: 1042 return 6; 1043 case AMDGPU::SI_SPILL_S160_SAVE: 1044 case AMDGPU::SI_SPILL_S160_RESTORE: 1045 case AMDGPU::SI_SPILL_V160_SAVE: 1046 case AMDGPU::SI_SPILL_V160_RESTORE: 1047 case AMDGPU::SI_SPILL_A160_SAVE: 1048 case AMDGPU::SI_SPILL_A160_RESTORE: 1049 case AMDGPU::SI_SPILL_AV160_SAVE: 1050 case AMDGPU::SI_SPILL_AV160_RESTORE: 1051 return 5; 1052 case AMDGPU::SI_SPILL_S128_SAVE: 1053 case AMDGPU::SI_SPILL_S128_RESTORE: 1054 case AMDGPU::SI_SPILL_V128_SAVE: 1055 case AMDGPU::SI_SPILL_V128_RESTORE: 1056 case AMDGPU::SI_SPILL_A128_SAVE: 1057 case AMDGPU::SI_SPILL_A128_RESTORE: 1058 case AMDGPU::SI_SPILL_AV128_SAVE: 1059 case AMDGPU::SI_SPILL_AV128_RESTORE: 1060 return 4; 1061 case AMDGPU::SI_SPILL_S96_SAVE: 1062 case AMDGPU::SI_SPILL_S96_RESTORE: 1063 case AMDGPU::SI_SPILL_V96_SAVE: 1064 case AMDGPU::SI_SPILL_V96_RESTORE: 1065 case AMDGPU::SI_SPILL_A96_SAVE: 1066 case AMDGPU::SI_SPILL_A96_RESTORE: 1067 case AMDGPU::SI_SPILL_AV96_SAVE: 1068 case AMDGPU::SI_SPILL_AV96_RESTORE: 1069 return 3; 1070 case AMDGPU::SI_SPILL_S64_SAVE: 1071 case AMDGPU::SI_SPILL_S64_RESTORE: 1072 case AMDGPU::SI_SPILL_V64_SAVE: 1073 case AMDGPU::SI_SPILL_V64_RESTORE: 1074 case AMDGPU::SI_SPILL_A64_SAVE: 1075 case AMDGPU::SI_SPILL_A64_RESTORE: 1076 case AMDGPU::SI_SPILL_AV64_SAVE: 1077 case AMDGPU::SI_SPILL_AV64_RESTORE: 1078 return 2; 1079 case AMDGPU::SI_SPILL_S32_SAVE: 1080 case AMDGPU::SI_SPILL_S32_RESTORE: 1081 case AMDGPU::SI_SPILL_V32_SAVE: 1082 case AMDGPU::SI_SPILL_V32_RESTORE: 1083 case AMDGPU::SI_SPILL_A32_SAVE: 1084 case AMDGPU::SI_SPILL_A32_RESTORE: 1085 case AMDGPU::SI_SPILL_AV32_SAVE: 1086 case AMDGPU::SI_SPILL_AV32_RESTORE: 1087 case AMDGPU::SI_SPILL_WWM_V32_SAVE: 1088 case AMDGPU::SI_SPILL_WWM_V32_RESTORE: 1089 case AMDGPU::SI_SPILL_WWM_AV32_SAVE: 1090 case AMDGPU::SI_SPILL_WWM_AV32_RESTORE: 1091 return 1; 1092 default: llvm_unreachable("Invalid spill opcode"); 1093 } 1094 } 1095 1096 static int getOffsetMUBUFStore(unsigned Opc) { 1097 switch (Opc) { 1098 case AMDGPU::BUFFER_STORE_DWORD_OFFEN: 1099 return AMDGPU::BUFFER_STORE_DWORD_OFFSET; 1100 case AMDGPU::BUFFER_STORE_BYTE_OFFEN: 1101 return AMDGPU::BUFFER_STORE_BYTE_OFFSET; 1102 case AMDGPU::BUFFER_STORE_SHORT_OFFEN: 1103 return AMDGPU::BUFFER_STORE_SHORT_OFFSET; 1104 case AMDGPU::BUFFER_STORE_DWORDX2_OFFEN: 1105 return AMDGPU::BUFFER_STORE_DWORDX2_OFFSET; 1106 case AMDGPU::BUFFER_STORE_DWORDX3_OFFEN: 1107 return AMDGPU::BUFFER_STORE_DWORDX3_OFFSET; 1108 case AMDGPU::BUFFER_STORE_DWORDX4_OFFEN: 1109 return AMDGPU::BUFFER_STORE_DWORDX4_OFFSET; 1110 case AMDGPU::BUFFER_STORE_SHORT_D16_HI_OFFEN: 1111 return AMDGPU::BUFFER_STORE_SHORT_D16_HI_OFFSET; 1112 case AMDGPU::BUFFER_STORE_BYTE_D16_HI_OFFEN: 1113 return AMDGPU::BUFFER_STORE_BYTE_D16_HI_OFFSET; 1114 default: 1115 return -1; 1116 } 1117 } 1118 1119 static int getOffsetMUBUFLoad(unsigned Opc) { 1120 switch (Opc) { 1121 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN: 1122 return AMDGPU::BUFFER_LOAD_DWORD_OFFSET; 1123 case AMDGPU::BUFFER_LOAD_UBYTE_OFFEN: 1124 return AMDGPU::BUFFER_LOAD_UBYTE_OFFSET; 1125 case AMDGPU::BUFFER_LOAD_SBYTE_OFFEN: 1126 return AMDGPU::BUFFER_LOAD_SBYTE_OFFSET; 1127 case AMDGPU::BUFFER_LOAD_USHORT_OFFEN: 1128 return AMDGPU::BUFFER_LOAD_USHORT_OFFSET; 1129 case AMDGPU::BUFFER_LOAD_SSHORT_OFFEN: 1130 return AMDGPU::BUFFER_LOAD_SSHORT_OFFSET; 1131 case AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN: 1132 return AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET; 1133 case AMDGPU::BUFFER_LOAD_DWORDX3_OFFEN: 1134 return AMDGPU::BUFFER_LOAD_DWORDX3_OFFSET; 1135 case AMDGPU::BUFFER_LOAD_DWORDX4_OFFEN: 1136 return AMDGPU::BUFFER_LOAD_DWORDX4_OFFSET; 1137 case AMDGPU::BUFFER_LOAD_UBYTE_D16_OFFEN: 1138 return AMDGPU::BUFFER_LOAD_UBYTE_D16_OFFSET; 1139 case AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFEN: 1140 return AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFSET; 1141 case AMDGPU::BUFFER_LOAD_SBYTE_D16_OFFEN: 1142 return AMDGPU::BUFFER_LOAD_SBYTE_D16_OFFSET; 1143 case AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFEN: 1144 return AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFSET; 1145 case AMDGPU::BUFFER_LOAD_SHORT_D16_OFFEN: 1146 return AMDGPU::BUFFER_LOAD_SHORT_D16_OFFSET; 1147 case AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFEN: 1148 return AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFSET; 1149 default: 1150 return -1; 1151 } 1152 } 1153 1154 static int getOffenMUBUFStore(unsigned Opc) { 1155 switch (Opc) { 1156 case AMDGPU::BUFFER_STORE_DWORD_OFFSET: 1157 return AMDGPU::BUFFER_STORE_DWORD_OFFEN; 1158 case AMDGPU::BUFFER_STORE_BYTE_OFFSET: 1159 return AMDGPU::BUFFER_STORE_BYTE_OFFEN; 1160 case AMDGPU::BUFFER_STORE_SHORT_OFFSET: 1161 return AMDGPU::BUFFER_STORE_SHORT_OFFEN; 1162 case AMDGPU::BUFFER_STORE_DWORDX2_OFFSET: 1163 return AMDGPU::BUFFER_STORE_DWORDX2_OFFEN; 1164 case AMDGPU::BUFFER_STORE_DWORDX3_OFFSET: 1165 return AMDGPU::BUFFER_STORE_DWORDX3_OFFEN; 1166 case AMDGPU::BUFFER_STORE_DWORDX4_OFFSET: 1167 return AMDGPU::BUFFER_STORE_DWORDX4_OFFEN; 1168 case AMDGPU::BUFFER_STORE_SHORT_D16_HI_OFFSET: 1169 return AMDGPU::BUFFER_STORE_SHORT_D16_HI_OFFEN; 1170 case AMDGPU::BUFFER_STORE_BYTE_D16_HI_OFFSET: 1171 return AMDGPU::BUFFER_STORE_BYTE_D16_HI_OFFEN; 1172 default: 1173 return -1; 1174 } 1175 } 1176 1177 static int getOffenMUBUFLoad(unsigned Opc) { 1178 switch (Opc) { 1179 case AMDGPU::BUFFER_LOAD_DWORD_OFFSET: 1180 return AMDGPU::BUFFER_LOAD_DWORD_OFFEN; 1181 case AMDGPU::BUFFER_LOAD_UBYTE_OFFSET: 1182 return AMDGPU::BUFFER_LOAD_UBYTE_OFFEN; 1183 case AMDGPU::BUFFER_LOAD_SBYTE_OFFSET: 1184 return AMDGPU::BUFFER_LOAD_SBYTE_OFFEN; 1185 case AMDGPU::BUFFER_LOAD_USHORT_OFFSET: 1186 return AMDGPU::BUFFER_LOAD_USHORT_OFFEN; 1187 case AMDGPU::BUFFER_LOAD_SSHORT_OFFSET: 1188 return AMDGPU::BUFFER_LOAD_SSHORT_OFFEN; 1189 case AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET: 1190 return AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN; 1191 case AMDGPU::BUFFER_LOAD_DWORDX3_OFFSET: 1192 return AMDGPU::BUFFER_LOAD_DWORDX3_OFFEN; 1193 case AMDGPU::BUFFER_LOAD_DWORDX4_OFFSET: 1194 return AMDGPU::BUFFER_LOAD_DWORDX4_OFFEN; 1195 case AMDGPU::BUFFER_LOAD_UBYTE_D16_OFFSET: 1196 return AMDGPU::BUFFER_LOAD_UBYTE_D16_OFFEN; 1197 case AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFSET: 1198 return AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFEN; 1199 case AMDGPU::BUFFER_LOAD_SBYTE_D16_OFFSET: 1200 return AMDGPU::BUFFER_LOAD_SBYTE_D16_OFFEN; 1201 case AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFSET: 1202 return AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFEN; 1203 case AMDGPU::BUFFER_LOAD_SHORT_D16_OFFSET: 1204 return AMDGPU::BUFFER_LOAD_SHORT_D16_OFFEN; 1205 case AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFSET: 1206 return AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFEN; 1207 default: 1208 return -1; 1209 } 1210 } 1211 1212 static MachineInstrBuilder spillVGPRtoAGPR(const GCNSubtarget &ST, 1213 MachineBasicBlock &MBB, 1214 MachineBasicBlock::iterator MI, 1215 int Index, unsigned Lane, 1216 unsigned ValueReg, bool IsKill) { 1217 MachineFunction *MF = MBB.getParent(); 1218 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); 1219 const SIInstrInfo *TII = ST.getInstrInfo(); 1220 1221 MCPhysReg Reg = MFI->getVGPRToAGPRSpill(Index, Lane); 1222 1223 if (Reg == AMDGPU::NoRegister) 1224 return MachineInstrBuilder(); 1225 1226 bool IsStore = MI->mayStore(); 1227 MachineRegisterInfo &MRI = MF->getRegInfo(); 1228 auto *TRI = static_cast<const SIRegisterInfo*>(MRI.getTargetRegisterInfo()); 1229 1230 unsigned Dst = IsStore ? Reg : ValueReg; 1231 unsigned Src = IsStore ? ValueReg : Reg; 1232 bool IsVGPR = TRI->isVGPR(MRI, Reg); 1233 DebugLoc DL = MI->getDebugLoc(); 1234 if (IsVGPR == TRI->isVGPR(MRI, ValueReg)) { 1235 // Spiller during regalloc may restore a spilled register to its superclass. 1236 // It could result in AGPR spills restored to VGPRs or the other way around, 1237 // making the src and dst with identical regclasses at this point. It just 1238 // needs a copy in such cases. 1239 auto CopyMIB = BuildMI(MBB, MI, DL, TII->get(AMDGPU::COPY), Dst) 1240 .addReg(Src, getKillRegState(IsKill)); 1241 CopyMIB->setAsmPrinterFlag(MachineInstr::ReloadReuse); 1242 return CopyMIB; 1243 } 1244 unsigned Opc = (IsStore ^ IsVGPR) ? AMDGPU::V_ACCVGPR_WRITE_B32_e64 1245 : AMDGPU::V_ACCVGPR_READ_B32_e64; 1246 1247 auto MIB = BuildMI(MBB, MI, DL, TII->get(Opc), Dst) 1248 .addReg(Src, getKillRegState(IsKill)); 1249 MIB->setAsmPrinterFlag(MachineInstr::ReloadReuse); 1250 return MIB; 1251 } 1252 1253 // This differs from buildSpillLoadStore by only scavenging a VGPR. It does not 1254 // need to handle the case where an SGPR may need to be spilled while spilling. 1255 static bool buildMUBUFOffsetLoadStore(const GCNSubtarget &ST, 1256 MachineFrameInfo &MFI, 1257 MachineBasicBlock::iterator MI, 1258 int Index, 1259 int64_t Offset) { 1260 const SIInstrInfo *TII = ST.getInstrInfo(); 1261 MachineBasicBlock *MBB = MI->getParent(); 1262 const DebugLoc &DL = MI->getDebugLoc(); 1263 bool IsStore = MI->mayStore(); 1264 1265 unsigned Opc = MI->getOpcode(); 1266 int LoadStoreOp = IsStore ? 1267 getOffsetMUBUFStore(Opc) : getOffsetMUBUFLoad(Opc); 1268 if (LoadStoreOp == -1) 1269 return false; 1270 1271 const MachineOperand *Reg = TII->getNamedOperand(*MI, AMDGPU::OpName::vdata); 1272 if (spillVGPRtoAGPR(ST, *MBB, MI, Index, 0, Reg->getReg(), false).getInstr()) 1273 return true; 1274 1275 MachineInstrBuilder NewMI = 1276 BuildMI(*MBB, MI, DL, TII->get(LoadStoreOp)) 1277 .add(*Reg) 1278 .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::srsrc)) 1279 .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)) 1280 .addImm(Offset) 1281 .addImm(0) // cpol 1282 .addImm(0) // swz 1283 .cloneMemRefs(*MI); 1284 1285 const MachineOperand *VDataIn = TII->getNamedOperand(*MI, 1286 AMDGPU::OpName::vdata_in); 1287 if (VDataIn) 1288 NewMI.add(*VDataIn); 1289 return true; 1290 } 1291 1292 static unsigned getFlatScratchSpillOpcode(const SIInstrInfo *TII, 1293 unsigned LoadStoreOp, 1294 unsigned EltSize) { 1295 bool IsStore = TII->get(LoadStoreOp).mayStore(); 1296 bool HasVAddr = AMDGPU::hasNamedOperand(LoadStoreOp, AMDGPU::OpName::vaddr); 1297 bool UseST = 1298 !HasVAddr && !AMDGPU::hasNamedOperand(LoadStoreOp, AMDGPU::OpName::saddr); 1299 1300 switch (EltSize) { 1301 case 4: 1302 LoadStoreOp = IsStore ? AMDGPU::SCRATCH_STORE_DWORD_SADDR 1303 : AMDGPU::SCRATCH_LOAD_DWORD_SADDR; 1304 break; 1305 case 8: 1306 LoadStoreOp = IsStore ? AMDGPU::SCRATCH_STORE_DWORDX2_SADDR 1307 : AMDGPU::SCRATCH_LOAD_DWORDX2_SADDR; 1308 break; 1309 case 12: 1310 LoadStoreOp = IsStore ? AMDGPU::SCRATCH_STORE_DWORDX3_SADDR 1311 : AMDGPU::SCRATCH_LOAD_DWORDX3_SADDR; 1312 break; 1313 case 16: 1314 LoadStoreOp = IsStore ? AMDGPU::SCRATCH_STORE_DWORDX4_SADDR 1315 : AMDGPU::SCRATCH_LOAD_DWORDX4_SADDR; 1316 break; 1317 default: 1318 llvm_unreachable("Unexpected spill load/store size!"); 1319 } 1320 1321 if (HasVAddr) 1322 LoadStoreOp = AMDGPU::getFlatScratchInstSVfromSS(LoadStoreOp); 1323 else if (UseST) 1324 LoadStoreOp = AMDGPU::getFlatScratchInstSTfromSS(LoadStoreOp); 1325 1326 return LoadStoreOp; 1327 } 1328 1329 void SIRegisterInfo::buildSpillLoadStore( 1330 MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, 1331 unsigned LoadStoreOp, int Index, Register ValueReg, bool IsKill, 1332 MCRegister ScratchOffsetReg, int64_t InstOffset, MachineMemOperand *MMO, 1333 RegScavenger *RS, LiveRegUnits *LiveUnits) const { 1334 assert((!RS || !LiveUnits) && "Only RS or LiveUnits can be set but not both"); 1335 1336 MachineFunction *MF = MBB.getParent(); 1337 const SIInstrInfo *TII = ST.getInstrInfo(); 1338 const MachineFrameInfo &MFI = MF->getFrameInfo(); 1339 const SIMachineFunctionInfo *FuncInfo = MF->getInfo<SIMachineFunctionInfo>(); 1340 1341 const MCInstrDesc *Desc = &TII->get(LoadStoreOp); 1342 bool IsStore = Desc->mayStore(); 1343 bool IsFlat = TII->isFLATScratch(LoadStoreOp); 1344 1345 bool CanClobberSCC = false; 1346 bool Scavenged = false; 1347 MCRegister SOffset = ScratchOffsetReg; 1348 1349 const TargetRegisterClass *RC = getRegClassForReg(MF->getRegInfo(), ValueReg); 1350 // On gfx90a+ AGPR is a regular VGPR acceptable for loads and stores. 1351 const bool IsAGPR = !ST.hasGFX90AInsts() && isAGPRClass(RC); 1352 const unsigned RegWidth = AMDGPU::getRegBitWidth(*RC) / 8; 1353 1354 // Always use 4 byte operations for AGPRs because we need to scavenge 1355 // a temporary VGPR. 1356 unsigned EltSize = (IsFlat && !IsAGPR) ? std::min(RegWidth, 16u) : 4u; 1357 unsigned NumSubRegs = RegWidth / EltSize; 1358 unsigned Size = NumSubRegs * EltSize; 1359 unsigned RemSize = RegWidth - Size; 1360 unsigned NumRemSubRegs = RemSize ? 1 : 0; 1361 int64_t Offset = InstOffset + MFI.getObjectOffset(Index); 1362 int64_t MaterializedOffset = Offset; 1363 1364 int64_t MaxOffset = Offset + Size + RemSize - EltSize; 1365 int64_t ScratchOffsetRegDelta = 0; 1366 1367 if (IsFlat && EltSize > 4) { 1368 LoadStoreOp = getFlatScratchSpillOpcode(TII, LoadStoreOp, EltSize); 1369 Desc = &TII->get(LoadStoreOp); 1370 } 1371 1372 Align Alignment = MFI.getObjectAlign(Index); 1373 const MachinePointerInfo &BasePtrInfo = MMO->getPointerInfo(); 1374 1375 assert((IsFlat || ((Offset % EltSize) == 0)) && 1376 "unexpected VGPR spill offset"); 1377 1378 // Track a VGPR to use for a constant offset we need to materialize. 1379 Register TmpOffsetVGPR; 1380 1381 // Track a VGPR to use as an intermediate value. 1382 Register TmpIntermediateVGPR; 1383 bool UseVGPROffset = false; 1384 1385 // Materialize a VGPR offset required for the given SGPR/VGPR/Immediate 1386 // combination. 1387 auto MaterializeVOffset = [&](Register SGPRBase, Register TmpVGPR, 1388 int64_t VOffset) { 1389 // We are using a VGPR offset 1390 if (IsFlat && SGPRBase) { 1391 // We only have 1 VGPR offset, or 1 SGPR offset. We don't have a free 1392 // SGPR, so perform the add as vector. 1393 // We don't need a base SGPR in the kernel. 1394 1395 if (ST.getConstantBusLimit(AMDGPU::V_ADD_U32_e64) >= 2) { 1396 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_ADD_U32_e64), TmpVGPR) 1397 .addReg(SGPRBase) 1398 .addImm(VOffset) 1399 .addImm(0); // clamp 1400 } else { 1401 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR) 1402 .addReg(SGPRBase); 1403 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_ADD_U32_e32), TmpVGPR) 1404 .addImm(VOffset) 1405 .addReg(TmpOffsetVGPR); 1406 } 1407 } else { 1408 assert(TmpOffsetVGPR); 1409 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR) 1410 .addImm(VOffset); 1411 } 1412 }; 1413 1414 bool IsOffsetLegal = 1415 IsFlat ? TII->isLegalFLATOffset(MaxOffset, AMDGPUAS::PRIVATE_ADDRESS, 1416 SIInstrFlags::FlatScratch) 1417 : TII->isLegalMUBUFImmOffset(MaxOffset); 1418 if (!IsOffsetLegal || (IsFlat && !SOffset && !ST.hasFlatScratchSTMode())) { 1419 SOffset = MCRegister(); 1420 1421 // We don't have access to the register scavenger if this function is called 1422 // during PEI::scavengeFrameVirtualRegs() so use LiveUnits in this case. 1423 // TODO: Clobbering SCC is not necessary for scratch instructions in the 1424 // entry. 1425 if (RS) { 1426 SOffset = RS->scavengeRegisterBackwards(AMDGPU::SGPR_32RegClass, MI, false, 0, false); 1427 1428 // Piggy back on the liveness scan we just did see if SCC is dead. 1429 CanClobberSCC = !RS->isRegUsed(AMDGPU::SCC); 1430 } else if (LiveUnits) { 1431 CanClobberSCC = LiveUnits->available(AMDGPU::SCC); 1432 for (MCRegister Reg : AMDGPU::SGPR_32RegClass) { 1433 if (LiveUnits->available(Reg) && !MF->getRegInfo().isReserved(Reg)) { 1434 SOffset = Reg; 1435 break; 1436 } 1437 } 1438 } 1439 1440 if (ScratchOffsetReg != AMDGPU::NoRegister && !CanClobberSCC) 1441 SOffset = Register(); 1442 1443 if (!SOffset) { 1444 UseVGPROffset = true; 1445 1446 if (RS) { 1447 TmpOffsetVGPR = RS->scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass, MI, false, 0); 1448 } else { 1449 assert(LiveUnits); 1450 for (MCRegister Reg : AMDGPU::VGPR_32RegClass) { 1451 if (LiveUnits->available(Reg) && !MF->getRegInfo().isReserved(Reg)) { 1452 TmpOffsetVGPR = Reg; 1453 break; 1454 } 1455 } 1456 } 1457 1458 assert(TmpOffsetVGPR); 1459 } else if (!SOffset && CanClobberSCC) { 1460 // There are no free SGPRs, and since we are in the process of spilling 1461 // VGPRs too. Since we need a VGPR in order to spill SGPRs (this is true 1462 // on SI/CI and on VI it is true until we implement spilling using scalar 1463 // stores), we have no way to free up an SGPR. Our solution here is to 1464 // add the offset directly to the ScratchOffset or StackPtrOffset 1465 // register, and then subtract the offset after the spill to return the 1466 // register to it's original value. 1467 1468 // TODO: If we don't have to do an emergency stack slot spill, converting 1469 // to use the VGPR offset is fewer instructions. 1470 if (!ScratchOffsetReg) 1471 ScratchOffsetReg = FuncInfo->getStackPtrOffsetReg(); 1472 SOffset = ScratchOffsetReg; 1473 ScratchOffsetRegDelta = Offset; 1474 } else { 1475 Scavenged = true; 1476 } 1477 1478 // We currently only support spilling VGPRs to EltSize boundaries, meaning 1479 // we can simplify the adjustment of Offset here to just scale with 1480 // WavefrontSize. 1481 if (!IsFlat && !UseVGPROffset) 1482 Offset *= ST.getWavefrontSize(); 1483 1484 if (!UseVGPROffset && !SOffset) 1485 report_fatal_error("could not scavenge SGPR to spill in entry function"); 1486 1487 if (UseVGPROffset) { 1488 // We are using a VGPR offset 1489 MaterializeVOffset(ScratchOffsetReg, TmpOffsetVGPR, Offset); 1490 } else if (ScratchOffsetReg == AMDGPU::NoRegister) { 1491 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), SOffset).addImm(Offset); 1492 } else { 1493 assert(Offset != 0); 1494 auto Add = BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), SOffset) 1495 .addReg(ScratchOffsetReg) 1496 .addImm(Offset); 1497 Add->getOperand(3).setIsDead(); // Mark SCC as dead. 1498 } 1499 1500 Offset = 0; 1501 } 1502 1503 if (IsFlat && SOffset == AMDGPU::NoRegister) { 1504 assert(AMDGPU::getNamedOperandIdx(LoadStoreOp, AMDGPU::OpName::vaddr) < 0 1505 && "Unexpected vaddr for flat scratch with a FI operand"); 1506 1507 if (UseVGPROffset) { 1508 LoadStoreOp = AMDGPU::getFlatScratchInstSVfromSS(LoadStoreOp); 1509 } else { 1510 assert(ST.hasFlatScratchSTMode()); 1511 LoadStoreOp = AMDGPU::getFlatScratchInstSTfromSS(LoadStoreOp); 1512 } 1513 1514 Desc = &TII->get(LoadStoreOp); 1515 } 1516 1517 for (unsigned i = 0, e = NumSubRegs + NumRemSubRegs, RegOffset = 0; i != e; 1518 ++i, RegOffset += EltSize) { 1519 if (i == NumSubRegs) { 1520 EltSize = RemSize; 1521 LoadStoreOp = getFlatScratchSpillOpcode(TII, LoadStoreOp, EltSize); 1522 } 1523 Desc = &TII->get(LoadStoreOp); 1524 1525 if (!IsFlat && UseVGPROffset) { 1526 int NewLoadStoreOp = IsStore ? getOffenMUBUFStore(LoadStoreOp) 1527 : getOffenMUBUFLoad(LoadStoreOp); 1528 Desc = &TII->get(NewLoadStoreOp); 1529 } 1530 1531 if (UseVGPROffset && TmpOffsetVGPR == TmpIntermediateVGPR) { 1532 // If we are spilling an AGPR beyond the range of the memory instruction 1533 // offset and need to use a VGPR offset, we ideally have at least 2 1534 // scratch VGPRs. If we don't have a second free VGPR without spilling, 1535 // recycle the VGPR used for the offset which requires resetting after 1536 // each subregister. 1537 1538 MaterializeVOffset(ScratchOffsetReg, TmpOffsetVGPR, MaterializedOffset); 1539 } 1540 1541 unsigned NumRegs = EltSize / 4; 1542 Register SubReg = e == 1 1543 ? ValueReg 1544 : Register(getSubReg(ValueReg, 1545 getSubRegFromChannel(RegOffset / 4, NumRegs))); 1546 1547 unsigned SOffsetRegState = 0; 1548 unsigned SrcDstRegState = getDefRegState(!IsStore); 1549 const bool IsLastSubReg = i + 1 == e; 1550 const bool IsFirstSubReg = i == 0; 1551 if (IsLastSubReg) { 1552 SOffsetRegState |= getKillRegState(Scavenged); 1553 // The last implicit use carries the "Kill" flag. 1554 SrcDstRegState |= getKillRegState(IsKill); 1555 } 1556 1557 // Make sure the whole register is defined if there are undef components by 1558 // adding an implicit def of the super-reg on the first instruction. 1559 bool NeedSuperRegDef = e > 1 && IsStore && IsFirstSubReg; 1560 bool NeedSuperRegImpOperand = e > 1; 1561 1562 // Remaining element size to spill into memory after some parts of it 1563 // spilled into either AGPRs or VGPRs. 1564 unsigned RemEltSize = EltSize; 1565 1566 // AGPRs to spill VGPRs and vice versa are allocated in a reverse order, 1567 // starting from the last lane. In case if a register cannot be completely 1568 // spilled into another register that will ensure its alignment does not 1569 // change. For targets with VGPR alignment requirement this is important 1570 // in case of flat scratch usage as we might get a scratch_load or 1571 // scratch_store of an unaligned register otherwise. 1572 for (int LaneS = (RegOffset + EltSize) / 4 - 1, Lane = LaneS, 1573 LaneE = RegOffset / 4; 1574 Lane >= LaneE; --Lane) { 1575 bool IsSubReg = e > 1 || EltSize > 4; 1576 Register Sub = IsSubReg 1577 ? Register(getSubReg(ValueReg, getSubRegFromChannel(Lane))) 1578 : ValueReg; 1579 auto MIB = spillVGPRtoAGPR(ST, MBB, MI, Index, Lane, Sub, IsKill); 1580 if (!MIB.getInstr()) 1581 break; 1582 if (NeedSuperRegDef || (IsSubReg && IsStore && Lane == LaneS && IsFirstSubReg)) { 1583 MIB.addReg(ValueReg, RegState::ImplicitDefine); 1584 NeedSuperRegDef = false; 1585 } 1586 if ((IsSubReg || NeedSuperRegImpOperand) && (IsFirstSubReg || IsLastSubReg)) { 1587 NeedSuperRegImpOperand = true; 1588 unsigned State = SrcDstRegState; 1589 if (!IsLastSubReg || (Lane != LaneE)) 1590 State &= ~RegState::Kill; 1591 if (!IsFirstSubReg || (Lane != LaneS)) 1592 State &= ~RegState::Define; 1593 MIB.addReg(ValueReg, RegState::Implicit | State); 1594 } 1595 RemEltSize -= 4; 1596 } 1597 1598 if (!RemEltSize) // Fully spilled into AGPRs. 1599 continue; 1600 1601 if (RemEltSize != EltSize) { // Partially spilled to AGPRs 1602 assert(IsFlat && EltSize > 4); 1603 1604 unsigned NumRegs = RemEltSize / 4; 1605 SubReg = Register(getSubReg(ValueReg, 1606 getSubRegFromChannel(RegOffset / 4, NumRegs))); 1607 unsigned Opc = getFlatScratchSpillOpcode(TII, LoadStoreOp, RemEltSize); 1608 Desc = &TII->get(Opc); 1609 } 1610 1611 unsigned FinalReg = SubReg; 1612 1613 if (IsAGPR) { 1614 assert(EltSize == 4); 1615 1616 if (!TmpIntermediateVGPR) { 1617 TmpIntermediateVGPR = FuncInfo->getVGPRForAGPRCopy(); 1618 assert(MF->getRegInfo().isReserved(TmpIntermediateVGPR)); 1619 } 1620 if (IsStore) { 1621 auto AccRead = BuildMI(MBB, MI, DL, 1622 TII->get(AMDGPU::V_ACCVGPR_READ_B32_e64), 1623 TmpIntermediateVGPR) 1624 .addReg(SubReg, getKillRegState(IsKill)); 1625 if (NeedSuperRegDef) 1626 AccRead.addReg(ValueReg, RegState::ImplicitDefine); 1627 AccRead->setAsmPrinterFlag(MachineInstr::ReloadReuse); 1628 } 1629 SubReg = TmpIntermediateVGPR; 1630 } else if (UseVGPROffset) { 1631 if (!TmpOffsetVGPR) { 1632 TmpOffsetVGPR = RS->scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass, 1633 MI, false, 0); 1634 RS->setRegUsed(TmpOffsetVGPR); 1635 } 1636 } 1637 1638 MachinePointerInfo PInfo = BasePtrInfo.getWithOffset(RegOffset); 1639 MachineMemOperand *NewMMO = 1640 MF->getMachineMemOperand(PInfo, MMO->getFlags(), RemEltSize, 1641 commonAlignment(Alignment, RegOffset)); 1642 1643 auto MIB = 1644 BuildMI(MBB, MI, DL, *Desc) 1645 .addReg(SubReg, getDefRegState(!IsStore) | getKillRegState(IsKill)); 1646 1647 if (UseVGPROffset) { 1648 // For an AGPR spill, we reuse the same temp VGPR for the offset and the 1649 // intermediate accvgpr_write. 1650 MIB.addReg(TmpOffsetVGPR, getKillRegState(IsLastSubReg && !IsAGPR)); 1651 } 1652 1653 if (!IsFlat) 1654 MIB.addReg(FuncInfo->getScratchRSrcReg()); 1655 1656 if (SOffset == AMDGPU::NoRegister) { 1657 if (!IsFlat) { 1658 if (UseVGPROffset && ScratchOffsetReg) { 1659 MIB.addReg(ScratchOffsetReg); 1660 } else { 1661 assert(FuncInfo->isBottomOfStack()); 1662 MIB.addImm(0); 1663 } 1664 } 1665 } else { 1666 MIB.addReg(SOffset, SOffsetRegState); 1667 } 1668 1669 MIB.addImm(Offset + RegOffset); 1670 1671 bool LastUse = MMO->getFlags() & MOLastUse; 1672 MIB.addImm(LastUse ? AMDGPU::CPol::TH_LU : 0); // cpol 1673 1674 if (!IsFlat) 1675 MIB.addImm(0); // swz 1676 MIB.addMemOperand(NewMMO); 1677 1678 if (!IsAGPR && NeedSuperRegDef) 1679 MIB.addReg(ValueReg, RegState::ImplicitDefine); 1680 1681 if (!IsStore && IsAGPR && TmpIntermediateVGPR != AMDGPU::NoRegister) { 1682 MIB = BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), 1683 FinalReg) 1684 .addReg(TmpIntermediateVGPR, RegState::Kill); 1685 MIB->setAsmPrinterFlag(MachineInstr::ReloadReuse); 1686 } 1687 1688 if (NeedSuperRegImpOperand && (IsFirstSubReg || IsLastSubReg)) 1689 MIB.addReg(ValueReg, RegState::Implicit | SrcDstRegState); 1690 1691 // The epilog restore of a wwm-scratch register can cause undesired 1692 // optimization during machine-cp post PrologEpilogInserter if the same 1693 // register was assigned for return value ABI lowering with a COPY 1694 // instruction. As given below, with the epilog reload, the earlier COPY 1695 // appeared to be dead during machine-cp. 1696 // ... 1697 // v0 in WWM operation, needs the WWM spill at prolog/epilog. 1698 // $vgpr0 = V_WRITELANE_B32 $sgpr20, 0, $vgpr0 1699 // ... 1700 // Epilog block: 1701 // $vgpr0 = COPY $vgpr1 // outgoing value moved to v0 1702 // ... 1703 // WWM spill restore to preserve the inactive lanes of v0. 1704 // $sgpr4_sgpr5 = S_XOR_SAVEEXEC_B64 -1 1705 // $vgpr0 = BUFFER_LOAD $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0 1706 // $exec = S_MOV_B64 killed $sgpr4_sgpr5 1707 // ... 1708 // SI_RETURN implicit $vgpr0 1709 // ... 1710 // To fix it, mark the same reg as a tied op for such restore instructions 1711 // so that it marks a usage for the preceding COPY. 1712 if (!IsStore && MI != MBB.end() && MI->isReturn() && 1713 MI->readsRegister(SubReg, this)) { 1714 MIB.addReg(SubReg, RegState::Implicit); 1715 MIB->tieOperands(0, MIB->getNumOperands() - 1); 1716 } 1717 } 1718 1719 if (ScratchOffsetRegDelta != 0) { 1720 // Subtract the offset we added to the ScratchOffset register. 1721 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), SOffset) 1722 .addReg(SOffset) 1723 .addImm(-ScratchOffsetRegDelta); 1724 } 1725 } 1726 1727 void SIRegisterInfo::buildVGPRSpillLoadStore(SGPRSpillBuilder &SB, int Index, 1728 int Offset, bool IsLoad, 1729 bool IsKill) const { 1730 // Load/store VGPR 1731 MachineFrameInfo &FrameInfo = SB.MF.getFrameInfo(); 1732 assert(FrameInfo.getStackID(Index) != TargetStackID::SGPRSpill); 1733 1734 Register FrameReg = 1735 FrameInfo.isFixedObjectIndex(Index) && hasBasePointer(SB.MF) 1736 ? getBaseRegister() 1737 : getFrameRegister(SB.MF); 1738 1739 Align Alignment = FrameInfo.getObjectAlign(Index); 1740 MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(SB.MF, Index); 1741 MachineMemOperand *MMO = SB.MF.getMachineMemOperand( 1742 PtrInfo, IsLoad ? MachineMemOperand::MOLoad : MachineMemOperand::MOStore, 1743 SB.EltSize, Alignment); 1744 1745 if (IsLoad) { 1746 unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR 1747 : AMDGPU::BUFFER_LOAD_DWORD_OFFSET; 1748 buildSpillLoadStore(*SB.MBB, SB.MI, SB.DL, Opc, Index, SB.TmpVGPR, false, 1749 FrameReg, (int64_t)Offset * SB.EltSize, MMO, SB.RS); 1750 } else { 1751 unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_STORE_DWORD_SADDR 1752 : AMDGPU::BUFFER_STORE_DWORD_OFFSET; 1753 buildSpillLoadStore(*SB.MBB, SB.MI, SB.DL, Opc, Index, SB.TmpVGPR, IsKill, 1754 FrameReg, (int64_t)Offset * SB.EltSize, MMO, SB.RS); 1755 // This only ever adds one VGPR spill 1756 SB.MFI.addToSpilledVGPRs(1); 1757 } 1758 } 1759 1760 bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI, int Index, 1761 RegScavenger *RS, SlotIndexes *Indexes, 1762 LiveIntervals *LIS, bool OnlyToVGPR, 1763 bool SpillToPhysVGPRLane) const { 1764 SGPRSpillBuilder SB(*this, *ST.getInstrInfo(), isWave32, MI, Index, RS); 1765 1766 ArrayRef<SpilledReg> VGPRSpills = 1767 SpillToPhysVGPRLane ? SB.MFI.getSGPRSpillToPhysicalVGPRLanes(Index) 1768 : SB.MFI.getSGPRSpillToVirtualVGPRLanes(Index); 1769 bool SpillToVGPR = !VGPRSpills.empty(); 1770 if (OnlyToVGPR && !SpillToVGPR) 1771 return false; 1772 1773 assert(SpillToVGPR || (SB.SuperReg != SB.MFI.getStackPtrOffsetReg() && 1774 SB.SuperReg != SB.MFI.getFrameOffsetReg())); 1775 1776 if (SpillToVGPR) { 1777 1778 assert(SB.NumSubRegs == VGPRSpills.size() && 1779 "Num of VGPR lanes should be equal to num of SGPRs spilled"); 1780 1781 for (unsigned i = 0, e = SB.NumSubRegs; i < e; ++i) { 1782 Register SubReg = 1783 SB.NumSubRegs == 1 1784 ? SB.SuperReg 1785 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i])); 1786 SpilledReg Spill = VGPRSpills[i]; 1787 1788 bool IsFirstSubreg = i == 0; 1789 bool IsLastSubreg = i == SB.NumSubRegs - 1; 1790 bool UseKill = SB.IsKill && IsLastSubreg; 1791 1792 1793 // Mark the "old value of vgpr" input undef only if this is the first sgpr 1794 // spill to this specific vgpr in the first basic block. 1795 auto MIB = BuildMI(*SB.MBB, MI, SB.DL, 1796 SB.TII.get(AMDGPU::SI_SPILL_S32_TO_VGPR), Spill.VGPR) 1797 .addReg(SubReg, getKillRegState(UseKill)) 1798 .addImm(Spill.Lane) 1799 .addReg(Spill.VGPR); 1800 if (Indexes) { 1801 if (IsFirstSubreg) 1802 Indexes->replaceMachineInstrInMaps(*MI, *MIB); 1803 else 1804 Indexes->insertMachineInstrInMaps(*MIB); 1805 } 1806 1807 if (IsFirstSubreg && SB.NumSubRegs > 1) { 1808 // We may be spilling a super-register which is only partially defined, 1809 // and need to ensure later spills think the value is defined. 1810 MIB.addReg(SB.SuperReg, RegState::ImplicitDefine); 1811 } 1812 1813 if (SB.NumSubRegs > 1 && (IsFirstSubreg || IsLastSubreg)) 1814 MIB.addReg(SB.SuperReg, getKillRegState(UseKill) | RegState::Implicit); 1815 1816 // FIXME: Since this spills to another register instead of an actual 1817 // frame index, we should delete the frame index when all references to 1818 // it are fixed. 1819 } 1820 } else { 1821 SB.prepare(); 1822 1823 // SubReg carries the "Kill" flag when SubReg == SB.SuperReg. 1824 unsigned SubKillState = getKillRegState((SB.NumSubRegs == 1) && SB.IsKill); 1825 1826 // Per VGPR helper data 1827 auto PVD = SB.getPerVGPRData(); 1828 1829 for (unsigned Offset = 0; Offset < PVD.NumVGPRs; ++Offset) { 1830 unsigned TmpVGPRFlags = RegState::Undef; 1831 1832 // Write sub registers into the VGPR 1833 for (unsigned i = Offset * PVD.PerVGPR, 1834 e = std::min((Offset + 1) * PVD.PerVGPR, SB.NumSubRegs); 1835 i < e; ++i) { 1836 Register SubReg = 1837 SB.NumSubRegs == 1 1838 ? SB.SuperReg 1839 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i])); 1840 1841 MachineInstrBuilder WriteLane = 1842 BuildMI(*SB.MBB, MI, SB.DL, 1843 SB.TII.get(AMDGPU::SI_SPILL_S32_TO_VGPR), SB.TmpVGPR) 1844 .addReg(SubReg, SubKillState) 1845 .addImm(i % PVD.PerVGPR) 1846 .addReg(SB.TmpVGPR, TmpVGPRFlags); 1847 TmpVGPRFlags = 0; 1848 1849 if (Indexes) { 1850 if (i == 0) 1851 Indexes->replaceMachineInstrInMaps(*MI, *WriteLane); 1852 else 1853 Indexes->insertMachineInstrInMaps(*WriteLane); 1854 } 1855 1856 // There could be undef components of a spilled super register. 1857 // TODO: Can we detect this and skip the spill? 1858 if (SB.NumSubRegs > 1) { 1859 // The last implicit use of the SB.SuperReg carries the "Kill" flag. 1860 unsigned SuperKillState = 0; 1861 if (i + 1 == SB.NumSubRegs) 1862 SuperKillState |= getKillRegState(SB.IsKill); 1863 WriteLane.addReg(SB.SuperReg, RegState::Implicit | SuperKillState); 1864 } 1865 } 1866 1867 // Write out VGPR 1868 SB.readWriteTmpVGPR(Offset, /*IsLoad*/ false); 1869 } 1870 1871 SB.restore(); 1872 } 1873 1874 MI->eraseFromParent(); 1875 SB.MFI.addToSpilledSGPRs(SB.NumSubRegs); 1876 1877 if (LIS) 1878 LIS->removeAllRegUnitsForPhysReg(SB.SuperReg); 1879 1880 return true; 1881 } 1882 1883 bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI, int Index, 1884 RegScavenger *RS, SlotIndexes *Indexes, 1885 LiveIntervals *LIS, bool OnlyToVGPR, 1886 bool SpillToPhysVGPRLane) const { 1887 SGPRSpillBuilder SB(*this, *ST.getInstrInfo(), isWave32, MI, Index, RS); 1888 1889 ArrayRef<SpilledReg> VGPRSpills = 1890 SpillToPhysVGPRLane ? SB.MFI.getSGPRSpillToPhysicalVGPRLanes(Index) 1891 : SB.MFI.getSGPRSpillToVirtualVGPRLanes(Index); 1892 bool SpillToVGPR = !VGPRSpills.empty(); 1893 if (OnlyToVGPR && !SpillToVGPR) 1894 return false; 1895 1896 if (SpillToVGPR) { 1897 for (unsigned i = 0, e = SB.NumSubRegs; i < e; ++i) { 1898 Register SubReg = 1899 SB.NumSubRegs == 1 1900 ? SB.SuperReg 1901 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i])); 1902 1903 SpilledReg Spill = VGPRSpills[i]; 1904 auto MIB = BuildMI(*SB.MBB, MI, SB.DL, 1905 SB.TII.get(AMDGPU::SI_RESTORE_S32_FROM_VGPR), SubReg) 1906 .addReg(Spill.VGPR) 1907 .addImm(Spill.Lane); 1908 if (SB.NumSubRegs > 1 && i == 0) 1909 MIB.addReg(SB.SuperReg, RegState::ImplicitDefine); 1910 if (Indexes) { 1911 if (i == e - 1) 1912 Indexes->replaceMachineInstrInMaps(*MI, *MIB); 1913 else 1914 Indexes->insertMachineInstrInMaps(*MIB); 1915 } 1916 } 1917 } else { 1918 SB.prepare(); 1919 1920 // Per VGPR helper data 1921 auto PVD = SB.getPerVGPRData(); 1922 1923 for (unsigned Offset = 0; Offset < PVD.NumVGPRs; ++Offset) { 1924 // Load in VGPR data 1925 SB.readWriteTmpVGPR(Offset, /*IsLoad*/ true); 1926 1927 // Unpack lanes 1928 for (unsigned i = Offset * PVD.PerVGPR, 1929 e = std::min((Offset + 1) * PVD.PerVGPR, SB.NumSubRegs); 1930 i < e; ++i) { 1931 Register SubReg = 1932 SB.NumSubRegs == 1 1933 ? SB.SuperReg 1934 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i])); 1935 1936 bool LastSubReg = (i + 1 == e); 1937 auto MIB = BuildMI(*SB.MBB, MI, SB.DL, 1938 SB.TII.get(AMDGPU::SI_RESTORE_S32_FROM_VGPR), SubReg) 1939 .addReg(SB.TmpVGPR, getKillRegState(LastSubReg)) 1940 .addImm(i); 1941 if (SB.NumSubRegs > 1 && i == 0) 1942 MIB.addReg(SB.SuperReg, RegState::ImplicitDefine); 1943 if (Indexes) { 1944 if (i == e - 1) 1945 Indexes->replaceMachineInstrInMaps(*MI, *MIB); 1946 else 1947 Indexes->insertMachineInstrInMaps(*MIB); 1948 } 1949 } 1950 } 1951 1952 SB.restore(); 1953 } 1954 1955 MI->eraseFromParent(); 1956 1957 if (LIS) 1958 LIS->removeAllRegUnitsForPhysReg(SB.SuperReg); 1959 1960 return true; 1961 } 1962 1963 bool SIRegisterInfo::spillEmergencySGPR(MachineBasicBlock::iterator MI, 1964 MachineBasicBlock &RestoreMBB, 1965 Register SGPR, RegScavenger *RS) const { 1966 SGPRSpillBuilder SB(*this, *ST.getInstrInfo(), isWave32, MI, SGPR, false, 0, 1967 RS); 1968 SB.prepare(); 1969 // Generate the spill of SGPR to SB.TmpVGPR. 1970 unsigned SubKillState = getKillRegState((SB.NumSubRegs == 1) && SB.IsKill); 1971 auto PVD = SB.getPerVGPRData(); 1972 for (unsigned Offset = 0; Offset < PVD.NumVGPRs; ++Offset) { 1973 unsigned TmpVGPRFlags = RegState::Undef; 1974 // Write sub registers into the VGPR 1975 for (unsigned i = Offset * PVD.PerVGPR, 1976 e = std::min((Offset + 1) * PVD.PerVGPR, SB.NumSubRegs); 1977 i < e; ++i) { 1978 Register SubReg = 1979 SB.NumSubRegs == 1 1980 ? SB.SuperReg 1981 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i])); 1982 1983 MachineInstrBuilder WriteLane = 1984 BuildMI(*SB.MBB, MI, SB.DL, SB.TII.get(AMDGPU::V_WRITELANE_B32), 1985 SB.TmpVGPR) 1986 .addReg(SubReg, SubKillState) 1987 .addImm(i % PVD.PerVGPR) 1988 .addReg(SB.TmpVGPR, TmpVGPRFlags); 1989 TmpVGPRFlags = 0; 1990 // There could be undef components of a spilled super register. 1991 // TODO: Can we detect this and skip the spill? 1992 if (SB.NumSubRegs > 1) { 1993 // The last implicit use of the SB.SuperReg carries the "Kill" flag. 1994 unsigned SuperKillState = 0; 1995 if (i + 1 == SB.NumSubRegs) 1996 SuperKillState |= getKillRegState(SB.IsKill); 1997 WriteLane.addReg(SB.SuperReg, RegState::Implicit | SuperKillState); 1998 } 1999 } 2000 // Don't need to write VGPR out. 2001 } 2002 2003 // Restore clobbered registers in the specified restore block. 2004 MI = RestoreMBB.end(); 2005 SB.setMI(&RestoreMBB, MI); 2006 // Generate the restore of SGPR from SB.TmpVGPR. 2007 for (unsigned Offset = 0; Offset < PVD.NumVGPRs; ++Offset) { 2008 // Don't need to load VGPR in. 2009 // Unpack lanes 2010 for (unsigned i = Offset * PVD.PerVGPR, 2011 e = std::min((Offset + 1) * PVD.PerVGPR, SB.NumSubRegs); 2012 i < e; ++i) { 2013 Register SubReg = 2014 SB.NumSubRegs == 1 2015 ? SB.SuperReg 2016 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i])); 2017 bool LastSubReg = (i + 1 == e); 2018 auto MIB = BuildMI(*SB.MBB, MI, SB.DL, SB.TII.get(AMDGPU::V_READLANE_B32), 2019 SubReg) 2020 .addReg(SB.TmpVGPR, getKillRegState(LastSubReg)) 2021 .addImm(i); 2022 if (SB.NumSubRegs > 1 && i == 0) 2023 MIB.addReg(SB.SuperReg, RegState::ImplicitDefine); 2024 } 2025 } 2026 SB.restore(); 2027 2028 SB.MFI.addToSpilledSGPRs(SB.NumSubRegs); 2029 return false; 2030 } 2031 2032 /// Special case of eliminateFrameIndex. Returns true if the SGPR was spilled to 2033 /// a VGPR and the stack slot can be safely eliminated when all other users are 2034 /// handled. 2035 bool SIRegisterInfo::eliminateSGPRToVGPRSpillFrameIndex( 2036 MachineBasicBlock::iterator MI, int FI, RegScavenger *RS, 2037 SlotIndexes *Indexes, LiveIntervals *LIS, bool SpillToPhysVGPRLane) const { 2038 switch (MI->getOpcode()) { 2039 case AMDGPU::SI_SPILL_S1024_SAVE: 2040 case AMDGPU::SI_SPILL_S512_SAVE: 2041 case AMDGPU::SI_SPILL_S384_SAVE: 2042 case AMDGPU::SI_SPILL_S352_SAVE: 2043 case AMDGPU::SI_SPILL_S320_SAVE: 2044 case AMDGPU::SI_SPILL_S288_SAVE: 2045 case AMDGPU::SI_SPILL_S256_SAVE: 2046 case AMDGPU::SI_SPILL_S224_SAVE: 2047 case AMDGPU::SI_SPILL_S192_SAVE: 2048 case AMDGPU::SI_SPILL_S160_SAVE: 2049 case AMDGPU::SI_SPILL_S128_SAVE: 2050 case AMDGPU::SI_SPILL_S96_SAVE: 2051 case AMDGPU::SI_SPILL_S64_SAVE: 2052 case AMDGPU::SI_SPILL_S32_SAVE: 2053 return spillSGPR(MI, FI, RS, Indexes, LIS, true, SpillToPhysVGPRLane); 2054 case AMDGPU::SI_SPILL_S1024_RESTORE: 2055 case AMDGPU::SI_SPILL_S512_RESTORE: 2056 case AMDGPU::SI_SPILL_S384_RESTORE: 2057 case AMDGPU::SI_SPILL_S352_RESTORE: 2058 case AMDGPU::SI_SPILL_S320_RESTORE: 2059 case AMDGPU::SI_SPILL_S288_RESTORE: 2060 case AMDGPU::SI_SPILL_S256_RESTORE: 2061 case AMDGPU::SI_SPILL_S224_RESTORE: 2062 case AMDGPU::SI_SPILL_S192_RESTORE: 2063 case AMDGPU::SI_SPILL_S160_RESTORE: 2064 case AMDGPU::SI_SPILL_S128_RESTORE: 2065 case AMDGPU::SI_SPILL_S96_RESTORE: 2066 case AMDGPU::SI_SPILL_S64_RESTORE: 2067 case AMDGPU::SI_SPILL_S32_RESTORE: 2068 return restoreSGPR(MI, FI, RS, Indexes, LIS, true, SpillToPhysVGPRLane); 2069 default: 2070 llvm_unreachable("not an SGPR spill instruction"); 2071 } 2072 } 2073 2074 bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, 2075 int SPAdj, unsigned FIOperandNum, 2076 RegScavenger *RS) const { 2077 MachineFunction *MF = MI->getParent()->getParent(); 2078 MachineBasicBlock *MBB = MI->getParent(); 2079 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); 2080 MachineFrameInfo &FrameInfo = MF->getFrameInfo(); 2081 const SIInstrInfo *TII = ST.getInstrInfo(); 2082 DebugLoc DL = MI->getDebugLoc(); 2083 2084 assert(SPAdj == 0 && "unhandled SP adjustment in call sequence?"); 2085 2086 assert(MF->getRegInfo().isReserved(MFI->getScratchRSrcReg()) && 2087 "unreserved scratch RSRC register"); 2088 2089 MachineOperand &FIOp = MI->getOperand(FIOperandNum); 2090 int Index = MI->getOperand(FIOperandNum).getIndex(); 2091 2092 Register FrameReg = FrameInfo.isFixedObjectIndex(Index) && hasBasePointer(*MF) 2093 ? getBaseRegister() 2094 : getFrameRegister(*MF); 2095 2096 switch (MI->getOpcode()) { 2097 // SGPR register spill 2098 case AMDGPU::SI_SPILL_S1024_SAVE: 2099 case AMDGPU::SI_SPILL_S512_SAVE: 2100 case AMDGPU::SI_SPILL_S384_SAVE: 2101 case AMDGPU::SI_SPILL_S352_SAVE: 2102 case AMDGPU::SI_SPILL_S320_SAVE: 2103 case AMDGPU::SI_SPILL_S288_SAVE: 2104 case AMDGPU::SI_SPILL_S256_SAVE: 2105 case AMDGPU::SI_SPILL_S224_SAVE: 2106 case AMDGPU::SI_SPILL_S192_SAVE: 2107 case AMDGPU::SI_SPILL_S160_SAVE: 2108 case AMDGPU::SI_SPILL_S128_SAVE: 2109 case AMDGPU::SI_SPILL_S96_SAVE: 2110 case AMDGPU::SI_SPILL_S64_SAVE: 2111 case AMDGPU::SI_SPILL_S32_SAVE: { 2112 return spillSGPR(MI, Index, RS); 2113 } 2114 2115 // SGPR register restore 2116 case AMDGPU::SI_SPILL_S1024_RESTORE: 2117 case AMDGPU::SI_SPILL_S512_RESTORE: 2118 case AMDGPU::SI_SPILL_S384_RESTORE: 2119 case AMDGPU::SI_SPILL_S352_RESTORE: 2120 case AMDGPU::SI_SPILL_S320_RESTORE: 2121 case AMDGPU::SI_SPILL_S288_RESTORE: 2122 case AMDGPU::SI_SPILL_S256_RESTORE: 2123 case AMDGPU::SI_SPILL_S224_RESTORE: 2124 case AMDGPU::SI_SPILL_S192_RESTORE: 2125 case AMDGPU::SI_SPILL_S160_RESTORE: 2126 case AMDGPU::SI_SPILL_S128_RESTORE: 2127 case AMDGPU::SI_SPILL_S96_RESTORE: 2128 case AMDGPU::SI_SPILL_S64_RESTORE: 2129 case AMDGPU::SI_SPILL_S32_RESTORE: { 2130 return restoreSGPR(MI, Index, RS); 2131 } 2132 2133 // VGPR register spill 2134 case AMDGPU::SI_SPILL_V1024_SAVE: 2135 case AMDGPU::SI_SPILL_V512_SAVE: 2136 case AMDGPU::SI_SPILL_V384_SAVE: 2137 case AMDGPU::SI_SPILL_V352_SAVE: 2138 case AMDGPU::SI_SPILL_V320_SAVE: 2139 case AMDGPU::SI_SPILL_V288_SAVE: 2140 case AMDGPU::SI_SPILL_V256_SAVE: 2141 case AMDGPU::SI_SPILL_V224_SAVE: 2142 case AMDGPU::SI_SPILL_V192_SAVE: 2143 case AMDGPU::SI_SPILL_V160_SAVE: 2144 case AMDGPU::SI_SPILL_V128_SAVE: 2145 case AMDGPU::SI_SPILL_V96_SAVE: 2146 case AMDGPU::SI_SPILL_V64_SAVE: 2147 case AMDGPU::SI_SPILL_V32_SAVE: 2148 case AMDGPU::SI_SPILL_A1024_SAVE: 2149 case AMDGPU::SI_SPILL_A512_SAVE: 2150 case AMDGPU::SI_SPILL_A384_SAVE: 2151 case AMDGPU::SI_SPILL_A352_SAVE: 2152 case AMDGPU::SI_SPILL_A320_SAVE: 2153 case AMDGPU::SI_SPILL_A288_SAVE: 2154 case AMDGPU::SI_SPILL_A256_SAVE: 2155 case AMDGPU::SI_SPILL_A224_SAVE: 2156 case AMDGPU::SI_SPILL_A192_SAVE: 2157 case AMDGPU::SI_SPILL_A160_SAVE: 2158 case AMDGPU::SI_SPILL_A128_SAVE: 2159 case AMDGPU::SI_SPILL_A96_SAVE: 2160 case AMDGPU::SI_SPILL_A64_SAVE: 2161 case AMDGPU::SI_SPILL_A32_SAVE: 2162 case AMDGPU::SI_SPILL_AV1024_SAVE: 2163 case AMDGPU::SI_SPILL_AV512_SAVE: 2164 case AMDGPU::SI_SPILL_AV384_SAVE: 2165 case AMDGPU::SI_SPILL_AV352_SAVE: 2166 case AMDGPU::SI_SPILL_AV320_SAVE: 2167 case AMDGPU::SI_SPILL_AV288_SAVE: 2168 case AMDGPU::SI_SPILL_AV256_SAVE: 2169 case AMDGPU::SI_SPILL_AV224_SAVE: 2170 case AMDGPU::SI_SPILL_AV192_SAVE: 2171 case AMDGPU::SI_SPILL_AV160_SAVE: 2172 case AMDGPU::SI_SPILL_AV128_SAVE: 2173 case AMDGPU::SI_SPILL_AV96_SAVE: 2174 case AMDGPU::SI_SPILL_AV64_SAVE: 2175 case AMDGPU::SI_SPILL_AV32_SAVE: 2176 case AMDGPU::SI_SPILL_WWM_V32_SAVE: 2177 case AMDGPU::SI_SPILL_WWM_AV32_SAVE: { 2178 const MachineOperand *VData = TII->getNamedOperand(*MI, 2179 AMDGPU::OpName::vdata); 2180 assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() == 2181 MFI->getStackPtrOffsetReg()); 2182 2183 unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_STORE_DWORD_SADDR 2184 : AMDGPU::BUFFER_STORE_DWORD_OFFSET; 2185 auto *MBB = MI->getParent(); 2186 bool IsWWMRegSpill = TII->isWWMRegSpillOpcode(MI->getOpcode()); 2187 if (IsWWMRegSpill) { 2188 TII->insertScratchExecCopy(*MF, *MBB, MI, DL, MFI->getSGPRForEXECCopy(), 2189 RS->isRegUsed(AMDGPU::SCC)); 2190 } 2191 buildSpillLoadStore( 2192 *MBB, MI, DL, Opc, Index, VData->getReg(), VData->isKill(), FrameReg, 2193 TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(), 2194 *MI->memoperands_begin(), RS); 2195 MFI->addToSpilledVGPRs(getNumSubRegsForSpillOp(MI->getOpcode())); 2196 if (IsWWMRegSpill) 2197 TII->restoreExec(*MF, *MBB, MI, DL, MFI->getSGPRForEXECCopy()); 2198 2199 MI->eraseFromParent(); 2200 return true; 2201 } 2202 case AMDGPU::SI_SPILL_V32_RESTORE: 2203 case AMDGPU::SI_SPILL_V64_RESTORE: 2204 case AMDGPU::SI_SPILL_V96_RESTORE: 2205 case AMDGPU::SI_SPILL_V128_RESTORE: 2206 case AMDGPU::SI_SPILL_V160_RESTORE: 2207 case AMDGPU::SI_SPILL_V192_RESTORE: 2208 case AMDGPU::SI_SPILL_V224_RESTORE: 2209 case AMDGPU::SI_SPILL_V256_RESTORE: 2210 case AMDGPU::SI_SPILL_V288_RESTORE: 2211 case AMDGPU::SI_SPILL_V320_RESTORE: 2212 case AMDGPU::SI_SPILL_V352_RESTORE: 2213 case AMDGPU::SI_SPILL_V384_RESTORE: 2214 case AMDGPU::SI_SPILL_V512_RESTORE: 2215 case AMDGPU::SI_SPILL_V1024_RESTORE: 2216 case AMDGPU::SI_SPILL_A32_RESTORE: 2217 case AMDGPU::SI_SPILL_A64_RESTORE: 2218 case AMDGPU::SI_SPILL_A96_RESTORE: 2219 case AMDGPU::SI_SPILL_A128_RESTORE: 2220 case AMDGPU::SI_SPILL_A160_RESTORE: 2221 case AMDGPU::SI_SPILL_A192_RESTORE: 2222 case AMDGPU::SI_SPILL_A224_RESTORE: 2223 case AMDGPU::SI_SPILL_A256_RESTORE: 2224 case AMDGPU::SI_SPILL_A288_RESTORE: 2225 case AMDGPU::SI_SPILL_A320_RESTORE: 2226 case AMDGPU::SI_SPILL_A352_RESTORE: 2227 case AMDGPU::SI_SPILL_A384_RESTORE: 2228 case AMDGPU::SI_SPILL_A512_RESTORE: 2229 case AMDGPU::SI_SPILL_A1024_RESTORE: 2230 case AMDGPU::SI_SPILL_AV32_RESTORE: 2231 case AMDGPU::SI_SPILL_AV64_RESTORE: 2232 case AMDGPU::SI_SPILL_AV96_RESTORE: 2233 case AMDGPU::SI_SPILL_AV128_RESTORE: 2234 case AMDGPU::SI_SPILL_AV160_RESTORE: 2235 case AMDGPU::SI_SPILL_AV192_RESTORE: 2236 case AMDGPU::SI_SPILL_AV224_RESTORE: 2237 case AMDGPU::SI_SPILL_AV256_RESTORE: 2238 case AMDGPU::SI_SPILL_AV288_RESTORE: 2239 case AMDGPU::SI_SPILL_AV320_RESTORE: 2240 case AMDGPU::SI_SPILL_AV352_RESTORE: 2241 case AMDGPU::SI_SPILL_AV384_RESTORE: 2242 case AMDGPU::SI_SPILL_AV512_RESTORE: 2243 case AMDGPU::SI_SPILL_AV1024_RESTORE: 2244 case AMDGPU::SI_SPILL_WWM_V32_RESTORE: 2245 case AMDGPU::SI_SPILL_WWM_AV32_RESTORE: { 2246 const MachineOperand *VData = TII->getNamedOperand(*MI, 2247 AMDGPU::OpName::vdata); 2248 assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() == 2249 MFI->getStackPtrOffsetReg()); 2250 2251 unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR 2252 : AMDGPU::BUFFER_LOAD_DWORD_OFFSET; 2253 auto *MBB = MI->getParent(); 2254 bool IsWWMRegSpill = TII->isWWMRegSpillOpcode(MI->getOpcode()); 2255 if (IsWWMRegSpill) { 2256 TII->insertScratchExecCopy(*MF, *MBB, MI, DL, MFI->getSGPRForEXECCopy(), 2257 RS->isRegUsed(AMDGPU::SCC)); 2258 } 2259 2260 buildSpillLoadStore( 2261 *MBB, MI, DL, Opc, Index, VData->getReg(), VData->isKill(), FrameReg, 2262 TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(), 2263 *MI->memoperands_begin(), RS); 2264 2265 if (IsWWMRegSpill) 2266 TII->restoreExec(*MF, *MBB, MI, DL, MFI->getSGPRForEXECCopy()); 2267 2268 MI->eraseFromParent(); 2269 return true; 2270 } 2271 2272 default: { 2273 // Other access to frame index 2274 const DebugLoc &DL = MI->getDebugLoc(); 2275 2276 int64_t Offset = FrameInfo.getObjectOffset(Index); 2277 if (ST.enableFlatScratch()) { 2278 if (TII->isFLATScratch(*MI)) { 2279 assert((int16_t)FIOperandNum == 2280 AMDGPU::getNamedOperandIdx(MI->getOpcode(), 2281 AMDGPU::OpName::saddr)); 2282 2283 // The offset is always swizzled, just replace it 2284 if (FrameReg) 2285 FIOp.ChangeToRegister(FrameReg, false); 2286 2287 MachineOperand *OffsetOp = 2288 TII->getNamedOperand(*MI, AMDGPU::OpName::offset); 2289 int64_t NewOffset = Offset + OffsetOp->getImm(); 2290 if (TII->isLegalFLATOffset(NewOffset, AMDGPUAS::PRIVATE_ADDRESS, 2291 SIInstrFlags::FlatScratch)) { 2292 OffsetOp->setImm(NewOffset); 2293 if (FrameReg) 2294 return false; 2295 Offset = 0; 2296 } 2297 2298 if (!Offset) { 2299 unsigned Opc = MI->getOpcode(); 2300 int NewOpc = -1; 2301 if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vaddr)) { 2302 NewOpc = AMDGPU::getFlatScratchInstSVfromSVS(Opc); 2303 } else if (ST.hasFlatScratchSTMode()) { 2304 // On GFX10 we have ST mode to use no registers for an address. 2305 // Otherwise we need to materialize 0 into an SGPR. 2306 NewOpc = AMDGPU::getFlatScratchInstSTfromSS(Opc); 2307 } 2308 2309 if (NewOpc != -1) { 2310 // removeOperand doesn't fixup tied operand indexes as it goes, so 2311 // it asserts. Untie vdst_in for now and retie them afterwards. 2312 int VDstIn = AMDGPU::getNamedOperandIdx(Opc, 2313 AMDGPU::OpName::vdst_in); 2314 bool TiedVDst = VDstIn != -1 && 2315 MI->getOperand(VDstIn).isReg() && 2316 MI->getOperand(VDstIn).isTied(); 2317 if (TiedVDst) 2318 MI->untieRegOperand(VDstIn); 2319 2320 MI->removeOperand( 2321 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr)); 2322 2323 if (TiedVDst) { 2324 int NewVDst = 2325 AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vdst); 2326 int NewVDstIn = 2327 AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vdst_in); 2328 assert (NewVDst != -1 && NewVDstIn != -1 && "Must be tied!"); 2329 MI->tieOperands(NewVDst, NewVDstIn); 2330 } 2331 MI->setDesc(TII->get(NewOpc)); 2332 return false; 2333 } 2334 } 2335 } 2336 2337 if (!FrameReg) { 2338 FIOp.ChangeToImmediate(Offset); 2339 if (TII->isImmOperandLegal(*MI, FIOperandNum, FIOp)) 2340 return false; 2341 } 2342 2343 // We need to use register here. Check if we can use an SGPR or need 2344 // a VGPR. 2345 FIOp.ChangeToRegister(AMDGPU::M0, false); 2346 bool UseSGPR = TII->isOperandLegal(*MI, FIOperandNum, &FIOp); 2347 2348 if (!Offset && FrameReg && UseSGPR) { 2349 FIOp.setReg(FrameReg); 2350 return false; 2351 } 2352 2353 const TargetRegisterClass *RC = UseSGPR ? &AMDGPU::SReg_32_XM0RegClass 2354 : &AMDGPU::VGPR_32RegClass; 2355 2356 Register TmpReg = 2357 RS->scavengeRegisterBackwards(*RC, MI, false, 0, !UseSGPR); 2358 FIOp.setReg(TmpReg); 2359 FIOp.setIsKill(); 2360 2361 if ((!FrameReg || !Offset) && TmpReg) { 2362 unsigned Opc = UseSGPR ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32; 2363 auto MIB = BuildMI(*MBB, MI, DL, TII->get(Opc), TmpReg); 2364 if (FrameReg) 2365 MIB.addReg(FrameReg); 2366 else 2367 MIB.addImm(Offset); 2368 2369 return false; 2370 } 2371 2372 bool NeedSaveSCC = RS->isRegUsed(AMDGPU::SCC) && 2373 !MI->definesRegister(AMDGPU::SCC, /*TRI=*/nullptr); 2374 2375 Register TmpSReg = 2376 UseSGPR ? TmpReg 2377 : RS->scavengeRegisterBackwards(AMDGPU::SReg_32_XM0RegClass, 2378 MI, false, 0, !UseSGPR); 2379 2380 // TODO: for flat scratch another attempt can be made with a VGPR index 2381 // if no SGPRs can be scavenged. 2382 if ((!TmpSReg && !FrameReg) || (!TmpReg && !UseSGPR)) 2383 report_fatal_error("Cannot scavenge register in FI elimination!"); 2384 2385 if (!TmpSReg) { 2386 // Use frame register and restore it after. 2387 TmpSReg = FrameReg; 2388 FIOp.setReg(FrameReg); 2389 FIOp.setIsKill(false); 2390 } 2391 2392 if (NeedSaveSCC) { 2393 assert(!(Offset & 0x1) && "Flat scratch offset must be aligned!"); 2394 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADDC_U32), TmpSReg) 2395 .addReg(FrameReg) 2396 .addImm(Offset); 2397 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_BITCMP1_B32)) 2398 .addReg(TmpSReg) 2399 .addImm(0); 2400 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_BITSET0_B32), TmpSReg) 2401 .addImm(0) 2402 .addReg(TmpSReg); 2403 } else { 2404 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), TmpSReg) 2405 .addReg(FrameReg) 2406 .addImm(Offset); 2407 } 2408 2409 if (!UseSGPR) 2410 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpReg) 2411 .addReg(TmpSReg, RegState::Kill); 2412 2413 if (TmpSReg == FrameReg) { 2414 // Undo frame register modification. 2415 if (NeedSaveSCC && 2416 !MI->registerDefIsDead(AMDGPU::SCC, /*TRI=*/nullptr)) { 2417 MachineBasicBlock::iterator I = 2418 BuildMI(*MBB, std::next(MI), DL, TII->get(AMDGPU::S_ADDC_U32), 2419 TmpSReg) 2420 .addReg(FrameReg) 2421 .addImm(-Offset); 2422 I = BuildMI(*MBB, std::next(I), DL, TII->get(AMDGPU::S_BITCMP1_B32)) 2423 .addReg(TmpSReg) 2424 .addImm(0); 2425 BuildMI(*MBB, std::next(I), DL, TII->get(AMDGPU::S_BITSET0_B32), 2426 TmpSReg) 2427 .addImm(0) 2428 .addReg(TmpSReg); 2429 } else { 2430 BuildMI(*MBB, std::next(MI), DL, TII->get(AMDGPU::S_ADD_I32), 2431 FrameReg) 2432 .addReg(FrameReg) 2433 .addImm(-Offset); 2434 } 2435 } 2436 2437 return false; 2438 } 2439 2440 bool IsMUBUF = TII->isMUBUF(*MI); 2441 2442 if (!IsMUBUF && !MFI->isBottomOfStack()) { 2443 // Convert to a swizzled stack address by scaling by the wave size. 2444 // In an entry function/kernel the offset is already swizzled. 2445 bool IsSALU = isSGPRClass(TII->getOpRegClass(*MI, FIOperandNum)); 2446 bool LiveSCC = RS->isRegUsed(AMDGPU::SCC) && 2447 !MI->definesRegister(AMDGPU::SCC, /*TRI=*/nullptr); 2448 const TargetRegisterClass *RC = IsSALU && !LiveSCC 2449 ? &AMDGPU::SReg_32RegClass 2450 : &AMDGPU::VGPR_32RegClass; 2451 bool IsCopy = MI->getOpcode() == AMDGPU::V_MOV_B32_e32 || 2452 MI->getOpcode() == AMDGPU::V_MOV_B32_e64; 2453 Register ResultReg = 2454 IsCopy ? MI->getOperand(0).getReg() 2455 : RS->scavengeRegisterBackwards(*RC, MI, false, 0); 2456 2457 int64_t Offset = FrameInfo.getObjectOffset(Index); 2458 if (Offset == 0) { 2459 unsigned OpCode = IsSALU && !LiveSCC ? AMDGPU::S_LSHR_B32 2460 : AMDGPU::V_LSHRREV_B32_e64; 2461 auto Shift = BuildMI(*MBB, MI, DL, TII->get(OpCode), ResultReg); 2462 if (OpCode == AMDGPU::V_LSHRREV_B32_e64) 2463 // For V_LSHRREV, the operands are reversed (the shift count goes 2464 // first). 2465 Shift.addImm(ST.getWavefrontSizeLog2()).addReg(FrameReg); 2466 else 2467 Shift.addReg(FrameReg).addImm(ST.getWavefrontSizeLog2()); 2468 if (IsSALU && !LiveSCC) 2469 Shift.getInstr()->getOperand(3).setIsDead(); // Mark SCC as dead. 2470 if (IsSALU && LiveSCC) { 2471 Register NewDest = RS->scavengeRegisterBackwards( 2472 AMDGPU::SReg_32RegClass, Shift, false, 0); 2473 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), 2474 NewDest) 2475 .addReg(ResultReg); 2476 ResultReg = NewDest; 2477 } 2478 } else { 2479 MachineInstrBuilder MIB; 2480 if (!IsSALU) { 2481 if ((MIB = TII->getAddNoCarry(*MBB, MI, DL, ResultReg, *RS)) != 2482 nullptr) { 2483 // Reuse ResultReg in intermediate step. 2484 Register ScaledReg = ResultReg; 2485 2486 BuildMI(*MBB, *MIB, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64), 2487 ScaledReg) 2488 .addImm(ST.getWavefrontSizeLog2()) 2489 .addReg(FrameReg); 2490 2491 const bool IsVOP2 = MIB->getOpcode() == AMDGPU::V_ADD_U32_e32; 2492 2493 // TODO: Fold if use instruction is another add of a constant. 2494 if (IsVOP2 || AMDGPU::isInlinableLiteral32(Offset, ST.hasInv2PiInlineImm())) { 2495 // FIXME: This can fail 2496 MIB.addImm(Offset); 2497 MIB.addReg(ScaledReg, RegState::Kill); 2498 if (!IsVOP2) 2499 MIB.addImm(0); // clamp bit 2500 } else { 2501 assert(MIB->getOpcode() == AMDGPU::V_ADD_CO_U32_e64 && 2502 "Need to reuse carry out register"); 2503 2504 // Use scavenged unused carry out as offset register. 2505 Register ConstOffsetReg; 2506 if (!isWave32) 2507 ConstOffsetReg = getSubReg(MIB.getReg(1), AMDGPU::sub0); 2508 else 2509 ConstOffsetReg = MIB.getReg(1); 2510 2511 BuildMI(*MBB, *MIB, DL, TII->get(AMDGPU::S_MOV_B32), ConstOffsetReg) 2512 .addImm(Offset); 2513 MIB.addReg(ConstOffsetReg, RegState::Kill); 2514 MIB.addReg(ScaledReg, RegState::Kill); 2515 MIB.addImm(0); // clamp bit 2516 } 2517 } 2518 } 2519 if (!MIB || IsSALU) { 2520 // We have to produce a carry out, and there isn't a free SGPR pair 2521 // for it. We can keep the whole computation on the SALU to avoid 2522 // clobbering an additional register at the cost of an extra mov. 2523 2524 // We may have 1 free scratch SGPR even though a carry out is 2525 // unavailable. Only one additional mov is needed. 2526 Register TmpScaledReg = RS->scavengeRegisterBackwards( 2527 AMDGPU::SReg_32_XM0RegClass, MI, false, 0, false); 2528 Register ScaledReg = TmpScaledReg.isValid() ? TmpScaledReg : FrameReg; 2529 2530 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_LSHR_B32), ScaledReg) 2531 .addReg(FrameReg) 2532 .addImm(ST.getWavefrontSizeLog2()); 2533 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), ScaledReg) 2534 .addReg(ScaledReg, RegState::Kill) 2535 .addImm(Offset); 2536 if (!IsSALU) 2537 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), ResultReg) 2538 .addReg(ScaledReg, RegState::Kill); 2539 else 2540 ResultReg = ScaledReg; 2541 2542 // If there were truly no free SGPRs, we need to undo everything. 2543 if (!TmpScaledReg.isValid()) { 2544 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), ScaledReg) 2545 .addReg(ScaledReg, RegState::Kill) 2546 .addImm(-Offset); 2547 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_LSHL_B32), ScaledReg) 2548 .addReg(FrameReg) 2549 .addImm(ST.getWavefrontSizeLog2()); 2550 } 2551 } 2552 } 2553 2554 // Don't introduce an extra copy if we're just materializing in a mov. 2555 if (IsCopy) { 2556 MI->eraseFromParent(); 2557 return true; 2558 } 2559 FIOp.ChangeToRegister(ResultReg, false, false, true); 2560 return false; 2561 } 2562 2563 if (IsMUBUF) { 2564 // Disable offen so we don't need a 0 vgpr base. 2565 assert(static_cast<int>(FIOperandNum) == 2566 AMDGPU::getNamedOperandIdx(MI->getOpcode(), 2567 AMDGPU::OpName::vaddr)); 2568 2569 auto &SOffset = *TII->getNamedOperand(*MI, AMDGPU::OpName::soffset); 2570 assert((SOffset.isImm() && SOffset.getImm() == 0)); 2571 2572 if (FrameReg != AMDGPU::NoRegister) 2573 SOffset.ChangeToRegister(FrameReg, false); 2574 2575 int64_t Offset = FrameInfo.getObjectOffset(Index); 2576 int64_t OldImm 2577 = TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(); 2578 int64_t NewOffset = OldImm + Offset; 2579 2580 if (TII->isLegalMUBUFImmOffset(NewOffset) && 2581 buildMUBUFOffsetLoadStore(ST, FrameInfo, MI, Index, NewOffset)) { 2582 MI->eraseFromParent(); 2583 return true; 2584 } 2585 } 2586 2587 // If the offset is simply too big, don't convert to a scratch wave offset 2588 // relative index. 2589 2590 FIOp.ChangeToImmediate(Offset); 2591 if (!TII->isImmOperandLegal(*MI, FIOperandNum, FIOp)) { 2592 Register TmpReg = RS->scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass, 2593 MI, false, 0); 2594 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpReg) 2595 .addImm(Offset); 2596 FIOp.ChangeToRegister(TmpReg, false, false, true); 2597 } 2598 } 2599 } 2600 return false; 2601 } 2602 2603 StringRef SIRegisterInfo::getRegAsmName(MCRegister Reg) const { 2604 return AMDGPUInstPrinter::getRegisterName(Reg); 2605 } 2606 2607 unsigned AMDGPU::getRegBitWidth(const TargetRegisterClass &RC) { 2608 return getRegBitWidth(RC.getID()); 2609 } 2610 2611 static const TargetRegisterClass * 2612 getAnyVGPRClassForBitWidth(unsigned BitWidth) { 2613 if (BitWidth == 64) 2614 return &AMDGPU::VReg_64RegClass; 2615 if (BitWidth == 96) 2616 return &AMDGPU::VReg_96RegClass; 2617 if (BitWidth == 128) 2618 return &AMDGPU::VReg_128RegClass; 2619 if (BitWidth == 160) 2620 return &AMDGPU::VReg_160RegClass; 2621 if (BitWidth == 192) 2622 return &AMDGPU::VReg_192RegClass; 2623 if (BitWidth == 224) 2624 return &AMDGPU::VReg_224RegClass; 2625 if (BitWidth == 256) 2626 return &AMDGPU::VReg_256RegClass; 2627 if (BitWidth == 288) 2628 return &AMDGPU::VReg_288RegClass; 2629 if (BitWidth == 320) 2630 return &AMDGPU::VReg_320RegClass; 2631 if (BitWidth == 352) 2632 return &AMDGPU::VReg_352RegClass; 2633 if (BitWidth == 384) 2634 return &AMDGPU::VReg_384RegClass; 2635 if (BitWidth == 512) 2636 return &AMDGPU::VReg_512RegClass; 2637 if (BitWidth == 1024) 2638 return &AMDGPU::VReg_1024RegClass; 2639 2640 return nullptr; 2641 } 2642 2643 static const TargetRegisterClass * 2644 getAlignedVGPRClassForBitWidth(unsigned BitWidth) { 2645 if (BitWidth == 64) 2646 return &AMDGPU::VReg_64_Align2RegClass; 2647 if (BitWidth == 96) 2648 return &AMDGPU::VReg_96_Align2RegClass; 2649 if (BitWidth == 128) 2650 return &AMDGPU::VReg_128_Align2RegClass; 2651 if (BitWidth == 160) 2652 return &AMDGPU::VReg_160_Align2RegClass; 2653 if (BitWidth == 192) 2654 return &AMDGPU::VReg_192_Align2RegClass; 2655 if (BitWidth == 224) 2656 return &AMDGPU::VReg_224_Align2RegClass; 2657 if (BitWidth == 256) 2658 return &AMDGPU::VReg_256_Align2RegClass; 2659 if (BitWidth == 288) 2660 return &AMDGPU::VReg_288_Align2RegClass; 2661 if (BitWidth == 320) 2662 return &AMDGPU::VReg_320_Align2RegClass; 2663 if (BitWidth == 352) 2664 return &AMDGPU::VReg_352_Align2RegClass; 2665 if (BitWidth == 384) 2666 return &AMDGPU::VReg_384_Align2RegClass; 2667 if (BitWidth == 512) 2668 return &AMDGPU::VReg_512_Align2RegClass; 2669 if (BitWidth == 1024) 2670 return &AMDGPU::VReg_1024_Align2RegClass; 2671 2672 return nullptr; 2673 } 2674 2675 const TargetRegisterClass * 2676 SIRegisterInfo::getVGPRClassForBitWidth(unsigned BitWidth) const { 2677 if (BitWidth == 1) 2678 return &AMDGPU::VReg_1RegClass; 2679 if (BitWidth == 16) 2680 return &AMDGPU::VGPR_16RegClass; 2681 if (BitWidth == 32) 2682 return &AMDGPU::VGPR_32RegClass; 2683 return ST.needsAlignedVGPRs() ? getAlignedVGPRClassForBitWidth(BitWidth) 2684 : getAnyVGPRClassForBitWidth(BitWidth); 2685 } 2686 2687 static const TargetRegisterClass * 2688 getAnyAGPRClassForBitWidth(unsigned BitWidth) { 2689 if (BitWidth == 64) 2690 return &AMDGPU::AReg_64RegClass; 2691 if (BitWidth == 96) 2692 return &AMDGPU::AReg_96RegClass; 2693 if (BitWidth == 128) 2694 return &AMDGPU::AReg_128RegClass; 2695 if (BitWidth == 160) 2696 return &AMDGPU::AReg_160RegClass; 2697 if (BitWidth == 192) 2698 return &AMDGPU::AReg_192RegClass; 2699 if (BitWidth == 224) 2700 return &AMDGPU::AReg_224RegClass; 2701 if (BitWidth == 256) 2702 return &AMDGPU::AReg_256RegClass; 2703 if (BitWidth == 288) 2704 return &AMDGPU::AReg_288RegClass; 2705 if (BitWidth == 320) 2706 return &AMDGPU::AReg_320RegClass; 2707 if (BitWidth == 352) 2708 return &AMDGPU::AReg_352RegClass; 2709 if (BitWidth == 384) 2710 return &AMDGPU::AReg_384RegClass; 2711 if (BitWidth == 512) 2712 return &AMDGPU::AReg_512RegClass; 2713 if (BitWidth == 1024) 2714 return &AMDGPU::AReg_1024RegClass; 2715 2716 return nullptr; 2717 } 2718 2719 static const TargetRegisterClass * 2720 getAlignedAGPRClassForBitWidth(unsigned BitWidth) { 2721 if (BitWidth == 64) 2722 return &AMDGPU::AReg_64_Align2RegClass; 2723 if (BitWidth == 96) 2724 return &AMDGPU::AReg_96_Align2RegClass; 2725 if (BitWidth == 128) 2726 return &AMDGPU::AReg_128_Align2RegClass; 2727 if (BitWidth == 160) 2728 return &AMDGPU::AReg_160_Align2RegClass; 2729 if (BitWidth == 192) 2730 return &AMDGPU::AReg_192_Align2RegClass; 2731 if (BitWidth == 224) 2732 return &AMDGPU::AReg_224_Align2RegClass; 2733 if (BitWidth == 256) 2734 return &AMDGPU::AReg_256_Align2RegClass; 2735 if (BitWidth == 288) 2736 return &AMDGPU::AReg_288_Align2RegClass; 2737 if (BitWidth == 320) 2738 return &AMDGPU::AReg_320_Align2RegClass; 2739 if (BitWidth == 352) 2740 return &AMDGPU::AReg_352_Align2RegClass; 2741 if (BitWidth == 384) 2742 return &AMDGPU::AReg_384_Align2RegClass; 2743 if (BitWidth == 512) 2744 return &AMDGPU::AReg_512_Align2RegClass; 2745 if (BitWidth == 1024) 2746 return &AMDGPU::AReg_1024_Align2RegClass; 2747 2748 return nullptr; 2749 } 2750 2751 const TargetRegisterClass * 2752 SIRegisterInfo::getAGPRClassForBitWidth(unsigned BitWidth) const { 2753 if (BitWidth == 16) 2754 return &AMDGPU::AGPR_LO16RegClass; 2755 if (BitWidth == 32) 2756 return &AMDGPU::AGPR_32RegClass; 2757 return ST.needsAlignedVGPRs() ? getAlignedAGPRClassForBitWidth(BitWidth) 2758 : getAnyAGPRClassForBitWidth(BitWidth); 2759 } 2760 2761 static const TargetRegisterClass * 2762 getAnyVectorSuperClassForBitWidth(unsigned BitWidth) { 2763 if (BitWidth == 64) 2764 return &AMDGPU::AV_64RegClass; 2765 if (BitWidth == 96) 2766 return &AMDGPU::AV_96RegClass; 2767 if (BitWidth == 128) 2768 return &AMDGPU::AV_128RegClass; 2769 if (BitWidth == 160) 2770 return &AMDGPU::AV_160RegClass; 2771 if (BitWidth == 192) 2772 return &AMDGPU::AV_192RegClass; 2773 if (BitWidth == 224) 2774 return &AMDGPU::AV_224RegClass; 2775 if (BitWidth == 256) 2776 return &AMDGPU::AV_256RegClass; 2777 if (BitWidth == 288) 2778 return &AMDGPU::AV_288RegClass; 2779 if (BitWidth == 320) 2780 return &AMDGPU::AV_320RegClass; 2781 if (BitWidth == 352) 2782 return &AMDGPU::AV_352RegClass; 2783 if (BitWidth == 384) 2784 return &AMDGPU::AV_384RegClass; 2785 if (BitWidth == 512) 2786 return &AMDGPU::AV_512RegClass; 2787 if (BitWidth == 1024) 2788 return &AMDGPU::AV_1024RegClass; 2789 2790 return nullptr; 2791 } 2792 2793 static const TargetRegisterClass * 2794 getAlignedVectorSuperClassForBitWidth(unsigned BitWidth) { 2795 if (BitWidth == 64) 2796 return &AMDGPU::AV_64_Align2RegClass; 2797 if (BitWidth == 96) 2798 return &AMDGPU::AV_96_Align2RegClass; 2799 if (BitWidth == 128) 2800 return &AMDGPU::AV_128_Align2RegClass; 2801 if (BitWidth == 160) 2802 return &AMDGPU::AV_160_Align2RegClass; 2803 if (BitWidth == 192) 2804 return &AMDGPU::AV_192_Align2RegClass; 2805 if (BitWidth == 224) 2806 return &AMDGPU::AV_224_Align2RegClass; 2807 if (BitWidth == 256) 2808 return &AMDGPU::AV_256_Align2RegClass; 2809 if (BitWidth == 288) 2810 return &AMDGPU::AV_288_Align2RegClass; 2811 if (BitWidth == 320) 2812 return &AMDGPU::AV_320_Align2RegClass; 2813 if (BitWidth == 352) 2814 return &AMDGPU::AV_352_Align2RegClass; 2815 if (BitWidth == 384) 2816 return &AMDGPU::AV_384_Align2RegClass; 2817 if (BitWidth == 512) 2818 return &AMDGPU::AV_512_Align2RegClass; 2819 if (BitWidth == 1024) 2820 return &AMDGPU::AV_1024_Align2RegClass; 2821 2822 return nullptr; 2823 } 2824 2825 const TargetRegisterClass * 2826 SIRegisterInfo::getVectorSuperClassForBitWidth(unsigned BitWidth) const { 2827 if (BitWidth == 32) 2828 return &AMDGPU::AV_32RegClass; 2829 return ST.needsAlignedVGPRs() 2830 ? getAlignedVectorSuperClassForBitWidth(BitWidth) 2831 : getAnyVectorSuperClassForBitWidth(BitWidth); 2832 } 2833 2834 const TargetRegisterClass * 2835 SIRegisterInfo::getSGPRClassForBitWidth(unsigned BitWidth) { 2836 if (BitWidth == 16) 2837 return &AMDGPU::SGPR_LO16RegClass; 2838 if (BitWidth == 32) 2839 return &AMDGPU::SReg_32RegClass; 2840 if (BitWidth == 64) 2841 return &AMDGPU::SReg_64RegClass; 2842 if (BitWidth == 96) 2843 return &AMDGPU::SGPR_96RegClass; 2844 if (BitWidth == 128) 2845 return &AMDGPU::SGPR_128RegClass; 2846 if (BitWidth == 160) 2847 return &AMDGPU::SGPR_160RegClass; 2848 if (BitWidth == 192) 2849 return &AMDGPU::SGPR_192RegClass; 2850 if (BitWidth == 224) 2851 return &AMDGPU::SGPR_224RegClass; 2852 if (BitWidth == 256) 2853 return &AMDGPU::SGPR_256RegClass; 2854 if (BitWidth == 288) 2855 return &AMDGPU::SGPR_288RegClass; 2856 if (BitWidth == 320) 2857 return &AMDGPU::SGPR_320RegClass; 2858 if (BitWidth == 352) 2859 return &AMDGPU::SGPR_352RegClass; 2860 if (BitWidth == 384) 2861 return &AMDGPU::SGPR_384RegClass; 2862 if (BitWidth == 512) 2863 return &AMDGPU::SGPR_512RegClass; 2864 if (BitWidth == 1024) 2865 return &AMDGPU::SGPR_1024RegClass; 2866 2867 return nullptr; 2868 } 2869 2870 bool SIRegisterInfo::isSGPRReg(const MachineRegisterInfo &MRI, 2871 Register Reg) const { 2872 const TargetRegisterClass *RC; 2873 if (Reg.isVirtual()) 2874 RC = MRI.getRegClass(Reg); 2875 else 2876 RC = getPhysRegBaseClass(Reg); 2877 return RC ? isSGPRClass(RC) : false; 2878 } 2879 2880 const TargetRegisterClass * 2881 SIRegisterInfo::getEquivalentVGPRClass(const TargetRegisterClass *SRC) const { 2882 unsigned Size = getRegSizeInBits(*SRC); 2883 const TargetRegisterClass *VRC = getVGPRClassForBitWidth(Size); 2884 assert(VRC && "Invalid register class size"); 2885 return VRC; 2886 } 2887 2888 const TargetRegisterClass * 2889 SIRegisterInfo::getEquivalentAGPRClass(const TargetRegisterClass *SRC) const { 2890 unsigned Size = getRegSizeInBits(*SRC); 2891 const TargetRegisterClass *ARC = getAGPRClassForBitWidth(Size); 2892 assert(ARC && "Invalid register class size"); 2893 return ARC; 2894 } 2895 2896 const TargetRegisterClass * 2897 SIRegisterInfo::getEquivalentSGPRClass(const TargetRegisterClass *VRC) const { 2898 unsigned Size = getRegSizeInBits(*VRC); 2899 if (Size == 32) 2900 return &AMDGPU::SGPR_32RegClass; 2901 const TargetRegisterClass *SRC = getSGPRClassForBitWidth(Size); 2902 assert(SRC && "Invalid register class size"); 2903 return SRC; 2904 } 2905 2906 const TargetRegisterClass * 2907 SIRegisterInfo::getCompatibleSubRegClass(const TargetRegisterClass *SuperRC, 2908 const TargetRegisterClass *SubRC, 2909 unsigned SubIdx) const { 2910 // Ensure this subregister index is aligned in the super register. 2911 const TargetRegisterClass *MatchRC = 2912 getMatchingSuperRegClass(SuperRC, SubRC, SubIdx); 2913 return MatchRC && MatchRC->hasSubClassEq(SuperRC) ? MatchRC : nullptr; 2914 } 2915 2916 bool SIRegisterInfo::opCanUseInlineConstant(unsigned OpType) const { 2917 if (OpType >= AMDGPU::OPERAND_REG_INLINE_AC_FIRST && 2918 OpType <= AMDGPU::OPERAND_REG_INLINE_AC_LAST) 2919 return !ST.hasMFMAInlineLiteralBug(); 2920 2921 return OpType >= AMDGPU::OPERAND_SRC_FIRST && 2922 OpType <= AMDGPU::OPERAND_SRC_LAST; 2923 } 2924 2925 bool SIRegisterInfo::shouldRewriteCopySrc( 2926 const TargetRegisterClass *DefRC, 2927 unsigned DefSubReg, 2928 const TargetRegisterClass *SrcRC, 2929 unsigned SrcSubReg) const { 2930 // We want to prefer the smallest register class possible, so we don't want to 2931 // stop and rewrite on anything that looks like a subregister 2932 // extract. Operations mostly don't care about the super register class, so we 2933 // only want to stop on the most basic of copies between the same register 2934 // class. 2935 // 2936 // e.g. if we have something like 2937 // %0 = ... 2938 // %1 = ... 2939 // %2 = REG_SEQUENCE %0, sub0, %1, sub1, %2, sub2 2940 // %3 = COPY %2, sub0 2941 // 2942 // We want to look through the COPY to find: 2943 // => %3 = COPY %0 2944 2945 // Plain copy. 2946 return getCommonSubClass(DefRC, SrcRC) != nullptr; 2947 } 2948 2949 bool SIRegisterInfo::opCanUseLiteralConstant(unsigned OpType) const { 2950 // TODO: 64-bit operands have extending behavior from 32-bit literal. 2951 return OpType >= AMDGPU::OPERAND_REG_IMM_FIRST && 2952 OpType <= AMDGPU::OPERAND_REG_IMM_LAST; 2953 } 2954 2955 /// Returns a lowest register that is not used at any point in the function. 2956 /// If all registers are used, then this function will return 2957 /// AMDGPU::NoRegister. If \p ReserveHighestRegister = true, then return 2958 /// highest unused register. 2959 MCRegister SIRegisterInfo::findUnusedRegister( 2960 const MachineRegisterInfo &MRI, const TargetRegisterClass *RC, 2961 const MachineFunction &MF, bool ReserveHighestRegister) const { 2962 if (ReserveHighestRegister) { 2963 for (MCRegister Reg : reverse(*RC)) 2964 if (MRI.isAllocatable(Reg) && !MRI.isPhysRegUsed(Reg)) 2965 return Reg; 2966 } else { 2967 for (MCRegister Reg : *RC) 2968 if (MRI.isAllocatable(Reg) && !MRI.isPhysRegUsed(Reg)) 2969 return Reg; 2970 } 2971 return MCRegister(); 2972 } 2973 2974 bool SIRegisterInfo::isUniformReg(const MachineRegisterInfo &MRI, 2975 const RegisterBankInfo &RBI, 2976 Register Reg) const { 2977 auto *RB = RBI.getRegBank(Reg, MRI, *MRI.getTargetRegisterInfo()); 2978 if (!RB) 2979 return false; 2980 2981 return !RBI.isDivergentRegBank(RB); 2982 } 2983 2984 ArrayRef<int16_t> SIRegisterInfo::getRegSplitParts(const TargetRegisterClass *RC, 2985 unsigned EltSize) const { 2986 const unsigned RegBitWidth = AMDGPU::getRegBitWidth(*RC); 2987 assert(RegBitWidth >= 32 && RegBitWidth <= 1024); 2988 2989 const unsigned RegDWORDs = RegBitWidth / 32; 2990 const unsigned EltDWORDs = EltSize / 4; 2991 assert(RegSplitParts.size() + 1 >= EltDWORDs); 2992 2993 const std::vector<int16_t> &Parts = RegSplitParts[EltDWORDs - 1]; 2994 const unsigned NumParts = RegDWORDs / EltDWORDs; 2995 2996 return ArrayRef(Parts.data(), NumParts); 2997 } 2998 2999 const TargetRegisterClass* 3000 SIRegisterInfo::getRegClassForReg(const MachineRegisterInfo &MRI, 3001 Register Reg) const { 3002 return Reg.isVirtual() ? MRI.getRegClass(Reg) : getPhysRegBaseClass(Reg); 3003 } 3004 3005 const TargetRegisterClass * 3006 SIRegisterInfo::getRegClassForOperandReg(const MachineRegisterInfo &MRI, 3007 const MachineOperand &MO) const { 3008 const TargetRegisterClass *SrcRC = getRegClassForReg(MRI, MO.getReg()); 3009 return getSubRegisterClass(SrcRC, MO.getSubReg()); 3010 } 3011 3012 bool SIRegisterInfo::isVGPR(const MachineRegisterInfo &MRI, 3013 Register Reg) const { 3014 const TargetRegisterClass *RC = getRegClassForReg(MRI, Reg); 3015 // Registers without classes are unaddressable, SGPR-like registers. 3016 return RC && isVGPRClass(RC); 3017 } 3018 3019 bool SIRegisterInfo::isAGPR(const MachineRegisterInfo &MRI, 3020 Register Reg) const { 3021 const TargetRegisterClass *RC = getRegClassForReg(MRI, Reg); 3022 3023 // Registers without classes are unaddressable, SGPR-like registers. 3024 return RC && isAGPRClass(RC); 3025 } 3026 3027 bool SIRegisterInfo::shouldCoalesce(MachineInstr *MI, 3028 const TargetRegisterClass *SrcRC, 3029 unsigned SubReg, 3030 const TargetRegisterClass *DstRC, 3031 unsigned DstSubReg, 3032 const TargetRegisterClass *NewRC, 3033 LiveIntervals &LIS) const { 3034 unsigned SrcSize = getRegSizeInBits(*SrcRC); 3035 unsigned DstSize = getRegSizeInBits(*DstRC); 3036 unsigned NewSize = getRegSizeInBits(*NewRC); 3037 3038 // Do not increase size of registers beyond dword, we would need to allocate 3039 // adjacent registers and constraint regalloc more than needed. 3040 3041 // Always allow dword coalescing. 3042 if (SrcSize <= 32 || DstSize <= 32) 3043 return true; 3044 3045 return NewSize <= DstSize || NewSize <= SrcSize; 3046 } 3047 3048 unsigned SIRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC, 3049 MachineFunction &MF) const { 3050 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 3051 3052 unsigned Occupancy = ST.getOccupancyWithLocalMemSize(MFI->getLDSSize(), 3053 MF.getFunction()); 3054 switch (RC->getID()) { 3055 default: 3056 return AMDGPUGenRegisterInfo::getRegPressureLimit(RC, MF); 3057 case AMDGPU::VGPR_32RegClassID: 3058 return std::min(ST.getMaxNumVGPRs(Occupancy), ST.getMaxNumVGPRs(MF)); 3059 case AMDGPU::SGPR_32RegClassID: 3060 case AMDGPU::SGPR_LO16RegClassID: 3061 return std::min(ST.getMaxNumSGPRs(Occupancy, true), ST.getMaxNumSGPRs(MF)); 3062 } 3063 } 3064 3065 unsigned SIRegisterInfo::getRegPressureSetLimit(const MachineFunction &MF, 3066 unsigned Idx) const { 3067 if (Idx == AMDGPU::RegisterPressureSets::VGPR_32 || 3068 Idx == AMDGPU::RegisterPressureSets::AGPR_32) 3069 return getRegPressureLimit(&AMDGPU::VGPR_32RegClass, 3070 const_cast<MachineFunction &>(MF)); 3071 3072 if (Idx == AMDGPU::RegisterPressureSets::SReg_32) 3073 return getRegPressureLimit(&AMDGPU::SGPR_32RegClass, 3074 const_cast<MachineFunction &>(MF)); 3075 3076 llvm_unreachable("Unexpected register pressure set!"); 3077 } 3078 3079 const int *SIRegisterInfo::getRegUnitPressureSets(unsigned RegUnit) const { 3080 static const int Empty[] = { -1 }; 3081 3082 if (RegPressureIgnoredUnits[RegUnit]) 3083 return Empty; 3084 3085 return AMDGPUGenRegisterInfo::getRegUnitPressureSets(RegUnit); 3086 } 3087 3088 MCRegister SIRegisterInfo::getReturnAddressReg(const MachineFunction &MF) const { 3089 // Not a callee saved register. 3090 return AMDGPU::SGPR30_SGPR31; 3091 } 3092 3093 const TargetRegisterClass * 3094 SIRegisterInfo::getRegClassForSizeOnBank(unsigned Size, 3095 const RegisterBank &RB) const { 3096 switch (RB.getID()) { 3097 case AMDGPU::VGPRRegBankID: 3098 return getVGPRClassForBitWidth( 3099 std::max(ST.useRealTrue16Insts() ? 16u : 32u, Size)); 3100 case AMDGPU::VCCRegBankID: 3101 assert(Size == 1); 3102 return isWave32 ? &AMDGPU::SReg_32_XM0_XEXECRegClass 3103 : &AMDGPU::SReg_64_XEXECRegClass; 3104 case AMDGPU::SGPRRegBankID: 3105 return getSGPRClassForBitWidth(std::max(32u, Size)); 3106 case AMDGPU::AGPRRegBankID: 3107 return getAGPRClassForBitWidth(std::max(32u, Size)); 3108 default: 3109 llvm_unreachable("unknown register bank"); 3110 } 3111 } 3112 3113 const TargetRegisterClass * 3114 SIRegisterInfo::getConstrainedRegClassForOperand(const MachineOperand &MO, 3115 const MachineRegisterInfo &MRI) const { 3116 const RegClassOrRegBank &RCOrRB = MRI.getRegClassOrRegBank(MO.getReg()); 3117 if (const RegisterBank *RB = RCOrRB.dyn_cast<const RegisterBank*>()) 3118 return getRegClassForTypeOnBank(MRI.getType(MO.getReg()), *RB); 3119 3120 if (const auto *RC = RCOrRB.dyn_cast<const TargetRegisterClass *>()) 3121 return getAllocatableClass(RC); 3122 3123 return nullptr; 3124 } 3125 3126 MCRegister SIRegisterInfo::getVCC() const { 3127 return isWave32 ? AMDGPU::VCC_LO : AMDGPU::VCC; 3128 } 3129 3130 MCRegister SIRegisterInfo::getExec() const { 3131 return isWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC; 3132 } 3133 3134 const TargetRegisterClass *SIRegisterInfo::getVGPR64Class() const { 3135 // VGPR tuples have an alignment requirement on gfx90a variants. 3136 return ST.needsAlignedVGPRs() ? &AMDGPU::VReg_64_Align2RegClass 3137 : &AMDGPU::VReg_64RegClass; 3138 } 3139 3140 const TargetRegisterClass * 3141 SIRegisterInfo::getRegClass(unsigned RCID) const { 3142 switch ((int)RCID) { 3143 case AMDGPU::SReg_1RegClassID: 3144 return getBoolRC(); 3145 case AMDGPU::SReg_1_XEXECRegClassID: 3146 return isWave32 ? &AMDGPU::SReg_32_XM0_XEXECRegClass 3147 : &AMDGPU::SReg_64_XEXECRegClass; 3148 case -1: 3149 return nullptr; 3150 default: 3151 return AMDGPUGenRegisterInfo::getRegClass(RCID); 3152 } 3153 } 3154 3155 // Find reaching register definition 3156 MachineInstr *SIRegisterInfo::findReachingDef(Register Reg, unsigned SubReg, 3157 MachineInstr &Use, 3158 MachineRegisterInfo &MRI, 3159 LiveIntervals *LIS) const { 3160 auto &MDT = LIS->getDomTree(); 3161 SlotIndex UseIdx = LIS->getInstructionIndex(Use); 3162 SlotIndex DefIdx; 3163 3164 if (Reg.isVirtual()) { 3165 if (!LIS->hasInterval(Reg)) 3166 return nullptr; 3167 LiveInterval &LI = LIS->getInterval(Reg); 3168 LaneBitmask SubLanes = SubReg ? getSubRegIndexLaneMask(SubReg) 3169 : MRI.getMaxLaneMaskForVReg(Reg); 3170 VNInfo *V = nullptr; 3171 if (LI.hasSubRanges()) { 3172 for (auto &S : LI.subranges()) { 3173 if ((S.LaneMask & SubLanes) == SubLanes) { 3174 V = S.getVNInfoAt(UseIdx); 3175 break; 3176 } 3177 } 3178 } else { 3179 V = LI.getVNInfoAt(UseIdx); 3180 } 3181 if (!V) 3182 return nullptr; 3183 DefIdx = V->def; 3184 } else { 3185 // Find last def. 3186 for (MCRegUnit Unit : regunits(Reg.asMCReg())) { 3187 LiveRange &LR = LIS->getRegUnit(Unit); 3188 if (VNInfo *V = LR.getVNInfoAt(UseIdx)) { 3189 if (!DefIdx.isValid() || 3190 MDT.dominates(LIS->getInstructionFromIndex(DefIdx), 3191 LIS->getInstructionFromIndex(V->def))) 3192 DefIdx = V->def; 3193 } else { 3194 return nullptr; 3195 } 3196 } 3197 } 3198 3199 MachineInstr *Def = LIS->getInstructionFromIndex(DefIdx); 3200 3201 if (!Def || !MDT.dominates(Def, &Use)) 3202 return nullptr; 3203 3204 assert(Def->modifiesRegister(Reg, this)); 3205 3206 return Def; 3207 } 3208 3209 MCPhysReg SIRegisterInfo::get32BitRegister(MCPhysReg Reg) const { 3210 assert(getRegSizeInBits(*getPhysRegBaseClass(Reg)) <= 32); 3211 3212 for (const TargetRegisterClass &RC : { AMDGPU::VGPR_32RegClass, 3213 AMDGPU::SReg_32RegClass, 3214 AMDGPU::AGPR_32RegClass } ) { 3215 if (MCPhysReg Super = getMatchingSuperReg(Reg, AMDGPU::lo16, &RC)) 3216 return Super; 3217 } 3218 if (MCPhysReg Super = getMatchingSuperReg(Reg, AMDGPU::hi16, 3219 &AMDGPU::VGPR_32RegClass)) { 3220 return Super; 3221 } 3222 3223 return AMDGPU::NoRegister; 3224 } 3225 3226 bool SIRegisterInfo::isProperlyAlignedRC(const TargetRegisterClass &RC) const { 3227 if (!ST.needsAlignedVGPRs()) 3228 return true; 3229 3230 if (isVGPRClass(&RC)) 3231 return RC.hasSuperClassEq(getVGPRClassForBitWidth(getRegSizeInBits(RC))); 3232 if (isAGPRClass(&RC)) 3233 return RC.hasSuperClassEq(getAGPRClassForBitWidth(getRegSizeInBits(RC))); 3234 if (isVectorSuperClass(&RC)) 3235 return RC.hasSuperClassEq( 3236 getVectorSuperClassForBitWidth(getRegSizeInBits(RC))); 3237 3238 return true; 3239 } 3240 3241 const TargetRegisterClass * 3242 SIRegisterInfo::getProperlyAlignedRC(const TargetRegisterClass *RC) const { 3243 if (!RC || !ST.needsAlignedVGPRs()) 3244 return RC; 3245 3246 unsigned Size = getRegSizeInBits(*RC); 3247 if (Size <= 32) 3248 return RC; 3249 3250 if (isVGPRClass(RC)) 3251 return getAlignedVGPRClassForBitWidth(Size); 3252 if (isAGPRClass(RC)) 3253 return getAlignedAGPRClassForBitWidth(Size); 3254 if (isVectorSuperClass(RC)) 3255 return getAlignedVectorSuperClassForBitWidth(Size); 3256 3257 return RC; 3258 } 3259 3260 ArrayRef<MCPhysReg> 3261 SIRegisterInfo::getAllSGPR128(const MachineFunction &MF) const { 3262 return ArrayRef(AMDGPU::SGPR_128RegClass.begin(), ST.getMaxNumSGPRs(MF) / 4); 3263 } 3264 3265 ArrayRef<MCPhysReg> 3266 SIRegisterInfo::getAllSGPR64(const MachineFunction &MF) const { 3267 return ArrayRef(AMDGPU::SGPR_64RegClass.begin(), ST.getMaxNumSGPRs(MF) / 2); 3268 } 3269 3270 ArrayRef<MCPhysReg> 3271 SIRegisterInfo::getAllSGPR32(const MachineFunction &MF) const { 3272 return ArrayRef(AMDGPU::SGPR_32RegClass.begin(), ST.getMaxNumSGPRs(MF)); 3273 } 3274 3275 unsigned 3276 SIRegisterInfo::getSubRegAlignmentNumBits(const TargetRegisterClass *RC, 3277 unsigned SubReg) const { 3278 switch (RC->TSFlags & SIRCFlags::RegKindMask) { 3279 case SIRCFlags::HasSGPR: 3280 return std::min(128u, getSubRegIdxSize(SubReg)); 3281 case SIRCFlags::HasAGPR: 3282 case SIRCFlags::HasVGPR: 3283 case SIRCFlags::HasVGPR | SIRCFlags::HasAGPR: 3284 return std::min(32u, getSubRegIdxSize(SubReg)); 3285 default: 3286 break; 3287 } 3288 return 0; 3289 } 3290