1 //===-- SIRegisterInfo.cpp - SI Register Information ---------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// SI implementation of the TargetRegisterInfo class. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPU.h" 15 #include "AMDGPURegisterBankInfo.h" 16 #include "GCNSubtarget.h" 17 #include "MCTargetDesc/AMDGPUInstPrinter.h" 18 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 19 #include "SIMachineFunctionInfo.h" 20 #include "SIRegisterInfo.h" 21 #include "llvm/CodeGen/LiveIntervals.h" 22 #include "llvm/CodeGen/LiveRegUnits.h" 23 #include "llvm/CodeGen/MachineDominators.h" 24 #include "llvm/CodeGen/MachineFrameInfo.h" 25 #include "llvm/CodeGen/RegisterScavenging.h" 26 27 using namespace llvm; 28 29 #define GET_REGINFO_TARGET_DESC 30 #include "AMDGPUGenRegisterInfo.inc" 31 32 static cl::opt<bool> EnableSpillSGPRToVGPR( 33 "amdgpu-spill-sgpr-to-vgpr", 34 cl::desc("Enable spilling SGPRs to VGPRs"), 35 cl::ReallyHidden, 36 cl::init(true)); 37 38 std::array<std::vector<int16_t>, 16> SIRegisterInfo::RegSplitParts; 39 std::array<std::array<uint16_t, 32>, 9> SIRegisterInfo::SubRegFromChannelTable; 40 41 // Map numbers of DWORDs to indexes in SubRegFromChannelTable. 42 // Valid indexes are shifted 1, such that a 0 mapping means unsupported. 43 // e.g. for 8 DWORDs (256-bit), SubRegFromChannelTableWidthMap[8] = 8, 44 // meaning index 7 in SubRegFromChannelTable. 45 static const std::array<unsigned, 17> SubRegFromChannelTableWidthMap = { 46 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 0, 0, 0, 0, 0, 0, 9}; 47 48 static void emitUnsupportedError(const Function &Fn, const MachineInstr &MI, 49 const Twine &ErrMsg) { 50 Fn.getContext().diagnose( 51 DiagnosticInfoUnsupported(Fn, ErrMsg, MI.getDebugLoc())); 52 } 53 54 namespace llvm { 55 56 // A temporary struct to spill SGPRs. 57 // This is mostly to spill SGPRs to memory. Spilling SGPRs into VGPR lanes emits 58 // just v_writelane and v_readlane. 59 // 60 // When spilling to memory, the SGPRs are written into VGPR lanes and the VGPR 61 // is saved to scratch (or the other way around for loads). 62 // For this, a VGPR is required where the needed lanes can be clobbered. The 63 // RegScavenger can provide a VGPR where currently active lanes can be 64 // clobbered, but we still need to save inactive lanes. 65 // The high-level steps are: 66 // - Try to scavenge SGPR(s) to save exec 67 // - Try to scavenge VGPR 68 // - Save needed, all or inactive lanes of a TmpVGPR 69 // - Spill/Restore SGPRs using TmpVGPR 70 // - Restore TmpVGPR 71 // 72 // To save all lanes of TmpVGPR, exec needs to be saved and modified. If we 73 // cannot scavenge temporary SGPRs to save exec, we use the following code: 74 // buffer_store_dword TmpVGPR ; only if active lanes need to be saved 75 // s_not exec, exec 76 // buffer_store_dword TmpVGPR ; save inactive lanes 77 // s_not exec, exec 78 struct SGPRSpillBuilder { 79 struct PerVGPRData { 80 unsigned PerVGPR; 81 unsigned NumVGPRs; 82 int64_t VGPRLanes; 83 }; 84 85 // The SGPR to save 86 Register SuperReg; 87 MachineBasicBlock::iterator MI; 88 ArrayRef<int16_t> SplitParts; 89 unsigned NumSubRegs; 90 bool IsKill; 91 const DebugLoc &DL; 92 93 /* When spilling to stack */ 94 // The SGPRs are written into this VGPR, which is then written to scratch 95 // (or vice versa for loads). 96 Register TmpVGPR = AMDGPU::NoRegister; 97 // Temporary spill slot to save TmpVGPR to. 98 int TmpVGPRIndex = 0; 99 // If TmpVGPR is live before the spill or if it is scavenged. 100 bool TmpVGPRLive = false; 101 // Scavenged SGPR to save EXEC. 102 Register SavedExecReg = AMDGPU::NoRegister; 103 // Stack index to write the SGPRs to. 104 int Index; 105 unsigned EltSize = 4; 106 107 RegScavenger *RS; 108 MachineBasicBlock *MBB; 109 MachineFunction &MF; 110 SIMachineFunctionInfo &MFI; 111 const SIInstrInfo &TII; 112 const SIRegisterInfo &TRI; 113 bool IsWave32; 114 Register ExecReg; 115 unsigned MovOpc; 116 unsigned NotOpc; 117 118 SGPRSpillBuilder(const SIRegisterInfo &TRI, const SIInstrInfo &TII, 119 bool IsWave32, MachineBasicBlock::iterator MI, int Index, 120 RegScavenger *RS) 121 : SGPRSpillBuilder(TRI, TII, IsWave32, MI, MI->getOperand(0).getReg(), 122 MI->getOperand(0).isKill(), Index, RS) {} 123 124 SGPRSpillBuilder(const SIRegisterInfo &TRI, const SIInstrInfo &TII, 125 bool IsWave32, MachineBasicBlock::iterator MI, Register Reg, 126 bool IsKill, int Index, RegScavenger *RS) 127 : SuperReg(Reg), MI(MI), IsKill(IsKill), DL(MI->getDebugLoc()), 128 Index(Index), RS(RS), MBB(MI->getParent()), MF(*MBB->getParent()), 129 MFI(*MF.getInfo<SIMachineFunctionInfo>()), TII(TII), TRI(TRI), 130 IsWave32(IsWave32) { 131 const TargetRegisterClass *RC = TRI.getPhysRegBaseClass(SuperReg); 132 SplitParts = TRI.getRegSplitParts(RC, EltSize); 133 NumSubRegs = SplitParts.empty() ? 1 : SplitParts.size(); 134 135 if (IsWave32) { 136 ExecReg = AMDGPU::EXEC_LO; 137 MovOpc = AMDGPU::S_MOV_B32; 138 NotOpc = AMDGPU::S_NOT_B32; 139 } else { 140 ExecReg = AMDGPU::EXEC; 141 MovOpc = AMDGPU::S_MOV_B64; 142 NotOpc = AMDGPU::S_NOT_B64; 143 } 144 145 assert(SuperReg != AMDGPU::M0 && "m0 should never spill"); 146 assert(SuperReg != AMDGPU::EXEC_LO && SuperReg != AMDGPU::EXEC_HI && 147 SuperReg != AMDGPU::EXEC && "exec should never spill"); 148 } 149 150 PerVGPRData getPerVGPRData() { 151 PerVGPRData Data; 152 Data.PerVGPR = IsWave32 ? 32 : 64; 153 Data.NumVGPRs = (NumSubRegs + (Data.PerVGPR - 1)) / Data.PerVGPR; 154 Data.VGPRLanes = (1LL << std::min(Data.PerVGPR, NumSubRegs)) - 1LL; 155 return Data; 156 } 157 158 // Tries to scavenge SGPRs to save EXEC and a VGPR. Uses v0 if no VGPR is 159 // free. 160 // Writes these instructions if an SGPR can be scavenged: 161 // s_mov_b64 s[6:7], exec ; Save exec 162 // s_mov_b64 exec, 3 ; Wanted lanemask 163 // buffer_store_dword v1 ; Write scavenged VGPR to emergency slot 164 // 165 // Writes these instructions if no SGPR can be scavenged: 166 // buffer_store_dword v0 ; Only if no free VGPR was found 167 // s_not_b64 exec, exec 168 // buffer_store_dword v0 ; Save inactive lanes 169 // ; exec stays inverted, it is flipped back in 170 // ; restore. 171 void prepare() { 172 // Scavenged temporary VGPR to use. It must be scavenged once for any number 173 // of spilled subregs. 174 // FIXME: The liveness analysis is limited and does not tell if a register 175 // is in use in lanes that are currently inactive. We can never be sure if 176 // a register as actually in use in another lane, so we need to save all 177 // used lanes of the chosen VGPR. 178 assert(RS && "Cannot spill SGPR to memory without RegScavenger"); 179 TmpVGPR = RS->scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass, MI, false, 180 0, false); 181 182 // Reserve temporary stack slot 183 TmpVGPRIndex = MFI.getScavengeFI(MF.getFrameInfo(), TRI); 184 if (TmpVGPR) { 185 // Found a register that is dead in the currently active lanes, we only 186 // need to spill inactive lanes. 187 TmpVGPRLive = false; 188 } else { 189 // Pick v0 because it doesn't make a difference. 190 TmpVGPR = AMDGPU::VGPR0; 191 TmpVGPRLive = true; 192 } 193 194 if (TmpVGPRLive) { 195 // We need to inform the scavenger that this index is already in use until 196 // we're done with the custom emergency spill. 197 RS->assignRegToScavengingIndex(TmpVGPRIndex, TmpVGPR); 198 } 199 200 // We may end up recursively calling the scavenger, and don't want to re-use 201 // the same register. 202 RS->setRegUsed(TmpVGPR); 203 204 // Try to scavenge SGPRs to save exec 205 assert(!SavedExecReg && "Exec is already saved, refuse to save again"); 206 const TargetRegisterClass &RC = 207 IsWave32 ? AMDGPU::SGPR_32RegClass : AMDGPU::SGPR_64RegClass; 208 RS->setRegUsed(SuperReg); 209 SavedExecReg = RS->scavengeRegisterBackwards(RC, MI, false, 0, false); 210 211 int64_t VGPRLanes = getPerVGPRData().VGPRLanes; 212 213 if (SavedExecReg) { 214 RS->setRegUsed(SavedExecReg); 215 // Set exec to needed lanes 216 BuildMI(*MBB, MI, DL, TII.get(MovOpc), SavedExecReg).addReg(ExecReg); 217 auto I = 218 BuildMI(*MBB, MI, DL, TII.get(MovOpc), ExecReg).addImm(VGPRLanes); 219 if (!TmpVGPRLive) 220 I.addReg(TmpVGPR, RegState::ImplicitDefine); 221 // Spill needed lanes 222 TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ false); 223 } else { 224 // The modify and restore of exec clobber SCC, which we would have to save 225 // and restore. FIXME: We probably would need to reserve a register for 226 // this. 227 if (RS->isRegUsed(AMDGPU::SCC)) 228 emitUnsupportedError(MF.getFunction(), *MI, 229 "unhandled SGPR spill to memory"); 230 231 // Spill active lanes 232 if (TmpVGPRLive) 233 TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ false, 234 /*IsKill*/ false); 235 // Spill inactive lanes 236 auto I = BuildMI(*MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg); 237 if (!TmpVGPRLive) 238 I.addReg(TmpVGPR, RegState::ImplicitDefine); 239 I->getOperand(2).setIsDead(); // Mark SCC as dead. 240 TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ false); 241 } 242 } 243 244 // Writes these instructions if an SGPR can be scavenged: 245 // buffer_load_dword v1 ; Write scavenged VGPR to emergency slot 246 // s_waitcnt vmcnt(0) ; If a free VGPR was found 247 // s_mov_b64 exec, s[6:7] ; Save exec 248 // 249 // Writes these instructions if no SGPR can be scavenged: 250 // buffer_load_dword v0 ; Restore inactive lanes 251 // s_waitcnt vmcnt(0) ; If a free VGPR was found 252 // s_not_b64 exec, exec 253 // buffer_load_dword v0 ; Only if no free VGPR was found 254 void restore() { 255 if (SavedExecReg) { 256 // Restore used lanes 257 TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ true, 258 /*IsKill*/ false); 259 // Restore exec 260 auto I = BuildMI(*MBB, MI, DL, TII.get(MovOpc), ExecReg) 261 .addReg(SavedExecReg, RegState::Kill); 262 // Add an implicit use of the load so it is not dead. 263 // FIXME This inserts an unnecessary waitcnt 264 if (!TmpVGPRLive) { 265 I.addReg(TmpVGPR, RegState::ImplicitKill); 266 } 267 } else { 268 // Restore inactive lanes 269 TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ true, 270 /*IsKill*/ false); 271 auto I = BuildMI(*MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg); 272 if (!TmpVGPRLive) 273 I.addReg(TmpVGPR, RegState::ImplicitKill); 274 I->getOperand(2).setIsDead(); // Mark SCC as dead. 275 276 // Restore active lanes 277 if (TmpVGPRLive) 278 TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ true); 279 } 280 281 // Inform the scavenger where we're releasing our custom scavenged register. 282 if (TmpVGPRLive) { 283 MachineBasicBlock::iterator RestorePt = std::prev(MI); 284 RS->assignRegToScavengingIndex(TmpVGPRIndex, TmpVGPR, &*RestorePt); 285 } 286 } 287 288 // Write TmpVGPR to memory or read TmpVGPR from memory. 289 // Either using a single buffer_load/store if exec is set to the needed mask 290 // or using 291 // buffer_load 292 // s_not exec, exec 293 // buffer_load 294 // s_not exec, exec 295 void readWriteTmpVGPR(unsigned Offset, bool IsLoad) { 296 if (SavedExecReg) { 297 // Spill needed lanes 298 TRI.buildVGPRSpillLoadStore(*this, Index, Offset, IsLoad); 299 } else { 300 // The modify and restore of exec clobber SCC, which we would have to save 301 // and restore. FIXME: We probably would need to reserve a register for 302 // this. 303 if (RS->isRegUsed(AMDGPU::SCC)) 304 emitUnsupportedError(MF.getFunction(), *MI, 305 "unhandled SGPR spill to memory"); 306 307 // Spill active lanes 308 TRI.buildVGPRSpillLoadStore(*this, Index, Offset, IsLoad, 309 /*IsKill*/ false); 310 // Spill inactive lanes 311 auto Not0 = BuildMI(*MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg); 312 Not0->getOperand(2).setIsDead(); // Mark SCC as dead. 313 TRI.buildVGPRSpillLoadStore(*this, Index, Offset, IsLoad); 314 auto Not1 = BuildMI(*MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg); 315 Not1->getOperand(2).setIsDead(); // Mark SCC as dead. 316 } 317 } 318 319 void setMI(MachineBasicBlock *NewMBB, MachineBasicBlock::iterator NewMI) { 320 assert(MBB->getParent() == &MF); 321 MI = NewMI; 322 MBB = NewMBB; 323 } 324 }; 325 326 } // namespace llvm 327 328 SIRegisterInfo::SIRegisterInfo(const GCNSubtarget &ST) 329 : AMDGPUGenRegisterInfo(AMDGPU::PC_REG, ST.getAMDGPUDwarfFlavour(), 330 ST.getAMDGPUDwarfFlavour(), 331 /*PC=*/0, ST.getHwMode()), 332 ST(ST), SpillSGPRToVGPR(EnableSpillSGPRToVGPR), isWave32(ST.isWave32()) { 333 334 assert(getSubRegIndexLaneMask(AMDGPU::sub0).getAsInteger() == 3 && 335 getSubRegIndexLaneMask(AMDGPU::sub31).getAsInteger() == (3ULL << 62) && 336 (getSubRegIndexLaneMask(AMDGPU::lo16) | 337 getSubRegIndexLaneMask(AMDGPU::hi16)).getAsInteger() == 338 getSubRegIndexLaneMask(AMDGPU::sub0).getAsInteger() && 339 "getNumCoveredRegs() will not work with generated subreg masks!"); 340 341 RegPressureIgnoredUnits.resize(getNumRegUnits()); 342 RegPressureIgnoredUnits.set(*regunits(MCRegister::from(AMDGPU::M0)).begin()); 343 for (auto Reg : AMDGPU::VGPR_16RegClass) { 344 if (AMDGPU::isHi16Reg(Reg, *this)) 345 RegPressureIgnoredUnits.set(*regunits(Reg).begin()); 346 } 347 348 // HACK: Until this is fully tablegen'd. 349 static llvm::once_flag InitializeRegSplitPartsFlag; 350 351 static auto InitializeRegSplitPartsOnce = [this]() { 352 for (unsigned Idx = 1, E = getNumSubRegIndices() - 1; Idx < E; ++Idx) { 353 unsigned Size = getSubRegIdxSize(Idx); 354 if (Size & 31) 355 continue; 356 std::vector<int16_t> &Vec = RegSplitParts[Size / 32 - 1]; 357 unsigned Pos = getSubRegIdxOffset(Idx); 358 if (Pos % Size) 359 continue; 360 Pos /= Size; 361 if (Vec.empty()) { 362 unsigned MaxNumParts = 1024 / Size; // Maximum register is 1024 bits. 363 Vec.resize(MaxNumParts); 364 } 365 Vec[Pos] = Idx; 366 } 367 }; 368 369 static llvm::once_flag InitializeSubRegFromChannelTableFlag; 370 371 static auto InitializeSubRegFromChannelTableOnce = [this]() { 372 for (auto &Row : SubRegFromChannelTable) 373 Row.fill(AMDGPU::NoSubRegister); 374 for (unsigned Idx = 1; Idx < getNumSubRegIndices(); ++Idx) { 375 unsigned Width = getSubRegIdxSize(Idx) / 32; 376 unsigned Offset = getSubRegIdxOffset(Idx) / 32; 377 assert(Width < SubRegFromChannelTableWidthMap.size()); 378 Width = SubRegFromChannelTableWidthMap[Width]; 379 if (Width == 0) 380 continue; 381 unsigned TableIdx = Width - 1; 382 assert(TableIdx < SubRegFromChannelTable.size()); 383 assert(Offset < SubRegFromChannelTable[TableIdx].size()); 384 SubRegFromChannelTable[TableIdx][Offset] = Idx; 385 } 386 }; 387 388 llvm::call_once(InitializeRegSplitPartsFlag, InitializeRegSplitPartsOnce); 389 llvm::call_once(InitializeSubRegFromChannelTableFlag, 390 InitializeSubRegFromChannelTableOnce); 391 } 392 393 void SIRegisterInfo::reserveRegisterTuples(BitVector &Reserved, 394 MCRegister Reg) const { 395 for (MCRegAliasIterator R(Reg, this, true); R.isValid(); ++R) 396 Reserved.set(*R); 397 } 398 399 // Forced to be here by one .inc 400 const MCPhysReg *SIRegisterInfo::getCalleeSavedRegs( 401 const MachineFunction *MF) const { 402 CallingConv::ID CC = MF->getFunction().getCallingConv(); 403 switch (CC) { 404 case CallingConv::C: 405 case CallingConv::Fast: 406 case CallingConv::Cold: 407 return ST.hasGFX90AInsts() ? CSR_AMDGPU_GFX90AInsts_SaveList 408 : CSR_AMDGPU_SaveList; 409 case CallingConv::AMDGPU_Gfx: 410 return ST.hasGFX90AInsts() ? CSR_AMDGPU_SI_Gfx_GFX90AInsts_SaveList 411 : CSR_AMDGPU_SI_Gfx_SaveList; 412 case CallingConv::AMDGPU_CS_ChainPreserve: 413 return CSR_AMDGPU_CS_ChainPreserve_SaveList; 414 default: { 415 // Dummy to not crash RegisterClassInfo. 416 static const MCPhysReg NoCalleeSavedReg = AMDGPU::NoRegister; 417 return &NoCalleeSavedReg; 418 } 419 } 420 } 421 422 const MCPhysReg * 423 SIRegisterInfo::getCalleeSavedRegsViaCopy(const MachineFunction *MF) const { 424 return nullptr; 425 } 426 427 const uint32_t *SIRegisterInfo::getCallPreservedMask(const MachineFunction &MF, 428 CallingConv::ID CC) const { 429 switch (CC) { 430 case CallingConv::C: 431 case CallingConv::Fast: 432 case CallingConv::Cold: 433 return ST.hasGFX90AInsts() ? CSR_AMDGPU_GFX90AInsts_RegMask 434 : CSR_AMDGPU_RegMask; 435 case CallingConv::AMDGPU_Gfx: 436 return ST.hasGFX90AInsts() ? CSR_AMDGPU_SI_Gfx_GFX90AInsts_RegMask 437 : CSR_AMDGPU_SI_Gfx_RegMask; 438 case CallingConv::AMDGPU_CS_Chain: 439 case CallingConv::AMDGPU_CS_ChainPreserve: 440 // Calls to these functions never return, so we can pretend everything is 441 // preserved. 442 return AMDGPU_AllVGPRs_RegMask; 443 default: 444 return nullptr; 445 } 446 } 447 448 const uint32_t *SIRegisterInfo::getNoPreservedMask() const { 449 return CSR_AMDGPU_NoRegs_RegMask; 450 } 451 452 bool SIRegisterInfo::isChainScratchRegister(Register VGPR) { 453 return VGPR >= AMDGPU::VGPR0 && VGPR < AMDGPU::VGPR8; 454 } 455 456 const TargetRegisterClass * 457 SIRegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC, 458 const MachineFunction &MF) const { 459 // FIXME: Should have a helper function like getEquivalentVGPRClass to get the 460 // equivalent AV class. If used one, the verifier will crash after 461 // RegBankSelect in the GISel flow. The aligned regclasses are not fully given 462 // until Instruction selection. 463 if (ST.hasMAIInsts() && (isVGPRClass(RC) || isAGPRClass(RC))) { 464 if (RC == &AMDGPU::VGPR_32RegClass || RC == &AMDGPU::AGPR_32RegClass) 465 return &AMDGPU::AV_32RegClass; 466 if (RC == &AMDGPU::VReg_64RegClass || RC == &AMDGPU::AReg_64RegClass) 467 return &AMDGPU::AV_64RegClass; 468 if (RC == &AMDGPU::VReg_64_Align2RegClass || 469 RC == &AMDGPU::AReg_64_Align2RegClass) 470 return &AMDGPU::AV_64_Align2RegClass; 471 if (RC == &AMDGPU::VReg_96RegClass || RC == &AMDGPU::AReg_96RegClass) 472 return &AMDGPU::AV_96RegClass; 473 if (RC == &AMDGPU::VReg_96_Align2RegClass || 474 RC == &AMDGPU::AReg_96_Align2RegClass) 475 return &AMDGPU::AV_96_Align2RegClass; 476 if (RC == &AMDGPU::VReg_128RegClass || RC == &AMDGPU::AReg_128RegClass) 477 return &AMDGPU::AV_128RegClass; 478 if (RC == &AMDGPU::VReg_128_Align2RegClass || 479 RC == &AMDGPU::AReg_128_Align2RegClass) 480 return &AMDGPU::AV_128_Align2RegClass; 481 if (RC == &AMDGPU::VReg_160RegClass || RC == &AMDGPU::AReg_160RegClass) 482 return &AMDGPU::AV_160RegClass; 483 if (RC == &AMDGPU::VReg_160_Align2RegClass || 484 RC == &AMDGPU::AReg_160_Align2RegClass) 485 return &AMDGPU::AV_160_Align2RegClass; 486 if (RC == &AMDGPU::VReg_192RegClass || RC == &AMDGPU::AReg_192RegClass) 487 return &AMDGPU::AV_192RegClass; 488 if (RC == &AMDGPU::VReg_192_Align2RegClass || 489 RC == &AMDGPU::AReg_192_Align2RegClass) 490 return &AMDGPU::AV_192_Align2RegClass; 491 if (RC == &AMDGPU::VReg_256RegClass || RC == &AMDGPU::AReg_256RegClass) 492 return &AMDGPU::AV_256RegClass; 493 if (RC == &AMDGPU::VReg_256_Align2RegClass || 494 RC == &AMDGPU::AReg_256_Align2RegClass) 495 return &AMDGPU::AV_256_Align2RegClass; 496 if (RC == &AMDGPU::VReg_512RegClass || RC == &AMDGPU::AReg_512RegClass) 497 return &AMDGPU::AV_512RegClass; 498 if (RC == &AMDGPU::VReg_512_Align2RegClass || 499 RC == &AMDGPU::AReg_512_Align2RegClass) 500 return &AMDGPU::AV_512_Align2RegClass; 501 if (RC == &AMDGPU::VReg_1024RegClass || RC == &AMDGPU::AReg_1024RegClass) 502 return &AMDGPU::AV_1024RegClass; 503 if (RC == &AMDGPU::VReg_1024_Align2RegClass || 504 RC == &AMDGPU::AReg_1024_Align2RegClass) 505 return &AMDGPU::AV_1024_Align2RegClass; 506 } 507 508 return TargetRegisterInfo::getLargestLegalSuperClass(RC, MF); 509 } 510 511 Register SIRegisterInfo::getFrameRegister(const MachineFunction &MF) const { 512 const SIFrameLowering *TFI = ST.getFrameLowering(); 513 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); 514 // During ISel lowering we always reserve the stack pointer in entry and chain 515 // functions, but never actually want to reference it when accessing our own 516 // frame. If we need a frame pointer we use it, but otherwise we can just use 517 // an immediate "0" which we represent by returning NoRegister. 518 if (FuncInfo->isBottomOfStack()) { 519 return TFI->hasFP(MF) ? FuncInfo->getFrameOffsetReg() : Register(); 520 } 521 return TFI->hasFP(MF) ? FuncInfo->getFrameOffsetReg() 522 : FuncInfo->getStackPtrOffsetReg(); 523 } 524 525 bool SIRegisterInfo::hasBasePointer(const MachineFunction &MF) const { 526 // When we need stack realignment, we can't reference off of the 527 // stack pointer, so we reserve a base pointer. 528 return shouldRealignStack(MF); 529 } 530 531 Register SIRegisterInfo::getBaseRegister() const { return AMDGPU::SGPR34; } 532 533 const uint32_t *SIRegisterInfo::getAllVGPRRegMask() const { 534 return AMDGPU_AllVGPRs_RegMask; 535 } 536 537 const uint32_t *SIRegisterInfo::getAllAGPRRegMask() const { 538 return AMDGPU_AllAGPRs_RegMask; 539 } 540 541 const uint32_t *SIRegisterInfo::getAllVectorRegMask() const { 542 return AMDGPU_AllVectorRegs_RegMask; 543 } 544 545 const uint32_t *SIRegisterInfo::getAllAllocatableSRegMask() const { 546 return AMDGPU_AllAllocatableSRegs_RegMask; 547 } 548 549 unsigned SIRegisterInfo::getSubRegFromChannel(unsigned Channel, 550 unsigned NumRegs) { 551 assert(NumRegs < SubRegFromChannelTableWidthMap.size()); 552 unsigned NumRegIndex = SubRegFromChannelTableWidthMap[NumRegs]; 553 assert(NumRegIndex && "Not implemented"); 554 assert(Channel < SubRegFromChannelTable[NumRegIndex - 1].size()); 555 return SubRegFromChannelTable[NumRegIndex - 1][Channel]; 556 } 557 558 MCRegister 559 SIRegisterInfo::getAlignedHighSGPRForRC(const MachineFunction &MF, 560 const unsigned Align, 561 const TargetRegisterClass *RC) const { 562 unsigned BaseIdx = alignDown(ST.getMaxNumSGPRs(MF), Align) - Align; 563 MCRegister BaseReg(AMDGPU::SGPR_32RegClass.getRegister(BaseIdx)); 564 return getMatchingSuperReg(BaseReg, AMDGPU::sub0, RC); 565 } 566 567 MCRegister SIRegisterInfo::reservedPrivateSegmentBufferReg( 568 const MachineFunction &MF) const { 569 return getAlignedHighSGPRForRC(MF, /*Align=*/4, &AMDGPU::SGPR_128RegClass); 570 } 571 572 std::pair<unsigned, unsigned> 573 SIRegisterInfo::getMaxNumVectorRegs(const MachineFunction &MF) const { 574 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 575 unsigned MaxNumVGPRs = ST.getMaxNumVGPRs(MF); 576 unsigned MaxNumAGPRs = MaxNumVGPRs; 577 unsigned TotalNumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs(); 578 579 // On GFX90A, the number of VGPRs and AGPRs need not be equal. Theoretically, 580 // a wave may have up to 512 total vector registers combining together both 581 // VGPRs and AGPRs. Hence, in an entry function without calls and without 582 // AGPRs used within it, it is possible to use the whole vector register 583 // budget for VGPRs. 584 // 585 // TODO: it shall be possible to estimate maximum AGPR/VGPR pressure and split 586 // register file accordingly. 587 if (ST.hasGFX90AInsts()) { 588 if (MFI->usesAGPRs(MF)) { 589 MaxNumVGPRs /= 2; 590 MaxNumAGPRs = MaxNumVGPRs; 591 } else { 592 if (MaxNumVGPRs > TotalNumVGPRs) { 593 MaxNumAGPRs = MaxNumVGPRs - TotalNumVGPRs; 594 MaxNumVGPRs = TotalNumVGPRs; 595 } else 596 MaxNumAGPRs = 0; 597 } 598 } 599 600 return std::pair(MaxNumVGPRs, MaxNumAGPRs); 601 } 602 603 BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const { 604 BitVector Reserved(getNumRegs()); 605 Reserved.set(AMDGPU::MODE); 606 607 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 608 609 // Reserve special purpose registers. 610 // 611 // EXEC_LO and EXEC_HI could be allocated and used as regular register, but 612 // this seems likely to result in bugs, so I'm marking them as reserved. 613 reserveRegisterTuples(Reserved, AMDGPU::EXEC); 614 reserveRegisterTuples(Reserved, AMDGPU::FLAT_SCR); 615 616 // M0 has to be reserved so that llvm accepts it as a live-in into a block. 617 reserveRegisterTuples(Reserved, AMDGPU::M0); 618 619 // Reserve src_vccz, src_execz, src_scc. 620 reserveRegisterTuples(Reserved, AMDGPU::SRC_VCCZ); 621 reserveRegisterTuples(Reserved, AMDGPU::SRC_EXECZ); 622 reserveRegisterTuples(Reserved, AMDGPU::SRC_SCC); 623 624 // Reserve the memory aperture registers 625 reserveRegisterTuples(Reserved, AMDGPU::SRC_SHARED_BASE); 626 reserveRegisterTuples(Reserved, AMDGPU::SRC_SHARED_LIMIT); 627 reserveRegisterTuples(Reserved, AMDGPU::SRC_PRIVATE_BASE); 628 reserveRegisterTuples(Reserved, AMDGPU::SRC_PRIVATE_LIMIT); 629 630 // Reserve src_pops_exiting_wave_id - support is not implemented in Codegen. 631 reserveRegisterTuples(Reserved, AMDGPU::SRC_POPS_EXITING_WAVE_ID); 632 633 // Reserve xnack_mask registers - support is not implemented in Codegen. 634 reserveRegisterTuples(Reserved, AMDGPU::XNACK_MASK); 635 636 // Reserve lds_direct register - support is not implemented in Codegen. 637 reserveRegisterTuples(Reserved, AMDGPU::LDS_DIRECT); 638 639 // Reserve Trap Handler registers - support is not implemented in Codegen. 640 reserveRegisterTuples(Reserved, AMDGPU::TBA); 641 reserveRegisterTuples(Reserved, AMDGPU::TMA); 642 reserveRegisterTuples(Reserved, AMDGPU::TTMP0_TTMP1); 643 reserveRegisterTuples(Reserved, AMDGPU::TTMP2_TTMP3); 644 reserveRegisterTuples(Reserved, AMDGPU::TTMP4_TTMP5); 645 reserveRegisterTuples(Reserved, AMDGPU::TTMP6_TTMP7); 646 reserveRegisterTuples(Reserved, AMDGPU::TTMP8_TTMP9); 647 reserveRegisterTuples(Reserved, AMDGPU::TTMP10_TTMP11); 648 reserveRegisterTuples(Reserved, AMDGPU::TTMP12_TTMP13); 649 reserveRegisterTuples(Reserved, AMDGPU::TTMP14_TTMP15); 650 651 // Reserve null register - it shall never be allocated 652 reserveRegisterTuples(Reserved, AMDGPU::SGPR_NULL64); 653 654 // Reserve SGPRs. 655 // 656 unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF); 657 unsigned TotalNumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs(); 658 for (const TargetRegisterClass *RC : regclasses()) { 659 if (RC->isBaseClass() && isSGPRClass(RC)) { 660 unsigned NumRegs = divideCeil(getRegSizeInBits(*RC), 32); 661 for (MCPhysReg Reg : *RC) { 662 unsigned Index = getHWRegIndex(Reg); 663 if (Index + NumRegs > MaxNumSGPRs && Index < TotalNumSGPRs) 664 Reserved.set(Reg); 665 } 666 } 667 } 668 669 Register ScratchRSrcReg = MFI->getScratchRSrcReg(); 670 if (ScratchRSrcReg != AMDGPU::NoRegister) { 671 // Reserve 4 SGPRs for the scratch buffer resource descriptor in case we 672 // need to spill. 673 // TODO: May need to reserve a VGPR if doing LDS spilling. 674 reserveRegisterTuples(Reserved, ScratchRSrcReg); 675 } 676 677 Register LongBranchReservedReg = MFI->getLongBranchReservedReg(); 678 if (LongBranchReservedReg) 679 reserveRegisterTuples(Reserved, LongBranchReservedReg); 680 681 // We have to assume the SP is needed in case there are calls in the function, 682 // which is detected after the function is lowered. If we aren't really going 683 // to need SP, don't bother reserving it. 684 MCRegister StackPtrReg = MFI->getStackPtrOffsetReg(); 685 if (StackPtrReg) { 686 reserveRegisterTuples(Reserved, StackPtrReg); 687 assert(!isSubRegister(ScratchRSrcReg, StackPtrReg)); 688 } 689 690 MCRegister FrameReg = MFI->getFrameOffsetReg(); 691 if (FrameReg) { 692 reserveRegisterTuples(Reserved, FrameReg); 693 assert(!isSubRegister(ScratchRSrcReg, FrameReg)); 694 } 695 696 if (hasBasePointer(MF)) { 697 MCRegister BasePtrReg = getBaseRegister(); 698 reserveRegisterTuples(Reserved, BasePtrReg); 699 assert(!isSubRegister(ScratchRSrcReg, BasePtrReg)); 700 } 701 702 // FIXME: Use same reserved register introduced in D149775 703 // SGPR used to preserve EXEC MASK around WWM spill/copy instructions. 704 Register ExecCopyReg = MFI->getSGPRForEXECCopy(); 705 if (ExecCopyReg) 706 reserveRegisterTuples(Reserved, ExecCopyReg); 707 708 // Reserve VGPRs/AGPRs. 709 // 710 auto [MaxNumVGPRs, MaxNumAGPRs] = getMaxNumVectorRegs(MF); 711 712 for (const TargetRegisterClass *RC : regclasses()) { 713 if (RC->isBaseClass() && isVGPRClass(RC)) { 714 unsigned NumRegs = divideCeil(getRegSizeInBits(*RC), 32); 715 for (MCPhysReg Reg : *RC) { 716 unsigned Index = getHWRegIndex(Reg); 717 if (Index + NumRegs > MaxNumVGPRs) 718 Reserved.set(Reg); 719 } 720 } 721 } 722 723 // Reserve all the AGPRs if there are no instructions to use it. 724 if (!ST.hasMAIInsts()) 725 MaxNumAGPRs = 0; 726 for (const TargetRegisterClass *RC : regclasses()) { 727 if (RC->isBaseClass() && isAGPRClass(RC)) { 728 unsigned NumRegs = divideCeil(getRegSizeInBits(*RC), 32); 729 for (MCPhysReg Reg : *RC) { 730 unsigned Index = getHWRegIndex(Reg); 731 if (Index + NumRegs > MaxNumAGPRs) 732 Reserved.set(Reg); 733 } 734 } 735 } 736 737 // On GFX908, in order to guarantee copying between AGPRs, we need a scratch 738 // VGPR available at all times. 739 if (ST.hasMAIInsts() && !ST.hasGFX90AInsts()) { 740 reserveRegisterTuples(Reserved, MFI->getVGPRForAGPRCopy()); 741 } 742 743 // During wwm-regalloc, reserve the registers for perlane VGPR allocation. The 744 // MFI->getNonWWMRegMask() field will have a valid bitmask only during 745 // wwm-regalloc and it would be empty otherwise. 746 BitVector NonWWMRegMask = MFI->getNonWWMRegMask(); 747 if (!NonWWMRegMask.empty()) { 748 for (unsigned RegI = AMDGPU::VGPR0, RegE = AMDGPU::VGPR0 + MaxNumVGPRs; 749 RegI < RegE; ++RegI) { 750 if (NonWWMRegMask.test(RegI)) 751 reserveRegisterTuples(Reserved, RegI); 752 } 753 } 754 755 for (Register Reg : MFI->getWWMReservedRegs()) 756 reserveRegisterTuples(Reserved, Reg); 757 758 // FIXME: Stop using reserved registers for this. 759 for (MCPhysReg Reg : MFI->getAGPRSpillVGPRs()) 760 reserveRegisterTuples(Reserved, Reg); 761 762 for (MCPhysReg Reg : MFI->getVGPRSpillAGPRs()) 763 reserveRegisterTuples(Reserved, Reg); 764 765 return Reserved; 766 } 767 768 bool SIRegisterInfo::isAsmClobberable(const MachineFunction &MF, 769 MCRegister PhysReg) const { 770 return !MF.getRegInfo().isReserved(PhysReg); 771 } 772 773 bool SIRegisterInfo::shouldRealignStack(const MachineFunction &MF) const { 774 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 775 // On entry or in chain functions, the base address is 0, so it can't possibly 776 // need any more alignment. 777 778 // FIXME: Should be able to specify the entry frame alignment per calling 779 // convention instead. 780 if (Info->isBottomOfStack()) 781 return false; 782 783 return TargetRegisterInfo::shouldRealignStack(MF); 784 } 785 786 bool SIRegisterInfo::requiresRegisterScavenging(const MachineFunction &Fn) const { 787 const SIMachineFunctionInfo *Info = Fn.getInfo<SIMachineFunctionInfo>(); 788 if (Info->isEntryFunction()) { 789 const MachineFrameInfo &MFI = Fn.getFrameInfo(); 790 return MFI.hasStackObjects() || MFI.hasCalls(); 791 } 792 793 // May need scavenger for dealing with callee saved registers. 794 return true; 795 } 796 797 bool SIRegisterInfo::requiresFrameIndexScavenging( 798 const MachineFunction &MF) const { 799 // Do not use frame virtual registers. They used to be used for SGPRs, but 800 // once we reach PrologEpilogInserter, we can no longer spill SGPRs. If the 801 // scavenger fails, we can increment/decrement the necessary SGPRs to avoid a 802 // spill. 803 return false; 804 } 805 806 bool SIRegisterInfo::requiresFrameIndexReplacementScavenging( 807 const MachineFunction &MF) const { 808 const MachineFrameInfo &MFI = MF.getFrameInfo(); 809 return MFI.hasStackObjects(); 810 } 811 812 bool SIRegisterInfo::requiresVirtualBaseRegisters( 813 const MachineFunction &) const { 814 // There are no special dedicated stack or frame pointers. 815 return true; 816 } 817 818 int64_t SIRegisterInfo::getScratchInstrOffset(const MachineInstr *MI) const { 819 assert(SIInstrInfo::isMUBUF(*MI) || SIInstrInfo::isFLATScratch(*MI)); 820 821 int OffIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), 822 AMDGPU::OpName::offset); 823 return MI->getOperand(OffIdx).getImm(); 824 } 825 826 int64_t SIRegisterInfo::getFrameIndexInstrOffset(const MachineInstr *MI, 827 int Idx) const { 828 switch (MI->getOpcode()) { 829 case AMDGPU::V_ADD_U32_e32: 830 case AMDGPU::V_ADD_U32_e64: 831 case AMDGPU::V_ADD_CO_U32_e32: { 832 int OtherIdx = Idx == 1 ? 2 : 1; 833 const MachineOperand &OtherOp = MI->getOperand(OtherIdx); 834 return OtherOp.isImm() ? OtherOp.getImm() : 0; 835 } 836 case AMDGPU::V_ADD_CO_U32_e64: { 837 int OtherIdx = Idx == 2 ? 3 : 2; 838 const MachineOperand &OtherOp = MI->getOperand(OtherIdx); 839 return OtherOp.isImm() ? OtherOp.getImm() : 0; 840 } 841 default: 842 break; 843 } 844 845 if (!SIInstrInfo::isMUBUF(*MI) && !SIInstrInfo::isFLATScratch(*MI)) 846 return 0; 847 848 assert((Idx == AMDGPU::getNamedOperandIdx(MI->getOpcode(), 849 AMDGPU::OpName::vaddr) || 850 (Idx == AMDGPU::getNamedOperandIdx(MI->getOpcode(), 851 AMDGPU::OpName::saddr))) && 852 "Should never see frame index on non-address operand"); 853 854 return getScratchInstrOffset(MI); 855 } 856 857 static bool isFIPlusImmOrVGPR(const SIRegisterInfo &TRI, 858 const MachineInstr &MI) { 859 assert(MI.getDesc().isAdd()); 860 const MachineOperand &Src0 = MI.getOperand(1); 861 const MachineOperand &Src1 = MI.getOperand(2); 862 863 if (Src0.isFI()) { 864 return Src1.isImm() || (Src1.isReg() && TRI.isVGPR(MI.getMF()->getRegInfo(), 865 Src1.getReg())); 866 } 867 868 if (Src1.isFI()) { 869 return Src0.isImm() || (Src0.isReg() && TRI.isVGPR(MI.getMF()->getRegInfo(), 870 Src0.getReg())); 871 } 872 873 return false; 874 } 875 876 bool SIRegisterInfo::needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const { 877 // TODO: Handle v_add_co_u32, v_or_b32, v_and_b32 and scalar opcodes. 878 switch (MI->getOpcode()) { 879 case AMDGPU::V_ADD_U32_e32: { 880 // TODO: We could handle this but it requires work to avoid violating 881 // operand restrictions. 882 if (ST.getConstantBusLimit(AMDGPU::V_ADD_U32_e32) < 2 && 883 !isFIPlusImmOrVGPR(*this, *MI)) 884 return false; 885 [[fallthrough]]; 886 } 887 case AMDGPU::V_ADD_U32_e64: 888 // FIXME: This optimization is barely profitable enableFlatScratch as-is. 889 // 890 // Much of the benefit with the MUBUF handling is we avoid duplicating the 891 // shift of the frame register, which isn't needed with scratch. 892 // 893 // materializeFrameBaseRegister doesn't know the register classes of the 894 // uses, and unconditionally uses an s_add_i32, which will end up using a 895 // copy for the vector uses. 896 return !ST.enableFlatScratch(); 897 case AMDGPU::V_ADD_CO_U32_e32: 898 if (ST.getConstantBusLimit(AMDGPU::V_ADD_CO_U32_e32) < 2 && 899 !isFIPlusImmOrVGPR(*this, *MI)) 900 return false; 901 // We can't deal with the case where the carry out has a use (though this 902 // should never happen) 903 return MI->getOperand(3).isDead(); 904 case AMDGPU::V_ADD_CO_U32_e64: 905 // TODO: Should we check use_empty instead? 906 return MI->getOperand(1).isDead(); 907 default: 908 break; 909 } 910 911 if (!SIInstrInfo::isMUBUF(*MI) && !SIInstrInfo::isFLATScratch(*MI)) 912 return false; 913 914 int64_t FullOffset = Offset + getScratchInstrOffset(MI); 915 916 const SIInstrInfo *TII = ST.getInstrInfo(); 917 if (SIInstrInfo::isMUBUF(*MI)) 918 return !TII->isLegalMUBUFImmOffset(FullOffset); 919 920 return !TII->isLegalFLATOffset(FullOffset, AMDGPUAS::PRIVATE_ADDRESS, 921 SIInstrFlags::FlatScratch); 922 } 923 924 Register SIRegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB, 925 int FrameIdx, 926 int64_t Offset) const { 927 MachineBasicBlock::iterator Ins = MBB->begin(); 928 DebugLoc DL; // Defaults to "unknown" 929 930 if (Ins != MBB->end()) 931 DL = Ins->getDebugLoc(); 932 933 MachineFunction *MF = MBB->getParent(); 934 const SIInstrInfo *TII = ST.getInstrInfo(); 935 MachineRegisterInfo &MRI = MF->getRegInfo(); 936 unsigned MovOpc = ST.enableFlatScratch() ? AMDGPU::S_MOV_B32 937 : AMDGPU::V_MOV_B32_e32; 938 939 Register BaseReg = MRI.createVirtualRegister( 940 ST.enableFlatScratch() ? &AMDGPU::SReg_32_XEXEC_HIRegClass 941 : &AMDGPU::VGPR_32RegClass); 942 943 if (Offset == 0) { 944 BuildMI(*MBB, Ins, DL, TII->get(MovOpc), BaseReg) 945 .addFrameIndex(FrameIdx); 946 return BaseReg; 947 } 948 949 Register OffsetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); 950 951 Register FIReg = MRI.createVirtualRegister( 952 ST.enableFlatScratch() ? &AMDGPU::SReg_32_XM0RegClass 953 : &AMDGPU::VGPR_32RegClass); 954 955 BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::S_MOV_B32), OffsetReg) 956 .addImm(Offset); 957 BuildMI(*MBB, Ins, DL, TII->get(MovOpc), FIReg) 958 .addFrameIndex(FrameIdx); 959 960 if (ST.enableFlatScratch() ) { 961 // FIXME: Make sure scc isn't live in. 962 BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::S_ADD_I32), BaseReg) 963 .addReg(OffsetReg, RegState::Kill) 964 .addReg(FIReg) 965 .setOperandDead(3); // scc 966 return BaseReg; 967 } 968 969 TII->getAddNoCarry(*MBB, Ins, DL, BaseReg) 970 .addReg(OffsetReg, RegState::Kill) 971 .addReg(FIReg) 972 .addImm(0); // clamp bit 973 974 return BaseReg; 975 } 976 977 void SIRegisterInfo::resolveFrameIndex(MachineInstr &MI, Register BaseReg, 978 int64_t Offset) const { 979 const SIInstrInfo *TII = ST.getInstrInfo(); 980 981 switch (MI.getOpcode()) { 982 case AMDGPU::V_ADD_U32_e32: 983 case AMDGPU::V_ADD_CO_U32_e32: { 984 MachineOperand *FIOp = &MI.getOperand(2); 985 MachineOperand *ImmOp = &MI.getOperand(1); 986 if (!FIOp->isFI()) 987 std::swap(FIOp, ImmOp); 988 989 if (!ImmOp->isImm()) { 990 assert(Offset == 0); 991 FIOp->ChangeToRegister(BaseReg, false); 992 TII->legalizeOperandsVOP2(MI.getMF()->getRegInfo(), MI); 993 return; 994 } 995 996 int64_t TotalOffset = ImmOp->getImm() + Offset; 997 if (TotalOffset == 0) { 998 MI.setDesc(TII->get(AMDGPU::COPY)); 999 for (unsigned I = MI.getNumOperands() - 1; I != 1; --I) 1000 MI.removeOperand(I); 1001 1002 MI.getOperand(1).ChangeToRegister(BaseReg, false); 1003 return; 1004 } 1005 1006 ImmOp->setImm(TotalOffset); 1007 1008 MachineBasicBlock *MBB = MI.getParent(); 1009 MachineFunction *MF = MBB->getParent(); 1010 MachineRegisterInfo &MRI = MF->getRegInfo(); 1011 1012 // FIXME: materializeFrameBaseRegister does not know the register class of 1013 // the uses of the frame index, and assumes SGPR for enableFlatScratch. Emit 1014 // a copy so we have a legal operand and hope the register coalescer can 1015 // clean it up. 1016 if (isSGPRReg(MRI, BaseReg)) { 1017 Register BaseRegVGPR = 1018 MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1019 BuildMI(*MBB, MI, MI.getDebugLoc(), TII->get(AMDGPU::COPY), BaseRegVGPR) 1020 .addReg(BaseReg); 1021 MI.getOperand(2).ChangeToRegister(BaseRegVGPR, false); 1022 } else { 1023 MI.getOperand(2).ChangeToRegister(BaseReg, false); 1024 } 1025 return; 1026 } 1027 case AMDGPU::V_ADD_U32_e64: 1028 case AMDGPU::V_ADD_CO_U32_e64: { 1029 int Src0Idx = MI.getNumExplicitDefs(); 1030 MachineOperand *FIOp = &MI.getOperand(Src0Idx); 1031 MachineOperand *ImmOp = &MI.getOperand(Src0Idx + 1); 1032 if (!FIOp->isFI()) 1033 std::swap(FIOp, ImmOp); 1034 1035 if (!ImmOp->isImm()) { 1036 FIOp->ChangeToRegister(BaseReg, false); 1037 TII->legalizeOperandsVOP3(MI.getMF()->getRegInfo(), MI); 1038 return; 1039 } 1040 1041 int64_t TotalOffset = ImmOp->getImm() + Offset; 1042 if (TotalOffset == 0) { 1043 MI.setDesc(TII->get(AMDGPU::COPY)); 1044 1045 for (unsigned I = MI.getNumOperands() - 1; I != 1; --I) 1046 MI.removeOperand(I); 1047 1048 MI.getOperand(1).ChangeToRegister(BaseReg, false); 1049 } else { 1050 FIOp->ChangeToRegister(BaseReg, false); 1051 ImmOp->setImm(TotalOffset); 1052 } 1053 1054 return; 1055 } 1056 default: 1057 break; 1058 } 1059 1060 bool IsFlat = TII->isFLATScratch(MI); 1061 1062 #ifndef NDEBUG 1063 // FIXME: Is it possible to be storing a frame index to itself? 1064 bool SeenFI = false; 1065 for (const MachineOperand &MO: MI.operands()) { 1066 if (MO.isFI()) { 1067 if (SeenFI) 1068 llvm_unreachable("should not see multiple frame indices"); 1069 1070 SeenFI = true; 1071 } 1072 } 1073 #endif 1074 1075 MachineOperand *FIOp = 1076 TII->getNamedOperand(MI, IsFlat ? AMDGPU::OpName::saddr 1077 : AMDGPU::OpName::vaddr); 1078 1079 MachineOperand *OffsetOp = TII->getNamedOperand(MI, AMDGPU::OpName::offset); 1080 int64_t NewOffset = OffsetOp->getImm() + Offset; 1081 1082 assert(FIOp && FIOp->isFI() && "frame index must be address operand"); 1083 assert(TII->isMUBUF(MI) || TII->isFLATScratch(MI)); 1084 1085 if (IsFlat) { 1086 assert(TII->isLegalFLATOffset(NewOffset, AMDGPUAS::PRIVATE_ADDRESS, 1087 SIInstrFlags::FlatScratch) && 1088 "offset should be legal"); 1089 FIOp->ChangeToRegister(BaseReg, false); 1090 OffsetOp->setImm(NewOffset); 1091 return; 1092 } 1093 1094 #ifndef NDEBUG 1095 MachineOperand *SOffset = TII->getNamedOperand(MI, AMDGPU::OpName::soffset); 1096 assert(SOffset->isImm() && SOffset->getImm() == 0); 1097 #endif 1098 1099 assert(TII->isLegalMUBUFImmOffset(NewOffset) && "offset should be legal"); 1100 1101 FIOp->ChangeToRegister(BaseReg, false); 1102 OffsetOp->setImm(NewOffset); 1103 } 1104 1105 bool SIRegisterInfo::isFrameOffsetLegal(const MachineInstr *MI, 1106 Register BaseReg, 1107 int64_t Offset) const { 1108 1109 switch (MI->getOpcode()) { 1110 case AMDGPU::V_ADD_U32_e32: 1111 case AMDGPU::V_ADD_CO_U32_e32: 1112 return true; 1113 case AMDGPU::V_ADD_U32_e64: 1114 case AMDGPU::V_ADD_CO_U32_e64: 1115 return ST.hasVOP3Literal() || AMDGPU::isInlinableIntLiteral(Offset); 1116 default: 1117 break; 1118 } 1119 1120 if (!SIInstrInfo::isMUBUF(*MI) && !SIInstrInfo::isFLATScratch(*MI)) 1121 return false; 1122 1123 int64_t NewOffset = Offset + getScratchInstrOffset(MI); 1124 1125 const SIInstrInfo *TII = ST.getInstrInfo(); 1126 if (SIInstrInfo::isMUBUF(*MI)) 1127 return TII->isLegalMUBUFImmOffset(NewOffset); 1128 1129 return TII->isLegalFLATOffset(NewOffset, AMDGPUAS::PRIVATE_ADDRESS, 1130 SIInstrFlags::FlatScratch); 1131 } 1132 1133 const TargetRegisterClass *SIRegisterInfo::getPointerRegClass( 1134 const MachineFunction &MF, unsigned Kind) const { 1135 // This is inaccurate. It depends on the instruction and address space. The 1136 // only place where we should hit this is for dealing with frame indexes / 1137 // private accesses, so this is correct in that case. 1138 return &AMDGPU::VGPR_32RegClass; 1139 } 1140 1141 const TargetRegisterClass * 1142 SIRegisterInfo::getCrossCopyRegClass(const TargetRegisterClass *RC) const { 1143 if (isAGPRClass(RC) && !ST.hasGFX90AInsts()) 1144 return getEquivalentVGPRClass(RC); 1145 if (RC == &AMDGPU::SCC_CLASSRegClass) 1146 return getWaveMaskRegClass(); 1147 1148 return RC; 1149 } 1150 1151 static unsigned getNumSubRegsForSpillOp(unsigned Op) { 1152 1153 switch (Op) { 1154 case AMDGPU::SI_SPILL_S1024_SAVE: 1155 case AMDGPU::SI_SPILL_S1024_RESTORE: 1156 case AMDGPU::SI_SPILL_V1024_SAVE: 1157 case AMDGPU::SI_SPILL_V1024_RESTORE: 1158 case AMDGPU::SI_SPILL_A1024_SAVE: 1159 case AMDGPU::SI_SPILL_A1024_RESTORE: 1160 case AMDGPU::SI_SPILL_AV1024_SAVE: 1161 case AMDGPU::SI_SPILL_AV1024_RESTORE: 1162 return 32; 1163 case AMDGPU::SI_SPILL_S512_SAVE: 1164 case AMDGPU::SI_SPILL_S512_RESTORE: 1165 case AMDGPU::SI_SPILL_V512_SAVE: 1166 case AMDGPU::SI_SPILL_V512_RESTORE: 1167 case AMDGPU::SI_SPILL_A512_SAVE: 1168 case AMDGPU::SI_SPILL_A512_RESTORE: 1169 case AMDGPU::SI_SPILL_AV512_SAVE: 1170 case AMDGPU::SI_SPILL_AV512_RESTORE: 1171 return 16; 1172 case AMDGPU::SI_SPILL_S384_SAVE: 1173 case AMDGPU::SI_SPILL_S384_RESTORE: 1174 case AMDGPU::SI_SPILL_V384_SAVE: 1175 case AMDGPU::SI_SPILL_V384_RESTORE: 1176 case AMDGPU::SI_SPILL_A384_SAVE: 1177 case AMDGPU::SI_SPILL_A384_RESTORE: 1178 case AMDGPU::SI_SPILL_AV384_SAVE: 1179 case AMDGPU::SI_SPILL_AV384_RESTORE: 1180 return 12; 1181 case AMDGPU::SI_SPILL_S352_SAVE: 1182 case AMDGPU::SI_SPILL_S352_RESTORE: 1183 case AMDGPU::SI_SPILL_V352_SAVE: 1184 case AMDGPU::SI_SPILL_V352_RESTORE: 1185 case AMDGPU::SI_SPILL_A352_SAVE: 1186 case AMDGPU::SI_SPILL_A352_RESTORE: 1187 case AMDGPU::SI_SPILL_AV352_SAVE: 1188 case AMDGPU::SI_SPILL_AV352_RESTORE: 1189 return 11; 1190 case AMDGPU::SI_SPILL_S320_SAVE: 1191 case AMDGPU::SI_SPILL_S320_RESTORE: 1192 case AMDGPU::SI_SPILL_V320_SAVE: 1193 case AMDGPU::SI_SPILL_V320_RESTORE: 1194 case AMDGPU::SI_SPILL_A320_SAVE: 1195 case AMDGPU::SI_SPILL_A320_RESTORE: 1196 case AMDGPU::SI_SPILL_AV320_SAVE: 1197 case AMDGPU::SI_SPILL_AV320_RESTORE: 1198 return 10; 1199 case AMDGPU::SI_SPILL_S288_SAVE: 1200 case AMDGPU::SI_SPILL_S288_RESTORE: 1201 case AMDGPU::SI_SPILL_V288_SAVE: 1202 case AMDGPU::SI_SPILL_V288_RESTORE: 1203 case AMDGPU::SI_SPILL_A288_SAVE: 1204 case AMDGPU::SI_SPILL_A288_RESTORE: 1205 case AMDGPU::SI_SPILL_AV288_SAVE: 1206 case AMDGPU::SI_SPILL_AV288_RESTORE: 1207 return 9; 1208 case AMDGPU::SI_SPILL_S256_SAVE: 1209 case AMDGPU::SI_SPILL_S256_RESTORE: 1210 case AMDGPU::SI_SPILL_V256_SAVE: 1211 case AMDGPU::SI_SPILL_V256_RESTORE: 1212 case AMDGPU::SI_SPILL_A256_SAVE: 1213 case AMDGPU::SI_SPILL_A256_RESTORE: 1214 case AMDGPU::SI_SPILL_AV256_SAVE: 1215 case AMDGPU::SI_SPILL_AV256_RESTORE: 1216 return 8; 1217 case AMDGPU::SI_SPILL_S224_SAVE: 1218 case AMDGPU::SI_SPILL_S224_RESTORE: 1219 case AMDGPU::SI_SPILL_V224_SAVE: 1220 case AMDGPU::SI_SPILL_V224_RESTORE: 1221 case AMDGPU::SI_SPILL_A224_SAVE: 1222 case AMDGPU::SI_SPILL_A224_RESTORE: 1223 case AMDGPU::SI_SPILL_AV224_SAVE: 1224 case AMDGPU::SI_SPILL_AV224_RESTORE: 1225 return 7; 1226 case AMDGPU::SI_SPILL_S192_SAVE: 1227 case AMDGPU::SI_SPILL_S192_RESTORE: 1228 case AMDGPU::SI_SPILL_V192_SAVE: 1229 case AMDGPU::SI_SPILL_V192_RESTORE: 1230 case AMDGPU::SI_SPILL_A192_SAVE: 1231 case AMDGPU::SI_SPILL_A192_RESTORE: 1232 case AMDGPU::SI_SPILL_AV192_SAVE: 1233 case AMDGPU::SI_SPILL_AV192_RESTORE: 1234 return 6; 1235 case AMDGPU::SI_SPILL_S160_SAVE: 1236 case AMDGPU::SI_SPILL_S160_RESTORE: 1237 case AMDGPU::SI_SPILL_V160_SAVE: 1238 case AMDGPU::SI_SPILL_V160_RESTORE: 1239 case AMDGPU::SI_SPILL_A160_SAVE: 1240 case AMDGPU::SI_SPILL_A160_RESTORE: 1241 case AMDGPU::SI_SPILL_AV160_SAVE: 1242 case AMDGPU::SI_SPILL_AV160_RESTORE: 1243 return 5; 1244 case AMDGPU::SI_SPILL_S128_SAVE: 1245 case AMDGPU::SI_SPILL_S128_RESTORE: 1246 case AMDGPU::SI_SPILL_V128_SAVE: 1247 case AMDGPU::SI_SPILL_V128_RESTORE: 1248 case AMDGPU::SI_SPILL_A128_SAVE: 1249 case AMDGPU::SI_SPILL_A128_RESTORE: 1250 case AMDGPU::SI_SPILL_AV128_SAVE: 1251 case AMDGPU::SI_SPILL_AV128_RESTORE: 1252 return 4; 1253 case AMDGPU::SI_SPILL_S96_SAVE: 1254 case AMDGPU::SI_SPILL_S96_RESTORE: 1255 case AMDGPU::SI_SPILL_V96_SAVE: 1256 case AMDGPU::SI_SPILL_V96_RESTORE: 1257 case AMDGPU::SI_SPILL_A96_SAVE: 1258 case AMDGPU::SI_SPILL_A96_RESTORE: 1259 case AMDGPU::SI_SPILL_AV96_SAVE: 1260 case AMDGPU::SI_SPILL_AV96_RESTORE: 1261 return 3; 1262 case AMDGPU::SI_SPILL_S64_SAVE: 1263 case AMDGPU::SI_SPILL_S64_RESTORE: 1264 case AMDGPU::SI_SPILL_V64_SAVE: 1265 case AMDGPU::SI_SPILL_V64_RESTORE: 1266 case AMDGPU::SI_SPILL_A64_SAVE: 1267 case AMDGPU::SI_SPILL_A64_RESTORE: 1268 case AMDGPU::SI_SPILL_AV64_SAVE: 1269 case AMDGPU::SI_SPILL_AV64_RESTORE: 1270 return 2; 1271 case AMDGPU::SI_SPILL_S32_SAVE: 1272 case AMDGPU::SI_SPILL_S32_RESTORE: 1273 case AMDGPU::SI_SPILL_V32_SAVE: 1274 case AMDGPU::SI_SPILL_V32_RESTORE: 1275 case AMDGPU::SI_SPILL_A32_SAVE: 1276 case AMDGPU::SI_SPILL_A32_RESTORE: 1277 case AMDGPU::SI_SPILL_AV32_SAVE: 1278 case AMDGPU::SI_SPILL_AV32_RESTORE: 1279 case AMDGPU::SI_SPILL_WWM_V32_SAVE: 1280 case AMDGPU::SI_SPILL_WWM_V32_RESTORE: 1281 case AMDGPU::SI_SPILL_WWM_AV32_SAVE: 1282 case AMDGPU::SI_SPILL_WWM_AV32_RESTORE: 1283 return 1; 1284 default: llvm_unreachable("Invalid spill opcode"); 1285 } 1286 } 1287 1288 static int getOffsetMUBUFStore(unsigned Opc) { 1289 switch (Opc) { 1290 case AMDGPU::BUFFER_STORE_DWORD_OFFEN: 1291 return AMDGPU::BUFFER_STORE_DWORD_OFFSET; 1292 case AMDGPU::BUFFER_STORE_BYTE_OFFEN: 1293 return AMDGPU::BUFFER_STORE_BYTE_OFFSET; 1294 case AMDGPU::BUFFER_STORE_SHORT_OFFEN: 1295 return AMDGPU::BUFFER_STORE_SHORT_OFFSET; 1296 case AMDGPU::BUFFER_STORE_DWORDX2_OFFEN: 1297 return AMDGPU::BUFFER_STORE_DWORDX2_OFFSET; 1298 case AMDGPU::BUFFER_STORE_DWORDX3_OFFEN: 1299 return AMDGPU::BUFFER_STORE_DWORDX3_OFFSET; 1300 case AMDGPU::BUFFER_STORE_DWORDX4_OFFEN: 1301 return AMDGPU::BUFFER_STORE_DWORDX4_OFFSET; 1302 case AMDGPU::BUFFER_STORE_SHORT_D16_HI_OFFEN: 1303 return AMDGPU::BUFFER_STORE_SHORT_D16_HI_OFFSET; 1304 case AMDGPU::BUFFER_STORE_BYTE_D16_HI_OFFEN: 1305 return AMDGPU::BUFFER_STORE_BYTE_D16_HI_OFFSET; 1306 default: 1307 return -1; 1308 } 1309 } 1310 1311 static int getOffsetMUBUFLoad(unsigned Opc) { 1312 switch (Opc) { 1313 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN: 1314 return AMDGPU::BUFFER_LOAD_DWORD_OFFSET; 1315 case AMDGPU::BUFFER_LOAD_UBYTE_OFFEN: 1316 return AMDGPU::BUFFER_LOAD_UBYTE_OFFSET; 1317 case AMDGPU::BUFFER_LOAD_SBYTE_OFFEN: 1318 return AMDGPU::BUFFER_LOAD_SBYTE_OFFSET; 1319 case AMDGPU::BUFFER_LOAD_USHORT_OFFEN: 1320 return AMDGPU::BUFFER_LOAD_USHORT_OFFSET; 1321 case AMDGPU::BUFFER_LOAD_SSHORT_OFFEN: 1322 return AMDGPU::BUFFER_LOAD_SSHORT_OFFSET; 1323 case AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN: 1324 return AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET; 1325 case AMDGPU::BUFFER_LOAD_DWORDX3_OFFEN: 1326 return AMDGPU::BUFFER_LOAD_DWORDX3_OFFSET; 1327 case AMDGPU::BUFFER_LOAD_DWORDX4_OFFEN: 1328 return AMDGPU::BUFFER_LOAD_DWORDX4_OFFSET; 1329 case AMDGPU::BUFFER_LOAD_UBYTE_D16_OFFEN: 1330 return AMDGPU::BUFFER_LOAD_UBYTE_D16_OFFSET; 1331 case AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFEN: 1332 return AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFSET; 1333 case AMDGPU::BUFFER_LOAD_SBYTE_D16_OFFEN: 1334 return AMDGPU::BUFFER_LOAD_SBYTE_D16_OFFSET; 1335 case AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFEN: 1336 return AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFSET; 1337 case AMDGPU::BUFFER_LOAD_SHORT_D16_OFFEN: 1338 return AMDGPU::BUFFER_LOAD_SHORT_D16_OFFSET; 1339 case AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFEN: 1340 return AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFSET; 1341 default: 1342 return -1; 1343 } 1344 } 1345 1346 static int getOffenMUBUFStore(unsigned Opc) { 1347 switch (Opc) { 1348 case AMDGPU::BUFFER_STORE_DWORD_OFFSET: 1349 return AMDGPU::BUFFER_STORE_DWORD_OFFEN; 1350 case AMDGPU::BUFFER_STORE_BYTE_OFFSET: 1351 return AMDGPU::BUFFER_STORE_BYTE_OFFEN; 1352 case AMDGPU::BUFFER_STORE_SHORT_OFFSET: 1353 return AMDGPU::BUFFER_STORE_SHORT_OFFEN; 1354 case AMDGPU::BUFFER_STORE_DWORDX2_OFFSET: 1355 return AMDGPU::BUFFER_STORE_DWORDX2_OFFEN; 1356 case AMDGPU::BUFFER_STORE_DWORDX3_OFFSET: 1357 return AMDGPU::BUFFER_STORE_DWORDX3_OFFEN; 1358 case AMDGPU::BUFFER_STORE_DWORDX4_OFFSET: 1359 return AMDGPU::BUFFER_STORE_DWORDX4_OFFEN; 1360 case AMDGPU::BUFFER_STORE_SHORT_D16_HI_OFFSET: 1361 return AMDGPU::BUFFER_STORE_SHORT_D16_HI_OFFEN; 1362 case AMDGPU::BUFFER_STORE_BYTE_D16_HI_OFFSET: 1363 return AMDGPU::BUFFER_STORE_BYTE_D16_HI_OFFEN; 1364 default: 1365 return -1; 1366 } 1367 } 1368 1369 static int getOffenMUBUFLoad(unsigned Opc) { 1370 switch (Opc) { 1371 case AMDGPU::BUFFER_LOAD_DWORD_OFFSET: 1372 return AMDGPU::BUFFER_LOAD_DWORD_OFFEN; 1373 case AMDGPU::BUFFER_LOAD_UBYTE_OFFSET: 1374 return AMDGPU::BUFFER_LOAD_UBYTE_OFFEN; 1375 case AMDGPU::BUFFER_LOAD_SBYTE_OFFSET: 1376 return AMDGPU::BUFFER_LOAD_SBYTE_OFFEN; 1377 case AMDGPU::BUFFER_LOAD_USHORT_OFFSET: 1378 return AMDGPU::BUFFER_LOAD_USHORT_OFFEN; 1379 case AMDGPU::BUFFER_LOAD_SSHORT_OFFSET: 1380 return AMDGPU::BUFFER_LOAD_SSHORT_OFFEN; 1381 case AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET: 1382 return AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN; 1383 case AMDGPU::BUFFER_LOAD_DWORDX3_OFFSET: 1384 return AMDGPU::BUFFER_LOAD_DWORDX3_OFFEN; 1385 case AMDGPU::BUFFER_LOAD_DWORDX4_OFFSET: 1386 return AMDGPU::BUFFER_LOAD_DWORDX4_OFFEN; 1387 case AMDGPU::BUFFER_LOAD_UBYTE_D16_OFFSET: 1388 return AMDGPU::BUFFER_LOAD_UBYTE_D16_OFFEN; 1389 case AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFSET: 1390 return AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFEN; 1391 case AMDGPU::BUFFER_LOAD_SBYTE_D16_OFFSET: 1392 return AMDGPU::BUFFER_LOAD_SBYTE_D16_OFFEN; 1393 case AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFSET: 1394 return AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFEN; 1395 case AMDGPU::BUFFER_LOAD_SHORT_D16_OFFSET: 1396 return AMDGPU::BUFFER_LOAD_SHORT_D16_OFFEN; 1397 case AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFSET: 1398 return AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFEN; 1399 default: 1400 return -1; 1401 } 1402 } 1403 1404 static MachineInstrBuilder spillVGPRtoAGPR(const GCNSubtarget &ST, 1405 MachineBasicBlock &MBB, 1406 MachineBasicBlock::iterator MI, 1407 int Index, unsigned Lane, 1408 unsigned ValueReg, bool IsKill) { 1409 MachineFunction *MF = MBB.getParent(); 1410 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); 1411 const SIInstrInfo *TII = ST.getInstrInfo(); 1412 1413 MCPhysReg Reg = MFI->getVGPRToAGPRSpill(Index, Lane); 1414 1415 if (Reg == AMDGPU::NoRegister) 1416 return MachineInstrBuilder(); 1417 1418 bool IsStore = MI->mayStore(); 1419 MachineRegisterInfo &MRI = MF->getRegInfo(); 1420 auto *TRI = static_cast<const SIRegisterInfo*>(MRI.getTargetRegisterInfo()); 1421 1422 unsigned Dst = IsStore ? Reg : ValueReg; 1423 unsigned Src = IsStore ? ValueReg : Reg; 1424 bool IsVGPR = TRI->isVGPR(MRI, Reg); 1425 DebugLoc DL = MI->getDebugLoc(); 1426 if (IsVGPR == TRI->isVGPR(MRI, ValueReg)) { 1427 // Spiller during regalloc may restore a spilled register to its superclass. 1428 // It could result in AGPR spills restored to VGPRs or the other way around, 1429 // making the src and dst with identical regclasses at this point. It just 1430 // needs a copy in such cases. 1431 auto CopyMIB = BuildMI(MBB, MI, DL, TII->get(AMDGPU::COPY), Dst) 1432 .addReg(Src, getKillRegState(IsKill)); 1433 CopyMIB->setAsmPrinterFlag(MachineInstr::ReloadReuse); 1434 return CopyMIB; 1435 } 1436 unsigned Opc = (IsStore ^ IsVGPR) ? AMDGPU::V_ACCVGPR_WRITE_B32_e64 1437 : AMDGPU::V_ACCVGPR_READ_B32_e64; 1438 1439 auto MIB = BuildMI(MBB, MI, DL, TII->get(Opc), Dst) 1440 .addReg(Src, getKillRegState(IsKill)); 1441 MIB->setAsmPrinterFlag(MachineInstr::ReloadReuse); 1442 return MIB; 1443 } 1444 1445 // This differs from buildSpillLoadStore by only scavenging a VGPR. It does not 1446 // need to handle the case where an SGPR may need to be spilled while spilling. 1447 static bool buildMUBUFOffsetLoadStore(const GCNSubtarget &ST, 1448 MachineFrameInfo &MFI, 1449 MachineBasicBlock::iterator MI, 1450 int Index, 1451 int64_t Offset) { 1452 const SIInstrInfo *TII = ST.getInstrInfo(); 1453 MachineBasicBlock *MBB = MI->getParent(); 1454 const DebugLoc &DL = MI->getDebugLoc(); 1455 bool IsStore = MI->mayStore(); 1456 1457 unsigned Opc = MI->getOpcode(); 1458 int LoadStoreOp = IsStore ? 1459 getOffsetMUBUFStore(Opc) : getOffsetMUBUFLoad(Opc); 1460 if (LoadStoreOp == -1) 1461 return false; 1462 1463 const MachineOperand *Reg = TII->getNamedOperand(*MI, AMDGPU::OpName::vdata); 1464 if (spillVGPRtoAGPR(ST, *MBB, MI, Index, 0, Reg->getReg(), false).getInstr()) 1465 return true; 1466 1467 MachineInstrBuilder NewMI = 1468 BuildMI(*MBB, MI, DL, TII->get(LoadStoreOp)) 1469 .add(*Reg) 1470 .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::srsrc)) 1471 .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)) 1472 .addImm(Offset) 1473 .addImm(0) // cpol 1474 .addImm(0) // swz 1475 .cloneMemRefs(*MI); 1476 1477 const MachineOperand *VDataIn = TII->getNamedOperand(*MI, 1478 AMDGPU::OpName::vdata_in); 1479 if (VDataIn) 1480 NewMI.add(*VDataIn); 1481 return true; 1482 } 1483 1484 static unsigned getFlatScratchSpillOpcode(const SIInstrInfo *TII, 1485 unsigned LoadStoreOp, 1486 unsigned EltSize) { 1487 bool IsStore = TII->get(LoadStoreOp).mayStore(); 1488 bool HasVAddr = AMDGPU::hasNamedOperand(LoadStoreOp, AMDGPU::OpName::vaddr); 1489 bool UseST = 1490 !HasVAddr && !AMDGPU::hasNamedOperand(LoadStoreOp, AMDGPU::OpName::saddr); 1491 1492 switch (EltSize) { 1493 case 4: 1494 LoadStoreOp = IsStore ? AMDGPU::SCRATCH_STORE_DWORD_SADDR 1495 : AMDGPU::SCRATCH_LOAD_DWORD_SADDR; 1496 break; 1497 case 8: 1498 LoadStoreOp = IsStore ? AMDGPU::SCRATCH_STORE_DWORDX2_SADDR 1499 : AMDGPU::SCRATCH_LOAD_DWORDX2_SADDR; 1500 break; 1501 case 12: 1502 LoadStoreOp = IsStore ? AMDGPU::SCRATCH_STORE_DWORDX3_SADDR 1503 : AMDGPU::SCRATCH_LOAD_DWORDX3_SADDR; 1504 break; 1505 case 16: 1506 LoadStoreOp = IsStore ? AMDGPU::SCRATCH_STORE_DWORDX4_SADDR 1507 : AMDGPU::SCRATCH_LOAD_DWORDX4_SADDR; 1508 break; 1509 default: 1510 llvm_unreachable("Unexpected spill load/store size!"); 1511 } 1512 1513 if (HasVAddr) 1514 LoadStoreOp = AMDGPU::getFlatScratchInstSVfromSS(LoadStoreOp); 1515 else if (UseST) 1516 LoadStoreOp = AMDGPU::getFlatScratchInstSTfromSS(LoadStoreOp); 1517 1518 return LoadStoreOp; 1519 } 1520 1521 void SIRegisterInfo::buildSpillLoadStore( 1522 MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, 1523 unsigned LoadStoreOp, int Index, Register ValueReg, bool IsKill, 1524 MCRegister ScratchOffsetReg, int64_t InstOffset, MachineMemOperand *MMO, 1525 RegScavenger *RS, LiveRegUnits *LiveUnits) const { 1526 assert((!RS || !LiveUnits) && "Only RS or LiveUnits can be set but not both"); 1527 1528 MachineFunction *MF = MBB.getParent(); 1529 const SIInstrInfo *TII = ST.getInstrInfo(); 1530 const MachineFrameInfo &MFI = MF->getFrameInfo(); 1531 const SIMachineFunctionInfo *FuncInfo = MF->getInfo<SIMachineFunctionInfo>(); 1532 1533 const MCInstrDesc *Desc = &TII->get(LoadStoreOp); 1534 bool IsStore = Desc->mayStore(); 1535 bool IsFlat = TII->isFLATScratch(LoadStoreOp); 1536 1537 bool CanClobberSCC = false; 1538 bool Scavenged = false; 1539 MCRegister SOffset = ScratchOffsetReg; 1540 1541 const TargetRegisterClass *RC = getRegClassForReg(MF->getRegInfo(), ValueReg); 1542 // On gfx90a+ AGPR is a regular VGPR acceptable for loads and stores. 1543 const bool IsAGPR = !ST.hasGFX90AInsts() && isAGPRClass(RC); 1544 const unsigned RegWidth = AMDGPU::getRegBitWidth(*RC) / 8; 1545 1546 // Always use 4 byte operations for AGPRs because we need to scavenge 1547 // a temporary VGPR. 1548 unsigned EltSize = (IsFlat && !IsAGPR) ? std::min(RegWidth, 16u) : 4u; 1549 unsigned NumSubRegs = RegWidth / EltSize; 1550 unsigned Size = NumSubRegs * EltSize; 1551 unsigned RemSize = RegWidth - Size; 1552 unsigned NumRemSubRegs = RemSize ? 1 : 0; 1553 int64_t Offset = InstOffset + MFI.getObjectOffset(Index); 1554 int64_t MaterializedOffset = Offset; 1555 1556 int64_t MaxOffset = Offset + Size + RemSize - EltSize; 1557 int64_t ScratchOffsetRegDelta = 0; 1558 1559 if (IsFlat && EltSize > 4) { 1560 LoadStoreOp = getFlatScratchSpillOpcode(TII, LoadStoreOp, EltSize); 1561 Desc = &TII->get(LoadStoreOp); 1562 } 1563 1564 Align Alignment = MFI.getObjectAlign(Index); 1565 const MachinePointerInfo &BasePtrInfo = MMO->getPointerInfo(); 1566 1567 assert((IsFlat || ((Offset % EltSize) == 0)) && 1568 "unexpected VGPR spill offset"); 1569 1570 // Track a VGPR to use for a constant offset we need to materialize. 1571 Register TmpOffsetVGPR; 1572 1573 // Track a VGPR to use as an intermediate value. 1574 Register TmpIntermediateVGPR; 1575 bool UseVGPROffset = false; 1576 1577 // Materialize a VGPR offset required for the given SGPR/VGPR/Immediate 1578 // combination. 1579 auto MaterializeVOffset = [&](Register SGPRBase, Register TmpVGPR, 1580 int64_t VOffset) { 1581 // We are using a VGPR offset 1582 if (IsFlat && SGPRBase) { 1583 // We only have 1 VGPR offset, or 1 SGPR offset. We don't have a free 1584 // SGPR, so perform the add as vector. 1585 // We don't need a base SGPR in the kernel. 1586 1587 if (ST.getConstantBusLimit(AMDGPU::V_ADD_U32_e64) >= 2) { 1588 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_ADD_U32_e64), TmpVGPR) 1589 .addReg(SGPRBase) 1590 .addImm(VOffset) 1591 .addImm(0); // clamp 1592 } else { 1593 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR) 1594 .addReg(SGPRBase); 1595 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_ADD_U32_e32), TmpVGPR) 1596 .addImm(VOffset) 1597 .addReg(TmpOffsetVGPR); 1598 } 1599 } else { 1600 assert(TmpOffsetVGPR); 1601 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR) 1602 .addImm(VOffset); 1603 } 1604 }; 1605 1606 bool IsOffsetLegal = 1607 IsFlat ? TII->isLegalFLATOffset(MaxOffset, AMDGPUAS::PRIVATE_ADDRESS, 1608 SIInstrFlags::FlatScratch) 1609 : TII->isLegalMUBUFImmOffset(MaxOffset); 1610 if (!IsOffsetLegal || (IsFlat && !SOffset && !ST.hasFlatScratchSTMode())) { 1611 SOffset = MCRegister(); 1612 1613 // We don't have access to the register scavenger if this function is called 1614 // during PEI::scavengeFrameVirtualRegs() so use LiveUnits in this case. 1615 // TODO: Clobbering SCC is not necessary for scratch instructions in the 1616 // entry. 1617 if (RS) { 1618 SOffset = RS->scavengeRegisterBackwards(AMDGPU::SGPR_32RegClass, MI, false, 0, false); 1619 1620 // Piggy back on the liveness scan we just did see if SCC is dead. 1621 CanClobberSCC = !RS->isRegUsed(AMDGPU::SCC); 1622 } else if (LiveUnits) { 1623 CanClobberSCC = LiveUnits->available(AMDGPU::SCC); 1624 for (MCRegister Reg : AMDGPU::SGPR_32RegClass) { 1625 if (LiveUnits->available(Reg) && !MF->getRegInfo().isReserved(Reg)) { 1626 SOffset = Reg; 1627 break; 1628 } 1629 } 1630 } 1631 1632 if (ScratchOffsetReg != AMDGPU::NoRegister && !CanClobberSCC) 1633 SOffset = Register(); 1634 1635 if (!SOffset) { 1636 UseVGPROffset = true; 1637 1638 if (RS) { 1639 TmpOffsetVGPR = RS->scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass, MI, false, 0); 1640 } else { 1641 assert(LiveUnits); 1642 for (MCRegister Reg : AMDGPU::VGPR_32RegClass) { 1643 if (LiveUnits->available(Reg) && !MF->getRegInfo().isReserved(Reg)) { 1644 TmpOffsetVGPR = Reg; 1645 break; 1646 } 1647 } 1648 } 1649 1650 assert(TmpOffsetVGPR); 1651 } else if (!SOffset && CanClobberSCC) { 1652 // There are no free SGPRs, and since we are in the process of spilling 1653 // VGPRs too. Since we need a VGPR in order to spill SGPRs (this is true 1654 // on SI/CI and on VI it is true until we implement spilling using scalar 1655 // stores), we have no way to free up an SGPR. Our solution here is to 1656 // add the offset directly to the ScratchOffset or StackPtrOffset 1657 // register, and then subtract the offset after the spill to return the 1658 // register to it's original value. 1659 1660 // TODO: If we don't have to do an emergency stack slot spill, converting 1661 // to use the VGPR offset is fewer instructions. 1662 if (!ScratchOffsetReg) 1663 ScratchOffsetReg = FuncInfo->getStackPtrOffsetReg(); 1664 SOffset = ScratchOffsetReg; 1665 ScratchOffsetRegDelta = Offset; 1666 } else { 1667 Scavenged = true; 1668 } 1669 1670 // We currently only support spilling VGPRs to EltSize boundaries, meaning 1671 // we can simplify the adjustment of Offset here to just scale with 1672 // WavefrontSize. 1673 if (!IsFlat && !UseVGPROffset) 1674 Offset *= ST.getWavefrontSize(); 1675 1676 if (!UseVGPROffset && !SOffset) 1677 report_fatal_error("could not scavenge SGPR to spill in entry function"); 1678 1679 if (UseVGPROffset) { 1680 // We are using a VGPR offset 1681 MaterializeVOffset(ScratchOffsetReg, TmpOffsetVGPR, Offset); 1682 } else if (ScratchOffsetReg == AMDGPU::NoRegister) { 1683 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), SOffset).addImm(Offset); 1684 } else { 1685 assert(Offset != 0); 1686 auto Add = BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), SOffset) 1687 .addReg(ScratchOffsetReg) 1688 .addImm(Offset); 1689 Add->getOperand(3).setIsDead(); // Mark SCC as dead. 1690 } 1691 1692 Offset = 0; 1693 } 1694 1695 if (IsFlat && SOffset == AMDGPU::NoRegister) { 1696 assert(AMDGPU::getNamedOperandIdx(LoadStoreOp, AMDGPU::OpName::vaddr) < 0 1697 && "Unexpected vaddr for flat scratch with a FI operand"); 1698 1699 if (UseVGPROffset) { 1700 LoadStoreOp = AMDGPU::getFlatScratchInstSVfromSS(LoadStoreOp); 1701 } else { 1702 assert(ST.hasFlatScratchSTMode()); 1703 LoadStoreOp = AMDGPU::getFlatScratchInstSTfromSS(LoadStoreOp); 1704 } 1705 1706 Desc = &TII->get(LoadStoreOp); 1707 } 1708 1709 for (unsigned i = 0, e = NumSubRegs + NumRemSubRegs, RegOffset = 0; i != e; 1710 ++i, RegOffset += EltSize) { 1711 if (i == NumSubRegs) { 1712 EltSize = RemSize; 1713 LoadStoreOp = getFlatScratchSpillOpcode(TII, LoadStoreOp, EltSize); 1714 } 1715 Desc = &TII->get(LoadStoreOp); 1716 1717 if (!IsFlat && UseVGPROffset) { 1718 int NewLoadStoreOp = IsStore ? getOffenMUBUFStore(LoadStoreOp) 1719 : getOffenMUBUFLoad(LoadStoreOp); 1720 Desc = &TII->get(NewLoadStoreOp); 1721 } 1722 1723 if (UseVGPROffset && TmpOffsetVGPR == TmpIntermediateVGPR) { 1724 // If we are spilling an AGPR beyond the range of the memory instruction 1725 // offset and need to use a VGPR offset, we ideally have at least 2 1726 // scratch VGPRs. If we don't have a second free VGPR without spilling, 1727 // recycle the VGPR used for the offset which requires resetting after 1728 // each subregister. 1729 1730 MaterializeVOffset(ScratchOffsetReg, TmpOffsetVGPR, MaterializedOffset); 1731 } 1732 1733 unsigned NumRegs = EltSize / 4; 1734 Register SubReg = e == 1 1735 ? ValueReg 1736 : Register(getSubReg(ValueReg, 1737 getSubRegFromChannel(RegOffset / 4, NumRegs))); 1738 1739 unsigned SOffsetRegState = 0; 1740 unsigned SrcDstRegState = getDefRegState(!IsStore); 1741 const bool IsLastSubReg = i + 1 == e; 1742 const bool IsFirstSubReg = i == 0; 1743 if (IsLastSubReg) { 1744 SOffsetRegState |= getKillRegState(Scavenged); 1745 // The last implicit use carries the "Kill" flag. 1746 SrcDstRegState |= getKillRegState(IsKill); 1747 } 1748 1749 // Make sure the whole register is defined if there are undef components by 1750 // adding an implicit def of the super-reg on the first instruction. 1751 bool NeedSuperRegDef = e > 1 && IsStore && IsFirstSubReg; 1752 bool NeedSuperRegImpOperand = e > 1; 1753 1754 // Remaining element size to spill into memory after some parts of it 1755 // spilled into either AGPRs or VGPRs. 1756 unsigned RemEltSize = EltSize; 1757 1758 // AGPRs to spill VGPRs and vice versa are allocated in a reverse order, 1759 // starting from the last lane. In case if a register cannot be completely 1760 // spilled into another register that will ensure its alignment does not 1761 // change. For targets with VGPR alignment requirement this is important 1762 // in case of flat scratch usage as we might get a scratch_load or 1763 // scratch_store of an unaligned register otherwise. 1764 for (int LaneS = (RegOffset + EltSize) / 4 - 1, Lane = LaneS, 1765 LaneE = RegOffset / 4; 1766 Lane >= LaneE; --Lane) { 1767 bool IsSubReg = e > 1 || EltSize > 4; 1768 Register Sub = IsSubReg 1769 ? Register(getSubReg(ValueReg, getSubRegFromChannel(Lane))) 1770 : ValueReg; 1771 auto MIB = spillVGPRtoAGPR(ST, MBB, MI, Index, Lane, Sub, IsKill); 1772 if (!MIB.getInstr()) 1773 break; 1774 if (NeedSuperRegDef || (IsSubReg && IsStore && Lane == LaneS && IsFirstSubReg)) { 1775 MIB.addReg(ValueReg, RegState::ImplicitDefine); 1776 NeedSuperRegDef = false; 1777 } 1778 if ((IsSubReg || NeedSuperRegImpOperand) && (IsFirstSubReg || IsLastSubReg)) { 1779 NeedSuperRegImpOperand = true; 1780 unsigned State = SrcDstRegState; 1781 if (!IsLastSubReg || (Lane != LaneE)) 1782 State &= ~RegState::Kill; 1783 if (!IsFirstSubReg || (Lane != LaneS)) 1784 State &= ~RegState::Define; 1785 MIB.addReg(ValueReg, RegState::Implicit | State); 1786 } 1787 RemEltSize -= 4; 1788 } 1789 1790 if (!RemEltSize) // Fully spilled into AGPRs. 1791 continue; 1792 1793 if (RemEltSize != EltSize) { // Partially spilled to AGPRs 1794 assert(IsFlat && EltSize > 4); 1795 1796 unsigned NumRegs = RemEltSize / 4; 1797 SubReg = Register(getSubReg(ValueReg, 1798 getSubRegFromChannel(RegOffset / 4, NumRegs))); 1799 unsigned Opc = getFlatScratchSpillOpcode(TII, LoadStoreOp, RemEltSize); 1800 Desc = &TII->get(Opc); 1801 } 1802 1803 unsigned FinalReg = SubReg; 1804 1805 if (IsAGPR) { 1806 assert(EltSize == 4); 1807 1808 if (!TmpIntermediateVGPR) { 1809 TmpIntermediateVGPR = FuncInfo->getVGPRForAGPRCopy(); 1810 assert(MF->getRegInfo().isReserved(TmpIntermediateVGPR)); 1811 } 1812 if (IsStore) { 1813 auto AccRead = BuildMI(MBB, MI, DL, 1814 TII->get(AMDGPU::V_ACCVGPR_READ_B32_e64), 1815 TmpIntermediateVGPR) 1816 .addReg(SubReg, getKillRegState(IsKill)); 1817 if (NeedSuperRegDef) 1818 AccRead.addReg(ValueReg, RegState::ImplicitDefine); 1819 if (NeedSuperRegImpOperand && (IsFirstSubReg || IsLastSubReg)) 1820 AccRead.addReg(ValueReg, RegState::Implicit); 1821 AccRead->setAsmPrinterFlag(MachineInstr::ReloadReuse); 1822 } 1823 SubReg = TmpIntermediateVGPR; 1824 } else if (UseVGPROffset) { 1825 if (!TmpOffsetVGPR) { 1826 TmpOffsetVGPR = RS->scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass, 1827 MI, false, 0); 1828 RS->setRegUsed(TmpOffsetVGPR); 1829 } 1830 } 1831 1832 MachinePointerInfo PInfo = BasePtrInfo.getWithOffset(RegOffset); 1833 MachineMemOperand *NewMMO = 1834 MF->getMachineMemOperand(PInfo, MMO->getFlags(), RemEltSize, 1835 commonAlignment(Alignment, RegOffset)); 1836 1837 auto MIB = 1838 BuildMI(MBB, MI, DL, *Desc) 1839 .addReg(SubReg, getDefRegState(!IsStore) | getKillRegState(IsKill)); 1840 1841 if (UseVGPROffset) { 1842 // For an AGPR spill, we reuse the same temp VGPR for the offset and the 1843 // intermediate accvgpr_write. 1844 MIB.addReg(TmpOffsetVGPR, getKillRegState(IsLastSubReg && !IsAGPR)); 1845 } 1846 1847 if (!IsFlat) 1848 MIB.addReg(FuncInfo->getScratchRSrcReg()); 1849 1850 if (SOffset == AMDGPU::NoRegister) { 1851 if (!IsFlat) { 1852 if (UseVGPROffset && ScratchOffsetReg) { 1853 MIB.addReg(ScratchOffsetReg); 1854 } else { 1855 assert(FuncInfo->isBottomOfStack()); 1856 MIB.addImm(0); 1857 } 1858 } 1859 } else { 1860 MIB.addReg(SOffset, SOffsetRegState); 1861 } 1862 1863 MIB.addImm(Offset + RegOffset); 1864 1865 bool LastUse = MMO->getFlags() & MOLastUse; 1866 MIB.addImm(LastUse ? AMDGPU::CPol::TH_LU : 0); // cpol 1867 1868 if (!IsFlat) 1869 MIB.addImm(0); // swz 1870 MIB.addMemOperand(NewMMO); 1871 1872 if (!IsAGPR && NeedSuperRegDef) 1873 MIB.addReg(ValueReg, RegState::ImplicitDefine); 1874 1875 if (!IsStore && IsAGPR && TmpIntermediateVGPR != AMDGPU::NoRegister) { 1876 MIB = BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), 1877 FinalReg) 1878 .addReg(TmpIntermediateVGPR, RegState::Kill); 1879 MIB->setAsmPrinterFlag(MachineInstr::ReloadReuse); 1880 } 1881 1882 if (NeedSuperRegImpOperand && (IsFirstSubReg || IsLastSubReg)) 1883 MIB.addReg(ValueReg, RegState::Implicit | SrcDstRegState); 1884 1885 // The epilog restore of a wwm-scratch register can cause undesired 1886 // optimization during machine-cp post PrologEpilogInserter if the same 1887 // register was assigned for return value ABI lowering with a COPY 1888 // instruction. As given below, with the epilog reload, the earlier COPY 1889 // appeared to be dead during machine-cp. 1890 // ... 1891 // v0 in WWM operation, needs the WWM spill at prolog/epilog. 1892 // $vgpr0 = V_WRITELANE_B32 $sgpr20, 0, $vgpr0 1893 // ... 1894 // Epilog block: 1895 // $vgpr0 = COPY $vgpr1 // outgoing value moved to v0 1896 // ... 1897 // WWM spill restore to preserve the inactive lanes of v0. 1898 // $sgpr4_sgpr5 = S_XOR_SAVEEXEC_B64 -1 1899 // $vgpr0 = BUFFER_LOAD $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0 1900 // $exec = S_MOV_B64 killed $sgpr4_sgpr5 1901 // ... 1902 // SI_RETURN implicit $vgpr0 1903 // ... 1904 // To fix it, mark the same reg as a tied op for such restore instructions 1905 // so that it marks a usage for the preceding COPY. 1906 if (!IsStore && MI != MBB.end() && MI->isReturn() && 1907 MI->readsRegister(SubReg, this)) { 1908 MIB.addReg(SubReg, RegState::Implicit); 1909 MIB->tieOperands(0, MIB->getNumOperands() - 1); 1910 } 1911 } 1912 1913 if (ScratchOffsetRegDelta != 0) { 1914 // Subtract the offset we added to the ScratchOffset register. 1915 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), SOffset) 1916 .addReg(SOffset) 1917 .addImm(-ScratchOffsetRegDelta); 1918 } 1919 } 1920 1921 void SIRegisterInfo::buildVGPRSpillLoadStore(SGPRSpillBuilder &SB, int Index, 1922 int Offset, bool IsLoad, 1923 bool IsKill) const { 1924 // Load/store VGPR 1925 MachineFrameInfo &FrameInfo = SB.MF.getFrameInfo(); 1926 assert(FrameInfo.getStackID(Index) != TargetStackID::SGPRSpill); 1927 1928 Register FrameReg = 1929 FrameInfo.isFixedObjectIndex(Index) && hasBasePointer(SB.MF) 1930 ? getBaseRegister() 1931 : getFrameRegister(SB.MF); 1932 1933 Align Alignment = FrameInfo.getObjectAlign(Index); 1934 MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(SB.MF, Index); 1935 MachineMemOperand *MMO = SB.MF.getMachineMemOperand( 1936 PtrInfo, IsLoad ? MachineMemOperand::MOLoad : MachineMemOperand::MOStore, 1937 SB.EltSize, Alignment); 1938 1939 if (IsLoad) { 1940 unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR 1941 : AMDGPU::BUFFER_LOAD_DWORD_OFFSET; 1942 buildSpillLoadStore(*SB.MBB, SB.MI, SB.DL, Opc, Index, SB.TmpVGPR, false, 1943 FrameReg, (int64_t)Offset * SB.EltSize, MMO, SB.RS); 1944 } else { 1945 unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_STORE_DWORD_SADDR 1946 : AMDGPU::BUFFER_STORE_DWORD_OFFSET; 1947 buildSpillLoadStore(*SB.MBB, SB.MI, SB.DL, Opc, Index, SB.TmpVGPR, IsKill, 1948 FrameReg, (int64_t)Offset * SB.EltSize, MMO, SB.RS); 1949 // This only ever adds one VGPR spill 1950 SB.MFI.addToSpilledVGPRs(1); 1951 } 1952 } 1953 1954 bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI, int Index, 1955 RegScavenger *RS, SlotIndexes *Indexes, 1956 LiveIntervals *LIS, bool OnlyToVGPR, 1957 bool SpillToPhysVGPRLane) const { 1958 assert(!MI->getOperand(0).isUndef() && 1959 "undef spill should have been deleted earlier"); 1960 1961 SGPRSpillBuilder SB(*this, *ST.getInstrInfo(), isWave32, MI, Index, RS); 1962 1963 ArrayRef<SpilledReg> VGPRSpills = 1964 SpillToPhysVGPRLane ? SB.MFI.getSGPRSpillToPhysicalVGPRLanes(Index) 1965 : SB.MFI.getSGPRSpillToVirtualVGPRLanes(Index); 1966 bool SpillToVGPR = !VGPRSpills.empty(); 1967 if (OnlyToVGPR && !SpillToVGPR) 1968 return false; 1969 1970 assert(SpillToVGPR || (SB.SuperReg != SB.MFI.getStackPtrOffsetReg() && 1971 SB.SuperReg != SB.MFI.getFrameOffsetReg())); 1972 1973 if (SpillToVGPR) { 1974 1975 // Since stack slot coloring pass is trying to optimize SGPR spills, 1976 // VGPR lanes (mapped from spill stack slot) may be shared for SGPR 1977 // spills of different sizes. This accounts for number of VGPR lanes alloted 1978 // equal to the largest SGPR being spilled in them. 1979 assert(SB.NumSubRegs <= VGPRSpills.size() && 1980 "Num of SGPRs spilled should be less than or equal to num of " 1981 "the VGPR lanes."); 1982 1983 for (unsigned i = 0, e = SB.NumSubRegs; i < e; ++i) { 1984 Register SubReg = 1985 SB.NumSubRegs == 1 1986 ? SB.SuperReg 1987 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i])); 1988 SpilledReg Spill = VGPRSpills[i]; 1989 1990 bool IsFirstSubreg = i == 0; 1991 bool IsLastSubreg = i == SB.NumSubRegs - 1; 1992 bool UseKill = SB.IsKill && IsLastSubreg; 1993 1994 1995 // Mark the "old value of vgpr" input undef only if this is the first sgpr 1996 // spill to this specific vgpr in the first basic block. 1997 auto MIB = BuildMI(*SB.MBB, MI, SB.DL, 1998 SB.TII.get(AMDGPU::SI_SPILL_S32_TO_VGPR), Spill.VGPR) 1999 .addReg(SubReg, getKillRegState(UseKill)) 2000 .addImm(Spill.Lane) 2001 .addReg(Spill.VGPR); 2002 if (Indexes) { 2003 if (IsFirstSubreg) 2004 Indexes->replaceMachineInstrInMaps(*MI, *MIB); 2005 else 2006 Indexes->insertMachineInstrInMaps(*MIB); 2007 } 2008 2009 if (IsFirstSubreg && SB.NumSubRegs > 1) { 2010 // We may be spilling a super-register which is only partially defined, 2011 // and need to ensure later spills think the value is defined. 2012 MIB.addReg(SB.SuperReg, RegState::ImplicitDefine); 2013 } 2014 2015 if (SB.NumSubRegs > 1 && (IsFirstSubreg || IsLastSubreg)) 2016 MIB.addReg(SB.SuperReg, getKillRegState(UseKill) | RegState::Implicit); 2017 2018 // FIXME: Since this spills to another register instead of an actual 2019 // frame index, we should delete the frame index when all references to 2020 // it are fixed. 2021 } 2022 } else { 2023 SB.prepare(); 2024 2025 // SubReg carries the "Kill" flag when SubReg == SB.SuperReg. 2026 unsigned SubKillState = getKillRegState((SB.NumSubRegs == 1) && SB.IsKill); 2027 2028 // Per VGPR helper data 2029 auto PVD = SB.getPerVGPRData(); 2030 2031 for (unsigned Offset = 0; Offset < PVD.NumVGPRs; ++Offset) { 2032 unsigned TmpVGPRFlags = RegState::Undef; 2033 2034 // Write sub registers into the VGPR 2035 for (unsigned i = Offset * PVD.PerVGPR, 2036 e = std::min((Offset + 1) * PVD.PerVGPR, SB.NumSubRegs); 2037 i < e; ++i) { 2038 Register SubReg = 2039 SB.NumSubRegs == 1 2040 ? SB.SuperReg 2041 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i])); 2042 2043 MachineInstrBuilder WriteLane = 2044 BuildMI(*SB.MBB, MI, SB.DL, 2045 SB.TII.get(AMDGPU::SI_SPILL_S32_TO_VGPR), SB.TmpVGPR) 2046 .addReg(SubReg, SubKillState) 2047 .addImm(i % PVD.PerVGPR) 2048 .addReg(SB.TmpVGPR, TmpVGPRFlags); 2049 TmpVGPRFlags = 0; 2050 2051 if (Indexes) { 2052 if (i == 0) 2053 Indexes->replaceMachineInstrInMaps(*MI, *WriteLane); 2054 else 2055 Indexes->insertMachineInstrInMaps(*WriteLane); 2056 } 2057 2058 // There could be undef components of a spilled super register. 2059 // TODO: Can we detect this and skip the spill? 2060 if (SB.NumSubRegs > 1) { 2061 // The last implicit use of the SB.SuperReg carries the "Kill" flag. 2062 unsigned SuperKillState = 0; 2063 if (i + 1 == SB.NumSubRegs) 2064 SuperKillState |= getKillRegState(SB.IsKill); 2065 WriteLane.addReg(SB.SuperReg, RegState::Implicit | SuperKillState); 2066 } 2067 } 2068 2069 // Write out VGPR 2070 SB.readWriteTmpVGPR(Offset, /*IsLoad*/ false); 2071 } 2072 2073 SB.restore(); 2074 } 2075 2076 MI->eraseFromParent(); 2077 SB.MFI.addToSpilledSGPRs(SB.NumSubRegs); 2078 2079 if (LIS) 2080 LIS->removeAllRegUnitsForPhysReg(SB.SuperReg); 2081 2082 return true; 2083 } 2084 2085 bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI, int Index, 2086 RegScavenger *RS, SlotIndexes *Indexes, 2087 LiveIntervals *LIS, bool OnlyToVGPR, 2088 bool SpillToPhysVGPRLane) const { 2089 SGPRSpillBuilder SB(*this, *ST.getInstrInfo(), isWave32, MI, Index, RS); 2090 2091 ArrayRef<SpilledReg> VGPRSpills = 2092 SpillToPhysVGPRLane ? SB.MFI.getSGPRSpillToPhysicalVGPRLanes(Index) 2093 : SB.MFI.getSGPRSpillToVirtualVGPRLanes(Index); 2094 bool SpillToVGPR = !VGPRSpills.empty(); 2095 if (OnlyToVGPR && !SpillToVGPR) 2096 return false; 2097 2098 if (SpillToVGPR) { 2099 for (unsigned i = 0, e = SB.NumSubRegs; i < e; ++i) { 2100 Register SubReg = 2101 SB.NumSubRegs == 1 2102 ? SB.SuperReg 2103 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i])); 2104 2105 SpilledReg Spill = VGPRSpills[i]; 2106 auto MIB = BuildMI(*SB.MBB, MI, SB.DL, 2107 SB.TII.get(AMDGPU::SI_RESTORE_S32_FROM_VGPR), SubReg) 2108 .addReg(Spill.VGPR) 2109 .addImm(Spill.Lane); 2110 if (SB.NumSubRegs > 1 && i == 0) 2111 MIB.addReg(SB.SuperReg, RegState::ImplicitDefine); 2112 if (Indexes) { 2113 if (i == e - 1) 2114 Indexes->replaceMachineInstrInMaps(*MI, *MIB); 2115 else 2116 Indexes->insertMachineInstrInMaps(*MIB); 2117 } 2118 } 2119 } else { 2120 SB.prepare(); 2121 2122 // Per VGPR helper data 2123 auto PVD = SB.getPerVGPRData(); 2124 2125 for (unsigned Offset = 0; Offset < PVD.NumVGPRs; ++Offset) { 2126 // Load in VGPR data 2127 SB.readWriteTmpVGPR(Offset, /*IsLoad*/ true); 2128 2129 // Unpack lanes 2130 for (unsigned i = Offset * PVD.PerVGPR, 2131 e = std::min((Offset + 1) * PVD.PerVGPR, SB.NumSubRegs); 2132 i < e; ++i) { 2133 Register SubReg = 2134 SB.NumSubRegs == 1 2135 ? SB.SuperReg 2136 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i])); 2137 2138 bool LastSubReg = (i + 1 == e); 2139 auto MIB = BuildMI(*SB.MBB, MI, SB.DL, 2140 SB.TII.get(AMDGPU::SI_RESTORE_S32_FROM_VGPR), SubReg) 2141 .addReg(SB.TmpVGPR, getKillRegState(LastSubReg)) 2142 .addImm(i); 2143 if (SB.NumSubRegs > 1 && i == 0) 2144 MIB.addReg(SB.SuperReg, RegState::ImplicitDefine); 2145 if (Indexes) { 2146 if (i == e - 1) 2147 Indexes->replaceMachineInstrInMaps(*MI, *MIB); 2148 else 2149 Indexes->insertMachineInstrInMaps(*MIB); 2150 } 2151 } 2152 } 2153 2154 SB.restore(); 2155 } 2156 2157 MI->eraseFromParent(); 2158 2159 if (LIS) 2160 LIS->removeAllRegUnitsForPhysReg(SB.SuperReg); 2161 2162 return true; 2163 } 2164 2165 bool SIRegisterInfo::spillEmergencySGPR(MachineBasicBlock::iterator MI, 2166 MachineBasicBlock &RestoreMBB, 2167 Register SGPR, RegScavenger *RS) const { 2168 SGPRSpillBuilder SB(*this, *ST.getInstrInfo(), isWave32, MI, SGPR, false, 0, 2169 RS); 2170 SB.prepare(); 2171 // Generate the spill of SGPR to SB.TmpVGPR. 2172 unsigned SubKillState = getKillRegState((SB.NumSubRegs == 1) && SB.IsKill); 2173 auto PVD = SB.getPerVGPRData(); 2174 for (unsigned Offset = 0; Offset < PVD.NumVGPRs; ++Offset) { 2175 unsigned TmpVGPRFlags = RegState::Undef; 2176 // Write sub registers into the VGPR 2177 for (unsigned i = Offset * PVD.PerVGPR, 2178 e = std::min((Offset + 1) * PVD.PerVGPR, SB.NumSubRegs); 2179 i < e; ++i) { 2180 Register SubReg = 2181 SB.NumSubRegs == 1 2182 ? SB.SuperReg 2183 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i])); 2184 2185 MachineInstrBuilder WriteLane = 2186 BuildMI(*SB.MBB, MI, SB.DL, SB.TII.get(AMDGPU::V_WRITELANE_B32), 2187 SB.TmpVGPR) 2188 .addReg(SubReg, SubKillState) 2189 .addImm(i % PVD.PerVGPR) 2190 .addReg(SB.TmpVGPR, TmpVGPRFlags); 2191 TmpVGPRFlags = 0; 2192 // There could be undef components of a spilled super register. 2193 // TODO: Can we detect this and skip the spill? 2194 if (SB.NumSubRegs > 1) { 2195 // The last implicit use of the SB.SuperReg carries the "Kill" flag. 2196 unsigned SuperKillState = 0; 2197 if (i + 1 == SB.NumSubRegs) 2198 SuperKillState |= getKillRegState(SB.IsKill); 2199 WriteLane.addReg(SB.SuperReg, RegState::Implicit | SuperKillState); 2200 } 2201 } 2202 // Don't need to write VGPR out. 2203 } 2204 2205 // Restore clobbered registers in the specified restore block. 2206 MI = RestoreMBB.end(); 2207 SB.setMI(&RestoreMBB, MI); 2208 // Generate the restore of SGPR from SB.TmpVGPR. 2209 for (unsigned Offset = 0; Offset < PVD.NumVGPRs; ++Offset) { 2210 // Don't need to load VGPR in. 2211 // Unpack lanes 2212 for (unsigned i = Offset * PVD.PerVGPR, 2213 e = std::min((Offset + 1) * PVD.PerVGPR, SB.NumSubRegs); 2214 i < e; ++i) { 2215 Register SubReg = 2216 SB.NumSubRegs == 1 2217 ? SB.SuperReg 2218 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i])); 2219 bool LastSubReg = (i + 1 == e); 2220 auto MIB = BuildMI(*SB.MBB, MI, SB.DL, SB.TII.get(AMDGPU::V_READLANE_B32), 2221 SubReg) 2222 .addReg(SB.TmpVGPR, getKillRegState(LastSubReg)) 2223 .addImm(i); 2224 if (SB.NumSubRegs > 1 && i == 0) 2225 MIB.addReg(SB.SuperReg, RegState::ImplicitDefine); 2226 } 2227 } 2228 SB.restore(); 2229 2230 SB.MFI.addToSpilledSGPRs(SB.NumSubRegs); 2231 return false; 2232 } 2233 2234 /// Special case of eliminateFrameIndex. Returns true if the SGPR was spilled to 2235 /// a VGPR and the stack slot can be safely eliminated when all other users are 2236 /// handled. 2237 bool SIRegisterInfo::eliminateSGPRToVGPRSpillFrameIndex( 2238 MachineBasicBlock::iterator MI, int FI, RegScavenger *RS, 2239 SlotIndexes *Indexes, LiveIntervals *LIS, bool SpillToPhysVGPRLane) const { 2240 switch (MI->getOpcode()) { 2241 case AMDGPU::SI_SPILL_S1024_SAVE: 2242 case AMDGPU::SI_SPILL_S512_SAVE: 2243 case AMDGPU::SI_SPILL_S384_SAVE: 2244 case AMDGPU::SI_SPILL_S352_SAVE: 2245 case AMDGPU::SI_SPILL_S320_SAVE: 2246 case AMDGPU::SI_SPILL_S288_SAVE: 2247 case AMDGPU::SI_SPILL_S256_SAVE: 2248 case AMDGPU::SI_SPILL_S224_SAVE: 2249 case AMDGPU::SI_SPILL_S192_SAVE: 2250 case AMDGPU::SI_SPILL_S160_SAVE: 2251 case AMDGPU::SI_SPILL_S128_SAVE: 2252 case AMDGPU::SI_SPILL_S96_SAVE: 2253 case AMDGPU::SI_SPILL_S64_SAVE: 2254 case AMDGPU::SI_SPILL_S32_SAVE: 2255 return spillSGPR(MI, FI, RS, Indexes, LIS, true, SpillToPhysVGPRLane); 2256 case AMDGPU::SI_SPILL_S1024_RESTORE: 2257 case AMDGPU::SI_SPILL_S512_RESTORE: 2258 case AMDGPU::SI_SPILL_S384_RESTORE: 2259 case AMDGPU::SI_SPILL_S352_RESTORE: 2260 case AMDGPU::SI_SPILL_S320_RESTORE: 2261 case AMDGPU::SI_SPILL_S288_RESTORE: 2262 case AMDGPU::SI_SPILL_S256_RESTORE: 2263 case AMDGPU::SI_SPILL_S224_RESTORE: 2264 case AMDGPU::SI_SPILL_S192_RESTORE: 2265 case AMDGPU::SI_SPILL_S160_RESTORE: 2266 case AMDGPU::SI_SPILL_S128_RESTORE: 2267 case AMDGPU::SI_SPILL_S96_RESTORE: 2268 case AMDGPU::SI_SPILL_S64_RESTORE: 2269 case AMDGPU::SI_SPILL_S32_RESTORE: 2270 return restoreSGPR(MI, FI, RS, Indexes, LIS, true, SpillToPhysVGPRLane); 2271 default: 2272 llvm_unreachable("not an SGPR spill instruction"); 2273 } 2274 } 2275 2276 bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, 2277 int SPAdj, unsigned FIOperandNum, 2278 RegScavenger *RS) const { 2279 MachineFunction *MF = MI->getParent()->getParent(); 2280 MachineBasicBlock *MBB = MI->getParent(); 2281 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); 2282 MachineFrameInfo &FrameInfo = MF->getFrameInfo(); 2283 const SIInstrInfo *TII = ST.getInstrInfo(); 2284 const DebugLoc &DL = MI->getDebugLoc(); 2285 2286 assert(SPAdj == 0 && "unhandled SP adjustment in call sequence?"); 2287 2288 assert(MF->getRegInfo().isReserved(MFI->getScratchRSrcReg()) && 2289 "unreserved scratch RSRC register"); 2290 2291 MachineOperand *FIOp = &MI->getOperand(FIOperandNum); 2292 int Index = MI->getOperand(FIOperandNum).getIndex(); 2293 2294 Register FrameReg = FrameInfo.isFixedObjectIndex(Index) && hasBasePointer(*MF) 2295 ? getBaseRegister() 2296 : getFrameRegister(*MF); 2297 2298 switch (MI->getOpcode()) { 2299 // SGPR register spill 2300 case AMDGPU::SI_SPILL_S1024_SAVE: 2301 case AMDGPU::SI_SPILL_S512_SAVE: 2302 case AMDGPU::SI_SPILL_S384_SAVE: 2303 case AMDGPU::SI_SPILL_S352_SAVE: 2304 case AMDGPU::SI_SPILL_S320_SAVE: 2305 case AMDGPU::SI_SPILL_S288_SAVE: 2306 case AMDGPU::SI_SPILL_S256_SAVE: 2307 case AMDGPU::SI_SPILL_S224_SAVE: 2308 case AMDGPU::SI_SPILL_S192_SAVE: 2309 case AMDGPU::SI_SPILL_S160_SAVE: 2310 case AMDGPU::SI_SPILL_S128_SAVE: 2311 case AMDGPU::SI_SPILL_S96_SAVE: 2312 case AMDGPU::SI_SPILL_S64_SAVE: 2313 case AMDGPU::SI_SPILL_S32_SAVE: { 2314 return spillSGPR(MI, Index, RS); 2315 } 2316 2317 // SGPR register restore 2318 case AMDGPU::SI_SPILL_S1024_RESTORE: 2319 case AMDGPU::SI_SPILL_S512_RESTORE: 2320 case AMDGPU::SI_SPILL_S384_RESTORE: 2321 case AMDGPU::SI_SPILL_S352_RESTORE: 2322 case AMDGPU::SI_SPILL_S320_RESTORE: 2323 case AMDGPU::SI_SPILL_S288_RESTORE: 2324 case AMDGPU::SI_SPILL_S256_RESTORE: 2325 case AMDGPU::SI_SPILL_S224_RESTORE: 2326 case AMDGPU::SI_SPILL_S192_RESTORE: 2327 case AMDGPU::SI_SPILL_S160_RESTORE: 2328 case AMDGPU::SI_SPILL_S128_RESTORE: 2329 case AMDGPU::SI_SPILL_S96_RESTORE: 2330 case AMDGPU::SI_SPILL_S64_RESTORE: 2331 case AMDGPU::SI_SPILL_S32_RESTORE: { 2332 return restoreSGPR(MI, Index, RS); 2333 } 2334 2335 // VGPR register spill 2336 case AMDGPU::SI_SPILL_V1024_SAVE: 2337 case AMDGPU::SI_SPILL_V512_SAVE: 2338 case AMDGPU::SI_SPILL_V384_SAVE: 2339 case AMDGPU::SI_SPILL_V352_SAVE: 2340 case AMDGPU::SI_SPILL_V320_SAVE: 2341 case AMDGPU::SI_SPILL_V288_SAVE: 2342 case AMDGPU::SI_SPILL_V256_SAVE: 2343 case AMDGPU::SI_SPILL_V224_SAVE: 2344 case AMDGPU::SI_SPILL_V192_SAVE: 2345 case AMDGPU::SI_SPILL_V160_SAVE: 2346 case AMDGPU::SI_SPILL_V128_SAVE: 2347 case AMDGPU::SI_SPILL_V96_SAVE: 2348 case AMDGPU::SI_SPILL_V64_SAVE: 2349 case AMDGPU::SI_SPILL_V32_SAVE: 2350 case AMDGPU::SI_SPILL_A1024_SAVE: 2351 case AMDGPU::SI_SPILL_A512_SAVE: 2352 case AMDGPU::SI_SPILL_A384_SAVE: 2353 case AMDGPU::SI_SPILL_A352_SAVE: 2354 case AMDGPU::SI_SPILL_A320_SAVE: 2355 case AMDGPU::SI_SPILL_A288_SAVE: 2356 case AMDGPU::SI_SPILL_A256_SAVE: 2357 case AMDGPU::SI_SPILL_A224_SAVE: 2358 case AMDGPU::SI_SPILL_A192_SAVE: 2359 case AMDGPU::SI_SPILL_A160_SAVE: 2360 case AMDGPU::SI_SPILL_A128_SAVE: 2361 case AMDGPU::SI_SPILL_A96_SAVE: 2362 case AMDGPU::SI_SPILL_A64_SAVE: 2363 case AMDGPU::SI_SPILL_A32_SAVE: 2364 case AMDGPU::SI_SPILL_AV1024_SAVE: 2365 case AMDGPU::SI_SPILL_AV512_SAVE: 2366 case AMDGPU::SI_SPILL_AV384_SAVE: 2367 case AMDGPU::SI_SPILL_AV352_SAVE: 2368 case AMDGPU::SI_SPILL_AV320_SAVE: 2369 case AMDGPU::SI_SPILL_AV288_SAVE: 2370 case AMDGPU::SI_SPILL_AV256_SAVE: 2371 case AMDGPU::SI_SPILL_AV224_SAVE: 2372 case AMDGPU::SI_SPILL_AV192_SAVE: 2373 case AMDGPU::SI_SPILL_AV160_SAVE: 2374 case AMDGPU::SI_SPILL_AV128_SAVE: 2375 case AMDGPU::SI_SPILL_AV96_SAVE: 2376 case AMDGPU::SI_SPILL_AV64_SAVE: 2377 case AMDGPU::SI_SPILL_AV32_SAVE: 2378 case AMDGPU::SI_SPILL_WWM_V32_SAVE: 2379 case AMDGPU::SI_SPILL_WWM_AV32_SAVE: { 2380 const MachineOperand *VData = TII->getNamedOperand(*MI, 2381 AMDGPU::OpName::vdata); 2382 if (VData->isUndef()) { 2383 MI->eraseFromParent(); 2384 return true; 2385 } 2386 2387 assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() == 2388 MFI->getStackPtrOffsetReg()); 2389 2390 unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_STORE_DWORD_SADDR 2391 : AMDGPU::BUFFER_STORE_DWORD_OFFSET; 2392 auto *MBB = MI->getParent(); 2393 bool IsWWMRegSpill = TII->isWWMRegSpillOpcode(MI->getOpcode()); 2394 if (IsWWMRegSpill) { 2395 TII->insertScratchExecCopy(*MF, *MBB, MI, DL, MFI->getSGPRForEXECCopy(), 2396 RS->isRegUsed(AMDGPU::SCC)); 2397 } 2398 buildSpillLoadStore( 2399 *MBB, MI, DL, Opc, Index, VData->getReg(), VData->isKill(), FrameReg, 2400 TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(), 2401 *MI->memoperands_begin(), RS); 2402 MFI->addToSpilledVGPRs(getNumSubRegsForSpillOp(MI->getOpcode())); 2403 if (IsWWMRegSpill) 2404 TII->restoreExec(*MF, *MBB, MI, DL, MFI->getSGPRForEXECCopy()); 2405 2406 MI->eraseFromParent(); 2407 return true; 2408 } 2409 case AMDGPU::SI_SPILL_V32_RESTORE: 2410 case AMDGPU::SI_SPILL_V64_RESTORE: 2411 case AMDGPU::SI_SPILL_V96_RESTORE: 2412 case AMDGPU::SI_SPILL_V128_RESTORE: 2413 case AMDGPU::SI_SPILL_V160_RESTORE: 2414 case AMDGPU::SI_SPILL_V192_RESTORE: 2415 case AMDGPU::SI_SPILL_V224_RESTORE: 2416 case AMDGPU::SI_SPILL_V256_RESTORE: 2417 case AMDGPU::SI_SPILL_V288_RESTORE: 2418 case AMDGPU::SI_SPILL_V320_RESTORE: 2419 case AMDGPU::SI_SPILL_V352_RESTORE: 2420 case AMDGPU::SI_SPILL_V384_RESTORE: 2421 case AMDGPU::SI_SPILL_V512_RESTORE: 2422 case AMDGPU::SI_SPILL_V1024_RESTORE: 2423 case AMDGPU::SI_SPILL_A32_RESTORE: 2424 case AMDGPU::SI_SPILL_A64_RESTORE: 2425 case AMDGPU::SI_SPILL_A96_RESTORE: 2426 case AMDGPU::SI_SPILL_A128_RESTORE: 2427 case AMDGPU::SI_SPILL_A160_RESTORE: 2428 case AMDGPU::SI_SPILL_A192_RESTORE: 2429 case AMDGPU::SI_SPILL_A224_RESTORE: 2430 case AMDGPU::SI_SPILL_A256_RESTORE: 2431 case AMDGPU::SI_SPILL_A288_RESTORE: 2432 case AMDGPU::SI_SPILL_A320_RESTORE: 2433 case AMDGPU::SI_SPILL_A352_RESTORE: 2434 case AMDGPU::SI_SPILL_A384_RESTORE: 2435 case AMDGPU::SI_SPILL_A512_RESTORE: 2436 case AMDGPU::SI_SPILL_A1024_RESTORE: 2437 case AMDGPU::SI_SPILL_AV32_RESTORE: 2438 case AMDGPU::SI_SPILL_AV64_RESTORE: 2439 case AMDGPU::SI_SPILL_AV96_RESTORE: 2440 case AMDGPU::SI_SPILL_AV128_RESTORE: 2441 case AMDGPU::SI_SPILL_AV160_RESTORE: 2442 case AMDGPU::SI_SPILL_AV192_RESTORE: 2443 case AMDGPU::SI_SPILL_AV224_RESTORE: 2444 case AMDGPU::SI_SPILL_AV256_RESTORE: 2445 case AMDGPU::SI_SPILL_AV288_RESTORE: 2446 case AMDGPU::SI_SPILL_AV320_RESTORE: 2447 case AMDGPU::SI_SPILL_AV352_RESTORE: 2448 case AMDGPU::SI_SPILL_AV384_RESTORE: 2449 case AMDGPU::SI_SPILL_AV512_RESTORE: 2450 case AMDGPU::SI_SPILL_AV1024_RESTORE: 2451 case AMDGPU::SI_SPILL_WWM_V32_RESTORE: 2452 case AMDGPU::SI_SPILL_WWM_AV32_RESTORE: { 2453 const MachineOperand *VData = TII->getNamedOperand(*MI, 2454 AMDGPU::OpName::vdata); 2455 assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() == 2456 MFI->getStackPtrOffsetReg()); 2457 2458 unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR 2459 : AMDGPU::BUFFER_LOAD_DWORD_OFFSET; 2460 auto *MBB = MI->getParent(); 2461 bool IsWWMRegSpill = TII->isWWMRegSpillOpcode(MI->getOpcode()); 2462 if (IsWWMRegSpill) { 2463 TII->insertScratchExecCopy(*MF, *MBB, MI, DL, MFI->getSGPRForEXECCopy(), 2464 RS->isRegUsed(AMDGPU::SCC)); 2465 } 2466 2467 buildSpillLoadStore( 2468 *MBB, MI, DL, Opc, Index, VData->getReg(), VData->isKill(), FrameReg, 2469 TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(), 2470 *MI->memoperands_begin(), RS); 2471 2472 if (IsWWMRegSpill) 2473 TII->restoreExec(*MF, *MBB, MI, DL, MFI->getSGPRForEXECCopy()); 2474 2475 MI->eraseFromParent(); 2476 return true; 2477 } 2478 case AMDGPU::V_ADD_U32_e32: 2479 case AMDGPU::V_ADD_U32_e64: 2480 case AMDGPU::V_ADD_CO_U32_e32: 2481 case AMDGPU::V_ADD_CO_U32_e64: { 2482 // TODO: Handle sub, and, or. 2483 unsigned NumDefs = MI->getNumExplicitDefs(); 2484 unsigned Src0Idx = NumDefs; 2485 2486 bool HasClamp = false; 2487 MachineOperand *VCCOp = nullptr; 2488 2489 switch (MI->getOpcode()) { 2490 case AMDGPU::V_ADD_U32_e32: 2491 break; 2492 case AMDGPU::V_ADD_U32_e64: 2493 HasClamp = MI->getOperand(3).getImm(); 2494 break; 2495 case AMDGPU::V_ADD_CO_U32_e32: 2496 VCCOp = &MI->getOperand(3); 2497 break; 2498 case AMDGPU::V_ADD_CO_U32_e64: 2499 VCCOp = &MI->getOperand(1); 2500 HasClamp = MI->getOperand(4).getImm(); 2501 break; 2502 default: 2503 break; 2504 } 2505 bool DeadVCC = !VCCOp || VCCOp->isDead(); 2506 MachineOperand &DstOp = MI->getOperand(0); 2507 Register DstReg = DstOp.getReg(); 2508 2509 unsigned OtherOpIdx = 2510 FIOperandNum == Src0Idx ? FIOperandNum + 1 : Src0Idx; 2511 MachineOperand *OtherOp = &MI->getOperand(OtherOpIdx); 2512 2513 unsigned Src1Idx = Src0Idx + 1; 2514 Register MaterializedReg = FrameReg; 2515 Register ScavengedVGPR; 2516 2517 int64_t Offset = FrameInfo.getObjectOffset(Index); 2518 // For the non-immediate case, we could fall through to the default 2519 // handling, but we do an in-place update of the result register here to 2520 // avoid scavenging another register. 2521 if (OtherOp->isImm()) { 2522 int64_t TotalOffset = OtherOp->getImm() + Offset; 2523 2524 if (!ST.hasVOP3Literal() && SIInstrInfo::isVOP3(*MI) && 2525 !AMDGPU::isInlinableIntLiteral(TotalOffset)) { 2526 // If we can't support a VOP3 literal in the VALU instruction, we 2527 // can't specially fold into the add. 2528 // TODO: Handle VOP3->VOP2 shrink to support the fold. 2529 break; 2530 } 2531 2532 OtherOp->setImm(TotalOffset); 2533 Offset = 0; 2534 } 2535 2536 if (FrameReg && !ST.enableFlatScratch()) { 2537 // We should just do an in-place update of the result register. However, 2538 // the value there may also be used by the add, in which case we need a 2539 // temporary register. 2540 // 2541 // FIXME: The scavenger is not finding the result register in the 2542 // common case where the add does not read the register. 2543 2544 ScavengedVGPR = RS->scavengeRegisterBackwards( 2545 AMDGPU::VGPR_32RegClass, MI, /*RestoreAfter=*/false, /*SPAdj=*/0); 2546 2547 // TODO: If we have a free SGPR, it's sometimes better to use a scalar 2548 // shift. 2549 BuildMI(*MBB, *MI, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64)) 2550 .addDef(ScavengedVGPR, RegState::Renamable) 2551 .addImm(ST.getWavefrontSizeLog2()) 2552 .addReg(FrameReg); 2553 MaterializedReg = ScavengedVGPR; 2554 } 2555 2556 if ((!OtherOp->isImm() || OtherOp->getImm() != 0) && MaterializedReg) { 2557 if (ST.enableFlatScratch() && 2558 !TII->isOperandLegal(*MI, Src1Idx, OtherOp)) { 2559 // We didn't need the shift above, so we have an SGPR for the frame 2560 // register, but may have a VGPR only operand. 2561 // 2562 // TODO: On gfx10+, we can easily change the opcode to the e64 version 2563 // and use the higher constant bus restriction to avoid this copy. 2564 2565 if (!ScavengedVGPR) { 2566 ScavengedVGPR = RS->scavengeRegisterBackwards( 2567 AMDGPU::VGPR_32RegClass, MI, /*RestoreAfter=*/false, 2568 /*SPAdj=*/0); 2569 } 2570 2571 assert(ScavengedVGPR != DstReg); 2572 2573 BuildMI(*MBB, *MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), ScavengedVGPR) 2574 .addReg(MaterializedReg, 2575 MaterializedReg != FrameReg ? RegState::Kill : 0); 2576 MaterializedReg = ScavengedVGPR; 2577 } 2578 2579 // TODO: In the flat scratch case, if this is an add of an SGPR, and SCC 2580 // is not live, we could use a scalar add + vector add instead of 2 2581 // vector adds. 2582 auto AddI32 = BuildMI(*MBB, *MI, DL, TII->get(MI->getOpcode())) 2583 .addDef(DstReg, RegState::Renamable); 2584 if (NumDefs == 2) 2585 AddI32.add(MI->getOperand(1)); 2586 2587 unsigned MaterializedRegFlags = 2588 MaterializedReg != FrameReg ? RegState::Kill : 0; 2589 2590 if (isVGPRClass(getPhysRegBaseClass(MaterializedReg))) { 2591 // If we know we have a VGPR already, it's more likely the other 2592 // operand is a legal vsrc0. 2593 AddI32 2594 .add(*OtherOp) 2595 .addReg(MaterializedReg, MaterializedRegFlags); 2596 } else { 2597 // Commute operands to avoid violating VOP2 restrictions. This will 2598 // typically happen when using scratch. 2599 AddI32 2600 .addReg(MaterializedReg, MaterializedRegFlags) 2601 .add(*OtherOp); 2602 } 2603 2604 if (MI->getOpcode() == AMDGPU::V_ADD_CO_U32_e64 || 2605 MI->getOpcode() == AMDGPU::V_ADD_U32_e64) 2606 AddI32.addImm(0); // clamp 2607 2608 if (MI->getOpcode() == AMDGPU::V_ADD_CO_U32_e32) 2609 AddI32.setOperandDead(3); // Dead vcc 2610 2611 MaterializedReg = DstReg; 2612 2613 OtherOp->ChangeToRegister(MaterializedReg, false); 2614 OtherOp->setIsKill(true); 2615 FIOp->ChangeToImmediate(Offset); 2616 Offset = 0; 2617 } else if (Offset != 0) { 2618 assert(!MaterializedReg); 2619 FIOp->ChangeToImmediate(Offset); 2620 Offset = 0; 2621 } else { 2622 if (DeadVCC && !HasClamp) { 2623 assert(Offset == 0); 2624 2625 // TODO: Losing kills and implicit operands. Just mutate to copy and 2626 // let lowerCopy deal with it? 2627 if (OtherOp->isReg() && OtherOp->getReg() == DstReg) { 2628 // Folded to an identity copy. 2629 MI->eraseFromParent(); 2630 return true; 2631 } 2632 2633 // The immediate value should be in OtherOp 2634 MI->setDesc(TII->get(AMDGPU::V_MOV_B32_e32)); 2635 MI->removeOperand(FIOperandNum); 2636 2637 unsigned NumOps = MI->getNumOperands(); 2638 for (unsigned I = NumOps - 2; I >= NumDefs + 1; --I) 2639 MI->removeOperand(I); 2640 2641 if (NumDefs == 2) 2642 MI->removeOperand(1); 2643 2644 // The code below can't deal with a mov. 2645 return true; 2646 } 2647 2648 // This folded to a constant, but we have to keep the add around for 2649 // pointless implicit defs or clamp modifier. 2650 FIOp->ChangeToImmediate(0); 2651 } 2652 2653 // Try to improve legality by commuting. 2654 if (!TII->isOperandLegal(*MI, Src1Idx) && TII->commuteInstruction(*MI)) { 2655 std::swap(FIOp, OtherOp); 2656 std::swap(FIOperandNum, OtherOpIdx); 2657 } 2658 2659 // We need at most one mov to satisfy the operand constraints. Prefer to 2660 // move the FI operand first, as it may be a literal in a VOP3 2661 // instruction. 2662 for (unsigned SrcIdx : {FIOperandNum, OtherOpIdx}) { 2663 if (!TII->isOperandLegal(*MI, SrcIdx)) { 2664 // If commuting didn't make the operands legal, we need to materialize 2665 // in a register. 2666 // TODO: Can use SGPR on gfx10+ in some cases. 2667 if (!ScavengedVGPR) { 2668 ScavengedVGPR = RS->scavengeRegisterBackwards( 2669 AMDGPU::VGPR_32RegClass, MI, /*RestoreAfter=*/false, 2670 /*SPAdj=*/0); 2671 } 2672 2673 assert(ScavengedVGPR != DstReg); 2674 2675 MachineOperand &Src = MI->getOperand(SrcIdx); 2676 BuildMI(*MBB, *MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), ScavengedVGPR) 2677 .add(Src); 2678 2679 Src.ChangeToRegister(ScavengedVGPR, false); 2680 Src.setIsKill(true); 2681 break; 2682 } 2683 } 2684 2685 // Fold out add of 0 case that can appear in kernels. 2686 if (FIOp->isImm() && FIOp->getImm() == 0 && DeadVCC && !HasClamp) { 2687 if (OtherOp->isReg() && OtherOp->getReg() != DstReg) { 2688 BuildMI(*MBB, *MI, DL, TII->get(AMDGPU::COPY), DstReg).add(*OtherOp); 2689 } 2690 2691 MI->eraseFromParent(); 2692 } 2693 2694 return true; 2695 } 2696 case AMDGPU::S_ADD_I32: { 2697 // TODO: Handle s_or_b32, s_and_b32. 2698 unsigned OtherOpIdx = FIOperandNum == 1 ? 2 : 1; 2699 MachineOperand &OtherOp = MI->getOperand(OtherOpIdx); 2700 2701 assert(FrameReg || MFI->isBottomOfStack()); 2702 2703 MachineOperand &DstOp = MI->getOperand(0); 2704 const DebugLoc &DL = MI->getDebugLoc(); 2705 Register MaterializedReg = FrameReg; 2706 2707 // Defend against live scc, which should never happen in practice. 2708 bool DeadSCC = MI->getOperand(3).isDead(); 2709 2710 Register TmpReg; 2711 2712 // FIXME: Scavenger should figure out that the result register is 2713 // available. Also should do this for the v_add case. 2714 if (OtherOp.isReg() && OtherOp.getReg() != DstOp.getReg()) 2715 TmpReg = DstOp.getReg(); 2716 2717 if (FrameReg && !ST.enableFlatScratch()) { 2718 // FIXME: In the common case where the add does not also read its result 2719 // (i.e. this isn't a reg += fi), it's not finding the dest reg as 2720 // available. 2721 if (!TmpReg) 2722 TmpReg = RS->scavengeRegisterBackwards(AMDGPU::SReg_32_XM0RegClass, 2723 MI, false, 0); 2724 BuildMI(*MBB, *MI, DL, TII->get(AMDGPU::S_LSHR_B32)) 2725 .addDef(TmpReg, RegState::Renamable) 2726 .addReg(FrameReg) 2727 .addImm(ST.getWavefrontSizeLog2()) 2728 .setOperandDead(3); // Set SCC dead 2729 MaterializedReg = TmpReg; 2730 } 2731 2732 int64_t Offset = FrameInfo.getObjectOffset(Index); 2733 2734 // For the non-immediate case, we could fall through to the default 2735 // handling, but we do an in-place update of the result register here to 2736 // avoid scavenging another register. 2737 if (OtherOp.isImm()) { 2738 OtherOp.setImm(OtherOp.getImm() + Offset); 2739 Offset = 0; 2740 2741 if (MaterializedReg) 2742 FIOp->ChangeToRegister(MaterializedReg, false); 2743 else 2744 FIOp->ChangeToImmediate(0); 2745 } else if (MaterializedReg) { 2746 // If we can't fold the other operand, do another increment. 2747 Register DstReg = DstOp.getReg(); 2748 2749 if (!TmpReg && MaterializedReg == FrameReg) { 2750 TmpReg = RS->scavengeRegisterBackwards(AMDGPU::SReg_32_XM0RegClass, 2751 MI, /*RestoreAfter=*/false, 0, 2752 /*AllowSpill=*/false); 2753 DstReg = TmpReg; 2754 } 2755 2756 auto AddI32 = BuildMI(*MBB, *MI, DL, TII->get(AMDGPU::S_ADD_I32)) 2757 .addDef(DstReg, RegState::Renamable) 2758 .addReg(MaterializedReg, RegState::Kill) 2759 .add(OtherOp); 2760 if (DeadSCC) 2761 AddI32.setOperandDead(3); 2762 2763 MaterializedReg = DstReg; 2764 2765 OtherOp.ChangeToRegister(MaterializedReg, false); 2766 OtherOp.setIsKill(true); 2767 OtherOp.setIsRenamable(true); 2768 FIOp->ChangeToImmediate(Offset); 2769 } else { 2770 // If we don't have any other offset to apply, we can just directly 2771 // interpret the frame index as the offset. 2772 FIOp->ChangeToImmediate(Offset); 2773 } 2774 2775 if (DeadSCC && OtherOp.isImm() && OtherOp.getImm() == 0) { 2776 assert(Offset == 0); 2777 MI->removeOperand(3); 2778 MI->removeOperand(OtherOpIdx); 2779 MI->setDesc(TII->get(FIOp->isReg() ? AMDGPU::COPY : AMDGPU::S_MOV_B32)); 2780 } else if (DeadSCC && FIOp->isImm() && FIOp->getImm() == 0) { 2781 assert(Offset == 0); 2782 MI->removeOperand(3); 2783 MI->removeOperand(FIOperandNum); 2784 MI->setDesc( 2785 TII->get(OtherOp.isReg() ? AMDGPU::COPY : AMDGPU::S_MOV_B32)); 2786 } 2787 2788 assert(!FIOp->isFI()); 2789 return true; 2790 } 2791 default: { 2792 break; 2793 } 2794 } 2795 2796 int64_t Offset = FrameInfo.getObjectOffset(Index); 2797 if (ST.enableFlatScratch()) { 2798 if (TII->isFLATScratch(*MI)) { 2799 assert( 2800 (int16_t)FIOperandNum == 2801 AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::saddr)); 2802 2803 // The offset is always swizzled, just replace it 2804 if (FrameReg) 2805 FIOp->ChangeToRegister(FrameReg, false); 2806 2807 MachineOperand *OffsetOp = 2808 TII->getNamedOperand(*MI, AMDGPU::OpName::offset); 2809 int64_t NewOffset = Offset + OffsetOp->getImm(); 2810 if (TII->isLegalFLATOffset(NewOffset, AMDGPUAS::PRIVATE_ADDRESS, 2811 SIInstrFlags::FlatScratch)) { 2812 OffsetOp->setImm(NewOffset); 2813 if (FrameReg) 2814 return false; 2815 Offset = 0; 2816 } 2817 2818 if (!Offset) { 2819 unsigned Opc = MI->getOpcode(); 2820 int NewOpc = -1; 2821 if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vaddr)) { 2822 NewOpc = AMDGPU::getFlatScratchInstSVfromSVS(Opc); 2823 } else if (ST.hasFlatScratchSTMode()) { 2824 // On GFX10 we have ST mode to use no registers for an address. 2825 // Otherwise we need to materialize 0 into an SGPR. 2826 NewOpc = AMDGPU::getFlatScratchInstSTfromSS(Opc); 2827 } 2828 2829 if (NewOpc != -1) { 2830 // removeOperand doesn't fixup tied operand indexes as it goes, so 2831 // it asserts. Untie vdst_in for now and retie them afterwards. 2832 int VDstIn = 2833 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst_in); 2834 bool TiedVDst = VDstIn != -1 && MI->getOperand(VDstIn).isReg() && 2835 MI->getOperand(VDstIn).isTied(); 2836 if (TiedVDst) 2837 MI->untieRegOperand(VDstIn); 2838 2839 MI->removeOperand( 2840 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr)); 2841 2842 if (TiedVDst) { 2843 int NewVDst = 2844 AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vdst); 2845 int NewVDstIn = 2846 AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vdst_in); 2847 assert(NewVDst != -1 && NewVDstIn != -1 && "Must be tied!"); 2848 MI->tieOperands(NewVDst, NewVDstIn); 2849 } 2850 MI->setDesc(TII->get(NewOpc)); 2851 return false; 2852 } 2853 } 2854 } 2855 2856 if (!FrameReg) { 2857 FIOp->ChangeToImmediate(Offset); 2858 if (TII->isImmOperandLegal(*MI, FIOperandNum, *FIOp)) 2859 return false; 2860 } 2861 2862 // We need to use register here. Check if we can use an SGPR or need 2863 // a VGPR. 2864 FIOp->ChangeToRegister(AMDGPU::M0, false); 2865 bool UseSGPR = TII->isOperandLegal(*MI, FIOperandNum, FIOp); 2866 2867 if (!Offset && FrameReg && UseSGPR) { 2868 FIOp->setReg(FrameReg); 2869 return false; 2870 } 2871 2872 const TargetRegisterClass *RC = 2873 UseSGPR ? &AMDGPU::SReg_32_XM0RegClass : &AMDGPU::VGPR_32RegClass; 2874 2875 Register TmpReg = 2876 RS->scavengeRegisterBackwards(*RC, MI, false, 0, !UseSGPR); 2877 FIOp->setReg(TmpReg); 2878 FIOp->setIsKill(); 2879 2880 if ((!FrameReg || !Offset) && TmpReg) { 2881 unsigned Opc = UseSGPR ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32; 2882 auto MIB = BuildMI(*MBB, MI, DL, TII->get(Opc), TmpReg); 2883 if (FrameReg) 2884 MIB.addReg(FrameReg); 2885 else 2886 MIB.addImm(Offset); 2887 2888 return false; 2889 } 2890 2891 bool NeedSaveSCC = RS->isRegUsed(AMDGPU::SCC) && 2892 !MI->definesRegister(AMDGPU::SCC, /*TRI=*/nullptr); 2893 2894 Register TmpSReg = 2895 UseSGPR ? TmpReg 2896 : RS->scavengeRegisterBackwards(AMDGPU::SReg_32_XM0RegClass, 2897 MI, false, 0, !UseSGPR); 2898 2899 // TODO: for flat scratch another attempt can be made with a VGPR index 2900 // if no SGPRs can be scavenged. 2901 if ((!TmpSReg && !FrameReg) || (!TmpReg && !UseSGPR)) 2902 report_fatal_error("Cannot scavenge register in FI elimination!"); 2903 2904 if (!TmpSReg) { 2905 // Use frame register and restore it after. 2906 TmpSReg = FrameReg; 2907 FIOp->setReg(FrameReg); 2908 FIOp->setIsKill(false); 2909 } 2910 2911 if (NeedSaveSCC) { 2912 assert(!(Offset & 0x1) && "Flat scratch offset must be aligned!"); 2913 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADDC_U32), TmpSReg) 2914 .addReg(FrameReg) 2915 .addImm(Offset); 2916 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_BITCMP1_B32)) 2917 .addReg(TmpSReg) 2918 .addImm(0); 2919 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_BITSET0_B32), TmpSReg) 2920 .addImm(0) 2921 .addReg(TmpSReg); 2922 } else { 2923 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), TmpSReg) 2924 .addReg(FrameReg) 2925 .addImm(Offset); 2926 } 2927 2928 if (!UseSGPR) 2929 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpReg) 2930 .addReg(TmpSReg, RegState::Kill); 2931 2932 if (TmpSReg == FrameReg) { 2933 // Undo frame register modification. 2934 if (NeedSaveSCC && 2935 !MI->registerDefIsDead(AMDGPU::SCC, /*TRI=*/nullptr)) { 2936 MachineBasicBlock::iterator I = 2937 BuildMI(*MBB, std::next(MI), DL, TII->get(AMDGPU::S_ADDC_U32), 2938 TmpSReg) 2939 .addReg(FrameReg) 2940 .addImm(-Offset); 2941 I = BuildMI(*MBB, std::next(I), DL, TII->get(AMDGPU::S_BITCMP1_B32)) 2942 .addReg(TmpSReg) 2943 .addImm(0); 2944 BuildMI(*MBB, std::next(I), DL, TII->get(AMDGPU::S_BITSET0_B32), 2945 TmpSReg) 2946 .addImm(0) 2947 .addReg(TmpSReg); 2948 } else { 2949 BuildMI(*MBB, std::next(MI), DL, TII->get(AMDGPU::S_ADD_I32), 2950 FrameReg) 2951 .addReg(FrameReg) 2952 .addImm(-Offset); 2953 } 2954 } 2955 2956 return false; 2957 } 2958 2959 bool IsMUBUF = TII->isMUBUF(*MI); 2960 2961 if (!IsMUBUF && !MFI->isBottomOfStack()) { 2962 // Convert to a swizzled stack address by scaling by the wave size. 2963 // In an entry function/kernel the offset is already swizzled. 2964 bool IsSALU = isSGPRClass(TII->getOpRegClass(*MI, FIOperandNum)); 2965 bool LiveSCC = RS->isRegUsed(AMDGPU::SCC) && 2966 !MI->definesRegister(AMDGPU::SCC, /*TRI=*/nullptr); 2967 const TargetRegisterClass *RC = IsSALU && !LiveSCC 2968 ? &AMDGPU::SReg_32RegClass 2969 : &AMDGPU::VGPR_32RegClass; 2970 bool IsCopy = MI->getOpcode() == AMDGPU::V_MOV_B32_e32 || 2971 MI->getOpcode() == AMDGPU::V_MOV_B32_e64 || 2972 MI->getOpcode() == AMDGPU::S_MOV_B32; 2973 Register ResultReg = 2974 IsCopy ? MI->getOperand(0).getReg() 2975 : RS->scavengeRegisterBackwards(*RC, MI, false, 0); 2976 2977 int64_t Offset = FrameInfo.getObjectOffset(Index); 2978 if (Offset == 0) { 2979 unsigned OpCode = 2980 IsSALU && !LiveSCC ? AMDGPU::S_LSHR_B32 : AMDGPU::V_LSHRREV_B32_e64; 2981 Register TmpResultReg = ResultReg; 2982 if (IsSALU && LiveSCC) { 2983 TmpResultReg = RS->scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass, 2984 MI, false, 0); 2985 } 2986 2987 auto Shift = BuildMI(*MBB, MI, DL, TII->get(OpCode), TmpResultReg); 2988 if (OpCode == AMDGPU::V_LSHRREV_B32_e64) 2989 // For V_LSHRREV, the operands are reversed (the shift count goes 2990 // first). 2991 Shift.addImm(ST.getWavefrontSizeLog2()).addReg(FrameReg); 2992 else 2993 Shift.addReg(FrameReg).addImm(ST.getWavefrontSizeLog2()); 2994 if (IsSALU && !LiveSCC) 2995 Shift.getInstr()->getOperand(3).setIsDead(); // Mark SCC as dead. 2996 if (IsSALU && LiveSCC) { 2997 Register NewDest = 2998 IsCopy ? ResultReg 2999 : RS->scavengeRegisterBackwards(AMDGPU::SReg_32RegClass, 3000 Shift, false, 0); 3001 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), NewDest) 3002 .addReg(TmpResultReg); 3003 ResultReg = NewDest; 3004 } 3005 } else { 3006 MachineInstrBuilder MIB; 3007 if (!IsSALU) { 3008 if ((MIB = TII->getAddNoCarry(*MBB, MI, DL, ResultReg, *RS)) != 3009 nullptr) { 3010 // Reuse ResultReg in intermediate step. 3011 Register ScaledReg = ResultReg; 3012 3013 BuildMI(*MBB, *MIB, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64), 3014 ScaledReg) 3015 .addImm(ST.getWavefrontSizeLog2()) 3016 .addReg(FrameReg); 3017 3018 const bool IsVOP2 = MIB->getOpcode() == AMDGPU::V_ADD_U32_e32; 3019 3020 // TODO: Fold if use instruction is another add of a constant. 3021 if (IsVOP2 || 3022 AMDGPU::isInlinableLiteral32(Offset, ST.hasInv2PiInlineImm())) { 3023 // FIXME: This can fail 3024 MIB.addImm(Offset); 3025 MIB.addReg(ScaledReg, RegState::Kill); 3026 if (!IsVOP2) 3027 MIB.addImm(0); // clamp bit 3028 } else { 3029 assert(MIB->getOpcode() == AMDGPU::V_ADD_CO_U32_e64 && 3030 "Need to reuse carry out register"); 3031 3032 // Use scavenged unused carry out as offset register. 3033 Register ConstOffsetReg; 3034 if (!isWave32) 3035 ConstOffsetReg = getSubReg(MIB.getReg(1), AMDGPU::sub0); 3036 else 3037 ConstOffsetReg = MIB.getReg(1); 3038 3039 BuildMI(*MBB, *MIB, DL, TII->get(AMDGPU::S_MOV_B32), 3040 ConstOffsetReg) 3041 .addImm(Offset); 3042 MIB.addReg(ConstOffsetReg, RegState::Kill); 3043 MIB.addReg(ScaledReg, RegState::Kill); 3044 MIB.addImm(0); // clamp bit 3045 } 3046 } 3047 } 3048 if (!MIB || IsSALU) { 3049 // We have to produce a carry out, and there isn't a free SGPR pair 3050 // for it. We can keep the whole computation on the SALU to avoid 3051 // clobbering an additional register at the cost of an extra mov. 3052 3053 // We may have 1 free scratch SGPR even though a carry out is 3054 // unavailable. Only one additional mov is needed. 3055 Register TmpScaledReg = IsCopy && IsSALU 3056 ? ResultReg 3057 : RS->scavengeRegisterBackwards( 3058 AMDGPU::SReg_32_XM0RegClass, MI, 3059 false, 0, /*AllowSpill=*/false); 3060 Register ScaledReg = TmpScaledReg.isValid() ? TmpScaledReg : FrameReg; 3061 Register TmpResultReg = ScaledReg; 3062 3063 if (!LiveSCC) { 3064 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_LSHR_B32), TmpResultReg) 3065 .addReg(FrameReg) 3066 .addImm(ST.getWavefrontSizeLog2()); 3067 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), TmpResultReg) 3068 .addReg(TmpResultReg, RegState::Kill) 3069 .addImm(Offset); 3070 } else { 3071 TmpResultReg = RS->scavengeRegisterBackwards( 3072 AMDGPU::VGPR_32RegClass, MI, false, 0, /*AllowSpill=*/true); 3073 3074 MachineInstrBuilder Add; 3075 if ((Add = TII->getAddNoCarry(*MBB, MI, DL, TmpResultReg, *RS))) { 3076 BuildMI(*MBB, *Add, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64), 3077 TmpResultReg) 3078 .addImm(ST.getWavefrontSizeLog2()) 3079 .addReg(FrameReg); 3080 if (Add->getOpcode() == AMDGPU::V_ADD_CO_U32_e64) { 3081 BuildMI(*MBB, *Add, DL, TII->get(AMDGPU::S_MOV_B32), ResultReg) 3082 .addImm(Offset); 3083 Add.addReg(ResultReg, RegState::Kill) 3084 .addReg(TmpResultReg, RegState::Kill) 3085 .addImm(0); 3086 } else 3087 Add.addImm(Offset).addReg(TmpResultReg, RegState::Kill); 3088 } else { 3089 assert(Offset > 0 && isUInt<24>(2 * ST.getMaxWaveScratchSize()) && 3090 "offset is unsafe for v_mad_u32_u24"); 3091 3092 // We start with a frame pointer with a wave space value, and 3093 // an offset in lane-space. We are materializing a lane space 3094 // value. We can either do a right shift of the frame pointer 3095 // to get to lane space, or a left shift of the offset to get 3096 // to wavespace. We can right shift after the computation to 3097 // get back to the desired per-lane value. We are using the 3098 // mad_u32_u24 primarily as an add with no carry out clobber. 3099 bool IsInlinableLiteral = 3100 AMDGPU::isInlinableLiteral32(Offset, ST.hasInv2PiInlineImm()); 3101 if (!IsInlinableLiteral) { 3102 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), 3103 TmpResultReg) 3104 .addImm(Offset); 3105 } 3106 3107 Add = BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MAD_U32_U24_e64), 3108 TmpResultReg); 3109 3110 if (!IsInlinableLiteral) { 3111 Add.addReg(TmpResultReg, RegState::Kill); 3112 } else { 3113 // We fold the offset into mad itself if its inlinable. 3114 Add.addImm(Offset); 3115 } 3116 Add.addImm(ST.getWavefrontSize()).addReg(FrameReg).addImm(0); 3117 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64), 3118 TmpResultReg) 3119 .addImm(ST.getWavefrontSizeLog2()) 3120 .addReg(TmpResultReg); 3121 } 3122 3123 Register NewDest = IsCopy ? ResultReg 3124 : RS->scavengeRegisterBackwards( 3125 AMDGPU::SReg_32RegClass, *Add, 3126 false, 0, /*AllowSpill=*/true); 3127 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), 3128 NewDest) 3129 .addReg(TmpResultReg); 3130 ResultReg = NewDest; 3131 } 3132 if (!IsSALU) 3133 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), ResultReg) 3134 .addReg(TmpResultReg, RegState::Kill); 3135 else 3136 ResultReg = TmpResultReg; 3137 // If there were truly no free SGPRs, we need to undo everything. 3138 if (!TmpScaledReg.isValid()) { 3139 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), ScaledReg) 3140 .addReg(ScaledReg, RegState::Kill) 3141 .addImm(-Offset); 3142 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_LSHL_B32), ScaledReg) 3143 .addReg(FrameReg) 3144 .addImm(ST.getWavefrontSizeLog2()); 3145 } 3146 } 3147 } 3148 3149 // Don't introduce an extra copy if we're just materializing in a mov. 3150 if (IsCopy) { 3151 MI->eraseFromParent(); 3152 return true; 3153 } 3154 FIOp->ChangeToRegister(ResultReg, false, false, true); 3155 return false; 3156 } 3157 3158 if (IsMUBUF) { 3159 // Disable offen so we don't need a 0 vgpr base. 3160 assert( 3161 static_cast<int>(FIOperandNum) == 3162 AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::vaddr)); 3163 3164 auto &SOffset = *TII->getNamedOperand(*MI, AMDGPU::OpName::soffset); 3165 assert((SOffset.isImm() && SOffset.getImm() == 0)); 3166 3167 if (FrameReg != AMDGPU::NoRegister) 3168 SOffset.ChangeToRegister(FrameReg, false); 3169 3170 int64_t Offset = FrameInfo.getObjectOffset(Index); 3171 int64_t OldImm = 3172 TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(); 3173 int64_t NewOffset = OldImm + Offset; 3174 3175 if (TII->isLegalMUBUFImmOffset(NewOffset) && 3176 buildMUBUFOffsetLoadStore(ST, FrameInfo, MI, Index, NewOffset)) { 3177 MI->eraseFromParent(); 3178 return true; 3179 } 3180 } 3181 3182 // If the offset is simply too big, don't convert to a scratch wave offset 3183 // relative index. 3184 3185 FIOp->ChangeToImmediate(Offset); 3186 if (!TII->isImmOperandLegal(*MI, FIOperandNum, *FIOp)) { 3187 Register TmpReg = 3188 RS->scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass, MI, false, 0); 3189 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpReg) 3190 .addImm(Offset); 3191 FIOp->ChangeToRegister(TmpReg, false, false, true); 3192 } 3193 3194 return false; 3195 } 3196 3197 StringRef SIRegisterInfo::getRegAsmName(MCRegister Reg) const { 3198 return AMDGPUInstPrinter::getRegisterName(Reg); 3199 } 3200 3201 unsigned AMDGPU::getRegBitWidth(const TargetRegisterClass &RC) { 3202 return getRegBitWidth(RC.getID()); 3203 } 3204 3205 static const TargetRegisterClass * 3206 getAnyVGPRClassForBitWidth(unsigned BitWidth) { 3207 if (BitWidth == 64) 3208 return &AMDGPU::VReg_64RegClass; 3209 if (BitWidth == 96) 3210 return &AMDGPU::VReg_96RegClass; 3211 if (BitWidth == 128) 3212 return &AMDGPU::VReg_128RegClass; 3213 if (BitWidth == 160) 3214 return &AMDGPU::VReg_160RegClass; 3215 if (BitWidth == 192) 3216 return &AMDGPU::VReg_192RegClass; 3217 if (BitWidth == 224) 3218 return &AMDGPU::VReg_224RegClass; 3219 if (BitWidth == 256) 3220 return &AMDGPU::VReg_256RegClass; 3221 if (BitWidth == 288) 3222 return &AMDGPU::VReg_288RegClass; 3223 if (BitWidth == 320) 3224 return &AMDGPU::VReg_320RegClass; 3225 if (BitWidth == 352) 3226 return &AMDGPU::VReg_352RegClass; 3227 if (BitWidth == 384) 3228 return &AMDGPU::VReg_384RegClass; 3229 if (BitWidth == 512) 3230 return &AMDGPU::VReg_512RegClass; 3231 if (BitWidth == 1024) 3232 return &AMDGPU::VReg_1024RegClass; 3233 3234 return nullptr; 3235 } 3236 3237 static const TargetRegisterClass * 3238 getAlignedVGPRClassForBitWidth(unsigned BitWidth) { 3239 if (BitWidth == 64) 3240 return &AMDGPU::VReg_64_Align2RegClass; 3241 if (BitWidth == 96) 3242 return &AMDGPU::VReg_96_Align2RegClass; 3243 if (BitWidth == 128) 3244 return &AMDGPU::VReg_128_Align2RegClass; 3245 if (BitWidth == 160) 3246 return &AMDGPU::VReg_160_Align2RegClass; 3247 if (BitWidth == 192) 3248 return &AMDGPU::VReg_192_Align2RegClass; 3249 if (BitWidth == 224) 3250 return &AMDGPU::VReg_224_Align2RegClass; 3251 if (BitWidth == 256) 3252 return &AMDGPU::VReg_256_Align2RegClass; 3253 if (BitWidth == 288) 3254 return &AMDGPU::VReg_288_Align2RegClass; 3255 if (BitWidth == 320) 3256 return &AMDGPU::VReg_320_Align2RegClass; 3257 if (BitWidth == 352) 3258 return &AMDGPU::VReg_352_Align2RegClass; 3259 if (BitWidth == 384) 3260 return &AMDGPU::VReg_384_Align2RegClass; 3261 if (BitWidth == 512) 3262 return &AMDGPU::VReg_512_Align2RegClass; 3263 if (BitWidth == 1024) 3264 return &AMDGPU::VReg_1024_Align2RegClass; 3265 3266 return nullptr; 3267 } 3268 3269 const TargetRegisterClass * 3270 SIRegisterInfo::getVGPRClassForBitWidth(unsigned BitWidth) const { 3271 if (BitWidth == 1) 3272 return &AMDGPU::VReg_1RegClass; 3273 if (BitWidth == 16) 3274 return &AMDGPU::VGPR_16RegClass; 3275 if (BitWidth == 32) 3276 return &AMDGPU::VGPR_32RegClass; 3277 return ST.needsAlignedVGPRs() ? getAlignedVGPRClassForBitWidth(BitWidth) 3278 : getAnyVGPRClassForBitWidth(BitWidth); 3279 } 3280 3281 static const TargetRegisterClass * 3282 getAnyAGPRClassForBitWidth(unsigned BitWidth) { 3283 if (BitWidth == 64) 3284 return &AMDGPU::AReg_64RegClass; 3285 if (BitWidth == 96) 3286 return &AMDGPU::AReg_96RegClass; 3287 if (BitWidth == 128) 3288 return &AMDGPU::AReg_128RegClass; 3289 if (BitWidth == 160) 3290 return &AMDGPU::AReg_160RegClass; 3291 if (BitWidth == 192) 3292 return &AMDGPU::AReg_192RegClass; 3293 if (BitWidth == 224) 3294 return &AMDGPU::AReg_224RegClass; 3295 if (BitWidth == 256) 3296 return &AMDGPU::AReg_256RegClass; 3297 if (BitWidth == 288) 3298 return &AMDGPU::AReg_288RegClass; 3299 if (BitWidth == 320) 3300 return &AMDGPU::AReg_320RegClass; 3301 if (BitWidth == 352) 3302 return &AMDGPU::AReg_352RegClass; 3303 if (BitWidth == 384) 3304 return &AMDGPU::AReg_384RegClass; 3305 if (BitWidth == 512) 3306 return &AMDGPU::AReg_512RegClass; 3307 if (BitWidth == 1024) 3308 return &AMDGPU::AReg_1024RegClass; 3309 3310 return nullptr; 3311 } 3312 3313 static const TargetRegisterClass * 3314 getAlignedAGPRClassForBitWidth(unsigned BitWidth) { 3315 if (BitWidth == 64) 3316 return &AMDGPU::AReg_64_Align2RegClass; 3317 if (BitWidth == 96) 3318 return &AMDGPU::AReg_96_Align2RegClass; 3319 if (BitWidth == 128) 3320 return &AMDGPU::AReg_128_Align2RegClass; 3321 if (BitWidth == 160) 3322 return &AMDGPU::AReg_160_Align2RegClass; 3323 if (BitWidth == 192) 3324 return &AMDGPU::AReg_192_Align2RegClass; 3325 if (BitWidth == 224) 3326 return &AMDGPU::AReg_224_Align2RegClass; 3327 if (BitWidth == 256) 3328 return &AMDGPU::AReg_256_Align2RegClass; 3329 if (BitWidth == 288) 3330 return &AMDGPU::AReg_288_Align2RegClass; 3331 if (BitWidth == 320) 3332 return &AMDGPU::AReg_320_Align2RegClass; 3333 if (BitWidth == 352) 3334 return &AMDGPU::AReg_352_Align2RegClass; 3335 if (BitWidth == 384) 3336 return &AMDGPU::AReg_384_Align2RegClass; 3337 if (BitWidth == 512) 3338 return &AMDGPU::AReg_512_Align2RegClass; 3339 if (BitWidth == 1024) 3340 return &AMDGPU::AReg_1024_Align2RegClass; 3341 3342 return nullptr; 3343 } 3344 3345 const TargetRegisterClass * 3346 SIRegisterInfo::getAGPRClassForBitWidth(unsigned BitWidth) const { 3347 if (BitWidth == 16) 3348 return &AMDGPU::AGPR_LO16RegClass; 3349 if (BitWidth == 32) 3350 return &AMDGPU::AGPR_32RegClass; 3351 return ST.needsAlignedVGPRs() ? getAlignedAGPRClassForBitWidth(BitWidth) 3352 : getAnyAGPRClassForBitWidth(BitWidth); 3353 } 3354 3355 static const TargetRegisterClass * 3356 getAnyVectorSuperClassForBitWidth(unsigned BitWidth) { 3357 if (BitWidth == 64) 3358 return &AMDGPU::AV_64RegClass; 3359 if (BitWidth == 96) 3360 return &AMDGPU::AV_96RegClass; 3361 if (BitWidth == 128) 3362 return &AMDGPU::AV_128RegClass; 3363 if (BitWidth == 160) 3364 return &AMDGPU::AV_160RegClass; 3365 if (BitWidth == 192) 3366 return &AMDGPU::AV_192RegClass; 3367 if (BitWidth == 224) 3368 return &AMDGPU::AV_224RegClass; 3369 if (BitWidth == 256) 3370 return &AMDGPU::AV_256RegClass; 3371 if (BitWidth == 288) 3372 return &AMDGPU::AV_288RegClass; 3373 if (BitWidth == 320) 3374 return &AMDGPU::AV_320RegClass; 3375 if (BitWidth == 352) 3376 return &AMDGPU::AV_352RegClass; 3377 if (BitWidth == 384) 3378 return &AMDGPU::AV_384RegClass; 3379 if (BitWidth == 512) 3380 return &AMDGPU::AV_512RegClass; 3381 if (BitWidth == 1024) 3382 return &AMDGPU::AV_1024RegClass; 3383 3384 return nullptr; 3385 } 3386 3387 static const TargetRegisterClass * 3388 getAlignedVectorSuperClassForBitWidth(unsigned BitWidth) { 3389 if (BitWidth == 64) 3390 return &AMDGPU::AV_64_Align2RegClass; 3391 if (BitWidth == 96) 3392 return &AMDGPU::AV_96_Align2RegClass; 3393 if (BitWidth == 128) 3394 return &AMDGPU::AV_128_Align2RegClass; 3395 if (BitWidth == 160) 3396 return &AMDGPU::AV_160_Align2RegClass; 3397 if (BitWidth == 192) 3398 return &AMDGPU::AV_192_Align2RegClass; 3399 if (BitWidth == 224) 3400 return &AMDGPU::AV_224_Align2RegClass; 3401 if (BitWidth == 256) 3402 return &AMDGPU::AV_256_Align2RegClass; 3403 if (BitWidth == 288) 3404 return &AMDGPU::AV_288_Align2RegClass; 3405 if (BitWidth == 320) 3406 return &AMDGPU::AV_320_Align2RegClass; 3407 if (BitWidth == 352) 3408 return &AMDGPU::AV_352_Align2RegClass; 3409 if (BitWidth == 384) 3410 return &AMDGPU::AV_384_Align2RegClass; 3411 if (BitWidth == 512) 3412 return &AMDGPU::AV_512_Align2RegClass; 3413 if (BitWidth == 1024) 3414 return &AMDGPU::AV_1024_Align2RegClass; 3415 3416 return nullptr; 3417 } 3418 3419 const TargetRegisterClass * 3420 SIRegisterInfo::getVectorSuperClassForBitWidth(unsigned BitWidth) const { 3421 if (BitWidth == 32) 3422 return &AMDGPU::AV_32RegClass; 3423 return ST.needsAlignedVGPRs() 3424 ? getAlignedVectorSuperClassForBitWidth(BitWidth) 3425 : getAnyVectorSuperClassForBitWidth(BitWidth); 3426 } 3427 3428 const TargetRegisterClass * 3429 SIRegisterInfo::getSGPRClassForBitWidth(unsigned BitWidth) { 3430 if (BitWidth == 16) 3431 return &AMDGPU::SGPR_LO16RegClass; 3432 if (BitWidth == 32) 3433 return &AMDGPU::SReg_32RegClass; 3434 if (BitWidth == 64) 3435 return &AMDGPU::SReg_64RegClass; 3436 if (BitWidth == 96) 3437 return &AMDGPU::SGPR_96RegClass; 3438 if (BitWidth == 128) 3439 return &AMDGPU::SGPR_128RegClass; 3440 if (BitWidth == 160) 3441 return &AMDGPU::SGPR_160RegClass; 3442 if (BitWidth == 192) 3443 return &AMDGPU::SGPR_192RegClass; 3444 if (BitWidth == 224) 3445 return &AMDGPU::SGPR_224RegClass; 3446 if (BitWidth == 256) 3447 return &AMDGPU::SGPR_256RegClass; 3448 if (BitWidth == 288) 3449 return &AMDGPU::SGPR_288RegClass; 3450 if (BitWidth == 320) 3451 return &AMDGPU::SGPR_320RegClass; 3452 if (BitWidth == 352) 3453 return &AMDGPU::SGPR_352RegClass; 3454 if (BitWidth == 384) 3455 return &AMDGPU::SGPR_384RegClass; 3456 if (BitWidth == 512) 3457 return &AMDGPU::SGPR_512RegClass; 3458 if (BitWidth == 1024) 3459 return &AMDGPU::SGPR_1024RegClass; 3460 3461 return nullptr; 3462 } 3463 3464 bool SIRegisterInfo::isSGPRReg(const MachineRegisterInfo &MRI, 3465 Register Reg) const { 3466 const TargetRegisterClass *RC; 3467 if (Reg.isVirtual()) 3468 RC = MRI.getRegClass(Reg); 3469 else 3470 RC = getPhysRegBaseClass(Reg); 3471 return RC ? isSGPRClass(RC) : false; 3472 } 3473 3474 const TargetRegisterClass * 3475 SIRegisterInfo::getEquivalentVGPRClass(const TargetRegisterClass *SRC) const { 3476 unsigned Size = getRegSizeInBits(*SRC); 3477 const TargetRegisterClass *VRC = getVGPRClassForBitWidth(Size); 3478 assert(VRC && "Invalid register class size"); 3479 return VRC; 3480 } 3481 3482 const TargetRegisterClass * 3483 SIRegisterInfo::getEquivalentAGPRClass(const TargetRegisterClass *SRC) const { 3484 unsigned Size = getRegSizeInBits(*SRC); 3485 const TargetRegisterClass *ARC = getAGPRClassForBitWidth(Size); 3486 assert(ARC && "Invalid register class size"); 3487 return ARC; 3488 } 3489 3490 const TargetRegisterClass * 3491 SIRegisterInfo::getEquivalentSGPRClass(const TargetRegisterClass *VRC) const { 3492 unsigned Size = getRegSizeInBits(*VRC); 3493 if (Size == 32) 3494 return &AMDGPU::SGPR_32RegClass; 3495 const TargetRegisterClass *SRC = getSGPRClassForBitWidth(Size); 3496 assert(SRC && "Invalid register class size"); 3497 return SRC; 3498 } 3499 3500 const TargetRegisterClass * 3501 SIRegisterInfo::getCompatibleSubRegClass(const TargetRegisterClass *SuperRC, 3502 const TargetRegisterClass *SubRC, 3503 unsigned SubIdx) const { 3504 // Ensure this subregister index is aligned in the super register. 3505 const TargetRegisterClass *MatchRC = 3506 getMatchingSuperRegClass(SuperRC, SubRC, SubIdx); 3507 return MatchRC && MatchRC->hasSubClassEq(SuperRC) ? MatchRC : nullptr; 3508 } 3509 3510 bool SIRegisterInfo::opCanUseInlineConstant(unsigned OpType) const { 3511 if (OpType >= AMDGPU::OPERAND_REG_INLINE_AC_FIRST && 3512 OpType <= AMDGPU::OPERAND_REG_INLINE_AC_LAST) 3513 return !ST.hasMFMAInlineLiteralBug(); 3514 3515 return OpType >= AMDGPU::OPERAND_SRC_FIRST && 3516 OpType <= AMDGPU::OPERAND_SRC_LAST; 3517 } 3518 3519 bool SIRegisterInfo::shouldRewriteCopySrc( 3520 const TargetRegisterClass *DefRC, 3521 unsigned DefSubReg, 3522 const TargetRegisterClass *SrcRC, 3523 unsigned SrcSubReg) const { 3524 // We want to prefer the smallest register class possible, so we don't want to 3525 // stop and rewrite on anything that looks like a subregister 3526 // extract. Operations mostly don't care about the super register class, so we 3527 // only want to stop on the most basic of copies between the same register 3528 // class. 3529 // 3530 // e.g. if we have something like 3531 // %0 = ... 3532 // %1 = ... 3533 // %2 = REG_SEQUENCE %0, sub0, %1, sub1, %2, sub2 3534 // %3 = COPY %2, sub0 3535 // 3536 // We want to look through the COPY to find: 3537 // => %3 = COPY %0 3538 3539 // Plain copy. 3540 return getCommonSubClass(DefRC, SrcRC) != nullptr; 3541 } 3542 3543 bool SIRegisterInfo::opCanUseLiteralConstant(unsigned OpType) const { 3544 // TODO: 64-bit operands have extending behavior from 32-bit literal. 3545 return OpType >= AMDGPU::OPERAND_REG_IMM_FIRST && 3546 OpType <= AMDGPU::OPERAND_REG_IMM_LAST; 3547 } 3548 3549 /// Returns a lowest register that is not used at any point in the function. 3550 /// If all registers are used, then this function will return 3551 /// AMDGPU::NoRegister. If \p ReserveHighestRegister = true, then return 3552 /// highest unused register. 3553 MCRegister SIRegisterInfo::findUnusedRegister( 3554 const MachineRegisterInfo &MRI, const TargetRegisterClass *RC, 3555 const MachineFunction &MF, bool ReserveHighestRegister) const { 3556 if (ReserveHighestRegister) { 3557 for (MCRegister Reg : reverse(*RC)) 3558 if (MRI.isAllocatable(Reg) && !MRI.isPhysRegUsed(Reg)) 3559 return Reg; 3560 } else { 3561 for (MCRegister Reg : *RC) 3562 if (MRI.isAllocatable(Reg) && !MRI.isPhysRegUsed(Reg)) 3563 return Reg; 3564 } 3565 return MCRegister(); 3566 } 3567 3568 bool SIRegisterInfo::isUniformReg(const MachineRegisterInfo &MRI, 3569 const RegisterBankInfo &RBI, 3570 Register Reg) const { 3571 auto *RB = RBI.getRegBank(Reg, MRI, *MRI.getTargetRegisterInfo()); 3572 if (!RB) 3573 return false; 3574 3575 return !RBI.isDivergentRegBank(RB); 3576 } 3577 3578 ArrayRef<int16_t> SIRegisterInfo::getRegSplitParts(const TargetRegisterClass *RC, 3579 unsigned EltSize) const { 3580 const unsigned RegBitWidth = AMDGPU::getRegBitWidth(*RC); 3581 assert(RegBitWidth >= 32 && RegBitWidth <= 1024); 3582 3583 const unsigned RegDWORDs = RegBitWidth / 32; 3584 const unsigned EltDWORDs = EltSize / 4; 3585 assert(RegSplitParts.size() + 1 >= EltDWORDs); 3586 3587 const std::vector<int16_t> &Parts = RegSplitParts[EltDWORDs - 1]; 3588 const unsigned NumParts = RegDWORDs / EltDWORDs; 3589 3590 return ArrayRef(Parts.data(), NumParts); 3591 } 3592 3593 const TargetRegisterClass* 3594 SIRegisterInfo::getRegClassForReg(const MachineRegisterInfo &MRI, 3595 Register Reg) const { 3596 return Reg.isVirtual() ? MRI.getRegClass(Reg) : getPhysRegBaseClass(Reg); 3597 } 3598 3599 const TargetRegisterClass * 3600 SIRegisterInfo::getRegClassForOperandReg(const MachineRegisterInfo &MRI, 3601 const MachineOperand &MO) const { 3602 const TargetRegisterClass *SrcRC = getRegClassForReg(MRI, MO.getReg()); 3603 return getSubRegisterClass(SrcRC, MO.getSubReg()); 3604 } 3605 3606 bool SIRegisterInfo::isVGPR(const MachineRegisterInfo &MRI, 3607 Register Reg) const { 3608 const TargetRegisterClass *RC = getRegClassForReg(MRI, Reg); 3609 // Registers without classes are unaddressable, SGPR-like registers. 3610 return RC && isVGPRClass(RC); 3611 } 3612 3613 bool SIRegisterInfo::isAGPR(const MachineRegisterInfo &MRI, 3614 Register Reg) const { 3615 const TargetRegisterClass *RC = getRegClassForReg(MRI, Reg); 3616 3617 // Registers without classes are unaddressable, SGPR-like registers. 3618 return RC && isAGPRClass(RC); 3619 } 3620 3621 bool SIRegisterInfo::shouldCoalesce(MachineInstr *MI, 3622 const TargetRegisterClass *SrcRC, 3623 unsigned SubReg, 3624 const TargetRegisterClass *DstRC, 3625 unsigned DstSubReg, 3626 const TargetRegisterClass *NewRC, 3627 LiveIntervals &LIS) const { 3628 unsigned SrcSize = getRegSizeInBits(*SrcRC); 3629 unsigned DstSize = getRegSizeInBits(*DstRC); 3630 unsigned NewSize = getRegSizeInBits(*NewRC); 3631 3632 // Do not increase size of registers beyond dword, we would need to allocate 3633 // adjacent registers and constraint regalloc more than needed. 3634 3635 // Always allow dword coalescing. 3636 if (SrcSize <= 32 || DstSize <= 32) 3637 return true; 3638 3639 return NewSize <= DstSize || NewSize <= SrcSize; 3640 } 3641 3642 unsigned SIRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC, 3643 MachineFunction &MF) const { 3644 unsigned MinOcc = ST.getOccupancyWithWorkGroupSizes(MF).first; 3645 switch (RC->getID()) { 3646 default: 3647 return AMDGPUGenRegisterInfo::getRegPressureLimit(RC, MF); 3648 case AMDGPU::VGPR_32RegClassID: 3649 return std::min(ST.getMaxNumVGPRs(MinOcc), ST.getMaxNumVGPRs(MF)); 3650 case AMDGPU::SGPR_32RegClassID: 3651 case AMDGPU::SGPR_LO16RegClassID: 3652 return std::min(ST.getMaxNumSGPRs(MinOcc, true), ST.getMaxNumSGPRs(MF)); 3653 } 3654 } 3655 3656 unsigned SIRegisterInfo::getRegPressureSetLimit(const MachineFunction &MF, 3657 unsigned Idx) const { 3658 if (Idx == AMDGPU::RegisterPressureSets::VGPR_32 || 3659 Idx == AMDGPU::RegisterPressureSets::AGPR_32) 3660 return getRegPressureLimit(&AMDGPU::VGPR_32RegClass, 3661 const_cast<MachineFunction &>(MF)); 3662 3663 if (Idx == AMDGPU::RegisterPressureSets::SReg_32) 3664 return getRegPressureLimit(&AMDGPU::SGPR_32RegClass, 3665 const_cast<MachineFunction &>(MF)); 3666 3667 llvm_unreachable("Unexpected register pressure set!"); 3668 } 3669 3670 const int *SIRegisterInfo::getRegUnitPressureSets(unsigned RegUnit) const { 3671 static const int Empty[] = { -1 }; 3672 3673 if (RegPressureIgnoredUnits[RegUnit]) 3674 return Empty; 3675 3676 return AMDGPUGenRegisterInfo::getRegUnitPressureSets(RegUnit); 3677 } 3678 3679 MCRegister SIRegisterInfo::getReturnAddressReg(const MachineFunction &MF) const { 3680 // Not a callee saved register. 3681 return AMDGPU::SGPR30_SGPR31; 3682 } 3683 3684 const TargetRegisterClass * 3685 SIRegisterInfo::getRegClassForSizeOnBank(unsigned Size, 3686 const RegisterBank &RB) const { 3687 switch (RB.getID()) { 3688 case AMDGPU::VGPRRegBankID: 3689 return getVGPRClassForBitWidth( 3690 std::max(ST.useRealTrue16Insts() ? 16u : 32u, Size)); 3691 case AMDGPU::VCCRegBankID: 3692 assert(Size == 1); 3693 return getWaveMaskRegClass(); 3694 case AMDGPU::SGPRRegBankID: 3695 return getSGPRClassForBitWidth(std::max(32u, Size)); 3696 case AMDGPU::AGPRRegBankID: 3697 return getAGPRClassForBitWidth(std::max(32u, Size)); 3698 default: 3699 llvm_unreachable("unknown register bank"); 3700 } 3701 } 3702 3703 const TargetRegisterClass * 3704 SIRegisterInfo::getConstrainedRegClassForOperand(const MachineOperand &MO, 3705 const MachineRegisterInfo &MRI) const { 3706 const RegClassOrRegBank &RCOrRB = MRI.getRegClassOrRegBank(MO.getReg()); 3707 if (const RegisterBank *RB = dyn_cast<const RegisterBank *>(RCOrRB)) 3708 return getRegClassForTypeOnBank(MRI.getType(MO.getReg()), *RB); 3709 3710 if (const auto *RC = dyn_cast<const TargetRegisterClass *>(RCOrRB)) 3711 return getAllocatableClass(RC); 3712 3713 return nullptr; 3714 } 3715 3716 MCRegister SIRegisterInfo::getVCC() const { 3717 return isWave32 ? AMDGPU::VCC_LO : AMDGPU::VCC; 3718 } 3719 3720 MCRegister SIRegisterInfo::getExec() const { 3721 return isWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC; 3722 } 3723 3724 const TargetRegisterClass *SIRegisterInfo::getVGPR64Class() const { 3725 // VGPR tuples have an alignment requirement on gfx90a variants. 3726 return ST.needsAlignedVGPRs() ? &AMDGPU::VReg_64_Align2RegClass 3727 : &AMDGPU::VReg_64RegClass; 3728 } 3729 3730 const TargetRegisterClass * 3731 SIRegisterInfo::getRegClass(unsigned RCID) const { 3732 switch ((int)RCID) { 3733 case AMDGPU::SReg_1RegClassID: 3734 return getBoolRC(); 3735 case AMDGPU::SReg_1_XEXECRegClassID: 3736 return getWaveMaskRegClass(); 3737 case -1: 3738 return nullptr; 3739 default: 3740 return AMDGPUGenRegisterInfo::getRegClass(RCID); 3741 } 3742 } 3743 3744 // Find reaching register definition 3745 MachineInstr *SIRegisterInfo::findReachingDef(Register Reg, unsigned SubReg, 3746 MachineInstr &Use, 3747 MachineRegisterInfo &MRI, 3748 LiveIntervals *LIS) const { 3749 auto &MDT = LIS->getDomTree(); 3750 SlotIndex UseIdx = LIS->getInstructionIndex(Use); 3751 SlotIndex DefIdx; 3752 3753 if (Reg.isVirtual()) { 3754 if (!LIS->hasInterval(Reg)) 3755 return nullptr; 3756 LiveInterval &LI = LIS->getInterval(Reg); 3757 LaneBitmask SubLanes = SubReg ? getSubRegIndexLaneMask(SubReg) 3758 : MRI.getMaxLaneMaskForVReg(Reg); 3759 VNInfo *V = nullptr; 3760 if (LI.hasSubRanges()) { 3761 for (auto &S : LI.subranges()) { 3762 if ((S.LaneMask & SubLanes) == SubLanes) { 3763 V = S.getVNInfoAt(UseIdx); 3764 break; 3765 } 3766 } 3767 } else { 3768 V = LI.getVNInfoAt(UseIdx); 3769 } 3770 if (!V) 3771 return nullptr; 3772 DefIdx = V->def; 3773 } else { 3774 // Find last def. 3775 for (MCRegUnit Unit : regunits(Reg.asMCReg())) { 3776 LiveRange &LR = LIS->getRegUnit(Unit); 3777 if (VNInfo *V = LR.getVNInfoAt(UseIdx)) { 3778 if (!DefIdx.isValid() || 3779 MDT.dominates(LIS->getInstructionFromIndex(DefIdx), 3780 LIS->getInstructionFromIndex(V->def))) 3781 DefIdx = V->def; 3782 } else { 3783 return nullptr; 3784 } 3785 } 3786 } 3787 3788 MachineInstr *Def = LIS->getInstructionFromIndex(DefIdx); 3789 3790 if (!Def || !MDT.dominates(Def, &Use)) 3791 return nullptr; 3792 3793 assert(Def->modifiesRegister(Reg, this)); 3794 3795 return Def; 3796 } 3797 3798 MCPhysReg SIRegisterInfo::get32BitRegister(MCPhysReg Reg) const { 3799 assert(getRegSizeInBits(*getPhysRegBaseClass(Reg)) <= 32); 3800 3801 for (const TargetRegisterClass &RC : { AMDGPU::VGPR_32RegClass, 3802 AMDGPU::SReg_32RegClass, 3803 AMDGPU::AGPR_32RegClass } ) { 3804 if (MCPhysReg Super = getMatchingSuperReg(Reg, AMDGPU::lo16, &RC)) 3805 return Super; 3806 } 3807 if (MCPhysReg Super = getMatchingSuperReg(Reg, AMDGPU::hi16, 3808 &AMDGPU::VGPR_32RegClass)) { 3809 return Super; 3810 } 3811 3812 return AMDGPU::NoRegister; 3813 } 3814 3815 bool SIRegisterInfo::isProperlyAlignedRC(const TargetRegisterClass &RC) const { 3816 if (!ST.needsAlignedVGPRs()) 3817 return true; 3818 3819 if (isVGPRClass(&RC)) 3820 return RC.hasSuperClassEq(getVGPRClassForBitWidth(getRegSizeInBits(RC))); 3821 if (isAGPRClass(&RC)) 3822 return RC.hasSuperClassEq(getAGPRClassForBitWidth(getRegSizeInBits(RC))); 3823 if (isVectorSuperClass(&RC)) 3824 return RC.hasSuperClassEq( 3825 getVectorSuperClassForBitWidth(getRegSizeInBits(RC))); 3826 3827 return true; 3828 } 3829 3830 const TargetRegisterClass * 3831 SIRegisterInfo::getProperlyAlignedRC(const TargetRegisterClass *RC) const { 3832 if (!RC || !ST.needsAlignedVGPRs()) 3833 return RC; 3834 3835 unsigned Size = getRegSizeInBits(*RC); 3836 if (Size <= 32) 3837 return RC; 3838 3839 if (isVGPRClass(RC)) 3840 return getAlignedVGPRClassForBitWidth(Size); 3841 if (isAGPRClass(RC)) 3842 return getAlignedAGPRClassForBitWidth(Size); 3843 if (isVectorSuperClass(RC)) 3844 return getAlignedVectorSuperClassForBitWidth(Size); 3845 3846 return RC; 3847 } 3848 3849 ArrayRef<MCPhysReg> 3850 SIRegisterInfo::getAllSGPR128(const MachineFunction &MF) const { 3851 return ArrayRef(AMDGPU::SGPR_128RegClass.begin(), ST.getMaxNumSGPRs(MF) / 4); 3852 } 3853 3854 ArrayRef<MCPhysReg> 3855 SIRegisterInfo::getAllSGPR64(const MachineFunction &MF) const { 3856 return ArrayRef(AMDGPU::SGPR_64RegClass.begin(), ST.getMaxNumSGPRs(MF) / 2); 3857 } 3858 3859 ArrayRef<MCPhysReg> 3860 SIRegisterInfo::getAllSGPR32(const MachineFunction &MF) const { 3861 return ArrayRef(AMDGPU::SGPR_32RegClass.begin(), ST.getMaxNumSGPRs(MF)); 3862 } 3863 3864 unsigned 3865 SIRegisterInfo::getSubRegAlignmentNumBits(const TargetRegisterClass *RC, 3866 unsigned SubReg) const { 3867 switch (RC->TSFlags & SIRCFlags::RegKindMask) { 3868 case SIRCFlags::HasSGPR: 3869 return std::min(128u, getSubRegIdxSize(SubReg)); 3870 case SIRCFlags::HasAGPR: 3871 case SIRCFlags::HasVGPR: 3872 case SIRCFlags::HasVGPR | SIRCFlags::HasAGPR: 3873 return std::min(32u, getSubRegIdxSize(SubReg)); 3874 default: 3875 break; 3876 } 3877 return 0; 3878 } 3879 3880 unsigned 3881 SIRegisterInfo::getNumUsedPhysRegs(const MachineRegisterInfo &MRI, 3882 const TargetRegisterClass &RC) const { 3883 for (MCPhysReg Reg : reverse(RC.getRegisters())) 3884 if (MRI.isPhysRegUsed(Reg)) 3885 return getHWRegIndex(Reg) + 1; 3886 return 0; 3887 } 3888 3889 SmallVector<StringLiteral> 3890 SIRegisterInfo::getVRegFlagsOfReg(Register Reg, 3891 const MachineFunction &MF) const { 3892 SmallVector<StringLiteral> RegFlags; 3893 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); 3894 if (FuncInfo->checkFlag(Reg, AMDGPU::VirtRegFlag::WWM_REG)) 3895 RegFlags.push_back("WWM_REG"); 3896 return RegFlags; 3897 } 3898