1 //===-- GCNSchedStrategy.cpp - GCN Scheduler Strategy ---------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// This contains a MachineSchedStrategy implementation for maximizing wave 11 /// occupancy on GCN hardware. 12 //===----------------------------------------------------------------------===// 13 14 #include "GCNSchedStrategy.h" 15 #include "SIMachineFunctionInfo.h" 16 #include "llvm/CodeGen/RegisterClassInfo.h" 17 18 #define DEBUG_TYPE "machine-scheduler" 19 20 using namespace llvm; 21 22 GCNMaxOccupancySchedStrategy::GCNMaxOccupancySchedStrategy( 23 const MachineSchedContext *C) : 24 GenericScheduler(C), TargetOccupancy(0), HasClusteredNodes(false), 25 HasExcessPressure(false), MF(nullptr) { } 26 27 void GCNMaxOccupancySchedStrategy::initialize(ScheduleDAGMI *DAG) { 28 GenericScheduler::initialize(DAG); 29 30 MF = &DAG->MF; 31 32 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); 33 34 // FIXME: This is also necessary, because some passes that run after 35 // scheduling and before regalloc increase register pressure. 36 const unsigned ErrorMargin = 3; 37 38 SGPRExcessLimit = 39 Context->RegClassInfo->getNumAllocatableRegs(&AMDGPU::SGPR_32RegClass); 40 VGPRExcessLimit = 41 Context->RegClassInfo->getNumAllocatableRegs(&AMDGPU::VGPR_32RegClass); 42 43 SIMachineFunctionInfo &MFI = *MF->getInfo<SIMachineFunctionInfo>(); 44 // Set the initial TargetOccupnacy to the maximum occupancy that we can 45 // achieve for this function. This effectively sets a lower bound on the 46 // 'Critical' register limits in the scheduler. 47 TargetOccupancy = MFI.getOccupancy(); 48 SGPRCriticalLimit = 49 std::min(ST.getMaxNumSGPRs(TargetOccupancy, true), SGPRExcessLimit); 50 VGPRCriticalLimit = 51 std::min(ST.getMaxNumVGPRs(TargetOccupancy), VGPRExcessLimit); 52 53 // Subtract error margin from register limits and avoid overflow. 54 SGPRCriticalLimit = 55 std::min(SGPRCriticalLimit - ErrorMargin, SGPRCriticalLimit); 56 VGPRCriticalLimit = 57 std::min(VGPRCriticalLimit - ErrorMargin, VGPRCriticalLimit); 58 SGPRExcessLimit = std::min(SGPRExcessLimit - ErrorMargin, SGPRExcessLimit); 59 VGPRExcessLimit = std::min(VGPRExcessLimit - ErrorMargin, VGPRExcessLimit); 60 } 61 62 void GCNMaxOccupancySchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU, 63 bool AtTop, const RegPressureTracker &RPTracker, 64 const SIRegisterInfo *SRI, 65 unsigned SGPRPressure, 66 unsigned VGPRPressure) { 67 68 Cand.SU = SU; 69 Cand.AtTop = AtTop; 70 71 // getDownwardPressure() and getUpwardPressure() make temporary changes to 72 // the tracker, so we need to pass those function a non-const copy. 73 RegPressureTracker &TempTracker = const_cast<RegPressureTracker&>(RPTracker); 74 75 Pressure.clear(); 76 MaxPressure.clear(); 77 78 if (AtTop) 79 TempTracker.getDownwardPressure(SU->getInstr(), Pressure, MaxPressure); 80 else { 81 // FIXME: I think for bottom up scheduling, the register pressure is cached 82 // and can be retrieved by DAG->getPressureDif(SU). 83 TempTracker.getUpwardPressure(SU->getInstr(), Pressure, MaxPressure); 84 } 85 86 unsigned NewSGPRPressure = Pressure[AMDGPU::RegisterPressureSets::SReg_32]; 87 unsigned NewVGPRPressure = Pressure[AMDGPU::RegisterPressureSets::VGPR_32]; 88 89 // If two instructions increase the pressure of different register sets 90 // by the same amount, the generic scheduler will prefer to schedule the 91 // instruction that increases the set with the least amount of registers, 92 // which in our case would be SGPRs. This is rarely what we want, so 93 // when we report excess/critical register pressure, we do it either 94 // only for VGPRs or only for SGPRs. 95 96 // FIXME: Better heuristics to determine whether to prefer SGPRs or VGPRs. 97 const unsigned MaxVGPRPressureInc = 16; 98 bool ShouldTrackVGPRs = VGPRPressure + MaxVGPRPressureInc >= VGPRExcessLimit; 99 bool ShouldTrackSGPRs = !ShouldTrackVGPRs && SGPRPressure >= SGPRExcessLimit; 100 101 102 // FIXME: We have to enter REG-EXCESS before we reach the actual threshold 103 // to increase the likelihood we don't go over the limits. We should improve 104 // the analysis to look through dependencies to find the path with the least 105 // register pressure. 106 107 // We only need to update the RPDelta for instructions that increase register 108 // pressure. Instructions that decrease or keep reg pressure the same will be 109 // marked as RegExcess in tryCandidate() when they are compared with 110 // instructions that increase the register pressure. 111 if (ShouldTrackVGPRs && NewVGPRPressure >= VGPRExcessLimit) { 112 HasExcessPressure = true; 113 Cand.RPDelta.Excess = PressureChange(AMDGPU::RegisterPressureSets::VGPR_32); 114 Cand.RPDelta.Excess.setUnitInc(NewVGPRPressure - VGPRExcessLimit); 115 } 116 117 if (ShouldTrackSGPRs && NewSGPRPressure >= SGPRExcessLimit) { 118 HasExcessPressure = true; 119 Cand.RPDelta.Excess = PressureChange(AMDGPU::RegisterPressureSets::SReg_32); 120 Cand.RPDelta.Excess.setUnitInc(NewSGPRPressure - SGPRExcessLimit); 121 } 122 123 // Register pressure is considered 'CRITICAL' if it is approaching a value 124 // that would reduce the wave occupancy for the execution unit. When 125 // register pressure is 'CRITICAL', increasing SGPR and VGPR pressure both 126 // has the same cost, so we don't need to prefer one over the other. 127 128 int SGPRDelta = NewSGPRPressure - SGPRCriticalLimit; 129 int VGPRDelta = NewVGPRPressure - VGPRCriticalLimit; 130 131 if (SGPRDelta >= 0 || VGPRDelta >= 0) { 132 HasExcessPressure = true; 133 if (SGPRDelta > VGPRDelta) { 134 Cand.RPDelta.CriticalMax = 135 PressureChange(AMDGPU::RegisterPressureSets::SReg_32); 136 Cand.RPDelta.CriticalMax.setUnitInc(SGPRDelta); 137 } else { 138 Cand.RPDelta.CriticalMax = 139 PressureChange(AMDGPU::RegisterPressureSets::VGPR_32); 140 Cand.RPDelta.CriticalMax.setUnitInc(VGPRDelta); 141 } 142 } 143 } 144 145 // This function is mostly cut and pasted from 146 // GenericScheduler::pickNodeFromQueue() 147 void GCNMaxOccupancySchedStrategy::pickNodeFromQueue(SchedBoundary &Zone, 148 const CandPolicy &ZonePolicy, 149 const RegPressureTracker &RPTracker, 150 SchedCandidate &Cand) { 151 const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo*>(TRI); 152 ArrayRef<unsigned> Pressure = RPTracker.getRegSetPressureAtPos(); 153 unsigned SGPRPressure = Pressure[AMDGPU::RegisterPressureSets::SReg_32]; 154 unsigned VGPRPressure = Pressure[AMDGPU::RegisterPressureSets::VGPR_32]; 155 ReadyQueue &Q = Zone.Available; 156 for (SUnit *SU : Q) { 157 158 SchedCandidate TryCand(ZonePolicy); 159 initCandidate(TryCand, SU, Zone.isTop(), RPTracker, SRI, 160 SGPRPressure, VGPRPressure); 161 // Pass SchedBoundary only when comparing nodes from the same boundary. 162 SchedBoundary *ZoneArg = Cand.AtTop == TryCand.AtTop ? &Zone : nullptr; 163 GenericScheduler::tryCandidate(Cand, TryCand, ZoneArg); 164 if (TryCand.Reason != NoCand) { 165 // Initialize resource delta if needed in case future heuristics query it. 166 if (TryCand.ResDelta == SchedResourceDelta()) 167 TryCand.initResourceDelta(Zone.DAG, SchedModel); 168 Cand.setBest(TryCand); 169 LLVM_DEBUG(traceCandidate(Cand)); 170 } 171 } 172 } 173 174 // This function is mostly cut and pasted from 175 // GenericScheduler::pickNodeBidirectional() 176 SUnit *GCNMaxOccupancySchedStrategy::pickNodeBidirectional(bool &IsTopNode) { 177 // Schedule as far as possible in the direction of no choice. This is most 178 // efficient, but also provides the best heuristics for CriticalPSets. 179 if (SUnit *SU = Bot.pickOnlyChoice()) { 180 IsTopNode = false; 181 return SU; 182 } 183 if (SUnit *SU = Top.pickOnlyChoice()) { 184 IsTopNode = true; 185 return SU; 186 } 187 // Set the bottom-up policy based on the state of the current bottom zone and 188 // the instructions outside the zone, including the top zone. 189 CandPolicy BotPolicy; 190 setPolicy(BotPolicy, /*IsPostRA=*/false, Bot, &Top); 191 // Set the top-down policy based on the state of the current top zone and 192 // the instructions outside the zone, including the bottom zone. 193 CandPolicy TopPolicy; 194 setPolicy(TopPolicy, /*IsPostRA=*/false, Top, &Bot); 195 196 // See if BotCand is still valid (because we previously scheduled from Top). 197 LLVM_DEBUG(dbgs() << "Picking from Bot:\n"); 198 if (!BotCand.isValid() || BotCand.SU->isScheduled || 199 BotCand.Policy != BotPolicy) { 200 BotCand.reset(CandPolicy()); 201 pickNodeFromQueue(Bot, BotPolicy, DAG->getBotRPTracker(), BotCand); 202 assert(BotCand.Reason != NoCand && "failed to find the first candidate"); 203 } else { 204 LLVM_DEBUG(traceCandidate(BotCand)); 205 #ifndef NDEBUG 206 if (VerifyScheduling) { 207 SchedCandidate TCand; 208 TCand.reset(CandPolicy()); 209 pickNodeFromQueue(Bot, BotPolicy, DAG->getBotRPTracker(), TCand); 210 assert(TCand.SU == BotCand.SU && 211 "Last pick result should correspond to re-picking right now"); 212 } 213 #endif 214 } 215 216 // Check if the top Q has a better candidate. 217 LLVM_DEBUG(dbgs() << "Picking from Top:\n"); 218 if (!TopCand.isValid() || TopCand.SU->isScheduled || 219 TopCand.Policy != TopPolicy) { 220 TopCand.reset(CandPolicy()); 221 pickNodeFromQueue(Top, TopPolicy, DAG->getTopRPTracker(), TopCand); 222 assert(TopCand.Reason != NoCand && "failed to find the first candidate"); 223 } else { 224 LLVM_DEBUG(traceCandidate(TopCand)); 225 #ifndef NDEBUG 226 if (VerifyScheduling) { 227 SchedCandidate TCand; 228 TCand.reset(CandPolicy()); 229 pickNodeFromQueue(Top, TopPolicy, DAG->getTopRPTracker(), TCand); 230 assert(TCand.SU == TopCand.SU && 231 "Last pick result should correspond to re-picking right now"); 232 } 233 #endif 234 } 235 236 // Pick best from BotCand and TopCand. 237 LLVM_DEBUG(dbgs() << "Top Cand: "; traceCandidate(TopCand); 238 dbgs() << "Bot Cand: "; traceCandidate(BotCand);); 239 SchedCandidate Cand = BotCand; 240 TopCand.Reason = NoCand; 241 GenericScheduler::tryCandidate(Cand, TopCand, nullptr); 242 if (TopCand.Reason != NoCand) { 243 Cand.setBest(TopCand); 244 } 245 LLVM_DEBUG(dbgs() << "Picking: "; traceCandidate(Cand);); 246 247 IsTopNode = Cand.AtTop; 248 return Cand.SU; 249 } 250 251 // This function is mostly cut and pasted from 252 // GenericScheduler::pickNode() 253 SUnit *GCNMaxOccupancySchedStrategy::pickNode(bool &IsTopNode) { 254 if (DAG->top() == DAG->bottom()) { 255 assert(Top.Available.empty() && Top.Pending.empty() && 256 Bot.Available.empty() && Bot.Pending.empty() && "ReadyQ garbage"); 257 return nullptr; 258 } 259 SUnit *SU; 260 do { 261 if (RegionPolicy.OnlyTopDown) { 262 SU = Top.pickOnlyChoice(); 263 if (!SU) { 264 CandPolicy NoPolicy; 265 TopCand.reset(NoPolicy); 266 pickNodeFromQueue(Top, NoPolicy, DAG->getTopRPTracker(), TopCand); 267 assert(TopCand.Reason != NoCand && "failed to find a candidate"); 268 SU = TopCand.SU; 269 } 270 IsTopNode = true; 271 } else if (RegionPolicy.OnlyBottomUp) { 272 SU = Bot.pickOnlyChoice(); 273 if (!SU) { 274 CandPolicy NoPolicy; 275 BotCand.reset(NoPolicy); 276 pickNodeFromQueue(Bot, NoPolicy, DAG->getBotRPTracker(), BotCand); 277 assert(BotCand.Reason != NoCand && "failed to find a candidate"); 278 SU = BotCand.SU; 279 } 280 IsTopNode = false; 281 } else { 282 SU = pickNodeBidirectional(IsTopNode); 283 } 284 } while (SU->isScheduled); 285 286 if (SU->isTopReady()) 287 Top.removeReady(SU); 288 if (SU->isBottomReady()) 289 Bot.removeReady(SU); 290 291 if (!HasClusteredNodes && SU->getInstr()->mayLoadOrStore()) { 292 for (SDep &Dep : SU->Preds) { 293 if (Dep.isCluster()) { 294 HasClusteredNodes = true; 295 break; 296 } 297 } 298 } 299 300 LLVM_DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") " 301 << *SU->getInstr()); 302 return SU; 303 } 304 305 GCNScheduleDAGMILive::GCNScheduleDAGMILive(MachineSchedContext *C, 306 std::unique_ptr<MachineSchedStrategy> S) : 307 ScheduleDAGMILive(C, std::move(S)), 308 ST(MF.getSubtarget<GCNSubtarget>()), 309 MFI(*MF.getInfo<SIMachineFunctionInfo>()), 310 StartingOccupancy(MFI.getOccupancy()), 311 MinOccupancy(StartingOccupancy), Stage(Collect), RegionIdx(0) { 312 313 LLVM_DEBUG(dbgs() << "Starting occupancy is " << StartingOccupancy << ".\n"); 314 } 315 316 void GCNScheduleDAGMILive::schedule() { 317 if (Stage == Collect) { 318 // Just record regions at the first pass. 319 Regions.push_back(std::make_pair(RegionBegin, RegionEnd)); 320 return; 321 } 322 323 std::vector<MachineInstr*> Unsched; 324 Unsched.reserve(NumRegionInstrs); 325 for (auto &I : *this) { 326 Unsched.push_back(&I); 327 } 328 329 GCNRegPressure PressureBefore; 330 if (LIS) { 331 PressureBefore = Pressure[RegionIdx]; 332 333 LLVM_DEBUG(dbgs() << "Pressure before scheduling:\nRegion live-ins:"; 334 GCNRPTracker::printLiveRegs(dbgs(), LiveIns[RegionIdx], MRI); 335 dbgs() << "Region live-in pressure: "; 336 llvm::getRegPressure(MRI, LiveIns[RegionIdx]).print(dbgs()); 337 dbgs() << "Region register pressure: "; 338 PressureBefore.print(dbgs())); 339 } 340 341 GCNMaxOccupancySchedStrategy &S = (GCNMaxOccupancySchedStrategy&)*SchedImpl; 342 // Set HasClusteredNodes to true for late stages where we have already 343 // collected it. That way pickNode() will not scan SDep's when not needed. 344 S.HasClusteredNodes = Stage > InitialSchedule; 345 S.HasExcessPressure = false; 346 ScheduleDAGMILive::schedule(); 347 Regions[RegionIdx] = std::make_pair(RegionBegin, RegionEnd); 348 RescheduleRegions[RegionIdx] = false; 349 if (Stage == InitialSchedule && S.HasClusteredNodes) 350 RegionsWithClusters[RegionIdx] = true; 351 if (S.HasExcessPressure) 352 RegionsWithHighRP[RegionIdx] = true; 353 354 if (!LIS) 355 return; 356 357 // Check the results of scheduling. 358 auto PressureAfter = getRealRegPressure(); 359 360 LLVM_DEBUG(dbgs() << "Pressure after scheduling: "; 361 PressureAfter.print(dbgs())); 362 363 if (PressureAfter.getSGPRNum() <= S.SGPRCriticalLimit && 364 PressureAfter.getVGPRNum(ST.hasGFX90AInsts()) <= S.VGPRCriticalLimit) { 365 Pressure[RegionIdx] = PressureAfter; 366 RegionsWithMinOcc[RegionIdx] = 367 PressureAfter.getOccupancy(ST) == MinOccupancy; 368 369 LLVM_DEBUG(dbgs() << "Pressure in desired limits, done.\n"); 370 return; 371 } 372 373 unsigned WavesAfter = 374 std::min(S.TargetOccupancy, PressureAfter.getOccupancy(ST)); 375 unsigned WavesBefore = 376 std::min(S.TargetOccupancy, PressureBefore.getOccupancy(ST)); 377 LLVM_DEBUG(dbgs() << "Occupancy before scheduling: " << WavesBefore 378 << ", after " << WavesAfter << ".\n"); 379 380 // We may not be able to keep the current target occupancy because of the just 381 // scheduled region. We might still be able to revert scheduling if the 382 // occupancy before was higher, or if the current schedule has register 383 // pressure higher than the excess limits which could lead to more spilling. 384 unsigned NewOccupancy = std::max(WavesAfter, WavesBefore); 385 386 // Allow memory bound functions to drop to 4 waves if not limited by an 387 // attribute. 388 if (WavesAfter < WavesBefore && WavesAfter < MinOccupancy && 389 WavesAfter >= MFI.getMinAllowedOccupancy()) { 390 LLVM_DEBUG(dbgs() << "Function is memory bound, allow occupancy drop up to " 391 << MFI.getMinAllowedOccupancy() << " waves\n"); 392 NewOccupancy = WavesAfter; 393 } 394 395 if (NewOccupancy < MinOccupancy) { 396 MinOccupancy = NewOccupancy; 397 MFI.limitOccupancy(MinOccupancy); 398 RegionsWithMinOcc.reset(); 399 LLVM_DEBUG(dbgs() << "Occupancy lowered for the function to " 400 << MinOccupancy << ".\n"); 401 } 402 403 unsigned MaxVGPRs = ST.getMaxNumVGPRs(MF); 404 unsigned MaxSGPRs = ST.getMaxNumSGPRs(MF); 405 if (PressureAfter.getVGPRNum(false) > MaxVGPRs || 406 PressureAfter.getAGPRNum() > MaxVGPRs || 407 PressureAfter.getSGPRNum() > MaxSGPRs) { 408 RescheduleRegions[RegionIdx] = true; 409 RegionsWithHighRP[RegionIdx] = true; 410 } 411 412 // If this condition is true, then either the occupancy before and after 413 // scheduling is the same, or we are allowing the occupancy to drop because 414 // the function is memory bound. Even if we are OK with the current occupancy, 415 // we still need to verify that we will not introduce any extra chance of 416 // spilling. 417 if (WavesAfter >= MinOccupancy) { 418 if (Stage == UnclusteredReschedule && 419 !PressureAfter.less(ST, PressureBefore)) { 420 LLVM_DEBUG(dbgs() << "Unclustered reschedule did not help.\n"); 421 } else if (WavesAfter > MFI.getMinWavesPerEU() || 422 PressureAfter.less(ST, PressureBefore) || 423 !RescheduleRegions[RegionIdx]) { 424 Pressure[RegionIdx] = PressureAfter; 425 RegionsWithMinOcc[RegionIdx] = 426 PressureAfter.getOccupancy(ST) == MinOccupancy; 427 if (!RegionsWithClusters[RegionIdx] && 428 (Stage + 1) == UnclusteredReschedule) 429 RescheduleRegions[RegionIdx] = false; 430 return; 431 } else { 432 LLVM_DEBUG(dbgs() << "New pressure will result in more spilling.\n"); 433 } 434 } 435 436 RegionsWithMinOcc[RegionIdx] = 437 PressureBefore.getOccupancy(ST) == MinOccupancy; 438 LLVM_DEBUG(dbgs() << "Attempting to revert scheduling.\n"); 439 RescheduleRegions[RegionIdx] = RegionsWithClusters[RegionIdx] || 440 (Stage + 1) != UnclusteredReschedule; 441 RegionEnd = RegionBegin; 442 int SkippedDebugInstr = 0; 443 for (MachineInstr *MI : Unsched) { 444 if (MI->isDebugInstr()) { 445 ++SkippedDebugInstr; 446 continue; 447 } 448 449 if (MI->getIterator() != RegionEnd) { 450 BB->remove(MI); 451 BB->insert(RegionEnd, MI); 452 if (!MI->isDebugInstr()) 453 LIS->handleMove(*MI, true); 454 } 455 // Reset read-undef flags and update them later. 456 for (auto &Op : MI->operands()) 457 if (Op.isReg() && Op.isDef()) 458 Op.setIsUndef(false); 459 RegisterOperands RegOpers; 460 RegOpers.collect(*MI, *TRI, MRI, ShouldTrackLaneMasks, false); 461 if (!MI->isDebugInstr()) { 462 if (ShouldTrackLaneMasks) { 463 // Adjust liveness and add missing dead+read-undef flags. 464 SlotIndex SlotIdx = LIS->getInstructionIndex(*MI).getRegSlot(); 465 RegOpers.adjustLaneLiveness(*LIS, MRI, SlotIdx, MI); 466 } else { 467 // Adjust for missing dead-def flags. 468 RegOpers.detectDeadDefs(*MI, *LIS); 469 } 470 } 471 RegionEnd = MI->getIterator(); 472 ++RegionEnd; 473 LLVM_DEBUG(dbgs() << "Scheduling " << *MI); 474 } 475 476 // After reverting schedule, debug instrs will now be at the end of the block 477 // and RegionEnd will point to the first debug instr. Increment RegionEnd 478 // pass debug instrs to the actual end of the scheduling region. 479 while (SkippedDebugInstr-- > 0) 480 ++RegionEnd; 481 482 // If Unsched.front() instruction is a debug instruction, this will actually 483 // shrink the region since we moved all debug instructions to the end of the 484 // block. Find the first instruction that is not a debug instruction. 485 RegionBegin = Unsched.front()->getIterator(); 486 if (RegionBegin->isDebugInstr()) { 487 for (MachineInstr *MI : Unsched) { 488 if (MI->isDebugInstr()) 489 continue; 490 RegionBegin = MI->getIterator(); 491 break; 492 } 493 } 494 495 // Then move the debug instructions back into their correct place and set 496 // RegionBegin and RegionEnd if needed. 497 placeDebugValues(); 498 499 Regions[RegionIdx] = std::make_pair(RegionBegin, RegionEnd); 500 } 501 502 GCNRegPressure GCNScheduleDAGMILive::getRealRegPressure() const { 503 GCNDownwardRPTracker RPTracker(*LIS); 504 RPTracker.advance(begin(), end(), &LiveIns[RegionIdx]); 505 return RPTracker.moveMaxPressure(); 506 } 507 508 void GCNScheduleDAGMILive::computeBlockPressure(const MachineBasicBlock *MBB) { 509 GCNDownwardRPTracker RPTracker(*LIS); 510 511 // If the block has the only successor then live-ins of that successor are 512 // live-outs of the current block. We can reuse calculated live set if the 513 // successor will be sent to scheduling past current block. 514 const MachineBasicBlock *OnlySucc = nullptr; 515 if (MBB->succ_size() == 1 && !(*MBB->succ_begin())->empty()) { 516 SlotIndexes *Ind = LIS->getSlotIndexes(); 517 if (Ind->getMBBStartIdx(MBB) < Ind->getMBBStartIdx(*MBB->succ_begin())) 518 OnlySucc = *MBB->succ_begin(); 519 } 520 521 // Scheduler sends regions from the end of the block upwards. 522 size_t CurRegion = RegionIdx; 523 for (size_t E = Regions.size(); CurRegion != E; ++CurRegion) 524 if (Regions[CurRegion].first->getParent() != MBB) 525 break; 526 --CurRegion; 527 528 auto I = MBB->begin(); 529 auto LiveInIt = MBBLiveIns.find(MBB); 530 auto &Rgn = Regions[CurRegion]; 531 auto *NonDbgMI = &*skipDebugInstructionsForward(Rgn.first, Rgn.second); 532 if (LiveInIt != MBBLiveIns.end()) { 533 auto LiveIn = std::move(LiveInIt->second); 534 RPTracker.reset(*MBB->begin(), &LiveIn); 535 MBBLiveIns.erase(LiveInIt); 536 } else { 537 I = Rgn.first; 538 auto LRS = BBLiveInMap.lookup(NonDbgMI); 539 #ifdef EXPENSIVE_CHECKS 540 assert(isEqual(getLiveRegsBefore(*NonDbgMI, *LIS), LRS)); 541 #endif 542 RPTracker.reset(*I, &LRS); 543 } 544 545 for ( ; ; ) { 546 I = RPTracker.getNext(); 547 548 if (Regions[CurRegion].first == I || NonDbgMI == I) { 549 LiveIns[CurRegion] = RPTracker.getLiveRegs(); 550 RPTracker.clearMaxPressure(); 551 } 552 553 if (Regions[CurRegion].second == I) { 554 Pressure[CurRegion] = RPTracker.moveMaxPressure(); 555 if (CurRegion-- == RegionIdx) 556 break; 557 } 558 RPTracker.advanceToNext(); 559 RPTracker.advanceBeforeNext(); 560 } 561 562 if (OnlySucc) { 563 if (I != MBB->end()) { 564 RPTracker.advanceToNext(); 565 RPTracker.advance(MBB->end()); 566 } 567 RPTracker.reset(*OnlySucc->begin(), &RPTracker.getLiveRegs()); 568 RPTracker.advanceBeforeNext(); 569 MBBLiveIns[OnlySucc] = RPTracker.moveLiveRegs(); 570 } 571 } 572 573 DenseMap<MachineInstr *, GCNRPTracker::LiveRegSet> 574 GCNScheduleDAGMILive::getBBLiveInMap() const { 575 assert(!Regions.empty()); 576 std::vector<MachineInstr *> BBStarters; 577 BBStarters.reserve(Regions.size()); 578 auto I = Regions.rbegin(), E = Regions.rend(); 579 auto *BB = I->first->getParent(); 580 do { 581 auto *MI = &*skipDebugInstructionsForward(I->first, I->second); 582 BBStarters.push_back(MI); 583 do { 584 ++I; 585 } while (I != E && I->first->getParent() == BB); 586 } while (I != E); 587 return getLiveRegMap(BBStarters, false /*After*/, *LIS); 588 } 589 590 void GCNScheduleDAGMILive::finalizeSchedule() { 591 LLVM_DEBUG(dbgs() << "All regions recorded, starting actual scheduling.\n"); 592 593 LiveIns.resize(Regions.size()); 594 Pressure.resize(Regions.size()); 595 RescheduleRegions.resize(Regions.size()); 596 RegionsWithClusters.resize(Regions.size()); 597 RegionsWithHighRP.resize(Regions.size()); 598 RegionsWithMinOcc.resize(Regions.size()); 599 RescheduleRegions.set(); 600 RegionsWithClusters.reset(); 601 RegionsWithHighRP.reset(); 602 RegionsWithMinOcc.reset(); 603 604 if (!Regions.empty()) 605 BBLiveInMap = getBBLiveInMap(); 606 607 std::vector<std::unique_ptr<ScheduleDAGMutation>> SavedMutations; 608 609 do { 610 Stage++; 611 RegionIdx = 0; 612 MachineBasicBlock *MBB = nullptr; 613 614 if (Stage > InitialSchedule) { 615 if (!LIS) 616 break; 617 618 // Retry function scheduling if we found resulting occupancy and it is 619 // lower than used for first pass scheduling. This will give more freedom 620 // to schedule low register pressure blocks. 621 // Code is partially copied from MachineSchedulerBase::scheduleRegions(). 622 623 if (Stage == UnclusteredReschedule) { 624 if (RescheduleRegions.none()) 625 continue; 626 LLVM_DEBUG(dbgs() << 627 "Retrying function scheduling without clustering.\n"); 628 } 629 630 if (Stage == ClusteredLowOccupancyReschedule) { 631 if (StartingOccupancy <= MinOccupancy) 632 break; 633 634 LLVM_DEBUG( 635 dbgs() 636 << "Retrying function scheduling with lowest recorded occupancy " 637 << MinOccupancy << ".\n"); 638 } 639 640 if (Stage == PreRARematerialize) { 641 if (RegionsWithMinOcc.none() || Regions.size() == 1) 642 break; 643 644 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 645 const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo(); 646 // Check maximum occupancy 647 if (ST.computeOccupancy(MF.getFunction(), MFI.getLDSSize()) == 648 MinOccupancy) 649 break; 650 651 // FIXME: This pass will invalidate cached MBBLiveIns for regions 652 // inbetween the defs and region we sinked the def to. Cached pressure 653 // for regions where a def is sinked from will also be invalidated. Will 654 // need to be fixed if there is another pass after this pass. 655 static_assert(LastStage == PreRARematerialize, 656 "Passes after PreRARematerialize are not supported"); 657 658 collectRematerializableInstructions(); 659 if (RematerializableInsts.empty() || !sinkTriviallyRematInsts(ST, TII)) 660 break; 661 662 LLVM_DEBUG( 663 dbgs() << "Retrying function scheduling with improved occupancy of " 664 << MinOccupancy << " from rematerializing\n"); 665 } 666 } 667 668 if (Stage == UnclusteredReschedule) 669 SavedMutations.swap(Mutations); 670 671 for (auto Region : Regions) { 672 if (((Stage == UnclusteredReschedule || Stage == PreRARematerialize) && 673 !RescheduleRegions[RegionIdx]) || 674 (Stage == ClusteredLowOccupancyReschedule && 675 !RegionsWithClusters[RegionIdx] && !RegionsWithHighRP[RegionIdx])) { 676 677 ++RegionIdx; 678 continue; 679 } 680 681 RegionBegin = Region.first; 682 RegionEnd = Region.second; 683 684 if (RegionBegin->getParent() != MBB) { 685 if (MBB) finishBlock(); 686 MBB = RegionBegin->getParent(); 687 startBlock(MBB); 688 if (Stage == InitialSchedule) 689 computeBlockPressure(MBB); 690 } 691 692 unsigned NumRegionInstrs = std::distance(begin(), end()); 693 enterRegion(MBB, begin(), end(), NumRegionInstrs); 694 695 // Skip empty scheduling regions (0 or 1 schedulable instructions). 696 if (begin() == end() || begin() == std::prev(end())) { 697 exitRegion(); 698 ++RegionIdx; 699 continue; 700 } 701 702 LLVM_DEBUG(dbgs() << "********** MI Scheduling **********\n"); 703 LLVM_DEBUG(dbgs() << MF.getName() << ":" << printMBBReference(*MBB) << " " 704 << MBB->getName() << "\n From: " << *begin() 705 << " To: "; 706 if (RegionEnd != MBB->end()) dbgs() << *RegionEnd; 707 else dbgs() << "End"; 708 dbgs() << " RegionInstrs: " << NumRegionInstrs << '\n'); 709 710 schedule(); 711 712 exitRegion(); 713 ++RegionIdx; 714 } 715 finishBlock(); 716 717 if (Stage == UnclusteredReschedule) 718 SavedMutations.swap(Mutations); 719 } while (Stage != LastStage); 720 } 721 722 void GCNScheduleDAGMILive::collectRematerializableInstructions() { 723 const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo *>(TRI); 724 for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) { 725 Register Reg = Register::index2VirtReg(I); 726 if (!LIS->hasInterval(Reg)) 727 continue; 728 729 // TODO: Handle AGPR and SGPR rematerialization 730 if (!SRI->isVGPRClass(MRI.getRegClass(Reg)) || !MRI.hasOneDef(Reg) || 731 !MRI.hasOneNonDBGUse(Reg)) 732 continue; 733 734 MachineOperand *Op = MRI.getOneDef(Reg); 735 MachineInstr *Def = Op->getParent(); 736 if (Op->getSubReg() != 0 || !isTriviallyReMaterializable(*Def, AA)) 737 continue; 738 739 MachineInstr *UseI = &*MRI.use_instr_nodbg_begin(Reg); 740 if (Def->getParent() == UseI->getParent()) 741 continue; 742 743 // We are only collecting defs that are defined in another block and are 744 // live-through or used inside regions at MinOccupancy. This means that the 745 // register must be in the live-in set for the region. 746 bool AddedToRematList = false; 747 for (unsigned I = 0, E = Regions.size(); I != E; ++I) { 748 auto It = LiveIns[I].find(Reg); 749 if (It != LiveIns[I].end() && !It->second.none()) { 750 if (RegionsWithMinOcc[I]) { 751 RematerializableInsts[I][Def] = UseI; 752 AddedToRematList = true; 753 } 754 755 // Collect regions with rematerializable reg as live-in to avoid 756 // searching later when updating RP. 757 RematDefToLiveInRegions[Def].push_back(I); 758 } 759 } 760 if (!AddedToRematList) 761 RematDefToLiveInRegions.erase(Def); 762 } 763 } 764 765 bool GCNScheduleDAGMILive::sinkTriviallyRematInsts(const GCNSubtarget &ST, 766 const TargetInstrInfo *TII) { 767 // Temporary copies of cached variables we will be modifying and replacing if 768 // sinking succeeds. 769 SmallVector< 770 std::pair<MachineBasicBlock::iterator, MachineBasicBlock::iterator>, 32> 771 NewRegions; 772 DenseMap<unsigned, GCNRPTracker::LiveRegSet> NewLiveIns; 773 DenseMap<unsigned, GCNRegPressure> NewPressure; 774 BitVector NewRescheduleRegions; 775 776 NewRegions.resize(Regions.size()); 777 NewRescheduleRegions.resize(Regions.size()); 778 779 // Collect only regions that has a rematerializable def as a live-in. 780 SmallSet<unsigned, 16> ImpactedRegions; 781 for (const auto &It : RematDefToLiveInRegions) 782 ImpactedRegions.insert(It.second.begin(), It.second.end()); 783 784 // Make copies of register pressure and live-ins cache that will be updated 785 // as we rematerialize. 786 for (auto Idx : ImpactedRegions) { 787 NewPressure[Idx] = Pressure[Idx]; 788 NewLiveIns[Idx] = LiveIns[Idx]; 789 } 790 NewRegions = Regions; 791 NewRescheduleRegions.reset(); 792 793 DenseMap<MachineInstr *, MachineInstr *> InsertedMIToOldDef; 794 bool Improved = false; 795 for (auto I : ImpactedRegions) { 796 if (!RegionsWithMinOcc[I]) 797 continue; 798 799 Improved = false; 800 int VGPRUsage = NewPressure[I].getVGPRNum(ST.hasGFX90AInsts()); 801 int SGPRUsage = NewPressure[I].getSGPRNum(); 802 803 // TODO: Handle occupancy drop due to AGPR and SGPR. 804 // Check if cause of occupancy drop is due to VGPR usage and not SGPR. 805 if (ST.getOccupancyWithNumSGPRs(SGPRUsage) == MinOccupancy) 806 break; 807 808 // The occupancy of this region could have been improved by a previous 809 // iteration's sinking of defs. 810 if (NewPressure[I].getOccupancy(ST) > MinOccupancy) { 811 NewRescheduleRegions[I] = true; 812 Improved = true; 813 continue; 814 } 815 816 // First check if we have enough trivially rematerializable instructions to 817 // improve occupancy. Optimistically assume all instructions we are able to 818 // sink decreased RP. 819 int TotalSinkableRegs = 0; 820 for (const auto &It : RematerializableInsts[I]) { 821 MachineInstr *Def = It.first; 822 Register DefReg = Def->getOperand(0).getReg(); 823 TotalSinkableRegs += 824 SIRegisterInfo::getNumCoveredRegs(NewLiveIns[I][DefReg]); 825 } 826 int VGPRsAfterSink = VGPRUsage - TotalSinkableRegs; 827 unsigned OptimisticOccupancy = ST.getOccupancyWithNumVGPRs(VGPRsAfterSink); 828 // If in the most optimistic scenario, we cannot improve occupancy, then do 829 // not attempt to sink any instructions. 830 if (OptimisticOccupancy <= MinOccupancy) 831 break; 832 833 unsigned ImproveOccupancy = 0; 834 SmallVector<MachineInstr *, 4> SinkedDefs; 835 for (auto &It : RematerializableInsts[I]) { 836 MachineInstr *Def = It.first; 837 MachineBasicBlock::iterator InsertPos = 838 MachineBasicBlock::iterator(It.second); 839 Register Reg = Def->getOperand(0).getReg(); 840 // Rematerialize MI to its use block. Since we are only rematerializing 841 // instructions that do not have any virtual reg uses, we do not need to 842 // call LiveRangeEdit::allUsesAvailableAt() and 843 // LiveRangeEdit::canRematerializeAt(). 844 TII->reMaterialize(*InsertPos->getParent(), InsertPos, Reg, 845 Def->getOperand(0).getSubReg(), *Def, *TRI); 846 MachineInstr *NewMI = &*(--InsertPos); 847 LIS->InsertMachineInstrInMaps(*NewMI); 848 LIS->removeInterval(Reg); 849 LIS->createAndComputeVirtRegInterval(Reg); 850 InsertedMIToOldDef[NewMI] = Def; 851 852 // Update region boundaries in scheduling region we sinked from since we 853 // may sink an instruction that was at the beginning or end of its region 854 updateRegionBoundaries(NewRegions, Def, /*NewMI =*/nullptr, 855 /*Removing =*/true); 856 857 // Update region boundaries in region we sinked to. 858 updateRegionBoundaries(NewRegions, InsertPos, NewMI); 859 860 LaneBitmask PrevMask = NewLiveIns[I][Reg]; 861 // FIXME: Also update cached pressure for where the def was sinked from. 862 // Update RP for all regions that has this reg as a live-in and remove 863 // the reg from all regions as a live-in. 864 for (auto Idx : RematDefToLiveInRegions[Def]) { 865 NewLiveIns[Idx].erase(Reg); 866 if (InsertPos->getParent() != Regions[Idx].first->getParent()) { 867 // Def is live-through and not used in this block. 868 NewPressure[Idx].inc(Reg, PrevMask, LaneBitmask::getNone(), MRI); 869 } else { 870 // Def is used and rematerialized into this block. 871 GCNDownwardRPTracker RPT(*LIS); 872 auto *NonDbgMI = &*skipDebugInstructionsForward( 873 NewRegions[Idx].first, NewRegions[Idx].second); 874 RPT.reset(*NonDbgMI, &NewLiveIns[Idx]); 875 RPT.advance(NewRegions[Idx].second); 876 NewPressure[Idx] = RPT.moveMaxPressure(); 877 } 878 } 879 880 SinkedDefs.push_back(Def); 881 ImproveOccupancy = NewPressure[I].getOccupancy(ST); 882 if (ImproveOccupancy > MinOccupancy) 883 break; 884 } 885 886 // Remove defs we just sinked from all regions' list of sinkable defs 887 for (auto &Def : SinkedDefs) 888 for (auto TrackedIdx : RematDefToLiveInRegions[Def]) 889 RematerializableInsts[TrackedIdx].erase(Def); 890 891 if (ImproveOccupancy <= MinOccupancy) 892 break; 893 894 NewRescheduleRegions[I] = true; 895 Improved = true; 896 } 897 898 if (!Improved) { 899 // Occupancy was not improved for all regions that were at MinOccupancy. 900 // Undo sinking and remove newly rematerialized instructions. 901 for (auto &Entry : InsertedMIToOldDef) { 902 MachineInstr *MI = Entry.first; 903 MachineInstr *OldMI = Entry.second; 904 Register Reg = MI->getOperand(0).getReg(); 905 LIS->RemoveMachineInstrFromMaps(*MI); 906 MI->eraseFromParent(); 907 OldMI->clearRegisterDeads(Reg); 908 LIS->removeInterval(Reg); 909 LIS->createAndComputeVirtRegInterval(Reg); 910 } 911 return false; 912 } 913 914 // Occupancy was improved for all regions. 915 for (auto &Entry : InsertedMIToOldDef) { 916 MachineInstr *MI = Entry.first; 917 MachineInstr *OldMI = Entry.second; 918 919 // Remove OldMI from BBLiveInMap since we are sinking it from its MBB. 920 BBLiveInMap.erase(OldMI); 921 922 // Remove OldMI and update LIS 923 Register Reg = MI->getOperand(0).getReg(); 924 LIS->RemoveMachineInstrFromMaps(*OldMI); 925 OldMI->eraseFromParent(); 926 LIS->removeInterval(Reg); 927 LIS->createAndComputeVirtRegInterval(Reg); 928 } 929 930 // Update live-ins, register pressure, and regions caches. 931 for (auto Idx : ImpactedRegions) { 932 LiveIns[Idx] = NewLiveIns[Idx]; 933 Pressure[Idx] = NewPressure[Idx]; 934 MBBLiveIns.erase(Regions[Idx].first->getParent()); 935 } 936 Regions = NewRegions; 937 RescheduleRegions = NewRescheduleRegions; 938 939 SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 940 MFI.increaseOccupancy(MF, ++MinOccupancy); 941 942 return true; 943 } 944 945 // Copied from MachineLICM 946 bool GCNScheduleDAGMILive::isTriviallyReMaterializable(const MachineInstr &MI, 947 AAResults *AA) { 948 if (!TII->isTriviallyReMaterializable(MI, AA)) 949 return false; 950 951 for (const MachineOperand &MO : MI.operands()) 952 if (MO.isReg() && MO.isUse() && MO.getReg().isVirtual()) 953 return false; 954 955 return true; 956 } 957 958 // When removing, we will have to check both beginning and ending of the region. 959 // When inserting, we will only have to check if we are inserting NewMI in front 960 // of a scheduling region and do not need to check the ending since we will only 961 // ever be inserting before an already existing MI. 962 void GCNScheduleDAGMILive::updateRegionBoundaries( 963 SmallVectorImpl<std::pair<MachineBasicBlock::iterator, 964 MachineBasicBlock::iterator>> &RegionBoundaries, 965 MachineBasicBlock::iterator MI, MachineInstr *NewMI, bool Removing) { 966 unsigned I = 0, E = RegionBoundaries.size(); 967 // Search for first region of the block where MI is located 968 while (I != E && MI->getParent() != RegionBoundaries[I].first->getParent()) 969 ++I; 970 971 for (; I != E; ++I) { 972 if (MI->getParent() != RegionBoundaries[I].first->getParent()) 973 return; 974 975 if (Removing && MI == RegionBoundaries[I].first && 976 MI == RegionBoundaries[I].second) { 977 // MI is in a region with size 1, after removing, the region will be 978 // size 0, set RegionBegin and RegionEnd to pass end of block iterator. 979 RegionBoundaries[I] = 980 std::make_pair(MI->getParent()->end(), MI->getParent()->end()); 981 return; 982 } 983 if (MI == RegionBoundaries[I].first) { 984 if (Removing) 985 RegionBoundaries[I] = 986 std::make_pair(std::next(MI), RegionBoundaries[I].second); 987 else 988 // Inserted NewMI in front of region, set new RegionBegin to NewMI 989 RegionBoundaries[I] = std::make_pair(MachineBasicBlock::iterator(NewMI), 990 RegionBoundaries[I].second); 991 return; 992 } 993 if (Removing && MI == RegionBoundaries[I].second) { 994 RegionBoundaries[I] = 995 std::make_pair(RegionBoundaries[I].first, std::prev(MI)); 996 return; 997 } 998 } 999 } 1000