1 //===-- GCNSchedStrategy.cpp - GCN Scheduler Strategy ---------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// This contains a MachineSchedStrategy implementation for maximizing wave 11 /// occupancy on GCN hardware. 12 /// 13 /// This pass will apply multiple scheduling stages to the same function. 14 /// Regions are first recorded in GCNScheduleDAGMILive::schedule. The actual 15 /// entry point for the scheduling of those regions is 16 /// GCNScheduleDAGMILive::runSchedStages. 17 18 /// Generally, the reason for having multiple scheduling stages is to account 19 /// for the kernel-wide effect of register usage on occupancy. Usually, only a 20 /// few scheduling regions will have register pressure high enough to limit 21 /// occupancy for the kernel, so constraints can be relaxed to improve ILP in 22 /// other regions. 23 /// 24 //===----------------------------------------------------------------------===// 25 26 #include "GCNSchedStrategy.h" 27 #include "AMDGPUIGroupLP.h" 28 #include "SIMachineFunctionInfo.h" 29 #include "llvm/CodeGen/RegisterClassInfo.h" 30 31 #define DEBUG_TYPE "machine-scheduler" 32 33 using namespace llvm; 34 35 static cl::opt<bool> DisableUnclusterHighRP( 36 "amdgpu-disable-unclustered-high-rp-reschedule", cl::Hidden, 37 cl::desc("Disable unclustered high register pressure " 38 "reduction scheduling stage."), 39 cl::init(false)); 40 41 static cl::opt<bool> DisableClusteredLowOccupancy( 42 "amdgpu-disable-clustered-low-occupancy-reschedule", cl::Hidden, 43 cl::desc("Disable clustered low occupancy " 44 "rescheduling for ILP scheduling stage."), 45 cl::init(false)); 46 47 static cl::opt<unsigned> ScheduleMetricBias( 48 "amdgpu-schedule-metric-bias", cl::Hidden, 49 cl::desc( 50 "Sets the bias which adds weight to occupancy vs latency. Set it to " 51 "100 to chase the occupancy only."), 52 cl::init(10)); 53 54 static cl::opt<bool> 55 RelaxedOcc("amdgpu-schedule-relaxed-occupancy", cl::Hidden, 56 cl::desc("Relax occupancy targets for kernels which are memory " 57 "bound (amdgpu-membound-threshold), or " 58 "Wave Limited (amdgpu-limit-wave-threshold)."), 59 cl::init(false)); 60 61 static cl::opt<bool> GCNTrackers( 62 "amdgpu-use-amdgpu-trackers", cl::Hidden, 63 cl::desc("Use the AMDGPU specific RPTrackers during scheduling"), 64 cl::init(false)); 65 66 const unsigned ScheduleMetrics::ScaleFactor = 100; 67 68 GCNSchedStrategy::GCNSchedStrategy(const MachineSchedContext *C) 69 : GenericScheduler(C), TargetOccupancy(0), MF(nullptr), 70 DownwardTracker(*C->LIS), UpwardTracker(*C->LIS), HasHighPressure(false) { 71 } 72 73 void GCNSchedStrategy::initialize(ScheduleDAGMI *DAG) { 74 GenericScheduler::initialize(DAG); 75 76 MF = &DAG->MF; 77 78 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); 79 80 SGPRExcessLimit = 81 Context->RegClassInfo->getNumAllocatableRegs(&AMDGPU::SGPR_32RegClass); 82 VGPRExcessLimit = 83 Context->RegClassInfo->getNumAllocatableRegs(&AMDGPU::VGPR_32RegClass); 84 85 SIMachineFunctionInfo &MFI = *MF->getInfo<SIMachineFunctionInfo>(); 86 // Set the initial TargetOccupnacy to the maximum occupancy that we can 87 // achieve for this function. This effectively sets a lower bound on the 88 // 'Critical' register limits in the scheduler. 89 // Allow for lower occupancy targets if kernel is wave limited or memory 90 // bound, and using the relaxed occupancy feature. 91 TargetOccupancy = 92 RelaxedOcc ? MFI.getMinAllowedOccupancy() : MFI.getOccupancy(); 93 SGPRCriticalLimit = 94 std::min(ST.getMaxNumSGPRs(TargetOccupancy, true), SGPRExcessLimit); 95 96 if (!KnownExcessRP) { 97 VGPRCriticalLimit = 98 std::min(ST.getMaxNumVGPRs(TargetOccupancy), VGPRExcessLimit); 99 } else { 100 // This is similar to ST.getMaxNumVGPRs(TargetOccupancy) result except 101 // returns a reasonably small number for targets with lots of VGPRs, such 102 // as GFX10 and GFX11. 103 LLVM_DEBUG(dbgs() << "Region is known to spill, use alternative " 104 "VGPRCriticalLimit calculation method.\n"); 105 106 unsigned Granule = AMDGPU::IsaInfo::getVGPRAllocGranule(&ST); 107 unsigned Addressable = AMDGPU::IsaInfo::getAddressableNumVGPRs(&ST); 108 unsigned VGPRBudget = alignDown(Addressable / TargetOccupancy, Granule); 109 VGPRBudget = std::max(VGPRBudget, Granule); 110 VGPRCriticalLimit = std::min(VGPRBudget, VGPRExcessLimit); 111 } 112 113 // Subtract error margin and bias from register limits and avoid overflow. 114 SGPRCriticalLimit -= std::min(SGPRLimitBias + ErrorMargin, SGPRCriticalLimit); 115 VGPRCriticalLimit -= std::min(VGPRLimitBias + ErrorMargin, VGPRCriticalLimit); 116 SGPRExcessLimit -= std::min(SGPRLimitBias + ErrorMargin, SGPRExcessLimit); 117 VGPRExcessLimit -= std::min(VGPRLimitBias + ErrorMargin, VGPRExcessLimit); 118 119 LLVM_DEBUG(dbgs() << "VGPRCriticalLimit = " << VGPRCriticalLimit 120 << ", VGPRExcessLimit = " << VGPRExcessLimit 121 << ", SGPRCriticalLimit = " << SGPRCriticalLimit 122 << ", SGPRExcessLimit = " << SGPRExcessLimit << "\n\n"); 123 } 124 125 /// Checks whether \p SU can use the cached DAG pressure diffs to compute the 126 /// current register pressure. 127 /// 128 /// This works for the common case, but it has a few exceptions that have been 129 /// observed through trial and error: 130 /// - Explicit physical register operands 131 /// - Subregister definitions 132 /// 133 /// In both of those cases, PressureDiff doesn't represent the actual pressure, 134 /// and querying LiveIntervals through the RegPressureTracker is needed to get 135 /// an accurate value. 136 /// 137 /// We should eventually only use PressureDiff for maximum performance, but this 138 /// already allows 80% of SUs to take the fast path without changing scheduling 139 /// at all. Further changes would either change scheduling, or require a lot 140 /// more logic to recover an accurate pressure estimate from the PressureDiffs. 141 static bool canUsePressureDiffs(const SUnit &SU) { 142 if (!SU.isInstr()) 143 return false; 144 145 // Cannot use pressure diffs for subregister defs or with physregs, it's 146 // imprecise in both cases. 147 for (const auto &Op : SU.getInstr()->operands()) { 148 if (!Op.isReg() || Op.isImplicit()) 149 continue; 150 if (Op.getReg().isPhysical() || 151 (Op.isDef() && Op.getSubReg() != AMDGPU::NoSubRegister)) 152 return false; 153 } 154 return true; 155 } 156 157 static void getRegisterPressures( 158 bool AtTop, const RegPressureTracker &RPTracker, SUnit *SU, 159 std::vector<unsigned> &Pressure, std::vector<unsigned> &MaxPressure, 160 GCNDownwardRPTracker &DownwardTracker, GCNUpwardRPTracker &UpwardTracker, 161 ScheduleDAGMI *DAG, const SIRegisterInfo *SRI) { 162 // getDownwardPressure() and getUpwardPressure() make temporary changes to 163 // the tracker, so we need to pass those function a non-const copy. 164 RegPressureTracker &TempTracker = const_cast<RegPressureTracker &>(RPTracker); 165 if (!GCNTrackers) { 166 AtTop 167 ? TempTracker.getDownwardPressure(SU->getInstr(), Pressure, MaxPressure) 168 : TempTracker.getUpwardPressure(SU->getInstr(), Pressure, MaxPressure); 169 170 return; 171 } 172 173 // GCNTrackers 174 Pressure.resize(4, 0); 175 MachineInstr *MI = SU->getInstr(); 176 GCNRegPressure NewPressure; 177 if (AtTop) { 178 GCNDownwardRPTracker TempDownwardTracker(DownwardTracker); 179 NewPressure = TempDownwardTracker.bumpDownwardPressure(MI, SRI); 180 } else { 181 GCNUpwardRPTracker TempUpwardTracker(UpwardTracker); 182 TempUpwardTracker.recede(*MI); 183 NewPressure = TempUpwardTracker.getPressure(); 184 } 185 Pressure[AMDGPU::RegisterPressureSets::SReg_32] = NewPressure.getSGPRNum(); 186 Pressure[AMDGPU::RegisterPressureSets::VGPR_32] = 187 NewPressure.getArchVGPRNum(); 188 Pressure[AMDGPU::RegisterPressureSets::AGPR_32] = NewPressure.getAGPRNum(); 189 } 190 191 // Return true if the instruction is mutually exclusive with all non-IGLP DAG 192 // mutations, requiring all other mutations to be disabled. 193 static bool isIGLPMutationOnly(unsigned Opcode) { 194 return Opcode == AMDGPU::SCHED_GROUP_BARRIER || Opcode == AMDGPU::IGLP_OPT; 195 } 196 197 void GCNSchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU, 198 bool AtTop, 199 const RegPressureTracker &RPTracker, 200 const SIRegisterInfo *SRI, 201 unsigned SGPRPressure, 202 unsigned VGPRPressure, bool IsBottomUp) { 203 Cand.SU = SU; 204 Cand.AtTop = AtTop; 205 206 if (!DAG->isTrackingPressure()) 207 return; 208 209 Pressure.clear(); 210 MaxPressure.clear(); 211 212 // We try to use the cached PressureDiffs in the ScheduleDAG whenever 213 // possible over querying the RegPressureTracker. 214 // 215 // RegPressureTracker will make a lot of LIS queries which are very 216 // expensive, it is considered a slow function in this context. 217 // 218 // PressureDiffs are precomputed and cached, and getPressureDiff is just a 219 // trivial lookup into an array. It is pretty much free. 220 // 221 // In EXPENSIVE_CHECKS, we always query RPTracker to verify the results of 222 // PressureDiffs. 223 if (AtTop || !canUsePressureDiffs(*SU) || GCNTrackers) { 224 getRegisterPressures(AtTop, RPTracker, SU, Pressure, MaxPressure, 225 DownwardTracker, UpwardTracker, DAG, SRI); 226 } else { 227 // Reserve 4 slots. 228 Pressure.resize(4, 0); 229 Pressure[AMDGPU::RegisterPressureSets::SReg_32] = SGPRPressure; 230 Pressure[AMDGPU::RegisterPressureSets::VGPR_32] = VGPRPressure; 231 232 for (const auto &Diff : DAG->getPressureDiff(SU)) { 233 if (!Diff.isValid()) 234 continue; 235 // PressureDiffs is always bottom-up so if we're working top-down we need 236 // to invert its sign. 237 Pressure[Diff.getPSet()] += 238 (IsBottomUp ? Diff.getUnitInc() : -Diff.getUnitInc()); 239 } 240 241 #ifdef EXPENSIVE_CHECKS 242 std::vector<unsigned> CheckPressure, CheckMaxPressure; 243 getRegisterPressures(AtTop, RPTracker, SU, CheckPressure, CheckMaxPressure, 244 DownwardTracker, UpwardTracker, DAG, SRI); 245 if (Pressure[AMDGPU::RegisterPressureSets::SReg_32] != 246 CheckPressure[AMDGPU::RegisterPressureSets::SReg_32] || 247 Pressure[AMDGPU::RegisterPressureSets::VGPR_32] != 248 CheckPressure[AMDGPU::RegisterPressureSets::VGPR_32]) { 249 errs() << "Register Pressure is inaccurate when calculated through " 250 "PressureDiff\n" 251 << "SGPR got " << Pressure[AMDGPU::RegisterPressureSets::SReg_32] 252 << ", expected " 253 << CheckPressure[AMDGPU::RegisterPressureSets::SReg_32] << "\n" 254 << "VGPR got " << Pressure[AMDGPU::RegisterPressureSets::VGPR_32] 255 << ", expected " 256 << CheckPressure[AMDGPU::RegisterPressureSets::VGPR_32] << "\n"; 257 report_fatal_error("inaccurate register pressure calculation"); 258 } 259 #endif 260 } 261 262 unsigned NewSGPRPressure = Pressure[AMDGPU::RegisterPressureSets::SReg_32]; 263 unsigned NewVGPRPressure = Pressure[AMDGPU::RegisterPressureSets::VGPR_32]; 264 265 // If two instructions increase the pressure of different register sets 266 // by the same amount, the generic scheduler will prefer to schedule the 267 // instruction that increases the set with the least amount of registers, 268 // which in our case would be SGPRs. This is rarely what we want, so 269 // when we report excess/critical register pressure, we do it either 270 // only for VGPRs or only for SGPRs. 271 272 // FIXME: Better heuristics to determine whether to prefer SGPRs or VGPRs. 273 const unsigned MaxVGPRPressureInc = 16; 274 bool ShouldTrackVGPRs = VGPRPressure + MaxVGPRPressureInc >= VGPRExcessLimit; 275 bool ShouldTrackSGPRs = !ShouldTrackVGPRs && SGPRPressure >= SGPRExcessLimit; 276 277 // FIXME: We have to enter REG-EXCESS before we reach the actual threshold 278 // to increase the likelihood we don't go over the limits. We should improve 279 // the analysis to look through dependencies to find the path with the least 280 // register pressure. 281 282 // We only need to update the RPDelta for instructions that increase register 283 // pressure. Instructions that decrease or keep reg pressure the same will be 284 // marked as RegExcess in tryCandidate() when they are compared with 285 // instructions that increase the register pressure. 286 if (ShouldTrackVGPRs && NewVGPRPressure >= VGPRExcessLimit) { 287 HasHighPressure = true; 288 Cand.RPDelta.Excess = PressureChange(AMDGPU::RegisterPressureSets::VGPR_32); 289 Cand.RPDelta.Excess.setUnitInc(NewVGPRPressure - VGPRExcessLimit); 290 } 291 292 if (ShouldTrackSGPRs && NewSGPRPressure >= SGPRExcessLimit) { 293 HasHighPressure = true; 294 Cand.RPDelta.Excess = PressureChange(AMDGPU::RegisterPressureSets::SReg_32); 295 Cand.RPDelta.Excess.setUnitInc(NewSGPRPressure - SGPRExcessLimit); 296 } 297 298 // Register pressure is considered 'CRITICAL' if it is approaching a value 299 // that would reduce the wave occupancy for the execution unit. When 300 // register pressure is 'CRITICAL', increasing SGPR and VGPR pressure both 301 // has the same cost, so we don't need to prefer one over the other. 302 303 int SGPRDelta = NewSGPRPressure - SGPRCriticalLimit; 304 int VGPRDelta = NewVGPRPressure - VGPRCriticalLimit; 305 306 if (SGPRDelta >= 0 || VGPRDelta >= 0) { 307 HasHighPressure = true; 308 if (SGPRDelta > VGPRDelta) { 309 Cand.RPDelta.CriticalMax = 310 PressureChange(AMDGPU::RegisterPressureSets::SReg_32); 311 Cand.RPDelta.CriticalMax.setUnitInc(SGPRDelta); 312 } else { 313 Cand.RPDelta.CriticalMax = 314 PressureChange(AMDGPU::RegisterPressureSets::VGPR_32); 315 Cand.RPDelta.CriticalMax.setUnitInc(VGPRDelta); 316 } 317 } 318 } 319 320 // This function is mostly cut and pasted from 321 // GenericScheduler::pickNodeFromQueue() 322 void GCNSchedStrategy::pickNodeFromQueue(SchedBoundary &Zone, 323 const CandPolicy &ZonePolicy, 324 const RegPressureTracker &RPTracker, 325 SchedCandidate &Cand, 326 bool IsBottomUp) { 327 const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo*>(TRI); 328 ArrayRef<unsigned> Pressure = RPTracker.getRegSetPressureAtPos(); 329 unsigned SGPRPressure = 0; 330 unsigned VGPRPressure = 0; 331 if (DAG->isTrackingPressure()) { 332 if (!GCNTrackers) { 333 SGPRPressure = Pressure[AMDGPU::RegisterPressureSets::SReg_32]; 334 VGPRPressure = Pressure[AMDGPU::RegisterPressureSets::VGPR_32]; 335 } else { 336 GCNRPTracker *T = IsBottomUp 337 ? static_cast<GCNRPTracker *>(&UpwardTracker) 338 : static_cast<GCNRPTracker *>(&DownwardTracker); 339 SGPRPressure = T->getPressure().getSGPRNum(); 340 VGPRPressure = T->getPressure().getArchVGPRNum(); 341 } 342 } 343 ReadyQueue &Q = Zone.Available; 344 for (SUnit *SU : Q) { 345 346 SchedCandidate TryCand(ZonePolicy); 347 initCandidate(TryCand, SU, Zone.isTop(), RPTracker, SRI, SGPRPressure, 348 VGPRPressure, IsBottomUp); 349 // Pass SchedBoundary only when comparing nodes from the same boundary. 350 SchedBoundary *ZoneArg = Cand.AtTop == TryCand.AtTop ? &Zone : nullptr; 351 tryCandidate(Cand, TryCand, ZoneArg); 352 if (TryCand.Reason != NoCand) { 353 // Initialize resource delta if needed in case future heuristics query it. 354 if (TryCand.ResDelta == SchedResourceDelta()) 355 TryCand.initResourceDelta(Zone.DAG, SchedModel); 356 Cand.setBest(TryCand); 357 LLVM_DEBUG(traceCandidate(Cand)); 358 } 359 } 360 } 361 362 // This function is mostly cut and pasted from 363 // GenericScheduler::pickNodeBidirectional() 364 SUnit *GCNSchedStrategy::pickNodeBidirectional(bool &IsTopNode) { 365 // Schedule as far as possible in the direction of no choice. This is most 366 // efficient, but also provides the best heuristics for CriticalPSets. 367 if (SUnit *SU = Bot.pickOnlyChoice()) { 368 IsTopNode = false; 369 return SU; 370 } 371 if (SUnit *SU = Top.pickOnlyChoice()) { 372 IsTopNode = true; 373 return SU; 374 } 375 // Set the bottom-up policy based on the state of the current bottom zone and 376 // the instructions outside the zone, including the top zone. 377 CandPolicy BotPolicy; 378 setPolicy(BotPolicy, /*IsPostRA=*/false, Bot, &Top); 379 // Set the top-down policy based on the state of the current top zone and 380 // the instructions outside the zone, including the bottom zone. 381 CandPolicy TopPolicy; 382 setPolicy(TopPolicy, /*IsPostRA=*/false, Top, &Bot); 383 384 // See if BotCand is still valid (because we previously scheduled from Top). 385 LLVM_DEBUG(dbgs() << "Picking from Bot:\n"); 386 if (!BotCand.isValid() || BotCand.SU->isScheduled || 387 BotCand.Policy != BotPolicy) { 388 BotCand.reset(CandPolicy()); 389 pickNodeFromQueue(Bot, BotPolicy, DAG->getBotRPTracker(), BotCand, 390 /*IsBottomUp=*/true); 391 assert(BotCand.Reason != NoCand && "failed to find the first candidate"); 392 } else { 393 LLVM_DEBUG(traceCandidate(BotCand)); 394 #ifndef NDEBUG 395 if (VerifyScheduling) { 396 SchedCandidate TCand; 397 TCand.reset(CandPolicy()); 398 pickNodeFromQueue(Bot, BotPolicy, DAG->getBotRPTracker(), TCand, 399 /*IsBottomUp=*/true); 400 assert(TCand.SU == BotCand.SU && 401 "Last pick result should correspond to re-picking right now"); 402 } 403 #endif 404 } 405 406 // Check if the top Q has a better candidate. 407 LLVM_DEBUG(dbgs() << "Picking from Top:\n"); 408 if (!TopCand.isValid() || TopCand.SU->isScheduled || 409 TopCand.Policy != TopPolicy) { 410 TopCand.reset(CandPolicy()); 411 pickNodeFromQueue(Top, TopPolicy, DAG->getTopRPTracker(), TopCand, 412 /*IsBottomUp=*/false); 413 assert(TopCand.Reason != NoCand && "failed to find the first candidate"); 414 } else { 415 LLVM_DEBUG(traceCandidate(TopCand)); 416 #ifndef NDEBUG 417 if (VerifyScheduling) { 418 SchedCandidate TCand; 419 TCand.reset(CandPolicy()); 420 pickNodeFromQueue(Top, TopPolicy, DAG->getTopRPTracker(), TCand, 421 /*IsBottomUp=*/false); 422 assert(TCand.SU == TopCand.SU && 423 "Last pick result should correspond to re-picking right now"); 424 } 425 #endif 426 } 427 428 // Pick best from BotCand and TopCand. 429 LLVM_DEBUG(dbgs() << "Top Cand: "; traceCandidate(TopCand); 430 dbgs() << "Bot Cand: "; traceCandidate(BotCand);); 431 SchedCandidate Cand = BotCand; 432 TopCand.Reason = NoCand; 433 tryCandidate(Cand, TopCand, nullptr); 434 if (TopCand.Reason != NoCand) { 435 Cand.setBest(TopCand); 436 } 437 LLVM_DEBUG(dbgs() << "Picking: "; traceCandidate(Cand);); 438 439 IsTopNode = Cand.AtTop; 440 return Cand.SU; 441 } 442 443 // This function is mostly cut and pasted from 444 // GenericScheduler::pickNode() 445 SUnit *GCNSchedStrategy::pickNode(bool &IsTopNode) { 446 if (DAG->top() == DAG->bottom()) { 447 assert(Top.Available.empty() && Top.Pending.empty() && 448 Bot.Available.empty() && Bot.Pending.empty() && "ReadyQ garbage"); 449 return nullptr; 450 } 451 SUnit *SU; 452 do { 453 if (RegionPolicy.OnlyTopDown) { 454 SU = Top.pickOnlyChoice(); 455 if (!SU) { 456 CandPolicy NoPolicy; 457 TopCand.reset(NoPolicy); 458 pickNodeFromQueue(Top, NoPolicy, DAG->getTopRPTracker(), TopCand, 459 /*IsBottomUp=*/false); 460 assert(TopCand.Reason != NoCand && "failed to find a candidate"); 461 SU = TopCand.SU; 462 } 463 IsTopNode = true; 464 } else if (RegionPolicy.OnlyBottomUp) { 465 SU = Bot.pickOnlyChoice(); 466 if (!SU) { 467 CandPolicy NoPolicy; 468 BotCand.reset(NoPolicy); 469 pickNodeFromQueue(Bot, NoPolicy, DAG->getBotRPTracker(), BotCand, 470 /*IsBottomUp=*/true); 471 assert(BotCand.Reason != NoCand && "failed to find a candidate"); 472 SU = BotCand.SU; 473 } 474 IsTopNode = false; 475 } else { 476 SU = pickNodeBidirectional(IsTopNode); 477 } 478 } while (SU->isScheduled); 479 480 if (SU->isTopReady()) 481 Top.removeReady(SU); 482 if (SU->isBottomReady()) 483 Bot.removeReady(SU); 484 485 LLVM_DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") " 486 << *SU->getInstr()); 487 return SU; 488 } 489 490 void GCNSchedStrategy::schedNode(SUnit *SU, bool IsTopNode) { 491 if (GCNTrackers) { 492 MachineInstr *MI = SU->getInstr(); 493 IsTopNode ? (void)DownwardTracker.advance(MI, false) 494 : UpwardTracker.recede(*MI); 495 } 496 497 return GenericScheduler::schedNode(SU, IsTopNode); 498 } 499 500 GCNSchedStageID GCNSchedStrategy::getCurrentStage() { 501 assert(CurrentStage && CurrentStage != SchedStages.end()); 502 return *CurrentStage; 503 } 504 505 bool GCNSchedStrategy::advanceStage() { 506 assert(CurrentStage != SchedStages.end()); 507 if (!CurrentStage) 508 CurrentStage = SchedStages.begin(); 509 else 510 CurrentStage++; 511 512 return CurrentStage != SchedStages.end(); 513 } 514 515 bool GCNSchedStrategy::hasNextStage() const { 516 assert(CurrentStage); 517 return std::next(CurrentStage) != SchedStages.end(); 518 } 519 520 GCNSchedStageID GCNSchedStrategy::getNextStage() const { 521 assert(CurrentStage && std::next(CurrentStage) != SchedStages.end()); 522 return *std::next(CurrentStage); 523 } 524 525 GCNMaxOccupancySchedStrategy::GCNMaxOccupancySchedStrategy( 526 const MachineSchedContext *C, bool IsLegacyScheduler) 527 : GCNSchedStrategy(C) { 528 SchedStages.push_back(GCNSchedStageID::OccInitialSchedule); 529 SchedStages.push_back(GCNSchedStageID::UnclusteredHighRPReschedule); 530 SchedStages.push_back(GCNSchedStageID::ClusteredLowOccupancyReschedule); 531 SchedStages.push_back(GCNSchedStageID::PreRARematerialize); 532 GCNTrackers = GCNTrackers & !IsLegacyScheduler; 533 } 534 535 GCNMaxILPSchedStrategy::GCNMaxILPSchedStrategy(const MachineSchedContext *C) 536 : GCNSchedStrategy(C) { 537 SchedStages.push_back(GCNSchedStageID::ILPInitialSchedule); 538 } 539 540 bool GCNMaxILPSchedStrategy::tryCandidate(SchedCandidate &Cand, 541 SchedCandidate &TryCand, 542 SchedBoundary *Zone) const { 543 // Initialize the candidate if needed. 544 if (!Cand.isValid()) { 545 TryCand.Reason = NodeOrder; 546 return true; 547 } 548 549 // Avoid spilling by exceeding the register limit. 550 if (DAG->isTrackingPressure() && 551 tryPressure(TryCand.RPDelta.Excess, Cand.RPDelta.Excess, TryCand, Cand, 552 RegExcess, TRI, DAG->MF)) 553 return TryCand.Reason != NoCand; 554 555 // Bias PhysReg Defs and copies to their uses and defined respectively. 556 if (tryGreater(biasPhysReg(TryCand.SU, TryCand.AtTop), 557 biasPhysReg(Cand.SU, Cand.AtTop), TryCand, Cand, PhysReg)) 558 return TryCand.Reason != NoCand; 559 560 bool SameBoundary = Zone != nullptr; 561 if (SameBoundary) { 562 // Prioritize instructions that read unbuffered resources by stall cycles. 563 if (tryLess(Zone->getLatencyStallCycles(TryCand.SU), 564 Zone->getLatencyStallCycles(Cand.SU), TryCand, Cand, Stall)) 565 return TryCand.Reason != NoCand; 566 567 // Avoid critical resource consumption and balance the schedule. 568 TryCand.initResourceDelta(DAG, SchedModel); 569 if (tryLess(TryCand.ResDelta.CritResources, Cand.ResDelta.CritResources, 570 TryCand, Cand, ResourceReduce)) 571 return TryCand.Reason != NoCand; 572 if (tryGreater(TryCand.ResDelta.DemandedResources, 573 Cand.ResDelta.DemandedResources, TryCand, Cand, 574 ResourceDemand)) 575 return TryCand.Reason != NoCand; 576 577 // Unconditionally try to reduce latency. 578 if (tryLatency(TryCand, Cand, *Zone)) 579 return TryCand.Reason != NoCand; 580 581 // Weak edges are for clustering and other constraints. 582 if (tryLess(getWeakLeft(TryCand.SU, TryCand.AtTop), 583 getWeakLeft(Cand.SU, Cand.AtTop), TryCand, Cand, Weak)) 584 return TryCand.Reason != NoCand; 585 } 586 587 // Keep clustered nodes together to encourage downstream peephole 588 // optimizations which may reduce resource requirements. 589 // 590 // This is a best effort to set things up for a post-RA pass. Optimizations 591 // like generating loads of multiple registers should ideally be done within 592 // the scheduler pass by combining the loads during DAG postprocessing. 593 const SUnit *CandNextClusterSU = 594 Cand.AtTop ? DAG->getNextClusterSucc() : DAG->getNextClusterPred(); 595 const SUnit *TryCandNextClusterSU = 596 TryCand.AtTop ? DAG->getNextClusterSucc() : DAG->getNextClusterPred(); 597 if (tryGreater(TryCand.SU == TryCandNextClusterSU, 598 Cand.SU == CandNextClusterSU, TryCand, Cand, Cluster)) 599 return TryCand.Reason != NoCand; 600 601 // Avoid increasing the max critical pressure in the scheduled region. 602 if (DAG->isTrackingPressure() && 603 tryPressure(TryCand.RPDelta.CriticalMax, Cand.RPDelta.CriticalMax, 604 TryCand, Cand, RegCritical, TRI, DAG->MF)) 605 return TryCand.Reason != NoCand; 606 607 // Avoid increasing the max pressure of the entire region. 608 if (DAG->isTrackingPressure() && 609 tryPressure(TryCand.RPDelta.CurrentMax, Cand.RPDelta.CurrentMax, TryCand, 610 Cand, RegMax, TRI, DAG->MF)) 611 return TryCand.Reason != NoCand; 612 613 if (SameBoundary) { 614 // Fall through to original instruction order. 615 if ((Zone->isTop() && TryCand.SU->NodeNum < Cand.SU->NodeNum) || 616 (!Zone->isTop() && TryCand.SU->NodeNum > Cand.SU->NodeNum)) { 617 TryCand.Reason = NodeOrder; 618 return true; 619 } 620 } 621 return false; 622 } 623 624 GCNMaxMemoryClauseSchedStrategy::GCNMaxMemoryClauseSchedStrategy( 625 const MachineSchedContext *C) 626 : GCNSchedStrategy(C) { 627 SchedStages.push_back(GCNSchedStageID::MemoryClauseInitialSchedule); 628 } 629 630 /// GCNMaxMemoryClauseSchedStrategy tries best to clause memory instructions as 631 /// much as possible. This is achieved by: 632 // 1. Prioritize clustered operations before stall latency heuristic. 633 // 2. Prioritize long-latency-load before stall latency heuristic. 634 /// 635 /// \param Cand provides the policy and current best candidate. 636 /// \param TryCand refers to the next SUnit candidate, otherwise uninitialized. 637 /// \param Zone describes the scheduled zone that we are extending, or nullptr 638 /// if Cand is from a different zone than TryCand. 639 /// \return \c true if TryCand is better than Cand (Reason is NOT NoCand) 640 bool GCNMaxMemoryClauseSchedStrategy::tryCandidate(SchedCandidate &Cand, 641 SchedCandidate &TryCand, 642 SchedBoundary *Zone) const { 643 // Initialize the candidate if needed. 644 if (!Cand.isValid()) { 645 TryCand.Reason = NodeOrder; 646 return true; 647 } 648 649 // Bias PhysReg Defs and copies to their uses and defined respectively. 650 if (tryGreater(biasPhysReg(TryCand.SU, TryCand.AtTop), 651 biasPhysReg(Cand.SU, Cand.AtTop), TryCand, Cand, PhysReg)) 652 return TryCand.Reason != NoCand; 653 654 if (DAG->isTrackingPressure()) { 655 // Avoid exceeding the target's limit. 656 if (tryPressure(TryCand.RPDelta.Excess, Cand.RPDelta.Excess, TryCand, Cand, 657 RegExcess, TRI, DAG->MF)) 658 return TryCand.Reason != NoCand; 659 660 // Avoid increasing the max critical pressure in the scheduled region. 661 if (tryPressure(TryCand.RPDelta.CriticalMax, Cand.RPDelta.CriticalMax, 662 TryCand, Cand, RegCritical, TRI, DAG->MF)) 663 return TryCand.Reason != NoCand; 664 } 665 666 // MaxMemoryClause-specific: We prioritize clustered instructions as we would 667 // get more benefit from clausing these memory instructions. 668 const SUnit *CandNextClusterSU = 669 Cand.AtTop ? DAG->getNextClusterSucc() : DAG->getNextClusterPred(); 670 const SUnit *TryCandNextClusterSU = 671 TryCand.AtTop ? DAG->getNextClusterSucc() : DAG->getNextClusterPred(); 672 if (tryGreater(TryCand.SU == TryCandNextClusterSU, 673 Cand.SU == CandNextClusterSU, TryCand, Cand, Cluster)) 674 return TryCand.Reason != NoCand; 675 676 // We only compare a subset of features when comparing nodes between 677 // Top and Bottom boundary. Some properties are simply incomparable, in many 678 // other instances we should only override the other boundary if something 679 // is a clear good pick on one boundary. Skip heuristics that are more 680 // "tie-breaking" in nature. 681 bool SameBoundary = Zone != nullptr; 682 if (SameBoundary) { 683 // For loops that are acyclic path limited, aggressively schedule for 684 // latency. Within an single cycle, whenever CurrMOps > 0, allow normal 685 // heuristics to take precedence. 686 if (Rem.IsAcyclicLatencyLimited && !Zone->getCurrMOps() && 687 tryLatency(TryCand, Cand, *Zone)) 688 return TryCand.Reason != NoCand; 689 690 // MaxMemoryClause-specific: Prioritize long latency memory load 691 // instructions in top-bottom order to hide more latency. The mayLoad check 692 // is used to exclude store-like instructions, which we do not want to 693 // scheduler them too early. 694 bool TryMayLoad = 695 TryCand.SU->isInstr() && TryCand.SU->getInstr()->mayLoad(); 696 bool CandMayLoad = Cand.SU->isInstr() && Cand.SU->getInstr()->mayLoad(); 697 698 if (TryMayLoad || CandMayLoad) { 699 bool TryLongLatency = 700 TryCand.SU->Latency > 10 * Cand.SU->Latency && TryMayLoad; 701 bool CandLongLatency = 702 10 * TryCand.SU->Latency < Cand.SU->Latency && CandMayLoad; 703 704 if (tryGreater(Zone->isTop() ? TryLongLatency : CandLongLatency, 705 Zone->isTop() ? CandLongLatency : TryLongLatency, TryCand, 706 Cand, Stall)) 707 return TryCand.Reason != NoCand; 708 } 709 // Prioritize instructions that read unbuffered resources by stall cycles. 710 if (tryLess(Zone->getLatencyStallCycles(TryCand.SU), 711 Zone->getLatencyStallCycles(Cand.SU), TryCand, Cand, Stall)) 712 return TryCand.Reason != NoCand; 713 } 714 715 if (SameBoundary) { 716 // Weak edges are for clustering and other constraints. 717 if (tryLess(getWeakLeft(TryCand.SU, TryCand.AtTop), 718 getWeakLeft(Cand.SU, Cand.AtTop), TryCand, Cand, Weak)) 719 return TryCand.Reason != NoCand; 720 } 721 722 // Avoid increasing the max pressure of the entire region. 723 if (DAG->isTrackingPressure() && 724 tryPressure(TryCand.RPDelta.CurrentMax, Cand.RPDelta.CurrentMax, TryCand, 725 Cand, RegMax, TRI, DAG->MF)) 726 return TryCand.Reason != NoCand; 727 728 if (SameBoundary) { 729 // Avoid critical resource consumption and balance the schedule. 730 TryCand.initResourceDelta(DAG, SchedModel); 731 if (tryLess(TryCand.ResDelta.CritResources, Cand.ResDelta.CritResources, 732 TryCand, Cand, ResourceReduce)) 733 return TryCand.Reason != NoCand; 734 if (tryGreater(TryCand.ResDelta.DemandedResources, 735 Cand.ResDelta.DemandedResources, TryCand, Cand, 736 ResourceDemand)) 737 return TryCand.Reason != NoCand; 738 739 // Avoid serializing long latency dependence chains. 740 // For acyclic path limited loops, latency was already checked above. 741 if (!RegionPolicy.DisableLatencyHeuristic && TryCand.Policy.ReduceLatency && 742 !Rem.IsAcyclicLatencyLimited && tryLatency(TryCand, Cand, *Zone)) 743 return TryCand.Reason != NoCand; 744 745 // Fall through to original instruction order. 746 if (Zone->isTop() == (TryCand.SU->NodeNum < Cand.SU->NodeNum)) { 747 assert(TryCand.SU->NodeNum != Cand.SU->NodeNum); 748 TryCand.Reason = NodeOrder; 749 return true; 750 } 751 } 752 753 return false; 754 } 755 756 GCNScheduleDAGMILive::GCNScheduleDAGMILive( 757 MachineSchedContext *C, std::unique_ptr<MachineSchedStrategy> S) 758 : ScheduleDAGMILive(C, std::move(S)), ST(MF.getSubtarget<GCNSubtarget>()), 759 MFI(*MF.getInfo<SIMachineFunctionInfo>()), 760 StartingOccupancy(MFI.getOccupancy()), MinOccupancy(StartingOccupancy), 761 RegionLiveOuts(this, /*IsLiveOut=*/true) { 762 763 LLVM_DEBUG(dbgs() << "Starting occupancy is " << StartingOccupancy << ".\n"); 764 if (RelaxedOcc) { 765 MinOccupancy = std::min(MFI.getMinAllowedOccupancy(), StartingOccupancy); 766 if (MinOccupancy != StartingOccupancy) 767 LLVM_DEBUG(dbgs() << "Allowing Occupancy drops to " << MinOccupancy 768 << ".\n"); 769 } 770 } 771 772 std::unique_ptr<GCNSchedStage> 773 GCNScheduleDAGMILive::createSchedStage(GCNSchedStageID SchedStageID) { 774 switch (SchedStageID) { 775 case GCNSchedStageID::OccInitialSchedule: 776 return std::make_unique<OccInitialScheduleStage>(SchedStageID, *this); 777 case GCNSchedStageID::UnclusteredHighRPReschedule: 778 return std::make_unique<UnclusteredHighRPStage>(SchedStageID, *this); 779 case GCNSchedStageID::ClusteredLowOccupancyReschedule: 780 return std::make_unique<ClusteredLowOccStage>(SchedStageID, *this); 781 case GCNSchedStageID::PreRARematerialize: 782 return std::make_unique<PreRARematStage>(SchedStageID, *this); 783 case GCNSchedStageID::ILPInitialSchedule: 784 return std::make_unique<ILPInitialScheduleStage>(SchedStageID, *this); 785 case GCNSchedStageID::MemoryClauseInitialSchedule: 786 return std::make_unique<MemoryClauseInitialScheduleStage>(SchedStageID, 787 *this); 788 } 789 790 llvm_unreachable("Unknown SchedStageID."); 791 } 792 793 void GCNScheduleDAGMILive::schedule() { 794 // Collect all scheduling regions. The actual scheduling is performed in 795 // GCNScheduleDAGMILive::finalizeSchedule. 796 Regions.push_back(std::pair(RegionBegin, RegionEnd)); 797 } 798 799 GCNRegPressure 800 GCNScheduleDAGMILive::getRealRegPressure(unsigned RegionIdx) const { 801 GCNDownwardRPTracker RPTracker(*LIS); 802 RPTracker.advance(begin(), end(), &LiveIns[RegionIdx]); 803 return RPTracker.moveMaxPressure(); 804 } 805 806 static MachineInstr *getLastMIForRegion(MachineBasicBlock::iterator RegionBegin, 807 MachineBasicBlock::iterator RegionEnd) { 808 auto REnd = RegionEnd == RegionBegin->getParent()->end() 809 ? std::prev(RegionEnd) 810 : RegionEnd; 811 return &*skipDebugInstructionsBackward(REnd, RegionBegin); 812 } 813 814 void GCNScheduleDAGMILive::computeBlockPressure(unsigned RegionIdx, 815 const MachineBasicBlock *MBB) { 816 GCNDownwardRPTracker RPTracker(*LIS); 817 818 // If the block has the only successor then live-ins of that successor are 819 // live-outs of the current block. We can reuse calculated live set if the 820 // successor will be sent to scheduling past current block. 821 822 // However, due to the bug in LiveInterval analysis it may happen that two 823 // predecessors of the same successor block have different lane bitmasks for 824 // a live-out register. Workaround that by sticking to one-to-one relationship 825 // i.e. one predecessor with one successor block. 826 const MachineBasicBlock *OnlySucc = nullptr; 827 if (MBB->succ_size() == 1) { 828 auto *Candidate = *MBB->succ_begin(); 829 if (!Candidate->empty() && Candidate->pred_size() == 1) { 830 SlotIndexes *Ind = LIS->getSlotIndexes(); 831 if (Ind->getMBBStartIdx(MBB) < Ind->getMBBStartIdx(Candidate)) 832 OnlySucc = Candidate; 833 } 834 } 835 836 // Scheduler sends regions from the end of the block upwards. 837 size_t CurRegion = RegionIdx; 838 for (size_t E = Regions.size(); CurRegion != E; ++CurRegion) 839 if (Regions[CurRegion].first->getParent() != MBB) 840 break; 841 --CurRegion; 842 843 auto I = MBB->begin(); 844 auto LiveInIt = MBBLiveIns.find(MBB); 845 auto &Rgn = Regions[CurRegion]; 846 auto *NonDbgMI = &*skipDebugInstructionsForward(Rgn.first, Rgn.second); 847 if (LiveInIt != MBBLiveIns.end()) { 848 auto LiveIn = std::move(LiveInIt->second); 849 RPTracker.reset(*MBB->begin(), &LiveIn); 850 MBBLiveIns.erase(LiveInIt); 851 } else { 852 I = Rgn.first; 853 auto LRS = BBLiveInMap.lookup(NonDbgMI); 854 #ifdef EXPENSIVE_CHECKS 855 assert(isEqual(getLiveRegsBefore(*NonDbgMI, *LIS), LRS)); 856 #endif 857 RPTracker.reset(*I, &LRS); 858 } 859 860 for (;;) { 861 I = RPTracker.getNext(); 862 863 if (Regions[CurRegion].first == I || NonDbgMI == I) { 864 LiveIns[CurRegion] = RPTracker.getLiveRegs(); 865 RPTracker.clearMaxPressure(); 866 } 867 868 if (Regions[CurRegion].second == I) { 869 Pressure[CurRegion] = RPTracker.moveMaxPressure(); 870 if (CurRegion-- == RegionIdx) 871 break; 872 } 873 RPTracker.advanceToNext(); 874 RPTracker.advanceBeforeNext(); 875 } 876 877 if (OnlySucc) { 878 if (I != MBB->end()) { 879 RPTracker.advanceToNext(); 880 RPTracker.advance(MBB->end()); 881 } 882 RPTracker.advanceBeforeNext(); 883 MBBLiveIns[OnlySucc] = RPTracker.moveLiveRegs(); 884 } 885 } 886 887 DenseMap<MachineInstr *, GCNRPTracker::LiveRegSet> 888 GCNScheduleDAGMILive::getRegionLiveInMap() const { 889 assert(!Regions.empty()); 890 std::vector<MachineInstr *> RegionFirstMIs; 891 RegionFirstMIs.reserve(Regions.size()); 892 auto I = Regions.rbegin(), E = Regions.rend(); 893 auto *BB = I->first->getParent(); 894 do { 895 auto *MI = &*skipDebugInstructionsForward(I->first, I->second); 896 RegionFirstMIs.push_back(MI); 897 do { 898 ++I; 899 } while (I != E && I->first->getParent() == BB); 900 } while (I != E); 901 return getLiveRegMap(RegionFirstMIs, /*After=*/false, *LIS); 902 } 903 904 DenseMap<MachineInstr *, GCNRPTracker::LiveRegSet> 905 GCNScheduleDAGMILive::getRegionLiveOutMap() const { 906 assert(!Regions.empty()); 907 std::vector<MachineInstr *> RegionLastMIs; 908 RegionLastMIs.reserve(Regions.size()); 909 for (auto &[RegionBegin, RegionEnd] : reverse(Regions)) 910 RegionLastMIs.push_back(getLastMIForRegion(RegionBegin, RegionEnd)); 911 912 return getLiveRegMap(RegionLastMIs, /*After=*/true, *LIS); 913 } 914 915 void RegionPressureMap::buildLiveRegMap() { 916 IdxToInstruction.clear(); 917 918 RegionLiveRegMap = 919 IsLiveOut ? DAG->getRegionLiveOutMap() : DAG->getRegionLiveInMap(); 920 for (unsigned I = 0; I < DAG->Regions.size(); I++) { 921 MachineInstr *RegionKey = 922 IsLiveOut 923 ? getLastMIForRegion(DAG->Regions[I].first, DAG->Regions[I].second) 924 : &*DAG->Regions[I].first; 925 IdxToInstruction[I] = RegionKey; 926 } 927 } 928 929 void GCNScheduleDAGMILive::finalizeSchedule() { 930 // Start actual scheduling here. This function is called by the base 931 // MachineScheduler after all regions have been recorded by 932 // GCNScheduleDAGMILive::schedule(). 933 LiveIns.resize(Regions.size()); 934 Pressure.resize(Regions.size()); 935 RescheduleRegions.resize(Regions.size()); 936 RegionsWithHighRP.resize(Regions.size()); 937 RegionsWithExcessRP.resize(Regions.size()); 938 RegionsWithMinOcc.resize(Regions.size()); 939 RegionsWithIGLPInstrs.resize(Regions.size()); 940 RescheduleRegions.set(); 941 RegionsWithHighRP.reset(); 942 RegionsWithExcessRP.reset(); 943 RegionsWithMinOcc.reset(); 944 RegionsWithIGLPInstrs.reset(); 945 946 runSchedStages(); 947 } 948 949 void GCNScheduleDAGMILive::runSchedStages() { 950 LLVM_DEBUG(dbgs() << "All regions recorded, starting actual scheduling.\n"); 951 952 if (!Regions.empty()) { 953 BBLiveInMap = getRegionLiveInMap(); 954 if (GCNTrackers) 955 RegionLiveOuts.buildLiveRegMap(); 956 } 957 958 GCNSchedStrategy &S = static_cast<GCNSchedStrategy &>(*SchedImpl); 959 while (S.advanceStage()) { 960 auto Stage = createSchedStage(S.getCurrentStage()); 961 if (!Stage->initGCNSchedStage()) 962 continue; 963 964 for (auto Region : Regions) { 965 RegionBegin = Region.first; 966 RegionEnd = Region.second; 967 // Setup for scheduling the region and check whether it should be skipped. 968 if (!Stage->initGCNRegion()) { 969 Stage->advanceRegion(); 970 exitRegion(); 971 continue; 972 } 973 974 if (GCNTrackers) { 975 GCNDownwardRPTracker *DownwardTracker = S.getDownwardTracker(); 976 GCNUpwardRPTracker *UpwardTracker = S.getUpwardTracker(); 977 GCNRPTracker::LiveRegSet *RegionLiveIns = 978 &LiveIns[Stage->getRegionIdx()]; 979 980 reinterpret_cast<GCNRPTracker *>(DownwardTracker) 981 ->reset(MRI, *RegionLiveIns); 982 reinterpret_cast<GCNRPTracker *>(UpwardTracker) 983 ->reset(MRI, RegionLiveOuts.getLiveRegsForRegionIdx( 984 Stage->getRegionIdx())); 985 } 986 987 ScheduleDAGMILive::schedule(); 988 Stage->finalizeGCNRegion(); 989 } 990 991 Stage->finalizeGCNSchedStage(); 992 } 993 } 994 995 #ifndef NDEBUG 996 raw_ostream &llvm::operator<<(raw_ostream &OS, const GCNSchedStageID &StageID) { 997 switch (StageID) { 998 case GCNSchedStageID::OccInitialSchedule: 999 OS << "Max Occupancy Initial Schedule"; 1000 break; 1001 case GCNSchedStageID::UnclusteredHighRPReschedule: 1002 OS << "Unclustered High Register Pressure Reschedule"; 1003 break; 1004 case GCNSchedStageID::ClusteredLowOccupancyReschedule: 1005 OS << "Clustered Low Occupancy Reschedule"; 1006 break; 1007 case GCNSchedStageID::PreRARematerialize: 1008 OS << "Pre-RA Rematerialize"; 1009 break; 1010 case GCNSchedStageID::ILPInitialSchedule: 1011 OS << "Max ILP Initial Schedule"; 1012 break; 1013 case GCNSchedStageID::MemoryClauseInitialSchedule: 1014 OS << "Max memory clause Initial Schedule"; 1015 break; 1016 } 1017 1018 return OS; 1019 } 1020 #endif 1021 1022 GCNSchedStage::GCNSchedStage(GCNSchedStageID StageID, GCNScheduleDAGMILive &DAG) 1023 : DAG(DAG), S(static_cast<GCNSchedStrategy &>(*DAG.SchedImpl)), MF(DAG.MF), 1024 MFI(DAG.MFI), ST(DAG.ST), StageID(StageID) {} 1025 1026 bool GCNSchedStage::initGCNSchedStage() { 1027 if (!DAG.LIS) 1028 return false; 1029 1030 LLVM_DEBUG(dbgs() << "Starting scheduling stage: " << StageID << "\n"); 1031 return true; 1032 } 1033 1034 bool UnclusteredHighRPStage::initGCNSchedStage() { 1035 if (DisableUnclusterHighRP) 1036 return false; 1037 1038 if (!GCNSchedStage::initGCNSchedStage()) 1039 return false; 1040 1041 if (DAG.RegionsWithHighRP.none() && DAG.RegionsWithExcessRP.none()) 1042 return false; 1043 1044 SavedMutations.swap(DAG.Mutations); 1045 DAG.addMutation( 1046 createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::PreRAReentry)); 1047 1048 InitialOccupancy = DAG.MinOccupancy; 1049 // Aggressivly try to reduce register pressure in the unclustered high RP 1050 // stage. Temporarily increase occupancy target in the region. 1051 S.SGPRLimitBias = S.HighRPSGPRBias; 1052 S.VGPRLimitBias = S.HighRPVGPRBias; 1053 if (MFI.getMaxWavesPerEU() > DAG.MinOccupancy) 1054 MFI.increaseOccupancy(MF, ++DAG.MinOccupancy); 1055 1056 LLVM_DEBUG( 1057 dbgs() 1058 << "Retrying function scheduling without clustering. " 1059 "Aggressivly try to reduce register pressure to achieve occupancy " 1060 << DAG.MinOccupancy << ".\n"); 1061 1062 return true; 1063 } 1064 1065 bool ClusteredLowOccStage::initGCNSchedStage() { 1066 if (DisableClusteredLowOccupancy) 1067 return false; 1068 1069 if (!GCNSchedStage::initGCNSchedStage()) 1070 return false; 1071 1072 // Don't bother trying to improve ILP in lower RP regions if occupancy has not 1073 // been dropped. All regions will have already been scheduled with the ideal 1074 // occupancy targets. 1075 if (DAG.StartingOccupancy <= DAG.MinOccupancy) 1076 return false; 1077 1078 LLVM_DEBUG( 1079 dbgs() << "Retrying function scheduling with lowest recorded occupancy " 1080 << DAG.MinOccupancy << ".\n"); 1081 return true; 1082 } 1083 1084 bool PreRARematStage::initGCNSchedStage() { 1085 if (!GCNSchedStage::initGCNSchedStage()) 1086 return false; 1087 1088 if (DAG.RegionsWithMinOcc.none() || DAG.Regions.size() == 1) 1089 return false; 1090 1091 const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo(); 1092 // Rematerialization will not help if occupancy is not limited by reg usage. 1093 if (ST.getOccupancyWithWorkGroupSizes(MF).second == DAG.MinOccupancy) 1094 return false; 1095 1096 // FIXME: This pass will invalidate cached MBBLiveIns for regions 1097 // inbetween the defs and region we sinked the def to. Cached pressure 1098 // for regions where a def is sinked from will also be invalidated. Will 1099 // need to be fixed if there is another pass after this pass. 1100 assert(!S.hasNextStage()); 1101 1102 collectRematerializableInstructions(); 1103 if (RematerializableInsts.empty() || !sinkTriviallyRematInsts(ST, TII)) 1104 return false; 1105 1106 LLVM_DEBUG( 1107 dbgs() << "Retrying function scheduling with improved occupancy of " 1108 << DAG.MinOccupancy << " from rematerializing\n"); 1109 return true; 1110 } 1111 1112 void GCNSchedStage::finalizeGCNSchedStage() { 1113 DAG.finishBlock(); 1114 LLVM_DEBUG(dbgs() << "Ending scheduling stage: " << StageID << "\n"); 1115 } 1116 1117 void UnclusteredHighRPStage::finalizeGCNSchedStage() { 1118 SavedMutations.swap(DAG.Mutations); 1119 S.SGPRLimitBias = S.VGPRLimitBias = 0; 1120 if (DAG.MinOccupancy > InitialOccupancy) { 1121 for (unsigned IDX = 0; IDX < DAG.Pressure.size(); ++IDX) 1122 DAG.RegionsWithMinOcc[IDX] = 1123 DAG.Pressure[IDX].getOccupancy(DAG.ST) == DAG.MinOccupancy; 1124 1125 LLVM_DEBUG(dbgs() << StageID 1126 << " stage successfully increased occupancy to " 1127 << DAG.MinOccupancy << '\n'); 1128 } 1129 1130 GCNSchedStage::finalizeGCNSchedStage(); 1131 } 1132 1133 bool GCNSchedStage::initGCNRegion() { 1134 // Check whether this new region is also a new block. 1135 if (DAG.RegionBegin->getParent() != CurrentMBB) 1136 setupNewBlock(); 1137 1138 unsigned NumRegionInstrs = std::distance(DAG.begin(), DAG.end()); 1139 DAG.enterRegion(CurrentMBB, DAG.begin(), DAG.end(), NumRegionInstrs); 1140 1141 // Skip empty scheduling regions (0 or 1 schedulable instructions). 1142 if (DAG.begin() == DAG.end() || DAG.begin() == std::prev(DAG.end())) 1143 return false; 1144 1145 LLVM_DEBUG(dbgs() << "********** MI Scheduling **********\n"); 1146 LLVM_DEBUG(dbgs() << MF.getName() << ":" << printMBBReference(*CurrentMBB) 1147 << " " << CurrentMBB->getName() 1148 << "\n From: " << *DAG.begin() << " To: "; 1149 if (DAG.RegionEnd != CurrentMBB->end()) dbgs() << *DAG.RegionEnd; 1150 else dbgs() << "End"; 1151 dbgs() << " RegionInstrs: " << NumRegionInstrs << '\n'); 1152 1153 // Save original instruction order before scheduling for possible revert. 1154 Unsched.clear(); 1155 Unsched.reserve(DAG.NumRegionInstrs); 1156 if (StageID == GCNSchedStageID::OccInitialSchedule || 1157 StageID == GCNSchedStageID::ILPInitialSchedule) { 1158 for (auto &I : DAG) { 1159 Unsched.push_back(&I); 1160 if (isIGLPMutationOnly(I.getOpcode())) 1161 DAG.RegionsWithIGLPInstrs[RegionIdx] = true; 1162 } 1163 } else { 1164 for (auto &I : DAG) 1165 Unsched.push_back(&I); 1166 } 1167 1168 PressureBefore = DAG.Pressure[RegionIdx]; 1169 1170 LLVM_DEBUG( 1171 dbgs() << "Pressure before scheduling:\nRegion live-ins:" 1172 << print(DAG.LiveIns[RegionIdx], DAG.MRI) 1173 << "Region live-in pressure: " 1174 << print(llvm::getRegPressure(DAG.MRI, DAG.LiveIns[RegionIdx])) 1175 << "Region register pressure: " << print(PressureBefore)); 1176 1177 S.HasHighPressure = false; 1178 S.KnownExcessRP = isRegionWithExcessRP(); 1179 1180 if (DAG.RegionsWithIGLPInstrs[RegionIdx] && 1181 StageID != GCNSchedStageID::UnclusteredHighRPReschedule) { 1182 SavedMutations.clear(); 1183 SavedMutations.swap(DAG.Mutations); 1184 bool IsInitialStage = StageID == GCNSchedStageID::OccInitialSchedule || 1185 StageID == GCNSchedStageID::ILPInitialSchedule; 1186 DAG.addMutation(createIGroupLPDAGMutation( 1187 IsInitialStage ? AMDGPU::SchedulingPhase::Initial 1188 : AMDGPU::SchedulingPhase::PreRAReentry)); 1189 } 1190 1191 return true; 1192 } 1193 1194 bool UnclusteredHighRPStage::initGCNRegion() { 1195 // Only reschedule regions with the minimum occupancy or regions that may have 1196 // spilling (excess register pressure). 1197 if ((!DAG.RegionsWithMinOcc[RegionIdx] || 1198 DAG.MinOccupancy <= InitialOccupancy) && 1199 !DAG.RegionsWithExcessRP[RegionIdx]) 1200 return false; 1201 1202 return GCNSchedStage::initGCNRegion(); 1203 } 1204 1205 bool ClusteredLowOccStage::initGCNRegion() { 1206 // We may need to reschedule this region if it wasn't rescheduled in the last 1207 // stage, or if we found it was testing critical register pressure limits in 1208 // the unclustered reschedule stage. The later is because we may not have been 1209 // able to raise the min occupancy in the previous stage so the region may be 1210 // overly constrained even if it was already rescheduled. 1211 if (!DAG.RegionsWithHighRP[RegionIdx]) 1212 return false; 1213 1214 return GCNSchedStage::initGCNRegion(); 1215 } 1216 1217 bool PreRARematStage::initGCNRegion() { 1218 if (!DAG.RescheduleRegions[RegionIdx]) 1219 return false; 1220 1221 return GCNSchedStage::initGCNRegion(); 1222 } 1223 1224 void GCNSchedStage::setupNewBlock() { 1225 if (CurrentMBB) 1226 DAG.finishBlock(); 1227 1228 CurrentMBB = DAG.RegionBegin->getParent(); 1229 DAG.startBlock(CurrentMBB); 1230 // Get real RP for the region if it hasn't be calculated before. After the 1231 // initial schedule stage real RP will be collected after scheduling. 1232 if (StageID == GCNSchedStageID::OccInitialSchedule || 1233 StageID == GCNSchedStageID::ILPInitialSchedule || 1234 StageID == GCNSchedStageID::MemoryClauseInitialSchedule) 1235 DAG.computeBlockPressure(RegionIdx, CurrentMBB); 1236 } 1237 1238 void GCNSchedStage::finalizeGCNRegion() { 1239 DAG.Regions[RegionIdx] = std::pair(DAG.RegionBegin, DAG.RegionEnd); 1240 DAG.RescheduleRegions[RegionIdx] = false; 1241 if (S.HasHighPressure) 1242 DAG.RegionsWithHighRP[RegionIdx] = true; 1243 1244 // Revert scheduling if we have dropped occupancy or there is some other 1245 // reason that the original schedule is better. 1246 checkScheduling(); 1247 1248 if (DAG.RegionsWithIGLPInstrs[RegionIdx] && 1249 StageID != GCNSchedStageID::UnclusteredHighRPReschedule) 1250 SavedMutations.swap(DAG.Mutations); 1251 1252 DAG.exitRegion(); 1253 RegionIdx++; 1254 } 1255 1256 void GCNSchedStage::checkScheduling() { 1257 // Check the results of scheduling. 1258 PressureAfter = DAG.getRealRegPressure(RegionIdx); 1259 1260 LLVM_DEBUG(dbgs() << "Pressure after scheduling: " << print(PressureAfter)); 1261 LLVM_DEBUG(dbgs() << "Region: " << RegionIdx << ".\n"); 1262 1263 if (PressureAfter.getSGPRNum() <= S.SGPRCriticalLimit && 1264 PressureAfter.getVGPRNum(ST.hasGFX90AInsts()) <= S.VGPRCriticalLimit) { 1265 DAG.Pressure[RegionIdx] = PressureAfter; 1266 DAG.RegionsWithMinOcc[RegionIdx] = 1267 PressureAfter.getOccupancy(ST) == DAG.MinOccupancy; 1268 1269 // Early out if we have achieved the occupancy target. 1270 LLVM_DEBUG(dbgs() << "Pressure in desired limits, done.\n"); 1271 return; 1272 } 1273 1274 unsigned TargetOccupancy = std::min( 1275 S.getTargetOccupancy(), ST.getOccupancyWithWorkGroupSizes(MF).second); 1276 unsigned WavesAfter = 1277 std::min(TargetOccupancy, PressureAfter.getOccupancy(ST)); 1278 unsigned WavesBefore = 1279 std::min(TargetOccupancy, PressureBefore.getOccupancy(ST)); 1280 LLVM_DEBUG(dbgs() << "Occupancy before scheduling: " << WavesBefore 1281 << ", after " << WavesAfter << ".\n"); 1282 1283 // We may not be able to keep the current target occupancy because of the just 1284 // scheduled region. We might still be able to revert scheduling if the 1285 // occupancy before was higher, or if the current schedule has register 1286 // pressure higher than the excess limits which could lead to more spilling. 1287 unsigned NewOccupancy = std::max(WavesAfter, WavesBefore); 1288 1289 // Allow memory bound functions to drop to 4 waves if not limited by an 1290 // attribute. 1291 if (WavesAfter < WavesBefore && WavesAfter < DAG.MinOccupancy && 1292 WavesAfter >= MFI.getMinAllowedOccupancy()) { 1293 LLVM_DEBUG(dbgs() << "Function is memory bound, allow occupancy drop up to " 1294 << MFI.getMinAllowedOccupancy() << " waves\n"); 1295 NewOccupancy = WavesAfter; 1296 } 1297 1298 if (NewOccupancy < DAG.MinOccupancy) { 1299 DAG.MinOccupancy = NewOccupancy; 1300 MFI.limitOccupancy(DAG.MinOccupancy); 1301 DAG.RegionsWithMinOcc.reset(); 1302 LLVM_DEBUG(dbgs() << "Occupancy lowered for the function to " 1303 << DAG.MinOccupancy << ".\n"); 1304 } 1305 // The maximum number of arch VGPR on non-unified register file, or the 1306 // maximum VGPR + AGPR in the unified register file case. 1307 unsigned MaxVGPRs = ST.getMaxNumVGPRs(MF); 1308 // The maximum number of arch VGPR for both unified and non-unified register 1309 // file. 1310 unsigned MaxArchVGPRs = std::min(MaxVGPRs, ST.getAddressableNumArchVGPRs()); 1311 unsigned MaxSGPRs = ST.getMaxNumSGPRs(MF); 1312 1313 if (PressureAfter.getVGPRNum(ST.hasGFX90AInsts()) > MaxVGPRs || 1314 PressureAfter.getVGPRNum(false) > MaxArchVGPRs || 1315 PressureAfter.getAGPRNum() > MaxArchVGPRs || 1316 PressureAfter.getSGPRNum() > MaxSGPRs) { 1317 DAG.RescheduleRegions[RegionIdx] = true; 1318 DAG.RegionsWithHighRP[RegionIdx] = true; 1319 DAG.RegionsWithExcessRP[RegionIdx] = true; 1320 } 1321 1322 // Revert if this region's schedule would cause a drop in occupancy or 1323 // spilling. 1324 if (shouldRevertScheduling(WavesAfter)) { 1325 revertScheduling(); 1326 } else { 1327 DAG.Pressure[RegionIdx] = PressureAfter; 1328 DAG.RegionsWithMinOcc[RegionIdx] = 1329 PressureAfter.getOccupancy(ST) == DAG.MinOccupancy; 1330 } 1331 } 1332 1333 unsigned 1334 GCNSchedStage::computeSUnitReadyCycle(const SUnit &SU, unsigned CurrCycle, 1335 DenseMap<unsigned, unsigned> &ReadyCycles, 1336 const TargetSchedModel &SM) { 1337 unsigned ReadyCycle = CurrCycle; 1338 for (auto &D : SU.Preds) { 1339 if (D.isAssignedRegDep()) { 1340 MachineInstr *DefMI = D.getSUnit()->getInstr(); 1341 unsigned Latency = SM.computeInstrLatency(DefMI); 1342 unsigned DefReady = ReadyCycles[DAG.getSUnit(DefMI)->NodeNum]; 1343 ReadyCycle = std::max(ReadyCycle, DefReady + Latency); 1344 } 1345 } 1346 ReadyCycles[SU.NodeNum] = ReadyCycle; 1347 return ReadyCycle; 1348 } 1349 1350 #ifndef NDEBUG 1351 struct EarlierIssuingCycle { 1352 bool operator()(std::pair<MachineInstr *, unsigned> A, 1353 std::pair<MachineInstr *, unsigned> B) const { 1354 return A.second < B.second; 1355 } 1356 }; 1357 1358 static void printScheduleModel(std::set<std::pair<MachineInstr *, unsigned>, 1359 EarlierIssuingCycle> &ReadyCycles) { 1360 if (ReadyCycles.empty()) 1361 return; 1362 unsigned BBNum = ReadyCycles.begin()->first->getParent()->getNumber(); 1363 dbgs() << "\n################## Schedule time ReadyCycles for MBB : " << BBNum 1364 << " ##################\n# Cycle #\t\t\tInstruction " 1365 " " 1366 " \n"; 1367 unsigned IPrev = 1; 1368 for (auto &I : ReadyCycles) { 1369 if (I.second > IPrev + 1) 1370 dbgs() << "****************************** BUBBLE OF " << I.second - IPrev 1371 << " CYCLES DETECTED ******************************\n\n"; 1372 dbgs() << "[ " << I.second << " ] : " << *I.first << "\n"; 1373 IPrev = I.second; 1374 } 1375 } 1376 #endif 1377 1378 ScheduleMetrics 1379 GCNSchedStage::getScheduleMetrics(const std::vector<SUnit> &InputSchedule) { 1380 #ifndef NDEBUG 1381 std::set<std::pair<MachineInstr *, unsigned>, EarlierIssuingCycle> 1382 ReadyCyclesSorted; 1383 #endif 1384 const TargetSchedModel &SM = ST.getInstrInfo()->getSchedModel(); 1385 unsigned SumBubbles = 0; 1386 DenseMap<unsigned, unsigned> ReadyCycles; 1387 unsigned CurrCycle = 0; 1388 for (auto &SU : InputSchedule) { 1389 unsigned ReadyCycle = 1390 computeSUnitReadyCycle(SU, CurrCycle, ReadyCycles, SM); 1391 SumBubbles += ReadyCycle - CurrCycle; 1392 #ifndef NDEBUG 1393 ReadyCyclesSorted.insert(std::make_pair(SU.getInstr(), ReadyCycle)); 1394 #endif 1395 CurrCycle = ++ReadyCycle; 1396 } 1397 #ifndef NDEBUG 1398 LLVM_DEBUG( 1399 printScheduleModel(ReadyCyclesSorted); 1400 dbgs() << "\n\t" 1401 << "Metric: " 1402 << (SumBubbles 1403 ? (SumBubbles * ScheduleMetrics::ScaleFactor) / CurrCycle 1404 : 1) 1405 << "\n\n"); 1406 #endif 1407 1408 return ScheduleMetrics(CurrCycle, SumBubbles); 1409 } 1410 1411 ScheduleMetrics 1412 GCNSchedStage::getScheduleMetrics(const GCNScheduleDAGMILive &DAG) { 1413 #ifndef NDEBUG 1414 std::set<std::pair<MachineInstr *, unsigned>, EarlierIssuingCycle> 1415 ReadyCyclesSorted; 1416 #endif 1417 const TargetSchedModel &SM = ST.getInstrInfo()->getSchedModel(); 1418 unsigned SumBubbles = 0; 1419 DenseMap<unsigned, unsigned> ReadyCycles; 1420 unsigned CurrCycle = 0; 1421 for (auto &MI : DAG) { 1422 SUnit *SU = DAG.getSUnit(&MI); 1423 if (!SU) 1424 continue; 1425 unsigned ReadyCycle = 1426 computeSUnitReadyCycle(*SU, CurrCycle, ReadyCycles, SM); 1427 SumBubbles += ReadyCycle - CurrCycle; 1428 #ifndef NDEBUG 1429 ReadyCyclesSorted.insert(std::make_pair(SU->getInstr(), ReadyCycle)); 1430 #endif 1431 CurrCycle = ++ReadyCycle; 1432 } 1433 #ifndef NDEBUG 1434 LLVM_DEBUG( 1435 printScheduleModel(ReadyCyclesSorted); 1436 dbgs() << "\n\t" 1437 << "Metric: " 1438 << (SumBubbles 1439 ? (SumBubbles * ScheduleMetrics::ScaleFactor) / CurrCycle 1440 : 1) 1441 << "\n\n"); 1442 #endif 1443 1444 return ScheduleMetrics(CurrCycle, SumBubbles); 1445 } 1446 1447 bool GCNSchedStage::shouldRevertScheduling(unsigned WavesAfter) { 1448 if (WavesAfter < DAG.MinOccupancy) 1449 return true; 1450 1451 return false; 1452 } 1453 1454 bool OccInitialScheduleStage::shouldRevertScheduling(unsigned WavesAfter) { 1455 if (PressureAfter == PressureBefore) 1456 return false; 1457 1458 if (GCNSchedStage::shouldRevertScheduling(WavesAfter)) 1459 return true; 1460 1461 if (mayCauseSpilling(WavesAfter)) 1462 return true; 1463 1464 return false; 1465 } 1466 1467 bool UnclusteredHighRPStage::shouldRevertScheduling(unsigned WavesAfter) { 1468 // If RP is not reduced in the unclustered reschedule stage, revert to the 1469 // old schedule. 1470 if ((WavesAfter <= PressureBefore.getOccupancy(ST) && 1471 mayCauseSpilling(WavesAfter)) || 1472 GCNSchedStage::shouldRevertScheduling(WavesAfter)) { 1473 LLVM_DEBUG(dbgs() << "Unclustered reschedule did not help.\n"); 1474 return true; 1475 } 1476 1477 // Do not attempt to relax schedule even more if we are already spilling. 1478 if (isRegionWithExcessRP()) 1479 return false; 1480 1481 LLVM_DEBUG( 1482 dbgs() 1483 << "\n\t *** In shouldRevertScheduling ***\n" 1484 << " *********** BEFORE UnclusteredHighRPStage ***********\n"); 1485 ScheduleMetrics MBefore = 1486 getScheduleMetrics(DAG.SUnits); 1487 LLVM_DEBUG( 1488 dbgs() 1489 << "\n *********** AFTER UnclusteredHighRPStage ***********\n"); 1490 ScheduleMetrics MAfter = getScheduleMetrics(DAG); 1491 unsigned OldMetric = MBefore.getMetric(); 1492 unsigned NewMetric = MAfter.getMetric(); 1493 unsigned WavesBefore = 1494 std::min(S.getTargetOccupancy(), PressureBefore.getOccupancy(ST)); 1495 unsigned Profit = 1496 ((WavesAfter * ScheduleMetrics::ScaleFactor) / WavesBefore * 1497 ((OldMetric + ScheduleMetricBias) * ScheduleMetrics::ScaleFactor) / 1498 NewMetric) / 1499 ScheduleMetrics::ScaleFactor; 1500 LLVM_DEBUG(dbgs() << "\tMetric before " << MBefore << "\tMetric after " 1501 << MAfter << "Profit: " << Profit << "\n"); 1502 return Profit < ScheduleMetrics::ScaleFactor; 1503 } 1504 1505 bool ClusteredLowOccStage::shouldRevertScheduling(unsigned WavesAfter) { 1506 if (PressureAfter == PressureBefore) 1507 return false; 1508 1509 if (GCNSchedStage::shouldRevertScheduling(WavesAfter)) 1510 return true; 1511 1512 if (mayCauseSpilling(WavesAfter)) 1513 return true; 1514 1515 return false; 1516 } 1517 1518 bool PreRARematStage::shouldRevertScheduling(unsigned WavesAfter) { 1519 if (GCNSchedStage::shouldRevertScheduling(WavesAfter)) 1520 return true; 1521 1522 if (mayCauseSpilling(WavesAfter)) 1523 return true; 1524 1525 return false; 1526 } 1527 1528 bool ILPInitialScheduleStage::shouldRevertScheduling(unsigned WavesAfter) { 1529 if (mayCauseSpilling(WavesAfter)) 1530 return true; 1531 1532 return false; 1533 } 1534 1535 bool MemoryClauseInitialScheduleStage::shouldRevertScheduling( 1536 unsigned WavesAfter) { 1537 return mayCauseSpilling(WavesAfter); 1538 } 1539 1540 bool GCNSchedStage::mayCauseSpilling(unsigned WavesAfter) { 1541 if (WavesAfter <= MFI.getMinWavesPerEU() && isRegionWithExcessRP() && 1542 !PressureAfter.less(MF, PressureBefore)) { 1543 LLVM_DEBUG(dbgs() << "New pressure will result in more spilling.\n"); 1544 return true; 1545 } 1546 1547 return false; 1548 } 1549 1550 void GCNSchedStage::revertScheduling() { 1551 DAG.RegionsWithMinOcc[RegionIdx] = 1552 PressureBefore.getOccupancy(ST) == DAG.MinOccupancy; 1553 LLVM_DEBUG(dbgs() << "Attempting to revert scheduling.\n"); 1554 DAG.RescheduleRegions[RegionIdx] = 1555 S.hasNextStage() && 1556 S.getNextStage() != GCNSchedStageID::UnclusteredHighRPReschedule; 1557 DAG.RegionEnd = DAG.RegionBegin; 1558 int SkippedDebugInstr = 0; 1559 for (MachineInstr *MI : Unsched) { 1560 if (MI->isDebugInstr()) { 1561 ++SkippedDebugInstr; 1562 continue; 1563 } 1564 1565 if (MI->getIterator() != DAG.RegionEnd) { 1566 DAG.BB->remove(MI); 1567 DAG.BB->insert(DAG.RegionEnd, MI); 1568 if (!MI->isDebugInstr()) 1569 DAG.LIS->handleMove(*MI, true); 1570 } 1571 1572 // Reset read-undef flags and update them later. 1573 for (auto &Op : MI->all_defs()) 1574 Op.setIsUndef(false); 1575 RegisterOperands RegOpers; 1576 RegOpers.collect(*MI, *DAG.TRI, DAG.MRI, DAG.ShouldTrackLaneMasks, false); 1577 if (!MI->isDebugInstr()) { 1578 if (DAG.ShouldTrackLaneMasks) { 1579 // Adjust liveness and add missing dead+read-undef flags. 1580 SlotIndex SlotIdx = DAG.LIS->getInstructionIndex(*MI).getRegSlot(); 1581 RegOpers.adjustLaneLiveness(*DAG.LIS, DAG.MRI, SlotIdx, MI); 1582 } else { 1583 // Adjust for missing dead-def flags. 1584 RegOpers.detectDeadDefs(*MI, *DAG.LIS); 1585 } 1586 } 1587 DAG.RegionEnd = MI->getIterator(); 1588 ++DAG.RegionEnd; 1589 LLVM_DEBUG(dbgs() << "Scheduling " << *MI); 1590 } 1591 1592 // After reverting schedule, debug instrs will now be at the end of the block 1593 // and RegionEnd will point to the first debug instr. Increment RegionEnd 1594 // pass debug instrs to the actual end of the scheduling region. 1595 while (SkippedDebugInstr-- > 0) 1596 ++DAG.RegionEnd; 1597 1598 // If Unsched.front() instruction is a debug instruction, this will actually 1599 // shrink the region since we moved all debug instructions to the end of the 1600 // block. Find the first instruction that is not a debug instruction. 1601 DAG.RegionBegin = Unsched.front()->getIterator(); 1602 if (DAG.RegionBegin->isDebugInstr()) { 1603 for (MachineInstr *MI : Unsched) { 1604 if (MI->isDebugInstr()) 1605 continue; 1606 DAG.RegionBegin = MI->getIterator(); 1607 break; 1608 } 1609 } 1610 1611 // Then move the debug instructions back into their correct place and set 1612 // RegionBegin and RegionEnd if needed. 1613 DAG.placeDebugValues(); 1614 1615 DAG.Regions[RegionIdx] = std::pair(DAG.RegionBegin, DAG.RegionEnd); 1616 } 1617 1618 void PreRARematStage::collectRematerializableInstructions() { 1619 const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo *>(DAG.TRI); 1620 for (unsigned I = 0, E = DAG.MRI.getNumVirtRegs(); I != E; ++I) { 1621 Register Reg = Register::index2VirtReg(I); 1622 if (!DAG.LIS->hasInterval(Reg)) 1623 continue; 1624 1625 // TODO: Handle AGPR and SGPR rematerialization 1626 if (!SRI->isVGPRClass(DAG.MRI.getRegClass(Reg)) || 1627 !DAG.MRI.hasOneDef(Reg) || !DAG.MRI.hasOneNonDBGUse(Reg)) 1628 continue; 1629 1630 MachineOperand *Op = DAG.MRI.getOneDef(Reg); 1631 MachineInstr *Def = Op->getParent(); 1632 if (Op->getSubReg() != 0 || !isTriviallyReMaterializable(*Def)) 1633 continue; 1634 1635 MachineInstr *UseI = &*DAG.MRI.use_instr_nodbg_begin(Reg); 1636 if (Def->getParent() == UseI->getParent()) 1637 continue; 1638 1639 // We are only collecting defs that are defined in another block and are 1640 // live-through or used inside regions at MinOccupancy. This means that the 1641 // register must be in the live-in set for the region. 1642 bool AddedToRematList = false; 1643 for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) { 1644 auto It = DAG.LiveIns[I].find(Reg); 1645 if (It != DAG.LiveIns[I].end() && !It->second.none()) { 1646 if (DAG.RegionsWithMinOcc[I]) { 1647 RematerializableInsts[I][Def] = UseI; 1648 AddedToRematList = true; 1649 } 1650 1651 // Collect regions with rematerializable reg as live-in to avoid 1652 // searching later when updating RP. 1653 RematDefToLiveInRegions[Def].push_back(I); 1654 } 1655 } 1656 if (!AddedToRematList) 1657 RematDefToLiveInRegions.erase(Def); 1658 } 1659 } 1660 1661 bool PreRARematStage::sinkTriviallyRematInsts(const GCNSubtarget &ST, 1662 const TargetInstrInfo *TII) { 1663 // Temporary copies of cached variables we will be modifying and replacing if 1664 // sinking succeeds. 1665 SmallVector< 1666 std::pair<MachineBasicBlock::iterator, MachineBasicBlock::iterator>, 32> 1667 NewRegions; 1668 DenseMap<unsigned, GCNRPTracker::LiveRegSet> NewLiveIns; 1669 DenseMap<unsigned, GCNRegPressure> NewPressure; 1670 BitVector NewRescheduleRegions; 1671 LiveIntervals *LIS = DAG.LIS; 1672 1673 NewRegions.resize(DAG.Regions.size()); 1674 NewRescheduleRegions.resize(DAG.Regions.size()); 1675 1676 // Collect only regions that has a rematerializable def as a live-in. 1677 SmallSet<unsigned, 16> ImpactedRegions; 1678 for (const auto &It : RematDefToLiveInRegions) 1679 ImpactedRegions.insert(It.second.begin(), It.second.end()); 1680 1681 // Make copies of register pressure and live-ins cache that will be updated 1682 // as we rematerialize. 1683 for (auto Idx : ImpactedRegions) { 1684 NewPressure[Idx] = DAG.Pressure[Idx]; 1685 NewLiveIns[Idx] = DAG.LiveIns[Idx]; 1686 } 1687 NewRegions = DAG.Regions; 1688 NewRescheduleRegions.reset(); 1689 1690 DenseMap<MachineInstr *, MachineInstr *> InsertedMIToOldDef; 1691 bool Improved = false; 1692 for (auto I : ImpactedRegions) { 1693 if (!DAG.RegionsWithMinOcc[I]) 1694 continue; 1695 1696 Improved = false; 1697 int VGPRUsage = NewPressure[I].getVGPRNum(ST.hasGFX90AInsts()); 1698 int SGPRUsage = NewPressure[I].getSGPRNum(); 1699 1700 // TODO: Handle occupancy drop due to AGPR and SGPR. 1701 // Check if cause of occupancy drop is due to VGPR usage and not SGPR. 1702 if (ST.getOccupancyWithNumSGPRs(SGPRUsage) == DAG.MinOccupancy) 1703 break; 1704 1705 // The occupancy of this region could have been improved by a previous 1706 // iteration's sinking of defs. 1707 if (NewPressure[I].getOccupancy(ST) > DAG.MinOccupancy) { 1708 NewRescheduleRegions[I] = true; 1709 Improved = true; 1710 continue; 1711 } 1712 1713 // First check if we have enough trivially rematerializable instructions to 1714 // improve occupancy. Optimistically assume all instructions we are able to 1715 // sink decreased RP. 1716 int TotalSinkableRegs = 0; 1717 for (const auto &It : RematerializableInsts[I]) { 1718 MachineInstr *Def = It.first; 1719 Register DefReg = Def->getOperand(0).getReg(); 1720 TotalSinkableRegs += 1721 SIRegisterInfo::getNumCoveredRegs(NewLiveIns[I][DefReg]); 1722 } 1723 int VGPRsAfterSink = VGPRUsage - TotalSinkableRegs; 1724 unsigned OptimisticOccupancy = ST.getOccupancyWithNumVGPRs(VGPRsAfterSink); 1725 // If in the most optimistic scenario, we cannot improve occupancy, then do 1726 // not attempt to sink any instructions. 1727 if (OptimisticOccupancy <= DAG.MinOccupancy) 1728 break; 1729 1730 unsigned ImproveOccupancy = 0; 1731 SmallVector<MachineInstr *, 4> SinkedDefs; 1732 for (auto &It : RematerializableInsts[I]) { 1733 MachineInstr *Def = It.first; 1734 MachineBasicBlock::iterator InsertPos = 1735 MachineBasicBlock::iterator(It.second); 1736 Register Reg = Def->getOperand(0).getReg(); 1737 // Rematerialize MI to its use block. Since we are only rematerializing 1738 // instructions that do not have any virtual reg uses, we do not need to 1739 // call LiveRangeEdit::allUsesAvailableAt() and 1740 // LiveRangeEdit::canRematerializeAt(). 1741 TII->reMaterialize(*InsertPos->getParent(), InsertPos, Reg, 1742 Def->getOperand(0).getSubReg(), *Def, *DAG.TRI); 1743 MachineInstr *NewMI = &*std::prev(InsertPos); 1744 LIS->InsertMachineInstrInMaps(*NewMI); 1745 LIS->removeInterval(Reg); 1746 LIS->createAndComputeVirtRegInterval(Reg); 1747 InsertedMIToOldDef[NewMI] = Def; 1748 1749 // Update region boundaries in scheduling region we sinked from since we 1750 // may sink an instruction that was at the beginning or end of its region 1751 DAG.updateRegionBoundaries(NewRegions, Def, /*NewMI =*/nullptr, 1752 /*Removing =*/true); 1753 1754 // Update region boundaries in region we sinked to. 1755 DAG.updateRegionBoundaries(NewRegions, InsertPos, NewMI); 1756 1757 LaneBitmask PrevMask = NewLiveIns[I][Reg]; 1758 // FIXME: Also update cached pressure for where the def was sinked from. 1759 // Update RP for all regions that has this reg as a live-in and remove 1760 // the reg from all regions as a live-in. 1761 for (auto Idx : RematDefToLiveInRegions[Def]) { 1762 NewLiveIns[Idx].erase(Reg); 1763 if (InsertPos->getParent() != DAG.Regions[Idx].first->getParent()) { 1764 // Def is live-through and not used in this block. 1765 NewPressure[Idx].inc(Reg, PrevMask, LaneBitmask::getNone(), DAG.MRI); 1766 } else { 1767 // Def is used and rematerialized into this block. 1768 GCNDownwardRPTracker RPT(*LIS); 1769 auto *NonDbgMI = &*skipDebugInstructionsForward( 1770 NewRegions[Idx].first, NewRegions[Idx].second); 1771 RPT.reset(*NonDbgMI, &NewLiveIns[Idx]); 1772 RPT.advance(NewRegions[Idx].second); 1773 NewPressure[Idx] = RPT.moveMaxPressure(); 1774 } 1775 } 1776 1777 SinkedDefs.push_back(Def); 1778 ImproveOccupancy = NewPressure[I].getOccupancy(ST); 1779 if (ImproveOccupancy > DAG.MinOccupancy) 1780 break; 1781 } 1782 1783 // Remove defs we just sinked from all regions' list of sinkable defs 1784 for (auto &Def : SinkedDefs) 1785 for (auto TrackedIdx : RematDefToLiveInRegions[Def]) 1786 RematerializableInsts[TrackedIdx].erase(Def); 1787 1788 if (ImproveOccupancy <= DAG.MinOccupancy) 1789 break; 1790 1791 NewRescheduleRegions[I] = true; 1792 Improved = true; 1793 } 1794 1795 if (!Improved) { 1796 // Occupancy was not improved for all regions that were at MinOccupancy. 1797 // Undo sinking and remove newly rematerialized instructions. 1798 for (auto &Entry : InsertedMIToOldDef) { 1799 MachineInstr *MI = Entry.first; 1800 MachineInstr *OldMI = Entry.second; 1801 Register Reg = MI->getOperand(0).getReg(); 1802 LIS->RemoveMachineInstrFromMaps(*MI); 1803 MI->eraseFromParent(); 1804 OldMI->clearRegisterDeads(Reg); 1805 LIS->removeInterval(Reg); 1806 LIS->createAndComputeVirtRegInterval(Reg); 1807 } 1808 return false; 1809 } 1810 1811 // Occupancy was improved for all regions. 1812 for (auto &Entry : InsertedMIToOldDef) { 1813 MachineInstr *MI = Entry.first; 1814 MachineInstr *OldMI = Entry.second; 1815 1816 // Remove OldMI from BBLiveInMap since we are sinking it from its MBB. 1817 DAG.BBLiveInMap.erase(OldMI); 1818 1819 // Remove OldMI and update LIS 1820 Register Reg = MI->getOperand(0).getReg(); 1821 LIS->RemoveMachineInstrFromMaps(*OldMI); 1822 OldMI->eraseFromParent(); 1823 LIS->removeInterval(Reg); 1824 LIS->createAndComputeVirtRegInterval(Reg); 1825 } 1826 1827 // Update live-ins, register pressure, and regions caches. 1828 for (auto Idx : ImpactedRegions) { 1829 DAG.LiveIns[Idx] = NewLiveIns[Idx]; 1830 DAG.Pressure[Idx] = NewPressure[Idx]; 1831 DAG.MBBLiveIns.erase(DAG.Regions[Idx].first->getParent()); 1832 } 1833 DAG.Regions = NewRegions; 1834 DAG.RescheduleRegions = NewRescheduleRegions; 1835 1836 if (GCNTrackers) 1837 DAG.RegionLiveOuts.buildLiveRegMap(); 1838 1839 SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 1840 MFI.increaseOccupancy(MF, ++DAG.MinOccupancy); 1841 1842 return true; 1843 } 1844 1845 bool PreRARematStage::isTriviallyReMaterializable(const MachineInstr &MI) { 1846 if (!DAG.TII->isTriviallyReMaterializable(MI)) 1847 return false; 1848 1849 for (const MachineOperand &MO : MI.all_uses()) { 1850 if (MO.getReg().isVirtual()) 1851 return false; 1852 1853 // We can't remat physreg uses, unless it is a constant or an ignorable 1854 // use (e.g. implicit exec use on VALU instructions) 1855 if (MO.getReg().isPhysical()) { 1856 if (DAG.MRI.isConstantPhysReg(MO.getReg()) || DAG.TII->isIgnorableUse(MO)) 1857 continue; 1858 return false; 1859 } 1860 } 1861 1862 return true; 1863 } 1864 1865 // When removing, we will have to check both beginning and ending of the region. 1866 // When inserting, we will only have to check if we are inserting NewMI in front 1867 // of a scheduling region and do not need to check the ending since we will only 1868 // ever be inserting before an already existing MI. 1869 void GCNScheduleDAGMILive::updateRegionBoundaries( 1870 SmallVectorImpl<std::pair<MachineBasicBlock::iterator, 1871 MachineBasicBlock::iterator>> &RegionBoundaries, 1872 MachineBasicBlock::iterator MI, MachineInstr *NewMI, bool Removing) { 1873 unsigned I = 0, E = RegionBoundaries.size(); 1874 // Search for first region of the block where MI is located 1875 while (I != E && MI->getParent() != RegionBoundaries[I].first->getParent()) 1876 ++I; 1877 1878 for (; I != E; ++I) { 1879 if (MI->getParent() != RegionBoundaries[I].first->getParent()) 1880 return; 1881 1882 if (Removing && MI == RegionBoundaries[I].first && 1883 MI == RegionBoundaries[I].second) { 1884 // MI is in a region with size 1, after removing, the region will be 1885 // size 0, set RegionBegin and RegionEnd to pass end of block iterator. 1886 RegionBoundaries[I] = 1887 std::pair(MI->getParent()->end(), MI->getParent()->end()); 1888 return; 1889 } 1890 if (MI == RegionBoundaries[I].first) { 1891 if (Removing) 1892 RegionBoundaries[I] = 1893 std::pair(std::next(MI), RegionBoundaries[I].second); 1894 else 1895 // Inserted NewMI in front of region, set new RegionBegin to NewMI 1896 RegionBoundaries[I] = std::pair(MachineBasicBlock::iterator(NewMI), 1897 RegionBoundaries[I].second); 1898 return; 1899 } 1900 if (Removing && MI == RegionBoundaries[I].second) { 1901 RegionBoundaries[I] = std::pair(RegionBoundaries[I].first, std::prev(MI)); 1902 return; 1903 } 1904 } 1905 } 1906 1907 static bool hasIGLPInstrs(ScheduleDAGInstrs *DAG) { 1908 return any_of(*DAG, [](MachineBasicBlock::iterator MI) { 1909 return isIGLPMutationOnly(MI->getOpcode()); 1910 }); 1911 } 1912 1913 GCNPostScheduleDAGMILive::GCNPostScheduleDAGMILive( 1914 MachineSchedContext *C, std::unique_ptr<MachineSchedStrategy> S, 1915 bool RemoveKillFlags) 1916 : ScheduleDAGMI(C, std::move(S), RemoveKillFlags) {} 1917 1918 void GCNPostScheduleDAGMILive::schedule() { 1919 HasIGLPInstrs = hasIGLPInstrs(this); 1920 if (HasIGLPInstrs) { 1921 SavedMutations.clear(); 1922 SavedMutations.swap(Mutations); 1923 addMutation(createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::PostRA)); 1924 } 1925 1926 ScheduleDAGMI::schedule(); 1927 } 1928 1929 void GCNPostScheduleDAGMILive::finalizeSchedule() { 1930 if (HasIGLPInstrs) 1931 SavedMutations.swap(Mutations); 1932 1933 ScheduleDAGMI::finalizeSchedule(); 1934 } 1935