xref: /llvm-project/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp (revision e77d428e46d94e1be6e5f38205b01d3f528d5e3f)
1 //===-- GCNSchedStrategy.cpp - GCN Scheduler Strategy ---------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// This contains a MachineSchedStrategy implementation for maximizing wave
11 /// occupancy on GCN hardware.
12 ///
13 /// This pass will apply multiple scheduling stages to the same function.
14 /// Regions are first recorded in GCNScheduleDAGMILive::schedule. The actual
15 /// entry point for the scheduling of those regions is
16 /// GCNScheduleDAGMILive::runSchedStages.
17 
18 /// Generally, the reason for having multiple scheduling stages is to account
19 /// for the kernel-wide effect of register usage on occupancy.  Usually, only a
20 /// few scheduling regions will have register pressure high enough to limit
21 /// occupancy for the kernel, so constraints can be relaxed to improve ILP in
22 /// other regions.
23 ///
24 //===----------------------------------------------------------------------===//
25 
26 #include "GCNSchedStrategy.h"
27 #include "AMDGPUIGroupLP.h"
28 #include "SIMachineFunctionInfo.h"
29 #include "llvm/CodeGen/RegisterClassInfo.h"
30 
31 #define DEBUG_TYPE "machine-scheduler"
32 
33 using namespace llvm;
34 
35 static cl::opt<bool> DisableUnclusterHighRP(
36     "amdgpu-disable-unclustered-high-rp-reschedule", cl::Hidden,
37     cl::desc("Disable unclustered high register pressure "
38              "reduction scheduling stage."),
39     cl::init(false));
40 
41 static cl::opt<bool> DisableClusteredLowOccupancy(
42     "amdgpu-disable-clustered-low-occupancy-reschedule", cl::Hidden,
43     cl::desc("Disable clustered low occupancy "
44              "rescheduling for ILP scheduling stage."),
45     cl::init(false));
46 
47 static cl::opt<unsigned> ScheduleMetricBias(
48     "amdgpu-schedule-metric-bias", cl::Hidden,
49     cl::desc(
50         "Sets the bias which adds weight to occupancy vs latency. Set it to "
51         "100 to chase the occupancy only."),
52     cl::init(10));
53 
54 static cl::opt<bool>
55     RelaxedOcc("amdgpu-schedule-relaxed-occupancy", cl::Hidden,
56                cl::desc("Relax occupancy targets for kernels which are memory "
57                         "bound (amdgpu-membound-threshold), or "
58                         "Wave Limited (amdgpu-limit-wave-threshold)."),
59                cl::init(false));
60 
61 static cl::opt<bool> GCNTrackers(
62     "amdgpu-use-amdgpu-trackers", cl::Hidden,
63     cl::desc("Use the AMDGPU specific RPTrackers during scheduling"),
64     cl::init(false));
65 
66 const unsigned ScheduleMetrics::ScaleFactor = 100;
67 
68 GCNSchedStrategy::GCNSchedStrategy(const MachineSchedContext *C)
69     : GenericScheduler(C), TargetOccupancy(0), MF(nullptr),
70       DownwardTracker(*C->LIS), UpwardTracker(*C->LIS), HasHighPressure(false) {
71 }
72 
73 void GCNSchedStrategy::initialize(ScheduleDAGMI *DAG) {
74   GenericScheduler::initialize(DAG);
75 
76   MF = &DAG->MF;
77 
78   const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
79 
80   SGPRExcessLimit =
81       Context->RegClassInfo->getNumAllocatableRegs(&AMDGPU::SGPR_32RegClass);
82   VGPRExcessLimit =
83       Context->RegClassInfo->getNumAllocatableRegs(&AMDGPU::VGPR_32RegClass);
84 
85   SIMachineFunctionInfo &MFI = *MF->getInfo<SIMachineFunctionInfo>();
86   // Set the initial TargetOccupnacy to the maximum occupancy that we can
87   // achieve for this function. This effectively sets a lower bound on the
88   // 'Critical' register limits in the scheduler.
89   // Allow for lower occupancy targets if kernel is wave limited or memory
90   // bound, and using the relaxed occupancy feature.
91   TargetOccupancy =
92       RelaxedOcc ? MFI.getMinAllowedOccupancy() : MFI.getOccupancy();
93   SGPRCriticalLimit =
94       std::min(ST.getMaxNumSGPRs(TargetOccupancy, true), SGPRExcessLimit);
95 
96   if (!KnownExcessRP) {
97     VGPRCriticalLimit =
98         std::min(ST.getMaxNumVGPRs(TargetOccupancy), VGPRExcessLimit);
99   } else {
100     // This is similar to ST.getMaxNumVGPRs(TargetOccupancy) result except
101     // returns a reasonably small number for targets with lots of VGPRs, such
102     // as GFX10 and GFX11.
103     LLVM_DEBUG(dbgs() << "Region is known to spill, use alternative "
104                          "VGPRCriticalLimit calculation method.\n");
105 
106     unsigned Granule = AMDGPU::IsaInfo::getVGPRAllocGranule(&ST);
107     unsigned Addressable = AMDGPU::IsaInfo::getAddressableNumVGPRs(&ST);
108     unsigned VGPRBudget = alignDown(Addressable / TargetOccupancy, Granule);
109     VGPRBudget = std::max(VGPRBudget, Granule);
110     VGPRCriticalLimit = std::min(VGPRBudget, VGPRExcessLimit);
111   }
112 
113   // Subtract error margin and bias from register limits and avoid overflow.
114   SGPRCriticalLimit -= std::min(SGPRLimitBias + ErrorMargin, SGPRCriticalLimit);
115   VGPRCriticalLimit -= std::min(VGPRLimitBias + ErrorMargin, VGPRCriticalLimit);
116   SGPRExcessLimit -= std::min(SGPRLimitBias + ErrorMargin, SGPRExcessLimit);
117   VGPRExcessLimit -= std::min(VGPRLimitBias + ErrorMargin, VGPRExcessLimit);
118 
119   LLVM_DEBUG(dbgs() << "VGPRCriticalLimit = " << VGPRCriticalLimit
120                     << ", VGPRExcessLimit = " << VGPRExcessLimit
121                     << ", SGPRCriticalLimit = " << SGPRCriticalLimit
122                     << ", SGPRExcessLimit = " << SGPRExcessLimit << "\n\n");
123 }
124 
125 /// Checks whether \p SU can use the cached DAG pressure diffs to compute the
126 /// current register pressure.
127 ///
128 /// This works for the common case, but it has a few exceptions that have been
129 /// observed through trial and error:
130 ///   - Explicit physical register operands
131 ///   - Subregister definitions
132 ///
133 /// In both of those cases, PressureDiff doesn't represent the actual pressure,
134 /// and querying LiveIntervals through the RegPressureTracker is needed to get
135 /// an accurate value.
136 ///
137 /// We should eventually only use PressureDiff for maximum performance, but this
138 /// already allows 80% of SUs to take the fast path without changing scheduling
139 /// at all. Further changes would either change scheduling, or require a lot
140 /// more logic to recover an accurate pressure estimate from the PressureDiffs.
141 static bool canUsePressureDiffs(const SUnit &SU) {
142   if (!SU.isInstr())
143     return false;
144 
145   // Cannot use pressure diffs for subregister defs or with physregs, it's
146   // imprecise in both cases.
147   for (const auto &Op : SU.getInstr()->operands()) {
148     if (!Op.isReg() || Op.isImplicit())
149       continue;
150     if (Op.getReg().isPhysical() ||
151         (Op.isDef() && Op.getSubReg() != AMDGPU::NoSubRegister))
152       return false;
153   }
154   return true;
155 }
156 
157 static void getRegisterPressures(
158     bool AtTop, const RegPressureTracker &RPTracker, SUnit *SU,
159     std::vector<unsigned> &Pressure, std::vector<unsigned> &MaxPressure,
160     GCNDownwardRPTracker &DownwardTracker, GCNUpwardRPTracker &UpwardTracker,
161     ScheduleDAGMI *DAG, const SIRegisterInfo *SRI) {
162   // getDownwardPressure() and getUpwardPressure() make temporary changes to
163   // the tracker, so we need to pass those function a non-const copy.
164   RegPressureTracker &TempTracker = const_cast<RegPressureTracker &>(RPTracker);
165   if (!GCNTrackers) {
166     AtTop
167         ? TempTracker.getDownwardPressure(SU->getInstr(), Pressure, MaxPressure)
168         : TempTracker.getUpwardPressure(SU->getInstr(), Pressure, MaxPressure);
169 
170     return;
171   }
172 
173   // GCNTrackers
174   Pressure.resize(4, 0);
175   MachineInstr *MI = SU->getInstr();
176   GCNRegPressure NewPressure;
177   if (AtTop) {
178     GCNDownwardRPTracker TempDownwardTracker(DownwardTracker);
179     NewPressure = TempDownwardTracker.bumpDownwardPressure(MI, SRI);
180   } else {
181     GCNUpwardRPTracker TempUpwardTracker(UpwardTracker);
182     TempUpwardTracker.recede(*MI);
183     NewPressure = TempUpwardTracker.getPressure();
184   }
185   Pressure[AMDGPU::RegisterPressureSets::SReg_32] = NewPressure.getSGPRNum();
186   Pressure[AMDGPU::RegisterPressureSets::VGPR_32] =
187       NewPressure.getArchVGPRNum();
188   Pressure[AMDGPU::RegisterPressureSets::AGPR_32] = NewPressure.getAGPRNum();
189 }
190 
191 // Return true if the instruction is mutually exclusive with all non-IGLP DAG
192 // mutations, requiring all other mutations to be disabled.
193 static bool isIGLPMutationOnly(unsigned Opcode) {
194   return Opcode == AMDGPU::SCHED_GROUP_BARRIER || Opcode == AMDGPU::IGLP_OPT;
195 }
196 
197 void GCNSchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU,
198                                      bool AtTop,
199                                      const RegPressureTracker &RPTracker,
200                                      const SIRegisterInfo *SRI,
201                                      unsigned SGPRPressure,
202                                      unsigned VGPRPressure, bool IsBottomUp) {
203   Cand.SU = SU;
204   Cand.AtTop = AtTop;
205 
206   if (!DAG->isTrackingPressure())
207     return;
208 
209   Pressure.clear();
210   MaxPressure.clear();
211 
212   // We try to use the cached PressureDiffs in the ScheduleDAG whenever
213   // possible over querying the RegPressureTracker.
214   //
215   // RegPressureTracker will make a lot of LIS queries which are very
216   // expensive, it is considered a slow function in this context.
217   //
218   // PressureDiffs are precomputed and cached, and getPressureDiff is just a
219   // trivial lookup into an array. It is pretty much free.
220   //
221   // In EXPENSIVE_CHECKS, we always query RPTracker to verify the results of
222   // PressureDiffs.
223   if (AtTop || !canUsePressureDiffs(*SU) || GCNTrackers) {
224     getRegisterPressures(AtTop, RPTracker, SU, Pressure, MaxPressure,
225                          DownwardTracker, UpwardTracker, DAG, SRI);
226   } else {
227     // Reserve 4 slots.
228     Pressure.resize(4, 0);
229     Pressure[AMDGPU::RegisterPressureSets::SReg_32] = SGPRPressure;
230     Pressure[AMDGPU::RegisterPressureSets::VGPR_32] = VGPRPressure;
231 
232     for (const auto &Diff : DAG->getPressureDiff(SU)) {
233       if (!Diff.isValid())
234         continue;
235       // PressureDiffs is always bottom-up so if we're working top-down we need
236       // to invert its sign.
237       Pressure[Diff.getPSet()] +=
238           (IsBottomUp ? Diff.getUnitInc() : -Diff.getUnitInc());
239     }
240 
241 #ifdef EXPENSIVE_CHECKS
242     std::vector<unsigned> CheckPressure, CheckMaxPressure;
243     getRegisterPressures(AtTop, RPTracker, SU, CheckPressure, CheckMaxPressure,
244                          DownwardTracker, UpwardTracker, DAG, SRI);
245     if (Pressure[AMDGPU::RegisterPressureSets::SReg_32] !=
246             CheckPressure[AMDGPU::RegisterPressureSets::SReg_32] ||
247         Pressure[AMDGPU::RegisterPressureSets::VGPR_32] !=
248             CheckPressure[AMDGPU::RegisterPressureSets::VGPR_32]) {
249       errs() << "Register Pressure is inaccurate when calculated through "
250                 "PressureDiff\n"
251              << "SGPR got " << Pressure[AMDGPU::RegisterPressureSets::SReg_32]
252              << ", expected "
253              << CheckPressure[AMDGPU::RegisterPressureSets::SReg_32] << "\n"
254              << "VGPR got " << Pressure[AMDGPU::RegisterPressureSets::VGPR_32]
255              << ", expected "
256              << CheckPressure[AMDGPU::RegisterPressureSets::VGPR_32] << "\n";
257       report_fatal_error("inaccurate register pressure calculation");
258     }
259 #endif
260   }
261 
262   unsigned NewSGPRPressure = Pressure[AMDGPU::RegisterPressureSets::SReg_32];
263   unsigned NewVGPRPressure = Pressure[AMDGPU::RegisterPressureSets::VGPR_32];
264 
265   // If two instructions increase the pressure of different register sets
266   // by the same amount, the generic scheduler will prefer to schedule the
267   // instruction that increases the set with the least amount of registers,
268   // which in our case would be SGPRs.  This is rarely what we want, so
269   // when we report excess/critical register pressure, we do it either
270   // only for VGPRs or only for SGPRs.
271 
272   // FIXME: Better heuristics to determine whether to prefer SGPRs or VGPRs.
273   const unsigned MaxVGPRPressureInc = 16;
274   bool ShouldTrackVGPRs = VGPRPressure + MaxVGPRPressureInc >= VGPRExcessLimit;
275   bool ShouldTrackSGPRs = !ShouldTrackVGPRs && SGPRPressure >= SGPRExcessLimit;
276 
277   // FIXME: We have to enter REG-EXCESS before we reach the actual threshold
278   // to increase the likelihood we don't go over the limits.  We should improve
279   // the analysis to look through dependencies to find the path with the least
280   // register pressure.
281 
282   // We only need to update the RPDelta for instructions that increase register
283   // pressure. Instructions that decrease or keep reg pressure the same will be
284   // marked as RegExcess in tryCandidate() when they are compared with
285   // instructions that increase the register pressure.
286   if (ShouldTrackVGPRs && NewVGPRPressure >= VGPRExcessLimit) {
287     HasHighPressure = true;
288     Cand.RPDelta.Excess = PressureChange(AMDGPU::RegisterPressureSets::VGPR_32);
289     Cand.RPDelta.Excess.setUnitInc(NewVGPRPressure - VGPRExcessLimit);
290   }
291 
292   if (ShouldTrackSGPRs && NewSGPRPressure >= SGPRExcessLimit) {
293     HasHighPressure = true;
294     Cand.RPDelta.Excess = PressureChange(AMDGPU::RegisterPressureSets::SReg_32);
295     Cand.RPDelta.Excess.setUnitInc(NewSGPRPressure - SGPRExcessLimit);
296   }
297 
298   // Register pressure is considered 'CRITICAL' if it is approaching a value
299   // that would reduce the wave occupancy for the execution unit.  When
300   // register pressure is 'CRITICAL', increasing SGPR and VGPR pressure both
301   // has the same cost, so we don't need to prefer one over the other.
302 
303   int SGPRDelta = NewSGPRPressure - SGPRCriticalLimit;
304   int VGPRDelta = NewVGPRPressure - VGPRCriticalLimit;
305 
306   if (SGPRDelta >= 0 || VGPRDelta >= 0) {
307     HasHighPressure = true;
308     if (SGPRDelta > VGPRDelta) {
309       Cand.RPDelta.CriticalMax =
310         PressureChange(AMDGPU::RegisterPressureSets::SReg_32);
311       Cand.RPDelta.CriticalMax.setUnitInc(SGPRDelta);
312     } else {
313       Cand.RPDelta.CriticalMax =
314         PressureChange(AMDGPU::RegisterPressureSets::VGPR_32);
315       Cand.RPDelta.CriticalMax.setUnitInc(VGPRDelta);
316     }
317   }
318 }
319 
320 // This function is mostly cut and pasted from
321 // GenericScheduler::pickNodeFromQueue()
322 void GCNSchedStrategy::pickNodeFromQueue(SchedBoundary &Zone,
323                                          const CandPolicy &ZonePolicy,
324                                          const RegPressureTracker &RPTracker,
325                                          SchedCandidate &Cand,
326                                          bool IsBottomUp) {
327   const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo*>(TRI);
328   ArrayRef<unsigned> Pressure = RPTracker.getRegSetPressureAtPos();
329   unsigned SGPRPressure = 0;
330   unsigned VGPRPressure = 0;
331   if (DAG->isTrackingPressure()) {
332     if (!GCNTrackers) {
333       SGPRPressure = Pressure[AMDGPU::RegisterPressureSets::SReg_32];
334       VGPRPressure = Pressure[AMDGPU::RegisterPressureSets::VGPR_32];
335     } else {
336       GCNRPTracker *T = IsBottomUp
337                             ? static_cast<GCNRPTracker *>(&UpwardTracker)
338                             : static_cast<GCNRPTracker *>(&DownwardTracker);
339       SGPRPressure = T->getPressure().getSGPRNum();
340       VGPRPressure = T->getPressure().getArchVGPRNum();
341     }
342   }
343   ReadyQueue &Q = Zone.Available;
344   for (SUnit *SU : Q) {
345 
346     SchedCandidate TryCand(ZonePolicy);
347     initCandidate(TryCand, SU, Zone.isTop(), RPTracker, SRI, SGPRPressure,
348                   VGPRPressure, IsBottomUp);
349     // Pass SchedBoundary only when comparing nodes from the same boundary.
350     SchedBoundary *ZoneArg = Cand.AtTop == TryCand.AtTop ? &Zone : nullptr;
351     tryCandidate(Cand, TryCand, ZoneArg);
352     if (TryCand.Reason != NoCand) {
353       // Initialize resource delta if needed in case future heuristics query it.
354       if (TryCand.ResDelta == SchedResourceDelta())
355         TryCand.initResourceDelta(Zone.DAG, SchedModel);
356       Cand.setBest(TryCand);
357       LLVM_DEBUG(traceCandidate(Cand));
358     }
359   }
360 }
361 
362 // This function is mostly cut and pasted from
363 // GenericScheduler::pickNodeBidirectional()
364 SUnit *GCNSchedStrategy::pickNodeBidirectional(bool &IsTopNode) {
365   // Schedule as far as possible in the direction of no choice. This is most
366   // efficient, but also provides the best heuristics for CriticalPSets.
367   if (SUnit *SU = Bot.pickOnlyChoice()) {
368     IsTopNode = false;
369     return SU;
370   }
371   if (SUnit *SU = Top.pickOnlyChoice()) {
372     IsTopNode = true;
373     return SU;
374   }
375   // Set the bottom-up policy based on the state of the current bottom zone and
376   // the instructions outside the zone, including the top zone.
377   CandPolicy BotPolicy;
378   setPolicy(BotPolicy, /*IsPostRA=*/false, Bot, &Top);
379   // Set the top-down policy based on the state of the current top zone and
380   // the instructions outside the zone, including the bottom zone.
381   CandPolicy TopPolicy;
382   setPolicy(TopPolicy, /*IsPostRA=*/false, Top, &Bot);
383 
384   // See if BotCand is still valid (because we previously scheduled from Top).
385   LLVM_DEBUG(dbgs() << "Picking from Bot:\n");
386   if (!BotCand.isValid() || BotCand.SU->isScheduled ||
387       BotCand.Policy != BotPolicy) {
388     BotCand.reset(CandPolicy());
389     pickNodeFromQueue(Bot, BotPolicy, DAG->getBotRPTracker(), BotCand,
390                       /*IsBottomUp=*/true);
391     assert(BotCand.Reason != NoCand && "failed to find the first candidate");
392   } else {
393     LLVM_DEBUG(traceCandidate(BotCand));
394 #ifndef NDEBUG
395     if (VerifyScheduling) {
396       SchedCandidate TCand;
397       TCand.reset(CandPolicy());
398       pickNodeFromQueue(Bot, BotPolicy, DAG->getBotRPTracker(), TCand,
399                         /*IsBottomUp=*/true);
400       assert(TCand.SU == BotCand.SU &&
401              "Last pick result should correspond to re-picking right now");
402     }
403 #endif
404   }
405 
406   // Check if the top Q has a better candidate.
407   LLVM_DEBUG(dbgs() << "Picking from Top:\n");
408   if (!TopCand.isValid() || TopCand.SU->isScheduled ||
409       TopCand.Policy != TopPolicy) {
410     TopCand.reset(CandPolicy());
411     pickNodeFromQueue(Top, TopPolicy, DAG->getTopRPTracker(), TopCand,
412                       /*IsBottomUp=*/false);
413     assert(TopCand.Reason != NoCand && "failed to find the first candidate");
414   } else {
415     LLVM_DEBUG(traceCandidate(TopCand));
416 #ifndef NDEBUG
417     if (VerifyScheduling) {
418       SchedCandidate TCand;
419       TCand.reset(CandPolicy());
420       pickNodeFromQueue(Top, TopPolicy, DAG->getTopRPTracker(), TCand,
421                         /*IsBottomUp=*/false);
422       assert(TCand.SU == TopCand.SU &&
423            "Last pick result should correspond to re-picking right now");
424     }
425 #endif
426   }
427 
428   // Pick best from BotCand and TopCand.
429   LLVM_DEBUG(dbgs() << "Top Cand: "; traceCandidate(TopCand);
430              dbgs() << "Bot Cand: "; traceCandidate(BotCand););
431   SchedCandidate Cand = BotCand;
432   TopCand.Reason = NoCand;
433   tryCandidate(Cand, TopCand, nullptr);
434   if (TopCand.Reason != NoCand) {
435     Cand.setBest(TopCand);
436   }
437   LLVM_DEBUG(dbgs() << "Picking: "; traceCandidate(Cand););
438 
439   IsTopNode = Cand.AtTop;
440   return Cand.SU;
441 }
442 
443 // This function is mostly cut and pasted from
444 // GenericScheduler::pickNode()
445 SUnit *GCNSchedStrategy::pickNode(bool &IsTopNode) {
446   if (DAG->top() == DAG->bottom()) {
447     assert(Top.Available.empty() && Top.Pending.empty() &&
448            Bot.Available.empty() && Bot.Pending.empty() && "ReadyQ garbage");
449     return nullptr;
450   }
451   SUnit *SU;
452   do {
453     if (RegionPolicy.OnlyTopDown) {
454       SU = Top.pickOnlyChoice();
455       if (!SU) {
456         CandPolicy NoPolicy;
457         TopCand.reset(NoPolicy);
458         pickNodeFromQueue(Top, NoPolicy, DAG->getTopRPTracker(), TopCand,
459                           /*IsBottomUp=*/false);
460         assert(TopCand.Reason != NoCand && "failed to find a candidate");
461         SU = TopCand.SU;
462       }
463       IsTopNode = true;
464     } else if (RegionPolicy.OnlyBottomUp) {
465       SU = Bot.pickOnlyChoice();
466       if (!SU) {
467         CandPolicy NoPolicy;
468         BotCand.reset(NoPolicy);
469         pickNodeFromQueue(Bot, NoPolicy, DAG->getBotRPTracker(), BotCand,
470                           /*IsBottomUp=*/true);
471         assert(BotCand.Reason != NoCand && "failed to find a candidate");
472         SU = BotCand.SU;
473       }
474       IsTopNode = false;
475     } else {
476       SU = pickNodeBidirectional(IsTopNode);
477     }
478   } while (SU->isScheduled);
479 
480   if (SU->isTopReady())
481     Top.removeReady(SU);
482   if (SU->isBottomReady())
483     Bot.removeReady(SU);
484 
485   LLVM_DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") "
486                     << *SU->getInstr());
487   return SU;
488 }
489 
490 void GCNSchedStrategy::schedNode(SUnit *SU, bool IsTopNode) {
491   if (GCNTrackers) {
492     MachineInstr *MI = SU->getInstr();
493     IsTopNode ? (void)DownwardTracker.advance(MI, false)
494               : UpwardTracker.recede(*MI);
495   }
496 
497   return GenericScheduler::schedNode(SU, IsTopNode);
498 }
499 
500 GCNSchedStageID GCNSchedStrategy::getCurrentStage() {
501   assert(CurrentStage && CurrentStage != SchedStages.end());
502   return *CurrentStage;
503 }
504 
505 bool GCNSchedStrategy::advanceStage() {
506   assert(CurrentStage != SchedStages.end());
507   if (!CurrentStage)
508     CurrentStage = SchedStages.begin();
509   else
510     CurrentStage++;
511 
512   return CurrentStage != SchedStages.end();
513 }
514 
515 bool GCNSchedStrategy::hasNextStage() const {
516   assert(CurrentStage);
517   return std::next(CurrentStage) != SchedStages.end();
518 }
519 
520 GCNSchedStageID GCNSchedStrategy::getNextStage() const {
521   assert(CurrentStage && std::next(CurrentStage) != SchedStages.end());
522   return *std::next(CurrentStage);
523 }
524 
525 GCNMaxOccupancySchedStrategy::GCNMaxOccupancySchedStrategy(
526     const MachineSchedContext *C, bool IsLegacyScheduler)
527     : GCNSchedStrategy(C) {
528   SchedStages.push_back(GCNSchedStageID::OccInitialSchedule);
529   SchedStages.push_back(GCNSchedStageID::UnclusteredHighRPReschedule);
530   SchedStages.push_back(GCNSchedStageID::ClusteredLowOccupancyReschedule);
531   SchedStages.push_back(GCNSchedStageID::PreRARematerialize);
532   GCNTrackers = GCNTrackers & !IsLegacyScheduler;
533 }
534 
535 GCNMaxILPSchedStrategy::GCNMaxILPSchedStrategy(const MachineSchedContext *C)
536     : GCNSchedStrategy(C) {
537   SchedStages.push_back(GCNSchedStageID::ILPInitialSchedule);
538 }
539 
540 bool GCNMaxILPSchedStrategy::tryCandidate(SchedCandidate &Cand,
541                                           SchedCandidate &TryCand,
542                                           SchedBoundary *Zone) const {
543   // Initialize the candidate if needed.
544   if (!Cand.isValid()) {
545     TryCand.Reason = NodeOrder;
546     return true;
547   }
548 
549   // Avoid spilling by exceeding the register limit.
550   if (DAG->isTrackingPressure() &&
551       tryPressure(TryCand.RPDelta.Excess, Cand.RPDelta.Excess, TryCand, Cand,
552                   RegExcess, TRI, DAG->MF))
553     return TryCand.Reason != NoCand;
554 
555   // Bias PhysReg Defs and copies to their uses and defined respectively.
556   if (tryGreater(biasPhysReg(TryCand.SU, TryCand.AtTop),
557                  biasPhysReg(Cand.SU, Cand.AtTop), TryCand, Cand, PhysReg))
558     return TryCand.Reason != NoCand;
559 
560   bool SameBoundary = Zone != nullptr;
561   if (SameBoundary) {
562     // Prioritize instructions that read unbuffered resources by stall cycles.
563     if (tryLess(Zone->getLatencyStallCycles(TryCand.SU),
564                 Zone->getLatencyStallCycles(Cand.SU), TryCand, Cand, Stall))
565       return TryCand.Reason != NoCand;
566 
567     // Avoid critical resource consumption and balance the schedule.
568     TryCand.initResourceDelta(DAG, SchedModel);
569     if (tryLess(TryCand.ResDelta.CritResources, Cand.ResDelta.CritResources,
570                 TryCand, Cand, ResourceReduce))
571       return TryCand.Reason != NoCand;
572     if (tryGreater(TryCand.ResDelta.DemandedResources,
573                    Cand.ResDelta.DemandedResources, TryCand, Cand,
574                    ResourceDemand))
575       return TryCand.Reason != NoCand;
576 
577     // Unconditionally try to reduce latency.
578     if (tryLatency(TryCand, Cand, *Zone))
579       return TryCand.Reason != NoCand;
580 
581     // Weak edges are for clustering and other constraints.
582     if (tryLess(getWeakLeft(TryCand.SU, TryCand.AtTop),
583                 getWeakLeft(Cand.SU, Cand.AtTop), TryCand, Cand, Weak))
584       return TryCand.Reason != NoCand;
585   }
586 
587   // Keep clustered nodes together to encourage downstream peephole
588   // optimizations which may reduce resource requirements.
589   //
590   // This is a best effort to set things up for a post-RA pass. Optimizations
591   // like generating loads of multiple registers should ideally be done within
592   // the scheduler pass by combining the loads during DAG postprocessing.
593   const SUnit *CandNextClusterSU =
594       Cand.AtTop ? DAG->getNextClusterSucc() : DAG->getNextClusterPred();
595   const SUnit *TryCandNextClusterSU =
596       TryCand.AtTop ? DAG->getNextClusterSucc() : DAG->getNextClusterPred();
597   if (tryGreater(TryCand.SU == TryCandNextClusterSU,
598                  Cand.SU == CandNextClusterSU, TryCand, Cand, Cluster))
599     return TryCand.Reason != NoCand;
600 
601   // Avoid increasing the max critical pressure in the scheduled region.
602   if (DAG->isTrackingPressure() &&
603       tryPressure(TryCand.RPDelta.CriticalMax, Cand.RPDelta.CriticalMax,
604                   TryCand, Cand, RegCritical, TRI, DAG->MF))
605     return TryCand.Reason != NoCand;
606 
607   // Avoid increasing the max pressure of the entire region.
608   if (DAG->isTrackingPressure() &&
609       tryPressure(TryCand.RPDelta.CurrentMax, Cand.RPDelta.CurrentMax, TryCand,
610                   Cand, RegMax, TRI, DAG->MF))
611     return TryCand.Reason != NoCand;
612 
613   if (SameBoundary) {
614     // Fall through to original instruction order.
615     if ((Zone->isTop() && TryCand.SU->NodeNum < Cand.SU->NodeNum) ||
616         (!Zone->isTop() && TryCand.SU->NodeNum > Cand.SU->NodeNum)) {
617       TryCand.Reason = NodeOrder;
618       return true;
619     }
620   }
621   return false;
622 }
623 
624 GCNMaxMemoryClauseSchedStrategy::GCNMaxMemoryClauseSchedStrategy(
625     const MachineSchedContext *C)
626     : GCNSchedStrategy(C) {
627   SchedStages.push_back(GCNSchedStageID::MemoryClauseInitialSchedule);
628 }
629 
630 /// GCNMaxMemoryClauseSchedStrategy tries best to clause memory instructions as
631 /// much as possible. This is achieved by:
632 //  1. Prioritize clustered operations before stall latency heuristic.
633 //  2. Prioritize long-latency-load before stall latency heuristic.
634 ///
635 /// \param Cand provides the policy and current best candidate.
636 /// \param TryCand refers to the next SUnit candidate, otherwise uninitialized.
637 /// \param Zone describes the scheduled zone that we are extending, or nullptr
638 ///             if Cand is from a different zone than TryCand.
639 /// \return \c true if TryCand is better than Cand (Reason is NOT NoCand)
640 bool GCNMaxMemoryClauseSchedStrategy::tryCandidate(SchedCandidate &Cand,
641                                                    SchedCandidate &TryCand,
642                                                    SchedBoundary *Zone) const {
643   // Initialize the candidate if needed.
644   if (!Cand.isValid()) {
645     TryCand.Reason = NodeOrder;
646     return true;
647   }
648 
649   // Bias PhysReg Defs and copies to their uses and defined respectively.
650   if (tryGreater(biasPhysReg(TryCand.SU, TryCand.AtTop),
651                  biasPhysReg(Cand.SU, Cand.AtTop), TryCand, Cand, PhysReg))
652     return TryCand.Reason != NoCand;
653 
654   if (DAG->isTrackingPressure()) {
655     // Avoid exceeding the target's limit.
656     if (tryPressure(TryCand.RPDelta.Excess, Cand.RPDelta.Excess, TryCand, Cand,
657                     RegExcess, TRI, DAG->MF))
658       return TryCand.Reason != NoCand;
659 
660     // Avoid increasing the max critical pressure in the scheduled region.
661     if (tryPressure(TryCand.RPDelta.CriticalMax, Cand.RPDelta.CriticalMax,
662                     TryCand, Cand, RegCritical, TRI, DAG->MF))
663       return TryCand.Reason != NoCand;
664   }
665 
666   // MaxMemoryClause-specific: We prioritize clustered instructions as we would
667   // get more benefit from clausing these memory instructions.
668   const SUnit *CandNextClusterSU =
669       Cand.AtTop ? DAG->getNextClusterSucc() : DAG->getNextClusterPred();
670   const SUnit *TryCandNextClusterSU =
671       TryCand.AtTop ? DAG->getNextClusterSucc() : DAG->getNextClusterPred();
672   if (tryGreater(TryCand.SU == TryCandNextClusterSU,
673                  Cand.SU == CandNextClusterSU, TryCand, Cand, Cluster))
674     return TryCand.Reason != NoCand;
675 
676   // We only compare a subset of features when comparing nodes between
677   // Top and Bottom boundary. Some properties are simply incomparable, in many
678   // other instances we should only override the other boundary if something
679   // is a clear good pick on one boundary. Skip heuristics that are more
680   // "tie-breaking" in nature.
681   bool SameBoundary = Zone != nullptr;
682   if (SameBoundary) {
683     // For loops that are acyclic path limited, aggressively schedule for
684     // latency. Within an single cycle, whenever CurrMOps > 0, allow normal
685     // heuristics to take precedence.
686     if (Rem.IsAcyclicLatencyLimited && !Zone->getCurrMOps() &&
687         tryLatency(TryCand, Cand, *Zone))
688       return TryCand.Reason != NoCand;
689 
690     // MaxMemoryClause-specific: Prioritize long latency memory load
691     // instructions in top-bottom order to hide more latency. The mayLoad check
692     // is used to exclude store-like instructions, which we do not want to
693     // scheduler them too early.
694     bool TryMayLoad =
695         TryCand.SU->isInstr() && TryCand.SU->getInstr()->mayLoad();
696     bool CandMayLoad = Cand.SU->isInstr() && Cand.SU->getInstr()->mayLoad();
697 
698     if (TryMayLoad || CandMayLoad) {
699       bool TryLongLatency =
700           TryCand.SU->Latency > 10 * Cand.SU->Latency && TryMayLoad;
701       bool CandLongLatency =
702           10 * TryCand.SU->Latency < Cand.SU->Latency && CandMayLoad;
703 
704       if (tryGreater(Zone->isTop() ? TryLongLatency : CandLongLatency,
705                      Zone->isTop() ? CandLongLatency : TryLongLatency, TryCand,
706                      Cand, Stall))
707         return TryCand.Reason != NoCand;
708     }
709     // Prioritize instructions that read unbuffered resources by stall cycles.
710     if (tryLess(Zone->getLatencyStallCycles(TryCand.SU),
711                 Zone->getLatencyStallCycles(Cand.SU), TryCand, Cand, Stall))
712       return TryCand.Reason != NoCand;
713   }
714 
715   if (SameBoundary) {
716     // Weak edges are for clustering and other constraints.
717     if (tryLess(getWeakLeft(TryCand.SU, TryCand.AtTop),
718                 getWeakLeft(Cand.SU, Cand.AtTop), TryCand, Cand, Weak))
719       return TryCand.Reason != NoCand;
720   }
721 
722   // Avoid increasing the max pressure of the entire region.
723   if (DAG->isTrackingPressure() &&
724       tryPressure(TryCand.RPDelta.CurrentMax, Cand.RPDelta.CurrentMax, TryCand,
725                   Cand, RegMax, TRI, DAG->MF))
726     return TryCand.Reason != NoCand;
727 
728   if (SameBoundary) {
729     // Avoid critical resource consumption and balance the schedule.
730     TryCand.initResourceDelta(DAG, SchedModel);
731     if (tryLess(TryCand.ResDelta.CritResources, Cand.ResDelta.CritResources,
732                 TryCand, Cand, ResourceReduce))
733       return TryCand.Reason != NoCand;
734     if (tryGreater(TryCand.ResDelta.DemandedResources,
735                    Cand.ResDelta.DemandedResources, TryCand, Cand,
736                    ResourceDemand))
737       return TryCand.Reason != NoCand;
738 
739     // Avoid serializing long latency dependence chains.
740     // For acyclic path limited loops, latency was already checked above.
741     if (!RegionPolicy.DisableLatencyHeuristic && TryCand.Policy.ReduceLatency &&
742         !Rem.IsAcyclicLatencyLimited && tryLatency(TryCand, Cand, *Zone))
743       return TryCand.Reason != NoCand;
744 
745     // Fall through to original instruction order.
746     if (Zone->isTop() == (TryCand.SU->NodeNum < Cand.SU->NodeNum)) {
747       assert(TryCand.SU->NodeNum != Cand.SU->NodeNum);
748       TryCand.Reason = NodeOrder;
749       return true;
750     }
751   }
752 
753   return false;
754 }
755 
756 GCNScheduleDAGMILive::GCNScheduleDAGMILive(
757     MachineSchedContext *C, std::unique_ptr<MachineSchedStrategy> S)
758     : ScheduleDAGMILive(C, std::move(S)), ST(MF.getSubtarget<GCNSubtarget>()),
759       MFI(*MF.getInfo<SIMachineFunctionInfo>()),
760       StartingOccupancy(MFI.getOccupancy()), MinOccupancy(StartingOccupancy),
761       RegionLiveOuts(this, /*IsLiveOut=*/true) {
762 
763   LLVM_DEBUG(dbgs() << "Starting occupancy is " << StartingOccupancy << ".\n");
764   if (RelaxedOcc) {
765     MinOccupancy = std::min(MFI.getMinAllowedOccupancy(), StartingOccupancy);
766     if (MinOccupancy != StartingOccupancy)
767       LLVM_DEBUG(dbgs() << "Allowing Occupancy drops to " << MinOccupancy
768                         << ".\n");
769   }
770 }
771 
772 std::unique_ptr<GCNSchedStage>
773 GCNScheduleDAGMILive::createSchedStage(GCNSchedStageID SchedStageID) {
774   switch (SchedStageID) {
775   case GCNSchedStageID::OccInitialSchedule:
776     return std::make_unique<OccInitialScheduleStage>(SchedStageID, *this);
777   case GCNSchedStageID::UnclusteredHighRPReschedule:
778     return std::make_unique<UnclusteredHighRPStage>(SchedStageID, *this);
779   case GCNSchedStageID::ClusteredLowOccupancyReschedule:
780     return std::make_unique<ClusteredLowOccStage>(SchedStageID, *this);
781   case GCNSchedStageID::PreRARematerialize:
782     return std::make_unique<PreRARematStage>(SchedStageID, *this);
783   case GCNSchedStageID::ILPInitialSchedule:
784     return std::make_unique<ILPInitialScheduleStage>(SchedStageID, *this);
785   case GCNSchedStageID::MemoryClauseInitialSchedule:
786     return std::make_unique<MemoryClauseInitialScheduleStage>(SchedStageID,
787                                                               *this);
788   }
789 
790   llvm_unreachable("Unknown SchedStageID.");
791 }
792 
793 void GCNScheduleDAGMILive::schedule() {
794   // Collect all scheduling regions. The actual scheduling is performed in
795   // GCNScheduleDAGMILive::finalizeSchedule.
796   Regions.push_back(std::pair(RegionBegin, RegionEnd));
797 }
798 
799 GCNRegPressure
800 GCNScheduleDAGMILive::getRealRegPressure(unsigned RegionIdx) const {
801   GCNDownwardRPTracker RPTracker(*LIS);
802   RPTracker.advance(begin(), end(), &LiveIns[RegionIdx]);
803   return RPTracker.moveMaxPressure();
804 }
805 
806 static MachineInstr *getLastMIForRegion(MachineBasicBlock::iterator RegionBegin,
807                                         MachineBasicBlock::iterator RegionEnd) {
808   auto REnd = RegionEnd == RegionBegin->getParent()->end()
809                   ? std::prev(RegionEnd)
810                   : RegionEnd;
811   return &*skipDebugInstructionsBackward(REnd, RegionBegin);
812 }
813 
814 void GCNScheduleDAGMILive::computeBlockPressure(unsigned RegionIdx,
815                                                 const MachineBasicBlock *MBB) {
816   GCNDownwardRPTracker RPTracker(*LIS);
817 
818   // If the block has the only successor then live-ins of that successor are
819   // live-outs of the current block. We can reuse calculated live set if the
820   // successor will be sent to scheduling past current block.
821 
822   // However, due to the bug in LiveInterval analysis it may happen that two
823   // predecessors of the same successor block have different lane bitmasks for
824   // a live-out register. Workaround that by sticking to one-to-one relationship
825   // i.e. one predecessor with one successor block.
826   const MachineBasicBlock *OnlySucc = nullptr;
827   if (MBB->succ_size() == 1) {
828     auto *Candidate = *MBB->succ_begin();
829     if (!Candidate->empty() && Candidate->pred_size() == 1) {
830       SlotIndexes *Ind = LIS->getSlotIndexes();
831       if (Ind->getMBBStartIdx(MBB) < Ind->getMBBStartIdx(Candidate))
832         OnlySucc = Candidate;
833     }
834   }
835 
836   // Scheduler sends regions from the end of the block upwards.
837   size_t CurRegion = RegionIdx;
838   for (size_t E = Regions.size(); CurRegion != E; ++CurRegion)
839     if (Regions[CurRegion].first->getParent() != MBB)
840       break;
841   --CurRegion;
842 
843   auto I = MBB->begin();
844   auto LiveInIt = MBBLiveIns.find(MBB);
845   auto &Rgn = Regions[CurRegion];
846   auto *NonDbgMI = &*skipDebugInstructionsForward(Rgn.first, Rgn.second);
847   if (LiveInIt != MBBLiveIns.end()) {
848     auto LiveIn = std::move(LiveInIt->second);
849     RPTracker.reset(*MBB->begin(), &LiveIn);
850     MBBLiveIns.erase(LiveInIt);
851   } else {
852     I = Rgn.first;
853     auto LRS = BBLiveInMap.lookup(NonDbgMI);
854 #ifdef EXPENSIVE_CHECKS
855     assert(isEqual(getLiveRegsBefore(*NonDbgMI, *LIS), LRS));
856 #endif
857     RPTracker.reset(*I, &LRS);
858   }
859 
860   for (;;) {
861     I = RPTracker.getNext();
862 
863     if (Regions[CurRegion].first == I || NonDbgMI == I) {
864       LiveIns[CurRegion] = RPTracker.getLiveRegs();
865       RPTracker.clearMaxPressure();
866     }
867 
868     if (Regions[CurRegion].second == I) {
869       Pressure[CurRegion] = RPTracker.moveMaxPressure();
870       if (CurRegion-- == RegionIdx)
871         break;
872     }
873     RPTracker.advanceToNext();
874     RPTracker.advanceBeforeNext();
875   }
876 
877   if (OnlySucc) {
878     if (I != MBB->end()) {
879       RPTracker.advanceToNext();
880       RPTracker.advance(MBB->end());
881     }
882     RPTracker.advanceBeforeNext();
883     MBBLiveIns[OnlySucc] = RPTracker.moveLiveRegs();
884   }
885 }
886 
887 DenseMap<MachineInstr *, GCNRPTracker::LiveRegSet>
888 GCNScheduleDAGMILive::getRegionLiveInMap() const {
889   assert(!Regions.empty());
890   std::vector<MachineInstr *> RegionFirstMIs;
891   RegionFirstMIs.reserve(Regions.size());
892   auto I = Regions.rbegin(), E = Regions.rend();
893   auto *BB = I->first->getParent();
894   do {
895     auto *MI = &*skipDebugInstructionsForward(I->first, I->second);
896     RegionFirstMIs.push_back(MI);
897     do {
898       ++I;
899     } while (I != E && I->first->getParent() == BB);
900   } while (I != E);
901   return getLiveRegMap(RegionFirstMIs, /*After=*/false, *LIS);
902 }
903 
904 DenseMap<MachineInstr *, GCNRPTracker::LiveRegSet>
905 GCNScheduleDAGMILive::getRegionLiveOutMap() const {
906   assert(!Regions.empty());
907   std::vector<MachineInstr *> RegionLastMIs;
908   RegionLastMIs.reserve(Regions.size());
909   for (auto &[RegionBegin, RegionEnd] : reverse(Regions))
910     RegionLastMIs.push_back(getLastMIForRegion(RegionBegin, RegionEnd));
911 
912   return getLiveRegMap(RegionLastMIs, /*After=*/true, *LIS);
913 }
914 
915 void RegionPressureMap::buildLiveRegMap() {
916   IdxToInstruction.clear();
917 
918   RegionLiveRegMap =
919       IsLiveOut ? DAG->getRegionLiveOutMap() : DAG->getRegionLiveInMap();
920   for (unsigned I = 0; I < DAG->Regions.size(); I++) {
921     MachineInstr *RegionKey =
922         IsLiveOut
923             ? getLastMIForRegion(DAG->Regions[I].first, DAG->Regions[I].second)
924             : &*DAG->Regions[I].first;
925     IdxToInstruction[I] = RegionKey;
926   }
927 }
928 
929 void GCNScheduleDAGMILive::finalizeSchedule() {
930   // Start actual scheduling here. This function is called by the base
931   // MachineScheduler after all regions have been recorded by
932   // GCNScheduleDAGMILive::schedule().
933   LiveIns.resize(Regions.size());
934   Pressure.resize(Regions.size());
935   RescheduleRegions.resize(Regions.size());
936   RegionsWithHighRP.resize(Regions.size());
937   RegionsWithExcessRP.resize(Regions.size());
938   RegionsWithMinOcc.resize(Regions.size());
939   RegionsWithIGLPInstrs.resize(Regions.size());
940   RescheduleRegions.set();
941   RegionsWithHighRP.reset();
942   RegionsWithExcessRP.reset();
943   RegionsWithMinOcc.reset();
944   RegionsWithIGLPInstrs.reset();
945 
946   runSchedStages();
947 }
948 
949 void GCNScheduleDAGMILive::runSchedStages() {
950   LLVM_DEBUG(dbgs() << "All regions recorded, starting actual scheduling.\n");
951 
952   if (!Regions.empty()) {
953     BBLiveInMap = getRegionLiveInMap();
954     if (GCNTrackers)
955       RegionLiveOuts.buildLiveRegMap();
956   }
957 
958   GCNSchedStrategy &S = static_cast<GCNSchedStrategy &>(*SchedImpl);
959   while (S.advanceStage()) {
960     auto Stage = createSchedStage(S.getCurrentStage());
961     if (!Stage->initGCNSchedStage())
962       continue;
963 
964     for (auto Region : Regions) {
965       RegionBegin = Region.first;
966       RegionEnd = Region.second;
967       // Setup for scheduling the region and check whether it should be skipped.
968       if (!Stage->initGCNRegion()) {
969         Stage->advanceRegion();
970         exitRegion();
971         continue;
972       }
973 
974       if (GCNTrackers) {
975         GCNDownwardRPTracker *DownwardTracker = S.getDownwardTracker();
976         GCNUpwardRPTracker *UpwardTracker = S.getUpwardTracker();
977         GCNRPTracker::LiveRegSet *RegionLiveIns =
978             &LiveIns[Stage->getRegionIdx()];
979 
980         reinterpret_cast<GCNRPTracker *>(DownwardTracker)
981             ->reset(MRI, *RegionLiveIns);
982         reinterpret_cast<GCNRPTracker *>(UpwardTracker)
983             ->reset(MRI, RegionLiveOuts.getLiveRegsForRegionIdx(
984                              Stage->getRegionIdx()));
985       }
986 
987       ScheduleDAGMILive::schedule();
988       Stage->finalizeGCNRegion();
989     }
990 
991     Stage->finalizeGCNSchedStage();
992   }
993 }
994 
995 #ifndef NDEBUG
996 raw_ostream &llvm::operator<<(raw_ostream &OS, const GCNSchedStageID &StageID) {
997   switch (StageID) {
998   case GCNSchedStageID::OccInitialSchedule:
999     OS << "Max Occupancy Initial Schedule";
1000     break;
1001   case GCNSchedStageID::UnclusteredHighRPReschedule:
1002     OS << "Unclustered High Register Pressure Reschedule";
1003     break;
1004   case GCNSchedStageID::ClusteredLowOccupancyReschedule:
1005     OS << "Clustered Low Occupancy Reschedule";
1006     break;
1007   case GCNSchedStageID::PreRARematerialize:
1008     OS << "Pre-RA Rematerialize";
1009     break;
1010   case GCNSchedStageID::ILPInitialSchedule:
1011     OS << "Max ILP Initial Schedule";
1012     break;
1013   case GCNSchedStageID::MemoryClauseInitialSchedule:
1014     OS << "Max memory clause Initial Schedule";
1015     break;
1016   }
1017 
1018   return OS;
1019 }
1020 #endif
1021 
1022 GCNSchedStage::GCNSchedStage(GCNSchedStageID StageID, GCNScheduleDAGMILive &DAG)
1023     : DAG(DAG), S(static_cast<GCNSchedStrategy &>(*DAG.SchedImpl)), MF(DAG.MF),
1024       MFI(DAG.MFI), ST(DAG.ST), StageID(StageID) {}
1025 
1026 bool GCNSchedStage::initGCNSchedStage() {
1027   if (!DAG.LIS)
1028     return false;
1029 
1030   LLVM_DEBUG(dbgs() << "Starting scheduling stage: " << StageID << "\n");
1031   return true;
1032 }
1033 
1034 bool UnclusteredHighRPStage::initGCNSchedStage() {
1035   if (DisableUnclusterHighRP)
1036     return false;
1037 
1038   if (!GCNSchedStage::initGCNSchedStage())
1039     return false;
1040 
1041   if (DAG.RegionsWithHighRP.none() && DAG.RegionsWithExcessRP.none())
1042     return false;
1043 
1044   SavedMutations.swap(DAG.Mutations);
1045   DAG.addMutation(
1046       createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::PreRAReentry));
1047 
1048   InitialOccupancy = DAG.MinOccupancy;
1049   // Aggressivly try to reduce register pressure in the unclustered high RP
1050   // stage. Temporarily increase occupancy target in the region.
1051   S.SGPRLimitBias = S.HighRPSGPRBias;
1052   S.VGPRLimitBias = S.HighRPVGPRBias;
1053   if (MFI.getMaxWavesPerEU() > DAG.MinOccupancy)
1054     MFI.increaseOccupancy(MF, ++DAG.MinOccupancy);
1055 
1056   LLVM_DEBUG(
1057       dbgs()
1058       << "Retrying function scheduling without clustering. "
1059          "Aggressivly try to reduce register pressure to achieve occupancy "
1060       << DAG.MinOccupancy << ".\n");
1061 
1062   return true;
1063 }
1064 
1065 bool ClusteredLowOccStage::initGCNSchedStage() {
1066   if (DisableClusteredLowOccupancy)
1067     return false;
1068 
1069   if (!GCNSchedStage::initGCNSchedStage())
1070     return false;
1071 
1072   // Don't bother trying to improve ILP in lower RP regions if occupancy has not
1073   // been dropped. All regions will have already been scheduled with the ideal
1074   // occupancy targets.
1075   if (DAG.StartingOccupancy <= DAG.MinOccupancy)
1076     return false;
1077 
1078   LLVM_DEBUG(
1079       dbgs() << "Retrying function scheduling with lowest recorded occupancy "
1080              << DAG.MinOccupancy << ".\n");
1081   return true;
1082 }
1083 
1084 bool PreRARematStage::initGCNSchedStage() {
1085   if (!GCNSchedStage::initGCNSchedStage())
1086     return false;
1087 
1088   if (DAG.RegionsWithMinOcc.none() || DAG.Regions.size() == 1)
1089     return false;
1090 
1091   const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
1092   // Rematerialization will not help if occupancy is not limited by reg usage.
1093   if (ST.getOccupancyWithWorkGroupSizes(MF).second == DAG.MinOccupancy)
1094     return false;
1095 
1096   // FIXME: This pass will invalidate cached MBBLiveIns for regions
1097   // inbetween the defs and region we sinked the def to. Cached pressure
1098   // for regions where a def is sinked from will also be invalidated. Will
1099   // need to be fixed if there is another pass after this pass.
1100   assert(!S.hasNextStage());
1101 
1102   collectRematerializableInstructions();
1103   if (RematerializableInsts.empty() || !sinkTriviallyRematInsts(ST, TII))
1104     return false;
1105 
1106   LLVM_DEBUG(
1107       dbgs() << "Retrying function scheduling with improved occupancy of "
1108              << DAG.MinOccupancy << " from rematerializing\n");
1109   return true;
1110 }
1111 
1112 void GCNSchedStage::finalizeGCNSchedStage() {
1113   DAG.finishBlock();
1114   LLVM_DEBUG(dbgs() << "Ending scheduling stage: " << StageID << "\n");
1115 }
1116 
1117 void UnclusteredHighRPStage::finalizeGCNSchedStage() {
1118   SavedMutations.swap(DAG.Mutations);
1119   S.SGPRLimitBias = S.VGPRLimitBias = 0;
1120   if (DAG.MinOccupancy > InitialOccupancy) {
1121     for (unsigned IDX = 0; IDX < DAG.Pressure.size(); ++IDX)
1122       DAG.RegionsWithMinOcc[IDX] =
1123           DAG.Pressure[IDX].getOccupancy(DAG.ST) == DAG.MinOccupancy;
1124 
1125     LLVM_DEBUG(dbgs() << StageID
1126                       << " stage successfully increased occupancy to "
1127                       << DAG.MinOccupancy << '\n');
1128   }
1129 
1130   GCNSchedStage::finalizeGCNSchedStage();
1131 }
1132 
1133 bool GCNSchedStage::initGCNRegion() {
1134   // Check whether this new region is also a new block.
1135   if (DAG.RegionBegin->getParent() != CurrentMBB)
1136     setupNewBlock();
1137 
1138   unsigned NumRegionInstrs = std::distance(DAG.begin(), DAG.end());
1139   DAG.enterRegion(CurrentMBB, DAG.begin(), DAG.end(), NumRegionInstrs);
1140 
1141   // Skip empty scheduling regions (0 or 1 schedulable instructions).
1142   if (DAG.begin() == DAG.end() || DAG.begin() == std::prev(DAG.end()))
1143     return false;
1144 
1145   LLVM_DEBUG(dbgs() << "********** MI Scheduling **********\n");
1146   LLVM_DEBUG(dbgs() << MF.getName() << ":" << printMBBReference(*CurrentMBB)
1147                     << " " << CurrentMBB->getName()
1148                     << "\n  From: " << *DAG.begin() << "    To: ";
1149              if (DAG.RegionEnd != CurrentMBB->end()) dbgs() << *DAG.RegionEnd;
1150              else dbgs() << "End";
1151              dbgs() << " RegionInstrs: " << NumRegionInstrs << '\n');
1152 
1153   // Save original instruction order before scheduling for possible revert.
1154   Unsched.clear();
1155   Unsched.reserve(DAG.NumRegionInstrs);
1156   if (StageID == GCNSchedStageID::OccInitialSchedule ||
1157       StageID == GCNSchedStageID::ILPInitialSchedule) {
1158     for (auto &I : DAG) {
1159       Unsched.push_back(&I);
1160       if (isIGLPMutationOnly(I.getOpcode()))
1161         DAG.RegionsWithIGLPInstrs[RegionIdx] = true;
1162     }
1163   } else {
1164     for (auto &I : DAG)
1165       Unsched.push_back(&I);
1166   }
1167 
1168   PressureBefore = DAG.Pressure[RegionIdx];
1169 
1170   LLVM_DEBUG(
1171       dbgs() << "Pressure before scheduling:\nRegion live-ins:"
1172              << print(DAG.LiveIns[RegionIdx], DAG.MRI)
1173              << "Region live-in pressure:  "
1174              << print(llvm::getRegPressure(DAG.MRI, DAG.LiveIns[RegionIdx]))
1175              << "Region register pressure: " << print(PressureBefore));
1176 
1177   S.HasHighPressure = false;
1178   S.KnownExcessRP = isRegionWithExcessRP();
1179 
1180   if (DAG.RegionsWithIGLPInstrs[RegionIdx] &&
1181       StageID != GCNSchedStageID::UnclusteredHighRPReschedule) {
1182     SavedMutations.clear();
1183     SavedMutations.swap(DAG.Mutations);
1184     bool IsInitialStage = StageID == GCNSchedStageID::OccInitialSchedule ||
1185                           StageID == GCNSchedStageID::ILPInitialSchedule;
1186     DAG.addMutation(createIGroupLPDAGMutation(
1187         IsInitialStage ? AMDGPU::SchedulingPhase::Initial
1188                        : AMDGPU::SchedulingPhase::PreRAReentry));
1189   }
1190 
1191   return true;
1192 }
1193 
1194 bool UnclusteredHighRPStage::initGCNRegion() {
1195   // Only reschedule regions with the minimum occupancy or regions that may have
1196   // spilling (excess register pressure).
1197   if ((!DAG.RegionsWithMinOcc[RegionIdx] ||
1198        DAG.MinOccupancy <= InitialOccupancy) &&
1199       !DAG.RegionsWithExcessRP[RegionIdx])
1200     return false;
1201 
1202   return GCNSchedStage::initGCNRegion();
1203 }
1204 
1205 bool ClusteredLowOccStage::initGCNRegion() {
1206   // We may need to reschedule this region if it wasn't rescheduled in the last
1207   // stage, or if we found it was testing critical register pressure limits in
1208   // the unclustered reschedule stage. The later is because we may not have been
1209   // able to raise the min occupancy in the previous stage so the region may be
1210   // overly constrained even if it was already rescheduled.
1211   if (!DAG.RegionsWithHighRP[RegionIdx])
1212     return false;
1213 
1214   return GCNSchedStage::initGCNRegion();
1215 }
1216 
1217 bool PreRARematStage::initGCNRegion() {
1218   if (!DAG.RescheduleRegions[RegionIdx])
1219     return false;
1220 
1221   return GCNSchedStage::initGCNRegion();
1222 }
1223 
1224 void GCNSchedStage::setupNewBlock() {
1225   if (CurrentMBB)
1226     DAG.finishBlock();
1227 
1228   CurrentMBB = DAG.RegionBegin->getParent();
1229   DAG.startBlock(CurrentMBB);
1230   // Get real RP for the region if it hasn't be calculated before. After the
1231   // initial schedule stage real RP will be collected after scheduling.
1232   if (StageID == GCNSchedStageID::OccInitialSchedule ||
1233       StageID == GCNSchedStageID::ILPInitialSchedule ||
1234       StageID == GCNSchedStageID::MemoryClauseInitialSchedule)
1235     DAG.computeBlockPressure(RegionIdx, CurrentMBB);
1236 }
1237 
1238 void GCNSchedStage::finalizeGCNRegion() {
1239   DAG.Regions[RegionIdx] = std::pair(DAG.RegionBegin, DAG.RegionEnd);
1240   DAG.RescheduleRegions[RegionIdx] = false;
1241   if (S.HasHighPressure)
1242     DAG.RegionsWithHighRP[RegionIdx] = true;
1243 
1244   // Revert scheduling if we have dropped occupancy or there is some other
1245   // reason that the original schedule is better.
1246   checkScheduling();
1247 
1248   if (DAG.RegionsWithIGLPInstrs[RegionIdx] &&
1249       StageID != GCNSchedStageID::UnclusteredHighRPReschedule)
1250     SavedMutations.swap(DAG.Mutations);
1251 
1252   DAG.exitRegion();
1253   RegionIdx++;
1254 }
1255 
1256 void GCNSchedStage::checkScheduling() {
1257   // Check the results of scheduling.
1258   PressureAfter = DAG.getRealRegPressure(RegionIdx);
1259 
1260   LLVM_DEBUG(dbgs() << "Pressure after scheduling: " << print(PressureAfter));
1261   LLVM_DEBUG(dbgs() << "Region: " << RegionIdx << ".\n");
1262 
1263   if (PressureAfter.getSGPRNum() <= S.SGPRCriticalLimit &&
1264       PressureAfter.getVGPRNum(ST.hasGFX90AInsts()) <= S.VGPRCriticalLimit) {
1265     DAG.Pressure[RegionIdx] = PressureAfter;
1266     DAG.RegionsWithMinOcc[RegionIdx] =
1267         PressureAfter.getOccupancy(ST) == DAG.MinOccupancy;
1268 
1269     // Early out if we have achieved the occupancy target.
1270     LLVM_DEBUG(dbgs() << "Pressure in desired limits, done.\n");
1271     return;
1272   }
1273 
1274   unsigned TargetOccupancy = std::min(
1275       S.getTargetOccupancy(), ST.getOccupancyWithWorkGroupSizes(MF).second);
1276   unsigned WavesAfter =
1277       std::min(TargetOccupancy, PressureAfter.getOccupancy(ST));
1278   unsigned WavesBefore =
1279       std::min(TargetOccupancy, PressureBefore.getOccupancy(ST));
1280   LLVM_DEBUG(dbgs() << "Occupancy before scheduling: " << WavesBefore
1281                     << ", after " << WavesAfter << ".\n");
1282 
1283   // We may not be able to keep the current target occupancy because of the just
1284   // scheduled region. We might still be able to revert scheduling if the
1285   // occupancy before was higher, or if the current schedule has register
1286   // pressure higher than the excess limits which could lead to more spilling.
1287   unsigned NewOccupancy = std::max(WavesAfter, WavesBefore);
1288 
1289   // Allow memory bound functions to drop to 4 waves if not limited by an
1290   // attribute.
1291   if (WavesAfter < WavesBefore && WavesAfter < DAG.MinOccupancy &&
1292       WavesAfter >= MFI.getMinAllowedOccupancy()) {
1293     LLVM_DEBUG(dbgs() << "Function is memory bound, allow occupancy drop up to "
1294                       << MFI.getMinAllowedOccupancy() << " waves\n");
1295     NewOccupancy = WavesAfter;
1296   }
1297 
1298   if (NewOccupancy < DAG.MinOccupancy) {
1299     DAG.MinOccupancy = NewOccupancy;
1300     MFI.limitOccupancy(DAG.MinOccupancy);
1301     DAG.RegionsWithMinOcc.reset();
1302     LLVM_DEBUG(dbgs() << "Occupancy lowered for the function to "
1303                       << DAG.MinOccupancy << ".\n");
1304   }
1305   // The maximum number of arch VGPR on non-unified register file, or the
1306   // maximum VGPR + AGPR in the unified register file case.
1307   unsigned MaxVGPRs = ST.getMaxNumVGPRs(MF);
1308   // The maximum number of arch VGPR for both unified and non-unified register
1309   // file.
1310   unsigned MaxArchVGPRs = std::min(MaxVGPRs, ST.getAddressableNumArchVGPRs());
1311   unsigned MaxSGPRs = ST.getMaxNumSGPRs(MF);
1312 
1313   if (PressureAfter.getVGPRNum(ST.hasGFX90AInsts()) > MaxVGPRs ||
1314       PressureAfter.getVGPRNum(false) > MaxArchVGPRs ||
1315       PressureAfter.getAGPRNum() > MaxArchVGPRs ||
1316       PressureAfter.getSGPRNum() > MaxSGPRs) {
1317     DAG.RescheduleRegions[RegionIdx] = true;
1318     DAG.RegionsWithHighRP[RegionIdx] = true;
1319     DAG.RegionsWithExcessRP[RegionIdx] = true;
1320   }
1321 
1322   // Revert if this region's schedule would cause a drop in occupancy or
1323   // spilling.
1324   if (shouldRevertScheduling(WavesAfter)) {
1325     revertScheduling();
1326   } else {
1327     DAG.Pressure[RegionIdx] = PressureAfter;
1328     DAG.RegionsWithMinOcc[RegionIdx] =
1329         PressureAfter.getOccupancy(ST) == DAG.MinOccupancy;
1330   }
1331 }
1332 
1333 unsigned
1334 GCNSchedStage::computeSUnitReadyCycle(const SUnit &SU, unsigned CurrCycle,
1335                                       DenseMap<unsigned, unsigned> &ReadyCycles,
1336                                       const TargetSchedModel &SM) {
1337   unsigned ReadyCycle = CurrCycle;
1338   for (auto &D : SU.Preds) {
1339     if (D.isAssignedRegDep()) {
1340       MachineInstr *DefMI = D.getSUnit()->getInstr();
1341       unsigned Latency = SM.computeInstrLatency(DefMI);
1342       unsigned DefReady = ReadyCycles[DAG.getSUnit(DefMI)->NodeNum];
1343       ReadyCycle = std::max(ReadyCycle, DefReady + Latency);
1344     }
1345   }
1346   ReadyCycles[SU.NodeNum] = ReadyCycle;
1347   return ReadyCycle;
1348 }
1349 
1350 #ifndef NDEBUG
1351 struct EarlierIssuingCycle {
1352   bool operator()(std::pair<MachineInstr *, unsigned> A,
1353                   std::pair<MachineInstr *, unsigned> B) const {
1354     return A.second < B.second;
1355   }
1356 };
1357 
1358 static void printScheduleModel(std::set<std::pair<MachineInstr *, unsigned>,
1359                                         EarlierIssuingCycle> &ReadyCycles) {
1360   if (ReadyCycles.empty())
1361     return;
1362   unsigned BBNum = ReadyCycles.begin()->first->getParent()->getNumber();
1363   dbgs() << "\n################## Schedule time ReadyCycles for MBB : " << BBNum
1364          << " ##################\n# Cycle #\t\t\tInstruction          "
1365             "             "
1366             "                            \n";
1367   unsigned IPrev = 1;
1368   for (auto &I : ReadyCycles) {
1369     if (I.second > IPrev + 1)
1370       dbgs() << "****************************** BUBBLE OF " << I.second - IPrev
1371              << " CYCLES DETECTED ******************************\n\n";
1372     dbgs() << "[ " << I.second << " ]  :  " << *I.first << "\n";
1373     IPrev = I.second;
1374   }
1375 }
1376 #endif
1377 
1378 ScheduleMetrics
1379 GCNSchedStage::getScheduleMetrics(const std::vector<SUnit> &InputSchedule) {
1380 #ifndef NDEBUG
1381   std::set<std::pair<MachineInstr *, unsigned>, EarlierIssuingCycle>
1382       ReadyCyclesSorted;
1383 #endif
1384   const TargetSchedModel &SM = ST.getInstrInfo()->getSchedModel();
1385   unsigned SumBubbles = 0;
1386   DenseMap<unsigned, unsigned> ReadyCycles;
1387   unsigned CurrCycle = 0;
1388   for (auto &SU : InputSchedule) {
1389     unsigned ReadyCycle =
1390         computeSUnitReadyCycle(SU, CurrCycle, ReadyCycles, SM);
1391     SumBubbles += ReadyCycle - CurrCycle;
1392 #ifndef NDEBUG
1393     ReadyCyclesSorted.insert(std::make_pair(SU.getInstr(), ReadyCycle));
1394 #endif
1395     CurrCycle = ++ReadyCycle;
1396   }
1397 #ifndef NDEBUG
1398   LLVM_DEBUG(
1399       printScheduleModel(ReadyCyclesSorted);
1400       dbgs() << "\n\t"
1401              << "Metric: "
1402              << (SumBubbles
1403                      ? (SumBubbles * ScheduleMetrics::ScaleFactor) / CurrCycle
1404                      : 1)
1405              << "\n\n");
1406 #endif
1407 
1408   return ScheduleMetrics(CurrCycle, SumBubbles);
1409 }
1410 
1411 ScheduleMetrics
1412 GCNSchedStage::getScheduleMetrics(const GCNScheduleDAGMILive &DAG) {
1413 #ifndef NDEBUG
1414   std::set<std::pair<MachineInstr *, unsigned>, EarlierIssuingCycle>
1415       ReadyCyclesSorted;
1416 #endif
1417   const TargetSchedModel &SM = ST.getInstrInfo()->getSchedModel();
1418   unsigned SumBubbles = 0;
1419   DenseMap<unsigned, unsigned> ReadyCycles;
1420   unsigned CurrCycle = 0;
1421   for (auto &MI : DAG) {
1422     SUnit *SU = DAG.getSUnit(&MI);
1423     if (!SU)
1424       continue;
1425     unsigned ReadyCycle =
1426         computeSUnitReadyCycle(*SU, CurrCycle, ReadyCycles, SM);
1427     SumBubbles += ReadyCycle - CurrCycle;
1428 #ifndef NDEBUG
1429     ReadyCyclesSorted.insert(std::make_pair(SU->getInstr(), ReadyCycle));
1430 #endif
1431     CurrCycle = ++ReadyCycle;
1432   }
1433 #ifndef NDEBUG
1434   LLVM_DEBUG(
1435       printScheduleModel(ReadyCyclesSorted);
1436       dbgs() << "\n\t"
1437              << "Metric: "
1438              << (SumBubbles
1439                      ? (SumBubbles * ScheduleMetrics::ScaleFactor) / CurrCycle
1440                      : 1)
1441              << "\n\n");
1442 #endif
1443 
1444   return ScheduleMetrics(CurrCycle, SumBubbles);
1445 }
1446 
1447 bool GCNSchedStage::shouldRevertScheduling(unsigned WavesAfter) {
1448   if (WavesAfter < DAG.MinOccupancy)
1449     return true;
1450 
1451   return false;
1452 }
1453 
1454 bool OccInitialScheduleStage::shouldRevertScheduling(unsigned WavesAfter) {
1455   if (PressureAfter == PressureBefore)
1456     return false;
1457 
1458   if (GCNSchedStage::shouldRevertScheduling(WavesAfter))
1459     return true;
1460 
1461   if (mayCauseSpilling(WavesAfter))
1462     return true;
1463 
1464   return false;
1465 }
1466 
1467 bool UnclusteredHighRPStage::shouldRevertScheduling(unsigned WavesAfter) {
1468   // If RP is not reduced in the unclustered reschedule stage, revert to the
1469   // old schedule.
1470   if ((WavesAfter <= PressureBefore.getOccupancy(ST) &&
1471        mayCauseSpilling(WavesAfter)) ||
1472       GCNSchedStage::shouldRevertScheduling(WavesAfter)) {
1473     LLVM_DEBUG(dbgs() << "Unclustered reschedule did not help.\n");
1474     return true;
1475   }
1476 
1477   // Do not attempt to relax schedule even more if we are already spilling.
1478   if (isRegionWithExcessRP())
1479     return false;
1480 
1481   LLVM_DEBUG(
1482       dbgs()
1483       << "\n\t      *** In shouldRevertScheduling ***\n"
1484       << "      *********** BEFORE UnclusteredHighRPStage ***********\n");
1485   ScheduleMetrics MBefore =
1486       getScheduleMetrics(DAG.SUnits);
1487   LLVM_DEBUG(
1488       dbgs()
1489       << "\n      *********** AFTER UnclusteredHighRPStage ***********\n");
1490   ScheduleMetrics MAfter = getScheduleMetrics(DAG);
1491   unsigned OldMetric = MBefore.getMetric();
1492   unsigned NewMetric = MAfter.getMetric();
1493   unsigned WavesBefore =
1494       std::min(S.getTargetOccupancy(), PressureBefore.getOccupancy(ST));
1495   unsigned Profit =
1496       ((WavesAfter * ScheduleMetrics::ScaleFactor) / WavesBefore *
1497        ((OldMetric + ScheduleMetricBias) * ScheduleMetrics::ScaleFactor) /
1498        NewMetric) /
1499       ScheduleMetrics::ScaleFactor;
1500   LLVM_DEBUG(dbgs() << "\tMetric before " << MBefore << "\tMetric after "
1501                     << MAfter << "Profit: " << Profit << "\n");
1502   return Profit < ScheduleMetrics::ScaleFactor;
1503 }
1504 
1505 bool ClusteredLowOccStage::shouldRevertScheduling(unsigned WavesAfter) {
1506   if (PressureAfter == PressureBefore)
1507     return false;
1508 
1509   if (GCNSchedStage::shouldRevertScheduling(WavesAfter))
1510     return true;
1511 
1512   if (mayCauseSpilling(WavesAfter))
1513     return true;
1514 
1515   return false;
1516 }
1517 
1518 bool PreRARematStage::shouldRevertScheduling(unsigned WavesAfter) {
1519   if (GCNSchedStage::shouldRevertScheduling(WavesAfter))
1520     return true;
1521 
1522   if (mayCauseSpilling(WavesAfter))
1523     return true;
1524 
1525   return false;
1526 }
1527 
1528 bool ILPInitialScheduleStage::shouldRevertScheduling(unsigned WavesAfter) {
1529   if (mayCauseSpilling(WavesAfter))
1530     return true;
1531 
1532   return false;
1533 }
1534 
1535 bool MemoryClauseInitialScheduleStage::shouldRevertScheduling(
1536     unsigned WavesAfter) {
1537   return mayCauseSpilling(WavesAfter);
1538 }
1539 
1540 bool GCNSchedStage::mayCauseSpilling(unsigned WavesAfter) {
1541   if (WavesAfter <= MFI.getMinWavesPerEU() && isRegionWithExcessRP() &&
1542       !PressureAfter.less(MF, PressureBefore)) {
1543     LLVM_DEBUG(dbgs() << "New pressure will result in more spilling.\n");
1544     return true;
1545   }
1546 
1547   return false;
1548 }
1549 
1550 void GCNSchedStage::revertScheduling() {
1551   DAG.RegionsWithMinOcc[RegionIdx] =
1552       PressureBefore.getOccupancy(ST) == DAG.MinOccupancy;
1553   LLVM_DEBUG(dbgs() << "Attempting to revert scheduling.\n");
1554   DAG.RescheduleRegions[RegionIdx] =
1555       S.hasNextStage() &&
1556       S.getNextStage() != GCNSchedStageID::UnclusteredHighRPReschedule;
1557   DAG.RegionEnd = DAG.RegionBegin;
1558   int SkippedDebugInstr = 0;
1559   for (MachineInstr *MI : Unsched) {
1560     if (MI->isDebugInstr()) {
1561       ++SkippedDebugInstr;
1562       continue;
1563     }
1564 
1565     if (MI->getIterator() != DAG.RegionEnd) {
1566       DAG.BB->remove(MI);
1567       DAG.BB->insert(DAG.RegionEnd, MI);
1568       if (!MI->isDebugInstr())
1569         DAG.LIS->handleMove(*MI, true);
1570     }
1571 
1572     // Reset read-undef flags and update them later.
1573     for (auto &Op : MI->all_defs())
1574       Op.setIsUndef(false);
1575     RegisterOperands RegOpers;
1576     RegOpers.collect(*MI, *DAG.TRI, DAG.MRI, DAG.ShouldTrackLaneMasks, false);
1577     if (!MI->isDebugInstr()) {
1578       if (DAG.ShouldTrackLaneMasks) {
1579         // Adjust liveness and add missing dead+read-undef flags.
1580         SlotIndex SlotIdx = DAG.LIS->getInstructionIndex(*MI).getRegSlot();
1581         RegOpers.adjustLaneLiveness(*DAG.LIS, DAG.MRI, SlotIdx, MI);
1582       } else {
1583         // Adjust for missing dead-def flags.
1584         RegOpers.detectDeadDefs(*MI, *DAG.LIS);
1585       }
1586     }
1587     DAG.RegionEnd = MI->getIterator();
1588     ++DAG.RegionEnd;
1589     LLVM_DEBUG(dbgs() << "Scheduling " << *MI);
1590   }
1591 
1592   // After reverting schedule, debug instrs will now be at the end of the block
1593   // and RegionEnd will point to the first debug instr. Increment RegionEnd
1594   // pass debug instrs to the actual end of the scheduling region.
1595   while (SkippedDebugInstr-- > 0)
1596     ++DAG.RegionEnd;
1597 
1598   // If Unsched.front() instruction is a debug instruction, this will actually
1599   // shrink the region since we moved all debug instructions to the end of the
1600   // block. Find the first instruction that is not a debug instruction.
1601   DAG.RegionBegin = Unsched.front()->getIterator();
1602   if (DAG.RegionBegin->isDebugInstr()) {
1603     for (MachineInstr *MI : Unsched) {
1604       if (MI->isDebugInstr())
1605         continue;
1606       DAG.RegionBegin = MI->getIterator();
1607       break;
1608     }
1609   }
1610 
1611   // Then move the debug instructions back into their correct place and set
1612   // RegionBegin and RegionEnd if needed.
1613   DAG.placeDebugValues();
1614 
1615   DAG.Regions[RegionIdx] = std::pair(DAG.RegionBegin, DAG.RegionEnd);
1616 }
1617 
1618 void PreRARematStage::collectRematerializableInstructions() {
1619   const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo *>(DAG.TRI);
1620   for (unsigned I = 0, E = DAG.MRI.getNumVirtRegs(); I != E; ++I) {
1621     Register Reg = Register::index2VirtReg(I);
1622     if (!DAG.LIS->hasInterval(Reg))
1623       continue;
1624 
1625     // TODO: Handle AGPR and SGPR rematerialization
1626     if (!SRI->isVGPRClass(DAG.MRI.getRegClass(Reg)) ||
1627         !DAG.MRI.hasOneDef(Reg) || !DAG.MRI.hasOneNonDBGUse(Reg))
1628       continue;
1629 
1630     MachineOperand *Op = DAG.MRI.getOneDef(Reg);
1631     MachineInstr *Def = Op->getParent();
1632     if (Op->getSubReg() != 0 || !isTriviallyReMaterializable(*Def))
1633       continue;
1634 
1635     MachineInstr *UseI = &*DAG.MRI.use_instr_nodbg_begin(Reg);
1636     if (Def->getParent() == UseI->getParent())
1637       continue;
1638 
1639     // We are only collecting defs that are defined in another block and are
1640     // live-through or used inside regions at MinOccupancy. This means that the
1641     // register must be in the live-in set for the region.
1642     bool AddedToRematList = false;
1643     for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) {
1644       auto It = DAG.LiveIns[I].find(Reg);
1645       if (It != DAG.LiveIns[I].end() && !It->second.none()) {
1646         if (DAG.RegionsWithMinOcc[I]) {
1647           RematerializableInsts[I][Def] = UseI;
1648           AddedToRematList = true;
1649         }
1650 
1651         // Collect regions with rematerializable reg as live-in to avoid
1652         // searching later when updating RP.
1653         RematDefToLiveInRegions[Def].push_back(I);
1654       }
1655     }
1656     if (!AddedToRematList)
1657       RematDefToLiveInRegions.erase(Def);
1658   }
1659 }
1660 
1661 bool PreRARematStage::sinkTriviallyRematInsts(const GCNSubtarget &ST,
1662                                               const TargetInstrInfo *TII) {
1663   // Temporary copies of cached variables we will be modifying and replacing if
1664   // sinking succeeds.
1665   SmallVector<
1666       std::pair<MachineBasicBlock::iterator, MachineBasicBlock::iterator>, 32>
1667       NewRegions;
1668   DenseMap<unsigned, GCNRPTracker::LiveRegSet> NewLiveIns;
1669   DenseMap<unsigned, GCNRegPressure> NewPressure;
1670   BitVector NewRescheduleRegions;
1671   LiveIntervals *LIS = DAG.LIS;
1672 
1673   NewRegions.resize(DAG.Regions.size());
1674   NewRescheduleRegions.resize(DAG.Regions.size());
1675 
1676   // Collect only regions that has a rematerializable def as a live-in.
1677   SmallSet<unsigned, 16> ImpactedRegions;
1678   for (const auto &It : RematDefToLiveInRegions)
1679     ImpactedRegions.insert(It.second.begin(), It.second.end());
1680 
1681   // Make copies of register pressure and live-ins cache that will be updated
1682   // as we rematerialize.
1683   for (auto Idx : ImpactedRegions) {
1684     NewPressure[Idx] = DAG.Pressure[Idx];
1685     NewLiveIns[Idx] = DAG.LiveIns[Idx];
1686   }
1687   NewRegions = DAG.Regions;
1688   NewRescheduleRegions.reset();
1689 
1690   DenseMap<MachineInstr *, MachineInstr *> InsertedMIToOldDef;
1691   bool Improved = false;
1692   for (auto I : ImpactedRegions) {
1693     if (!DAG.RegionsWithMinOcc[I])
1694       continue;
1695 
1696     Improved = false;
1697     int VGPRUsage = NewPressure[I].getVGPRNum(ST.hasGFX90AInsts());
1698     int SGPRUsage = NewPressure[I].getSGPRNum();
1699 
1700     // TODO: Handle occupancy drop due to AGPR and SGPR.
1701     // Check if cause of occupancy drop is due to VGPR usage and not SGPR.
1702     if (ST.getOccupancyWithNumSGPRs(SGPRUsage) == DAG.MinOccupancy)
1703       break;
1704 
1705     // The occupancy of this region could have been improved by a previous
1706     // iteration's sinking of defs.
1707     if (NewPressure[I].getOccupancy(ST) > DAG.MinOccupancy) {
1708       NewRescheduleRegions[I] = true;
1709       Improved = true;
1710       continue;
1711     }
1712 
1713     // First check if we have enough trivially rematerializable instructions to
1714     // improve occupancy. Optimistically assume all instructions we are able to
1715     // sink decreased RP.
1716     int TotalSinkableRegs = 0;
1717     for (const auto &It : RematerializableInsts[I]) {
1718       MachineInstr *Def = It.first;
1719       Register DefReg = Def->getOperand(0).getReg();
1720       TotalSinkableRegs +=
1721           SIRegisterInfo::getNumCoveredRegs(NewLiveIns[I][DefReg]);
1722     }
1723     int VGPRsAfterSink = VGPRUsage - TotalSinkableRegs;
1724     unsigned OptimisticOccupancy = ST.getOccupancyWithNumVGPRs(VGPRsAfterSink);
1725     // If in the most optimistic scenario, we cannot improve occupancy, then do
1726     // not attempt to sink any instructions.
1727     if (OptimisticOccupancy <= DAG.MinOccupancy)
1728       break;
1729 
1730     unsigned ImproveOccupancy = 0;
1731     SmallVector<MachineInstr *, 4> SinkedDefs;
1732     for (auto &It : RematerializableInsts[I]) {
1733       MachineInstr *Def = It.first;
1734       MachineBasicBlock::iterator InsertPos =
1735           MachineBasicBlock::iterator(It.second);
1736       Register Reg = Def->getOperand(0).getReg();
1737       // Rematerialize MI to its use block. Since we are only rematerializing
1738       // instructions that do not have any virtual reg uses, we do not need to
1739       // call LiveRangeEdit::allUsesAvailableAt() and
1740       // LiveRangeEdit::canRematerializeAt().
1741       TII->reMaterialize(*InsertPos->getParent(), InsertPos, Reg,
1742                          Def->getOperand(0).getSubReg(), *Def, *DAG.TRI);
1743       MachineInstr *NewMI = &*std::prev(InsertPos);
1744       LIS->InsertMachineInstrInMaps(*NewMI);
1745       LIS->removeInterval(Reg);
1746       LIS->createAndComputeVirtRegInterval(Reg);
1747       InsertedMIToOldDef[NewMI] = Def;
1748 
1749       // Update region boundaries in scheduling region we sinked from since we
1750       // may sink an instruction that was at the beginning or end of its region
1751       DAG.updateRegionBoundaries(NewRegions, Def, /*NewMI =*/nullptr,
1752                                  /*Removing =*/true);
1753 
1754       // Update region boundaries in region we sinked to.
1755       DAG.updateRegionBoundaries(NewRegions, InsertPos, NewMI);
1756 
1757       LaneBitmask PrevMask = NewLiveIns[I][Reg];
1758       // FIXME: Also update cached pressure for where the def was sinked from.
1759       // Update RP for all regions that has this reg as a live-in and remove
1760       // the reg from all regions as a live-in.
1761       for (auto Idx : RematDefToLiveInRegions[Def]) {
1762         NewLiveIns[Idx].erase(Reg);
1763         if (InsertPos->getParent() != DAG.Regions[Idx].first->getParent()) {
1764           // Def is live-through and not used in this block.
1765           NewPressure[Idx].inc(Reg, PrevMask, LaneBitmask::getNone(), DAG.MRI);
1766         } else {
1767           // Def is used and rematerialized into this block.
1768           GCNDownwardRPTracker RPT(*LIS);
1769           auto *NonDbgMI = &*skipDebugInstructionsForward(
1770               NewRegions[Idx].first, NewRegions[Idx].second);
1771           RPT.reset(*NonDbgMI, &NewLiveIns[Idx]);
1772           RPT.advance(NewRegions[Idx].second);
1773           NewPressure[Idx] = RPT.moveMaxPressure();
1774         }
1775       }
1776 
1777       SinkedDefs.push_back(Def);
1778       ImproveOccupancy = NewPressure[I].getOccupancy(ST);
1779       if (ImproveOccupancy > DAG.MinOccupancy)
1780         break;
1781     }
1782 
1783     // Remove defs we just sinked from all regions' list of sinkable defs
1784     for (auto &Def : SinkedDefs)
1785       for (auto TrackedIdx : RematDefToLiveInRegions[Def])
1786         RematerializableInsts[TrackedIdx].erase(Def);
1787 
1788     if (ImproveOccupancy <= DAG.MinOccupancy)
1789       break;
1790 
1791     NewRescheduleRegions[I] = true;
1792     Improved = true;
1793   }
1794 
1795   if (!Improved) {
1796     // Occupancy was not improved for all regions that were at MinOccupancy.
1797     // Undo sinking and remove newly rematerialized instructions.
1798     for (auto &Entry : InsertedMIToOldDef) {
1799       MachineInstr *MI = Entry.first;
1800       MachineInstr *OldMI = Entry.second;
1801       Register Reg = MI->getOperand(0).getReg();
1802       LIS->RemoveMachineInstrFromMaps(*MI);
1803       MI->eraseFromParent();
1804       OldMI->clearRegisterDeads(Reg);
1805       LIS->removeInterval(Reg);
1806       LIS->createAndComputeVirtRegInterval(Reg);
1807     }
1808     return false;
1809   }
1810 
1811   // Occupancy was improved for all regions.
1812   for (auto &Entry : InsertedMIToOldDef) {
1813     MachineInstr *MI = Entry.first;
1814     MachineInstr *OldMI = Entry.second;
1815 
1816     // Remove OldMI from BBLiveInMap since we are sinking it from its MBB.
1817     DAG.BBLiveInMap.erase(OldMI);
1818 
1819     // Remove OldMI and update LIS
1820     Register Reg = MI->getOperand(0).getReg();
1821     LIS->RemoveMachineInstrFromMaps(*OldMI);
1822     OldMI->eraseFromParent();
1823     LIS->removeInterval(Reg);
1824     LIS->createAndComputeVirtRegInterval(Reg);
1825   }
1826 
1827   // Update live-ins, register pressure, and regions caches.
1828   for (auto Idx : ImpactedRegions) {
1829     DAG.LiveIns[Idx] = NewLiveIns[Idx];
1830     DAG.Pressure[Idx] = NewPressure[Idx];
1831     DAG.MBBLiveIns.erase(DAG.Regions[Idx].first->getParent());
1832   }
1833   DAG.Regions = NewRegions;
1834   DAG.RescheduleRegions = NewRescheduleRegions;
1835 
1836   if (GCNTrackers)
1837     DAG.RegionLiveOuts.buildLiveRegMap();
1838 
1839   SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
1840   MFI.increaseOccupancy(MF, ++DAG.MinOccupancy);
1841 
1842   return true;
1843 }
1844 
1845 bool PreRARematStage::isTriviallyReMaterializable(const MachineInstr &MI) {
1846   if (!DAG.TII->isTriviallyReMaterializable(MI))
1847     return false;
1848 
1849   for (const MachineOperand &MO : MI.all_uses()) {
1850     if (MO.getReg().isVirtual())
1851       return false;
1852 
1853     // We can't remat physreg uses, unless it is a constant or an ignorable
1854     // use (e.g. implicit exec use on VALU instructions)
1855     if (MO.getReg().isPhysical()) {
1856       if (DAG.MRI.isConstantPhysReg(MO.getReg()) || DAG.TII->isIgnorableUse(MO))
1857         continue;
1858       return false;
1859     }
1860   }
1861 
1862   return true;
1863 }
1864 
1865 // When removing, we will have to check both beginning and ending of the region.
1866 // When inserting, we will only have to check if we are inserting NewMI in front
1867 // of a scheduling region and do not need to check the ending since we will only
1868 // ever be inserting before an already existing MI.
1869 void GCNScheduleDAGMILive::updateRegionBoundaries(
1870     SmallVectorImpl<std::pair<MachineBasicBlock::iterator,
1871                               MachineBasicBlock::iterator>> &RegionBoundaries,
1872     MachineBasicBlock::iterator MI, MachineInstr *NewMI, bool Removing) {
1873   unsigned I = 0, E = RegionBoundaries.size();
1874   // Search for first region of the block where MI is located
1875   while (I != E && MI->getParent() != RegionBoundaries[I].first->getParent())
1876     ++I;
1877 
1878   for (; I != E; ++I) {
1879     if (MI->getParent() != RegionBoundaries[I].first->getParent())
1880       return;
1881 
1882     if (Removing && MI == RegionBoundaries[I].first &&
1883         MI == RegionBoundaries[I].second) {
1884       // MI is in a region with size 1, after removing, the region will be
1885       // size 0, set RegionBegin and RegionEnd to pass end of block iterator.
1886       RegionBoundaries[I] =
1887           std::pair(MI->getParent()->end(), MI->getParent()->end());
1888       return;
1889     }
1890     if (MI == RegionBoundaries[I].first) {
1891       if (Removing)
1892         RegionBoundaries[I] =
1893             std::pair(std::next(MI), RegionBoundaries[I].second);
1894       else
1895         // Inserted NewMI in front of region, set new RegionBegin to NewMI
1896         RegionBoundaries[I] = std::pair(MachineBasicBlock::iterator(NewMI),
1897                                         RegionBoundaries[I].second);
1898       return;
1899     }
1900     if (Removing && MI == RegionBoundaries[I].second) {
1901       RegionBoundaries[I] = std::pair(RegionBoundaries[I].first, std::prev(MI));
1902       return;
1903     }
1904   }
1905 }
1906 
1907 static bool hasIGLPInstrs(ScheduleDAGInstrs *DAG) {
1908   return any_of(*DAG, [](MachineBasicBlock::iterator MI) {
1909     return isIGLPMutationOnly(MI->getOpcode());
1910   });
1911 }
1912 
1913 GCNPostScheduleDAGMILive::GCNPostScheduleDAGMILive(
1914     MachineSchedContext *C, std::unique_ptr<MachineSchedStrategy> S,
1915     bool RemoveKillFlags)
1916     : ScheduleDAGMI(C, std::move(S), RemoveKillFlags) {}
1917 
1918 void GCNPostScheduleDAGMILive::schedule() {
1919   HasIGLPInstrs = hasIGLPInstrs(this);
1920   if (HasIGLPInstrs) {
1921     SavedMutations.clear();
1922     SavedMutations.swap(Mutations);
1923     addMutation(createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::PostRA));
1924   }
1925 
1926   ScheduleDAGMI::schedule();
1927 }
1928 
1929 void GCNPostScheduleDAGMILive::finalizeSchedule() {
1930   if (HasIGLPInstrs)
1931     SavedMutations.swap(Mutations);
1932 
1933   ScheduleDAGMI::finalizeSchedule();
1934 }
1935