xref: /llvm-project/llvm/lib/CodeGen/MachineScheduler.cpp (revision c3c6d47c45bcd28b45e26d8bf14597ba1e176c05)
1 //===- MachineScheduler.cpp - Machine Instruction Scheduler ---------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // MachineScheduler schedules machine instructions after phi elimination. It
10 // preserves LiveIntervals so it can be invoked before register allocation.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "llvm/CodeGen/MachineScheduler.h"
15 #include "llvm/ADT/ArrayRef.h"
16 #include "llvm/ADT/BitVector.h"
17 #include "llvm/ADT/DenseMap.h"
18 #include "llvm/ADT/PriorityQueue.h"
19 #include "llvm/ADT/STLExtras.h"
20 #include "llvm/ADT/SmallVector.h"
21 #include "llvm/ADT/Statistic.h"
22 #include "llvm/ADT/iterator_range.h"
23 #include "llvm/Analysis/AliasAnalysis.h"
24 #include "llvm/CodeGen/LiveInterval.h"
25 #include "llvm/CodeGen/LiveIntervals.h"
26 #include "llvm/CodeGen/MachineBasicBlock.h"
27 #include "llvm/CodeGen/MachineDominators.h"
28 #include "llvm/CodeGen/MachineFunction.h"
29 #include "llvm/CodeGen/MachineFunctionPass.h"
30 #include "llvm/CodeGen/MachineInstr.h"
31 #include "llvm/CodeGen/MachineLoopInfo.h"
32 #include "llvm/CodeGen/MachineOperand.h"
33 #include "llvm/CodeGen/MachinePassRegistry.h"
34 #include "llvm/CodeGen/MachineRegisterInfo.h"
35 #include "llvm/CodeGen/RegisterClassInfo.h"
36 #include "llvm/CodeGen/RegisterPressure.h"
37 #include "llvm/CodeGen/ScheduleDAG.h"
38 #include "llvm/CodeGen/ScheduleDAGInstrs.h"
39 #include "llvm/CodeGen/ScheduleDAGMutation.h"
40 #include "llvm/CodeGen/ScheduleDFS.h"
41 #include "llvm/CodeGen/ScheduleHazardRecognizer.h"
42 #include "llvm/CodeGen/SlotIndexes.h"
43 #include "llvm/CodeGen/TargetFrameLowering.h"
44 #include "llvm/CodeGen/TargetInstrInfo.h"
45 #include "llvm/CodeGen/TargetLowering.h"
46 #include "llvm/CodeGen/TargetPassConfig.h"
47 #include "llvm/CodeGen/TargetRegisterInfo.h"
48 #include "llvm/CodeGen/TargetSchedule.h"
49 #include "llvm/CodeGen/TargetSubtargetInfo.h"
50 #include "llvm/Config/llvm-config.h"
51 #include "llvm/InitializePasses.h"
52 #include "llvm/MC/LaneBitmask.h"
53 #include "llvm/Pass.h"
54 #include "llvm/Support/CommandLine.h"
55 #include "llvm/Support/Compiler.h"
56 #include "llvm/Support/Debug.h"
57 #include "llvm/Support/ErrorHandling.h"
58 #include "llvm/Support/GraphWriter.h"
59 #include "llvm/Support/MachineValueType.h"
60 #include "llvm/Support/raw_ostream.h"
61 #include <algorithm>
62 #include <cassert>
63 #include <cstdint>
64 #include <iterator>
65 #include <limits>
66 #include <memory>
67 #include <string>
68 #include <tuple>
69 #include <utility>
70 #include <vector>
71 
72 using namespace llvm;
73 
74 #define DEBUG_TYPE "machine-scheduler"
75 
76 STATISTIC(NumClustered, "Number of load/store pairs clustered");
77 
78 namespace llvm {
79 
80 cl::opt<bool> ForceTopDown("misched-topdown", cl::Hidden,
81                            cl::desc("Force top-down list scheduling"));
82 cl::opt<bool> ForceBottomUp("misched-bottomup", cl::Hidden,
83                             cl::desc("Force bottom-up list scheduling"));
84 cl::opt<bool>
85 DumpCriticalPathLength("misched-dcpl", cl::Hidden,
86                        cl::desc("Print critical path length to stdout"));
87 
88 cl::opt<bool> VerifyScheduling(
89     "verify-misched", cl::Hidden,
90     cl::desc("Verify machine instrs before and after machine scheduling"));
91 
92 #ifndef NDEBUG
93 cl::opt<bool> ViewMISchedDAGs(
94     "view-misched-dags", cl::Hidden,
95     cl::desc("Pop up a window to show MISched dags after they are processed"));
96 cl::opt<bool> PrintDAGs("misched-print-dags", cl::Hidden,
97                         cl::desc("Print schedule DAGs"));
98 cl::opt<bool> MISchedDumpReservedCycles(
99     "misched-dump-reserved-cycles", cl::Hidden, cl::init(false),
100     cl::desc("Dump resource usage at schedule boundary."));
101 #else
102 const bool ViewMISchedDAGs = false;
103 const bool PrintDAGs = false;
104 const bool MISchedDumpReservedCycles = false;
105 #endif // NDEBUG
106 
107 } // end namespace llvm
108 
109 #ifndef NDEBUG
110 /// In some situations a few uninteresting nodes depend on nearly all other
111 /// nodes in the graph, provide a cutoff to hide them.
112 static cl::opt<unsigned> ViewMISchedCutoff("view-misched-cutoff", cl::Hidden,
113   cl::desc("Hide nodes with more predecessor/successor than cutoff"));
114 
115 static cl::opt<unsigned> MISchedCutoff("misched-cutoff", cl::Hidden,
116   cl::desc("Stop scheduling after N instructions"), cl::init(~0U));
117 
118 static cl::opt<std::string> SchedOnlyFunc("misched-only-func", cl::Hidden,
119   cl::desc("Only schedule this function"));
120 static cl::opt<unsigned> SchedOnlyBlock("misched-only-block", cl::Hidden,
121                                         cl::desc("Only schedule this MBB#"));
122 #endif // NDEBUG
123 
124 /// Avoid quadratic complexity in unusually large basic blocks by limiting the
125 /// size of the ready lists.
126 static cl::opt<unsigned> ReadyListLimit("misched-limit", cl::Hidden,
127   cl::desc("Limit ready list to N instructions"), cl::init(256));
128 
129 static cl::opt<bool> EnableRegPressure("misched-regpressure", cl::Hidden,
130   cl::desc("Enable register pressure scheduling."), cl::init(true));
131 
132 static cl::opt<bool> EnableCyclicPath("misched-cyclicpath", cl::Hidden,
133   cl::desc("Enable cyclic critical path analysis."), cl::init(true));
134 
135 static cl::opt<bool> EnableMemOpCluster("misched-cluster", cl::Hidden,
136                                         cl::desc("Enable memop clustering."),
137                                         cl::init(true));
138 static cl::opt<bool>
139     ForceFastCluster("force-fast-cluster", cl::Hidden,
140                      cl::desc("Switch to fast cluster algorithm with the lost "
141                               "of some fusion opportunities"),
142                      cl::init(false));
143 static cl::opt<unsigned>
144     FastClusterThreshold("fast-cluster-threshold", cl::Hidden,
145                          cl::desc("The threshold for fast cluster"),
146                          cl::init(1000));
147 
148 // DAG subtrees must have at least this many nodes.
149 static const unsigned MinSubtreeSize = 8;
150 
151 // Pin the vtables to this file.
152 void MachineSchedStrategy::anchor() {}
153 
154 void ScheduleDAGMutation::anchor() {}
155 
156 //===----------------------------------------------------------------------===//
157 // Machine Instruction Scheduling Pass and Registry
158 //===----------------------------------------------------------------------===//
159 
160 MachineSchedContext::MachineSchedContext() {
161   RegClassInfo = new RegisterClassInfo();
162 }
163 
164 MachineSchedContext::~MachineSchedContext() {
165   delete RegClassInfo;
166 }
167 
168 namespace {
169 
170 /// Base class for a machine scheduler class that can run at any point.
171 class MachineSchedulerBase : public MachineSchedContext,
172                              public MachineFunctionPass {
173 public:
174   MachineSchedulerBase(char &ID): MachineFunctionPass(ID) {}
175 
176   void print(raw_ostream &O, const Module* = nullptr) const override;
177 
178 protected:
179   void scheduleRegions(ScheduleDAGInstrs &Scheduler, bool FixKillFlags);
180 };
181 
182 /// MachineScheduler runs after coalescing and before register allocation.
183 class MachineScheduler : public MachineSchedulerBase {
184 public:
185   MachineScheduler();
186 
187   void getAnalysisUsage(AnalysisUsage &AU) const override;
188 
189   bool runOnMachineFunction(MachineFunction&) override;
190 
191   static char ID; // Class identification, replacement for typeinfo
192 
193 protected:
194   ScheduleDAGInstrs *createMachineScheduler();
195 };
196 
197 /// PostMachineScheduler runs after shortly before code emission.
198 class PostMachineScheduler : public MachineSchedulerBase {
199 public:
200   PostMachineScheduler();
201 
202   void getAnalysisUsage(AnalysisUsage &AU) const override;
203 
204   bool runOnMachineFunction(MachineFunction&) override;
205 
206   static char ID; // Class identification, replacement for typeinfo
207 
208 protected:
209   ScheduleDAGInstrs *createPostMachineScheduler();
210 };
211 
212 } // end anonymous namespace
213 
214 char MachineScheduler::ID = 0;
215 
216 char &llvm::MachineSchedulerID = MachineScheduler::ID;
217 
218 INITIALIZE_PASS_BEGIN(MachineScheduler, DEBUG_TYPE,
219                       "Machine Instruction Scheduler", false, false)
220 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
221 INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
222 INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
223 INITIALIZE_PASS_DEPENDENCY(SlotIndexes)
224 INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
225 INITIALIZE_PASS_END(MachineScheduler, DEBUG_TYPE,
226                     "Machine Instruction Scheduler", false, false)
227 
228 MachineScheduler::MachineScheduler() : MachineSchedulerBase(ID) {
229   initializeMachineSchedulerPass(*PassRegistry::getPassRegistry());
230 }
231 
232 void MachineScheduler::getAnalysisUsage(AnalysisUsage &AU) const {
233   AU.setPreservesCFG();
234   AU.addRequired<MachineDominatorTree>();
235   AU.addRequired<MachineLoopInfo>();
236   AU.addRequired<AAResultsWrapperPass>();
237   AU.addRequired<TargetPassConfig>();
238   AU.addRequired<SlotIndexes>();
239   AU.addPreserved<SlotIndexes>();
240   AU.addRequired<LiveIntervals>();
241   AU.addPreserved<LiveIntervals>();
242   MachineFunctionPass::getAnalysisUsage(AU);
243 }
244 
245 char PostMachineScheduler::ID = 0;
246 
247 char &llvm::PostMachineSchedulerID = PostMachineScheduler::ID;
248 
249 INITIALIZE_PASS_BEGIN(PostMachineScheduler, "postmisched",
250                       "PostRA Machine Instruction Scheduler", false, false)
251 INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
252 INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
253 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
254 INITIALIZE_PASS_END(PostMachineScheduler, "postmisched",
255                     "PostRA Machine Instruction Scheduler", false, false)
256 
257 PostMachineScheduler::PostMachineScheduler() : MachineSchedulerBase(ID) {
258   initializePostMachineSchedulerPass(*PassRegistry::getPassRegistry());
259 }
260 
261 void PostMachineScheduler::getAnalysisUsage(AnalysisUsage &AU) const {
262   AU.setPreservesCFG();
263   AU.addRequired<MachineDominatorTree>();
264   AU.addRequired<MachineLoopInfo>();
265   AU.addRequired<AAResultsWrapperPass>();
266   AU.addRequired<TargetPassConfig>();
267   MachineFunctionPass::getAnalysisUsage(AU);
268 }
269 
270 MachinePassRegistry<MachineSchedRegistry::ScheduleDAGCtor>
271     MachineSchedRegistry::Registry;
272 
273 /// A dummy default scheduler factory indicates whether the scheduler
274 /// is overridden on the command line.
275 static ScheduleDAGInstrs *useDefaultMachineSched(MachineSchedContext *C) {
276   return nullptr;
277 }
278 
279 /// MachineSchedOpt allows command line selection of the scheduler.
280 static cl::opt<MachineSchedRegistry::ScheduleDAGCtor, false,
281                RegisterPassParser<MachineSchedRegistry>>
282 MachineSchedOpt("misched",
283                 cl::init(&useDefaultMachineSched), cl::Hidden,
284                 cl::desc("Machine instruction scheduler to use"));
285 
286 static MachineSchedRegistry
287 DefaultSchedRegistry("default", "Use the target's default scheduler choice.",
288                      useDefaultMachineSched);
289 
290 static cl::opt<bool> EnableMachineSched(
291     "enable-misched",
292     cl::desc("Enable the machine instruction scheduling pass."), cl::init(true),
293     cl::Hidden);
294 
295 static cl::opt<bool> EnablePostRAMachineSched(
296     "enable-post-misched",
297     cl::desc("Enable the post-ra machine instruction scheduling pass."),
298     cl::init(true), cl::Hidden);
299 
300 /// Decrement this iterator until reaching the top or a non-debug instr.
301 static MachineBasicBlock::const_iterator
302 priorNonDebug(MachineBasicBlock::const_iterator I,
303               MachineBasicBlock::const_iterator Beg) {
304   assert(I != Beg && "reached the top of the region, cannot decrement");
305   while (--I != Beg) {
306     if (!I->isDebugOrPseudoInstr())
307       break;
308   }
309   return I;
310 }
311 
312 /// Non-const version.
313 static MachineBasicBlock::iterator
314 priorNonDebug(MachineBasicBlock::iterator I,
315               MachineBasicBlock::const_iterator Beg) {
316   return priorNonDebug(MachineBasicBlock::const_iterator(I), Beg)
317       .getNonConstIterator();
318 }
319 
320 /// If this iterator is a debug value, increment until reaching the End or a
321 /// non-debug instruction.
322 static MachineBasicBlock::const_iterator
323 nextIfDebug(MachineBasicBlock::const_iterator I,
324             MachineBasicBlock::const_iterator End) {
325   for(; I != End; ++I) {
326     if (!I->isDebugOrPseudoInstr())
327       break;
328   }
329   return I;
330 }
331 
332 /// Non-const version.
333 static MachineBasicBlock::iterator
334 nextIfDebug(MachineBasicBlock::iterator I,
335             MachineBasicBlock::const_iterator End) {
336   return nextIfDebug(MachineBasicBlock::const_iterator(I), End)
337       .getNonConstIterator();
338 }
339 
340 /// Instantiate a ScheduleDAGInstrs that will be owned by the caller.
341 ScheduleDAGInstrs *MachineScheduler::createMachineScheduler() {
342   // Select the scheduler, or set the default.
343   MachineSchedRegistry::ScheduleDAGCtor Ctor = MachineSchedOpt;
344   if (Ctor != useDefaultMachineSched)
345     return Ctor(this);
346 
347   // Get the default scheduler set by the target for this function.
348   ScheduleDAGInstrs *Scheduler = PassConfig->createMachineScheduler(this);
349   if (Scheduler)
350     return Scheduler;
351 
352   // Default to GenericScheduler.
353   return createGenericSchedLive(this);
354 }
355 
356 /// Instantiate a ScheduleDAGInstrs for PostRA scheduling that will be owned by
357 /// the caller. We don't have a command line option to override the postRA
358 /// scheduler. The Target must configure it.
359 ScheduleDAGInstrs *PostMachineScheduler::createPostMachineScheduler() {
360   // Get the postRA scheduler set by the target for this function.
361   ScheduleDAGInstrs *Scheduler = PassConfig->createPostMachineScheduler(this);
362   if (Scheduler)
363     return Scheduler;
364 
365   // Default to GenericScheduler.
366   return createGenericSchedPostRA(this);
367 }
368 
369 /// Top-level MachineScheduler pass driver.
370 ///
371 /// Visit blocks in function order. Divide each block into scheduling regions
372 /// and visit them bottom-up. Visiting regions bottom-up is not required, but is
373 /// consistent with the DAG builder, which traverses the interior of the
374 /// scheduling regions bottom-up.
375 ///
376 /// This design avoids exposing scheduling boundaries to the DAG builder,
377 /// simplifying the DAG builder's support for "special" target instructions.
378 /// At the same time the design allows target schedulers to operate across
379 /// scheduling boundaries, for example to bundle the boundary instructions
380 /// without reordering them. This creates complexity, because the target
381 /// scheduler must update the RegionBegin and RegionEnd positions cached by
382 /// ScheduleDAGInstrs whenever adding or removing instructions. A much simpler
383 /// design would be to split blocks at scheduling boundaries, but LLVM has a
384 /// general bias against block splitting purely for implementation simplicity.
385 bool MachineScheduler::runOnMachineFunction(MachineFunction &mf) {
386   if (skipFunction(mf.getFunction()))
387     return false;
388 
389   if (EnableMachineSched.getNumOccurrences()) {
390     if (!EnableMachineSched)
391       return false;
392   } else if (!mf.getSubtarget().enableMachineScheduler())
393     return false;
394 
395   LLVM_DEBUG(dbgs() << "Before MISched:\n"; mf.print(dbgs()));
396 
397   // Initialize the context of the pass.
398   MF = &mf;
399   MLI = &getAnalysis<MachineLoopInfo>();
400   MDT = &getAnalysis<MachineDominatorTree>();
401   PassConfig = &getAnalysis<TargetPassConfig>();
402   AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
403 
404   LIS = &getAnalysis<LiveIntervals>();
405 
406   if (VerifyScheduling) {
407     LLVM_DEBUG(LIS->dump());
408     MF->verify(this, "Before machine scheduling.");
409   }
410   RegClassInfo->runOnMachineFunction(*MF);
411 
412   // Instantiate the selected scheduler for this target, function, and
413   // optimization level.
414   std::unique_ptr<ScheduleDAGInstrs> Scheduler(createMachineScheduler());
415   scheduleRegions(*Scheduler, false);
416 
417   LLVM_DEBUG(LIS->dump());
418   if (VerifyScheduling)
419     MF->verify(this, "After machine scheduling.");
420   return true;
421 }
422 
423 bool PostMachineScheduler::runOnMachineFunction(MachineFunction &mf) {
424   if (skipFunction(mf.getFunction()))
425     return false;
426 
427   if (EnablePostRAMachineSched.getNumOccurrences()) {
428     if (!EnablePostRAMachineSched)
429       return false;
430   } else if (!mf.getSubtarget().enablePostRAMachineScheduler()) {
431     LLVM_DEBUG(dbgs() << "Subtarget disables post-MI-sched.\n");
432     return false;
433   }
434   LLVM_DEBUG(dbgs() << "Before post-MI-sched:\n"; mf.print(dbgs()));
435 
436   // Initialize the context of the pass.
437   MF = &mf;
438   MLI = &getAnalysis<MachineLoopInfo>();
439   PassConfig = &getAnalysis<TargetPassConfig>();
440   AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
441 
442   if (VerifyScheduling)
443     MF->verify(this, "Before post machine scheduling.");
444 
445   // Instantiate the selected scheduler for this target, function, and
446   // optimization level.
447   std::unique_ptr<ScheduleDAGInstrs> Scheduler(createPostMachineScheduler());
448   scheduleRegions(*Scheduler, true);
449 
450   if (VerifyScheduling)
451     MF->verify(this, "After post machine scheduling.");
452   return true;
453 }
454 
455 /// Return true of the given instruction should not be included in a scheduling
456 /// region.
457 ///
458 /// MachineScheduler does not currently support scheduling across calls. To
459 /// handle calls, the DAG builder needs to be modified to create register
460 /// anti/output dependencies on the registers clobbered by the call's regmask
461 /// operand. In PreRA scheduling, the stack pointer adjustment already prevents
462 /// scheduling across calls. In PostRA scheduling, we need the isCall to enforce
463 /// the boundary, but there would be no benefit to postRA scheduling across
464 /// calls this late anyway.
465 static bool isSchedBoundary(MachineBasicBlock::iterator MI,
466                             MachineBasicBlock *MBB,
467                             MachineFunction *MF,
468                             const TargetInstrInfo *TII) {
469   return MI->isCall() || TII->isSchedulingBoundary(*MI, MBB, *MF);
470 }
471 
472 /// A region of an MBB for scheduling.
473 namespace {
474 struct SchedRegion {
475   /// RegionBegin is the first instruction in the scheduling region, and
476   /// RegionEnd is either MBB->end() or the scheduling boundary after the
477   /// last instruction in the scheduling region. These iterators cannot refer
478   /// to instructions outside of the identified scheduling region because
479   /// those may be reordered before scheduling this region.
480   MachineBasicBlock::iterator RegionBegin;
481   MachineBasicBlock::iterator RegionEnd;
482   unsigned NumRegionInstrs;
483 
484   SchedRegion(MachineBasicBlock::iterator B, MachineBasicBlock::iterator E,
485               unsigned N) :
486     RegionBegin(B), RegionEnd(E), NumRegionInstrs(N) {}
487 };
488 } // end anonymous namespace
489 
490 using MBBRegionsVector = SmallVector<SchedRegion, 16>;
491 
492 static void
493 getSchedRegions(MachineBasicBlock *MBB,
494                 MBBRegionsVector &Regions,
495                 bool RegionsTopDown) {
496   MachineFunction *MF = MBB->getParent();
497   const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
498 
499   MachineBasicBlock::iterator I = nullptr;
500   for(MachineBasicBlock::iterator RegionEnd = MBB->end();
501       RegionEnd != MBB->begin(); RegionEnd = I) {
502 
503     // Avoid decrementing RegionEnd for blocks with no terminator.
504     if (RegionEnd != MBB->end() ||
505         isSchedBoundary(&*std::prev(RegionEnd), &*MBB, MF, TII)) {
506       --RegionEnd;
507     }
508 
509     // The next region starts above the previous region. Look backward in the
510     // instruction stream until we find the nearest boundary.
511     unsigned NumRegionInstrs = 0;
512     I = RegionEnd;
513     for (;I != MBB->begin(); --I) {
514       MachineInstr &MI = *std::prev(I);
515       if (isSchedBoundary(&MI, &*MBB, MF, TII))
516         break;
517       if (!MI.isDebugOrPseudoInstr()) {
518         // MBB::size() uses instr_iterator to count. Here we need a bundle to
519         // count as a single instruction.
520         ++NumRegionInstrs;
521       }
522     }
523 
524     // It's possible we found a scheduling region that only has debug
525     // instructions. Don't bother scheduling these.
526     if (NumRegionInstrs != 0)
527       Regions.push_back(SchedRegion(I, RegionEnd, NumRegionInstrs));
528   }
529 
530   if (RegionsTopDown)
531     std::reverse(Regions.begin(), Regions.end());
532 }
533 
534 /// Main driver for both MachineScheduler and PostMachineScheduler.
535 void MachineSchedulerBase::scheduleRegions(ScheduleDAGInstrs &Scheduler,
536                                            bool FixKillFlags) {
537   // Visit all machine basic blocks.
538   //
539   // TODO: Visit blocks in global postorder or postorder within the bottom-up
540   // loop tree. Then we can optionally compute global RegPressure.
541   for (MachineFunction::iterator MBB = MF->begin(), MBBEnd = MF->end();
542        MBB != MBBEnd; ++MBB) {
543 
544     Scheduler.startBlock(&*MBB);
545 
546 #ifndef NDEBUG
547     if (SchedOnlyFunc.getNumOccurrences() && SchedOnlyFunc != MF->getName())
548       continue;
549     if (SchedOnlyBlock.getNumOccurrences()
550         && (int)SchedOnlyBlock != MBB->getNumber())
551       continue;
552 #endif
553 
554     // Break the block into scheduling regions [I, RegionEnd). RegionEnd
555     // points to the scheduling boundary at the bottom of the region. The DAG
556     // does not include RegionEnd, but the region does (i.e. the next
557     // RegionEnd is above the previous RegionBegin). If the current block has
558     // no terminator then RegionEnd == MBB->end() for the bottom region.
559     //
560     // All the regions of MBB are first found and stored in MBBRegions, which
561     // will be processed (MBB) top-down if initialized with true.
562     //
563     // The Scheduler may insert instructions during either schedule() or
564     // exitRegion(), even for empty regions. So the local iterators 'I' and
565     // 'RegionEnd' are invalid across these calls. Instructions must not be
566     // added to other regions than the current one without updating MBBRegions.
567 
568     MBBRegionsVector MBBRegions;
569     getSchedRegions(&*MBB, MBBRegions, Scheduler.doMBBSchedRegionsTopDown());
570     for (const SchedRegion &R : MBBRegions) {
571       MachineBasicBlock::iterator I = R.RegionBegin;
572       MachineBasicBlock::iterator RegionEnd = R.RegionEnd;
573       unsigned NumRegionInstrs = R.NumRegionInstrs;
574 
575       // Notify the scheduler of the region, even if we may skip scheduling
576       // it. Perhaps it still needs to be bundled.
577       Scheduler.enterRegion(&*MBB, I, RegionEnd, NumRegionInstrs);
578 
579       // Skip empty scheduling regions (0 or 1 schedulable instructions).
580       if (I == RegionEnd || I == std::prev(RegionEnd)) {
581         // Close the current region. Bundle the terminator if needed.
582         // This invalidates 'RegionEnd' and 'I'.
583         Scheduler.exitRegion();
584         continue;
585       }
586       LLVM_DEBUG(dbgs() << "********** MI Scheduling **********\n");
587       LLVM_DEBUG(dbgs() << MF->getName() << ":" << printMBBReference(*MBB)
588                         << " " << MBB->getName() << "\n  From: " << *I
589                         << "    To: ";
590                  if (RegionEnd != MBB->end()) dbgs() << *RegionEnd;
591                  else dbgs() << "End\n";
592                  dbgs() << " RegionInstrs: " << NumRegionInstrs << '\n');
593       if (DumpCriticalPathLength) {
594         errs() << MF->getName();
595         errs() << ":%bb. " << MBB->getNumber();
596         errs() << " " << MBB->getName() << " \n";
597       }
598 
599       // Schedule a region: possibly reorder instructions.
600       // This invalidates the original region iterators.
601       Scheduler.schedule();
602 
603       // Close the current region.
604       Scheduler.exitRegion();
605     }
606     Scheduler.finishBlock();
607     // FIXME: Ideally, no further passes should rely on kill flags. However,
608     // thumb2 size reduction is currently an exception, so the PostMIScheduler
609     // needs to do this.
610     if (FixKillFlags)
611       Scheduler.fixupKills(*MBB);
612   }
613   Scheduler.finalizeSchedule();
614 }
615 
616 void MachineSchedulerBase::print(raw_ostream &O, const Module* m) const {
617   // unimplemented
618 }
619 
620 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
621 LLVM_DUMP_METHOD void ReadyQueue::dump() const {
622   dbgs() << "Queue " << Name << ": ";
623   for (const SUnit *SU : Queue)
624     dbgs() << SU->NodeNum << " ";
625   dbgs() << "\n";
626 }
627 #endif
628 
629 //===----------------------------------------------------------------------===//
630 // ScheduleDAGMI - Basic machine instruction scheduling. This is
631 // independent of PreRA/PostRA scheduling and involves no extra book-keeping for
632 // virtual registers.
633 // ===----------------------------------------------------------------------===/
634 
635 // Provide a vtable anchor.
636 ScheduleDAGMI::~ScheduleDAGMI() = default;
637 
638 /// ReleaseSucc - Decrement the NumPredsLeft count of a successor. When
639 /// NumPredsLeft reaches zero, release the successor node.
640 ///
641 /// FIXME: Adjust SuccSU height based on MinLatency.
642 void ScheduleDAGMI::releaseSucc(SUnit *SU, SDep *SuccEdge) {
643   SUnit *SuccSU = SuccEdge->getSUnit();
644 
645   if (SuccEdge->isWeak()) {
646     --SuccSU->WeakPredsLeft;
647     if (SuccEdge->isCluster())
648       NextClusterSucc = SuccSU;
649     return;
650   }
651 #ifndef NDEBUG
652   if (SuccSU->NumPredsLeft == 0) {
653     dbgs() << "*** Scheduling failed! ***\n";
654     dumpNode(*SuccSU);
655     dbgs() << " has been released too many times!\n";
656     llvm_unreachable(nullptr);
657   }
658 #endif
659   // SU->TopReadyCycle was set to CurrCycle when it was scheduled. However,
660   // CurrCycle may have advanced since then.
661   if (SuccSU->TopReadyCycle < SU->TopReadyCycle + SuccEdge->getLatency())
662     SuccSU->TopReadyCycle = SU->TopReadyCycle + SuccEdge->getLatency();
663 
664   --SuccSU->NumPredsLeft;
665   if (SuccSU->NumPredsLeft == 0 && SuccSU != &ExitSU)
666     SchedImpl->releaseTopNode(SuccSU);
667 }
668 
669 /// releaseSuccessors - Call releaseSucc on each of SU's successors.
670 void ScheduleDAGMI::releaseSuccessors(SUnit *SU) {
671   for (SDep &Succ : SU->Succs)
672     releaseSucc(SU, &Succ);
673 }
674 
675 /// ReleasePred - Decrement the NumSuccsLeft count of a predecessor. When
676 /// NumSuccsLeft reaches zero, release the predecessor node.
677 ///
678 /// FIXME: Adjust PredSU height based on MinLatency.
679 void ScheduleDAGMI::releasePred(SUnit *SU, SDep *PredEdge) {
680   SUnit *PredSU = PredEdge->getSUnit();
681 
682   if (PredEdge->isWeak()) {
683     --PredSU->WeakSuccsLeft;
684     if (PredEdge->isCluster())
685       NextClusterPred = PredSU;
686     return;
687   }
688 #ifndef NDEBUG
689   if (PredSU->NumSuccsLeft == 0) {
690     dbgs() << "*** Scheduling failed! ***\n";
691     dumpNode(*PredSU);
692     dbgs() << " has been released too many times!\n";
693     llvm_unreachable(nullptr);
694   }
695 #endif
696   // SU->BotReadyCycle was set to CurrCycle when it was scheduled. However,
697   // CurrCycle may have advanced since then.
698   if (PredSU->BotReadyCycle < SU->BotReadyCycle + PredEdge->getLatency())
699     PredSU->BotReadyCycle = SU->BotReadyCycle + PredEdge->getLatency();
700 
701   --PredSU->NumSuccsLeft;
702   if (PredSU->NumSuccsLeft == 0 && PredSU != &EntrySU)
703     SchedImpl->releaseBottomNode(PredSU);
704 }
705 
706 /// releasePredecessors - Call releasePred on each of SU's predecessors.
707 void ScheduleDAGMI::releasePredecessors(SUnit *SU) {
708   for (SDep &Pred : SU->Preds)
709     releasePred(SU, &Pred);
710 }
711 
712 void ScheduleDAGMI::startBlock(MachineBasicBlock *bb) {
713   ScheduleDAGInstrs::startBlock(bb);
714   SchedImpl->enterMBB(bb);
715 }
716 
717 void ScheduleDAGMI::finishBlock() {
718   SchedImpl->leaveMBB();
719   ScheduleDAGInstrs::finishBlock();
720 }
721 
722 /// enterRegion - Called back from MachineScheduler::runOnMachineFunction after
723 /// crossing a scheduling boundary. [begin, end) includes all instructions in
724 /// the region, including the boundary itself and single-instruction regions
725 /// that don't get scheduled.
726 void ScheduleDAGMI::enterRegion(MachineBasicBlock *bb,
727                                      MachineBasicBlock::iterator begin,
728                                      MachineBasicBlock::iterator end,
729                                      unsigned regioninstrs)
730 {
731   ScheduleDAGInstrs::enterRegion(bb, begin, end, regioninstrs);
732 
733   SchedImpl->initPolicy(begin, end, regioninstrs);
734 }
735 
736 /// This is normally called from the main scheduler loop but may also be invoked
737 /// by the scheduling strategy to perform additional code motion.
738 void ScheduleDAGMI::moveInstruction(
739   MachineInstr *MI, MachineBasicBlock::iterator InsertPos) {
740   // Advance RegionBegin if the first instruction moves down.
741   if (&*RegionBegin == MI)
742     ++RegionBegin;
743 
744   // Update the instruction stream.
745   BB->splice(InsertPos, BB, MI);
746 
747   // Update LiveIntervals
748   if (LIS)
749     LIS->handleMove(*MI, /*UpdateFlags=*/true);
750 
751   // Recede RegionBegin if an instruction moves above the first.
752   if (RegionBegin == InsertPos)
753     RegionBegin = MI;
754 }
755 
756 bool ScheduleDAGMI::checkSchedLimit() {
757 #if LLVM_ENABLE_ABI_BREAKING_CHECKS && !defined(NDEBUG)
758   if (NumInstrsScheduled == MISchedCutoff && MISchedCutoff != ~0U) {
759     CurrentTop = CurrentBottom;
760     return false;
761   }
762   ++NumInstrsScheduled;
763 #endif
764   return true;
765 }
766 
767 /// Per-region scheduling driver, called back from
768 /// MachineScheduler::runOnMachineFunction. This is a simplified driver that
769 /// does not consider liveness or register pressure. It is useful for PostRA
770 /// scheduling and potentially other custom schedulers.
771 void ScheduleDAGMI::schedule() {
772   LLVM_DEBUG(dbgs() << "ScheduleDAGMI::schedule starting\n");
773   LLVM_DEBUG(SchedImpl->dumpPolicy());
774 
775   // Build the DAG.
776   buildSchedGraph(AA);
777 
778   postprocessDAG();
779 
780   SmallVector<SUnit*, 8> TopRoots, BotRoots;
781   findRootsAndBiasEdges(TopRoots, BotRoots);
782 
783   LLVM_DEBUG(dump());
784   if (PrintDAGs) dump();
785   if (ViewMISchedDAGs) viewGraph();
786 
787   // Initialize the strategy before modifying the DAG.
788   // This may initialize a DFSResult to be used for queue priority.
789   SchedImpl->initialize(this);
790 
791   // Initialize ready queues now that the DAG and priority data are finalized.
792   initQueues(TopRoots, BotRoots);
793 
794   bool IsTopNode = false;
795   while (true) {
796     LLVM_DEBUG(dbgs() << "** ScheduleDAGMI::schedule picking next node\n");
797     SUnit *SU = SchedImpl->pickNode(IsTopNode);
798     if (!SU) break;
799 
800     assert(!SU->isScheduled && "Node already scheduled");
801     if (!checkSchedLimit())
802       break;
803 
804     MachineInstr *MI = SU->getInstr();
805     if (IsTopNode) {
806       assert(SU->isTopReady() && "node still has unscheduled dependencies");
807       if (&*CurrentTop == MI)
808         CurrentTop = nextIfDebug(++CurrentTop, CurrentBottom);
809       else
810         moveInstruction(MI, CurrentTop);
811     } else {
812       assert(SU->isBottomReady() && "node still has unscheduled dependencies");
813       MachineBasicBlock::iterator priorII =
814         priorNonDebug(CurrentBottom, CurrentTop);
815       if (&*priorII == MI)
816         CurrentBottom = priorII;
817       else {
818         if (&*CurrentTop == MI)
819           CurrentTop = nextIfDebug(++CurrentTop, priorII);
820         moveInstruction(MI, CurrentBottom);
821         CurrentBottom = MI;
822       }
823     }
824     // Notify the scheduling strategy before updating the DAG.
825     // This sets the scheduled node's ReadyCycle to CurrCycle. When updateQueues
826     // runs, it can then use the accurate ReadyCycle time to determine whether
827     // newly released nodes can move to the readyQ.
828     SchedImpl->schedNode(SU, IsTopNode);
829 
830     updateQueues(SU, IsTopNode);
831   }
832   assert(CurrentTop == CurrentBottom && "Nonempty unscheduled zone.");
833 
834   placeDebugValues();
835 
836   LLVM_DEBUG({
837     dbgs() << "*** Final schedule for "
838            << printMBBReference(*begin()->getParent()) << " ***\n";
839     dumpSchedule();
840     dbgs() << '\n';
841   });
842 }
843 
844 /// Apply each ScheduleDAGMutation step in order.
845 void ScheduleDAGMI::postprocessDAG() {
846   for (auto &m : Mutations)
847     m->apply(this);
848 }
849 
850 void ScheduleDAGMI::
851 findRootsAndBiasEdges(SmallVectorImpl<SUnit*> &TopRoots,
852                       SmallVectorImpl<SUnit*> &BotRoots) {
853   for (SUnit &SU : SUnits) {
854     assert(!SU.isBoundaryNode() && "Boundary node should not be in SUnits");
855 
856     // Order predecessors so DFSResult follows the critical path.
857     SU.biasCriticalPath();
858 
859     // A SUnit is ready to top schedule if it has no predecessors.
860     if (!SU.NumPredsLeft)
861       TopRoots.push_back(&SU);
862     // A SUnit is ready to bottom schedule if it has no successors.
863     if (!SU.NumSuccsLeft)
864       BotRoots.push_back(&SU);
865   }
866   ExitSU.biasCriticalPath();
867 }
868 
869 /// Identify DAG roots and setup scheduler queues.
870 void ScheduleDAGMI::initQueues(ArrayRef<SUnit*> TopRoots,
871                                ArrayRef<SUnit*> BotRoots) {
872   NextClusterSucc = nullptr;
873   NextClusterPred = nullptr;
874 
875   // Release all DAG roots for scheduling, not including EntrySU/ExitSU.
876   //
877   // Nodes with unreleased weak edges can still be roots.
878   // Release top roots in forward order.
879   for (SUnit *SU : TopRoots)
880     SchedImpl->releaseTopNode(SU);
881 
882   // Release bottom roots in reverse order so the higher priority nodes appear
883   // first. This is more natural and slightly more efficient.
884   for (SmallVectorImpl<SUnit*>::const_reverse_iterator
885          I = BotRoots.rbegin(), E = BotRoots.rend(); I != E; ++I) {
886     SchedImpl->releaseBottomNode(*I);
887   }
888 
889   releaseSuccessors(&EntrySU);
890   releasePredecessors(&ExitSU);
891 
892   SchedImpl->registerRoots();
893 
894   // Advance past initial DebugValues.
895   CurrentTop = nextIfDebug(RegionBegin, RegionEnd);
896   CurrentBottom = RegionEnd;
897 }
898 
899 /// Update scheduler queues after scheduling an instruction.
900 void ScheduleDAGMI::updateQueues(SUnit *SU, bool IsTopNode) {
901   // Release dependent instructions for scheduling.
902   if (IsTopNode)
903     releaseSuccessors(SU);
904   else
905     releasePredecessors(SU);
906 
907   SU->isScheduled = true;
908 }
909 
910 /// Reinsert any remaining debug_values, just like the PostRA scheduler.
911 void ScheduleDAGMI::placeDebugValues() {
912   // If first instruction was a DBG_VALUE then put it back.
913   if (FirstDbgValue) {
914     BB->splice(RegionBegin, BB, FirstDbgValue);
915     RegionBegin = FirstDbgValue;
916   }
917 
918   for (std::vector<std::pair<MachineInstr *, MachineInstr *>>::iterator
919          DI = DbgValues.end(), DE = DbgValues.begin(); DI != DE; --DI) {
920     std::pair<MachineInstr *, MachineInstr *> P = *std::prev(DI);
921     MachineInstr *DbgValue = P.first;
922     MachineBasicBlock::iterator OrigPrevMI = P.second;
923     if (&*RegionBegin == DbgValue)
924       ++RegionBegin;
925     BB->splice(std::next(OrigPrevMI), BB, DbgValue);
926     if (RegionEnd != BB->end() && OrigPrevMI == &*RegionEnd)
927       RegionEnd = DbgValue;
928   }
929 }
930 
931 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
932 LLVM_DUMP_METHOD void ScheduleDAGMI::dumpSchedule() const {
933   for (MachineInstr &MI : *this) {
934     if (SUnit *SU = getSUnit(&MI))
935       dumpNode(*SU);
936     else
937       dbgs() << "Missing SUnit\n";
938   }
939 }
940 #endif
941 
942 //===----------------------------------------------------------------------===//
943 // ScheduleDAGMILive - Base class for MachineInstr scheduling with LiveIntervals
944 // preservation.
945 //===----------------------------------------------------------------------===//
946 
947 ScheduleDAGMILive::~ScheduleDAGMILive() {
948   delete DFSResult;
949 }
950 
951 void ScheduleDAGMILive::collectVRegUses(SUnit &SU) {
952   const MachineInstr &MI = *SU.getInstr();
953   for (const MachineOperand &MO : MI.operands()) {
954     if (!MO.isReg())
955       continue;
956     if (!MO.readsReg())
957       continue;
958     if (TrackLaneMasks && !MO.isUse())
959       continue;
960 
961     Register Reg = MO.getReg();
962     if (!Register::isVirtualRegister(Reg))
963       continue;
964 
965     // Ignore re-defs.
966     if (TrackLaneMasks) {
967       bool FoundDef = false;
968       for (const MachineOperand &MO2 : MI.operands()) {
969         if (MO2.isReg() && MO2.isDef() && MO2.getReg() == Reg && !MO2.isDead()) {
970           FoundDef = true;
971           break;
972         }
973       }
974       if (FoundDef)
975         continue;
976     }
977 
978     // Record this local VReg use.
979     VReg2SUnitMultiMap::iterator UI = VRegUses.find(Reg);
980     for (; UI != VRegUses.end(); ++UI) {
981       if (UI->SU == &SU)
982         break;
983     }
984     if (UI == VRegUses.end())
985       VRegUses.insert(VReg2SUnit(Reg, LaneBitmask::getNone(), &SU));
986   }
987 }
988 
989 /// enterRegion - Called back from MachineScheduler::runOnMachineFunction after
990 /// crossing a scheduling boundary. [begin, end) includes all instructions in
991 /// the region, including the boundary itself and single-instruction regions
992 /// that don't get scheduled.
993 void ScheduleDAGMILive::enterRegion(MachineBasicBlock *bb,
994                                 MachineBasicBlock::iterator begin,
995                                 MachineBasicBlock::iterator end,
996                                 unsigned regioninstrs)
997 {
998   // ScheduleDAGMI initializes SchedImpl's per-region policy.
999   ScheduleDAGMI::enterRegion(bb, begin, end, regioninstrs);
1000 
1001   // For convenience remember the end of the liveness region.
1002   LiveRegionEnd = (RegionEnd == bb->end()) ? RegionEnd : std::next(RegionEnd);
1003 
1004   SUPressureDiffs.clear();
1005 
1006   ShouldTrackPressure = SchedImpl->shouldTrackPressure();
1007   ShouldTrackLaneMasks = SchedImpl->shouldTrackLaneMasks();
1008 
1009   assert((!ShouldTrackLaneMasks || ShouldTrackPressure) &&
1010          "ShouldTrackLaneMasks requires ShouldTrackPressure");
1011 }
1012 
1013 // Setup the register pressure trackers for the top scheduled and bottom
1014 // scheduled regions.
1015 void ScheduleDAGMILive::initRegPressure() {
1016   VRegUses.clear();
1017   VRegUses.setUniverse(MRI.getNumVirtRegs());
1018   for (SUnit &SU : SUnits)
1019     collectVRegUses(SU);
1020 
1021   TopRPTracker.init(&MF, RegClassInfo, LIS, BB, RegionBegin,
1022                     ShouldTrackLaneMasks, false);
1023   BotRPTracker.init(&MF, RegClassInfo, LIS, BB, LiveRegionEnd,
1024                     ShouldTrackLaneMasks, false);
1025 
1026   // Close the RPTracker to finalize live ins.
1027   RPTracker.closeRegion();
1028 
1029   LLVM_DEBUG(RPTracker.dump());
1030 
1031   // Initialize the live ins and live outs.
1032   TopRPTracker.addLiveRegs(RPTracker.getPressure().LiveInRegs);
1033   BotRPTracker.addLiveRegs(RPTracker.getPressure().LiveOutRegs);
1034 
1035   // Close one end of the tracker so we can call
1036   // getMaxUpward/DownwardPressureDelta before advancing across any
1037   // instructions. This converts currently live regs into live ins/outs.
1038   TopRPTracker.closeTop();
1039   BotRPTracker.closeBottom();
1040 
1041   BotRPTracker.initLiveThru(RPTracker);
1042   if (!BotRPTracker.getLiveThru().empty()) {
1043     TopRPTracker.initLiveThru(BotRPTracker.getLiveThru());
1044     LLVM_DEBUG(dbgs() << "Live Thru: ";
1045                dumpRegSetPressure(BotRPTracker.getLiveThru(), TRI));
1046   };
1047 
1048   // For each live out vreg reduce the pressure change associated with other
1049   // uses of the same vreg below the live-out reaching def.
1050   updatePressureDiffs(RPTracker.getPressure().LiveOutRegs);
1051 
1052   // Account for liveness generated by the region boundary.
1053   if (LiveRegionEnd != RegionEnd) {
1054     SmallVector<RegisterMaskPair, 8> LiveUses;
1055     BotRPTracker.recede(&LiveUses);
1056     updatePressureDiffs(LiveUses);
1057   }
1058 
1059   LLVM_DEBUG(dbgs() << "Top Pressure:\n";
1060              dumpRegSetPressure(TopRPTracker.getRegSetPressureAtPos(), TRI);
1061              dbgs() << "Bottom Pressure:\n";
1062              dumpRegSetPressure(BotRPTracker.getRegSetPressureAtPos(), TRI););
1063 
1064   assert((BotRPTracker.getPos() == RegionEnd ||
1065           (RegionEnd->isDebugInstr() &&
1066            BotRPTracker.getPos() == priorNonDebug(RegionEnd, RegionBegin))) &&
1067          "Can't find the region bottom");
1068 
1069   // Cache the list of excess pressure sets in this region. This will also track
1070   // the max pressure in the scheduled code for these sets.
1071   RegionCriticalPSets.clear();
1072   const std::vector<unsigned> &RegionPressure =
1073     RPTracker.getPressure().MaxSetPressure;
1074   for (unsigned i = 0, e = RegionPressure.size(); i < e; ++i) {
1075     unsigned Limit = RegClassInfo->getRegPressureSetLimit(i);
1076     if (RegionPressure[i] > Limit) {
1077       LLVM_DEBUG(dbgs() << TRI->getRegPressureSetName(i) << " Limit " << Limit
1078                         << " Actual " << RegionPressure[i] << "\n");
1079       RegionCriticalPSets.push_back(PressureChange(i));
1080     }
1081   }
1082   LLVM_DEBUG(dbgs() << "Excess PSets: ";
1083              for (const PressureChange &RCPS
1084                   : RegionCriticalPSets) dbgs()
1085              << TRI->getRegPressureSetName(RCPS.getPSet()) << " ";
1086              dbgs() << "\n");
1087 }
1088 
1089 void ScheduleDAGMILive::
1090 updateScheduledPressure(const SUnit *SU,
1091                         const std::vector<unsigned> &NewMaxPressure) {
1092   const PressureDiff &PDiff = getPressureDiff(SU);
1093   unsigned CritIdx = 0, CritEnd = RegionCriticalPSets.size();
1094   for (const PressureChange &PC : PDiff) {
1095     if (!PC.isValid())
1096       break;
1097     unsigned ID = PC.getPSet();
1098     while (CritIdx != CritEnd && RegionCriticalPSets[CritIdx].getPSet() < ID)
1099       ++CritIdx;
1100     if (CritIdx != CritEnd && RegionCriticalPSets[CritIdx].getPSet() == ID) {
1101       if ((int)NewMaxPressure[ID] > RegionCriticalPSets[CritIdx].getUnitInc()
1102           && NewMaxPressure[ID] <= (unsigned)std::numeric_limits<int16_t>::max())
1103         RegionCriticalPSets[CritIdx].setUnitInc(NewMaxPressure[ID]);
1104     }
1105     unsigned Limit = RegClassInfo->getRegPressureSetLimit(ID);
1106     if (NewMaxPressure[ID] >= Limit - 2) {
1107       LLVM_DEBUG(dbgs() << "  " << TRI->getRegPressureSetName(ID) << ": "
1108                         << NewMaxPressure[ID]
1109                         << ((NewMaxPressure[ID] > Limit) ? " > " : " <= ")
1110                         << Limit << "(+ " << BotRPTracker.getLiveThru()[ID]
1111                         << " livethru)\n");
1112     }
1113   }
1114 }
1115 
1116 /// Update the PressureDiff array for liveness after scheduling this
1117 /// instruction.
1118 void ScheduleDAGMILive::updatePressureDiffs(
1119     ArrayRef<RegisterMaskPair> LiveUses) {
1120   for (const RegisterMaskPair &P : LiveUses) {
1121     Register Reg = P.RegUnit;
1122     /// FIXME: Currently assuming single-use physregs.
1123     if (!Register::isVirtualRegister(Reg))
1124       continue;
1125 
1126     if (ShouldTrackLaneMasks) {
1127       // If the register has just become live then other uses won't change
1128       // this fact anymore => decrement pressure.
1129       // If the register has just become dead then other uses make it come
1130       // back to life => increment pressure.
1131       bool Decrement = P.LaneMask.any();
1132 
1133       for (const VReg2SUnit &V2SU
1134            : make_range(VRegUses.find(Reg), VRegUses.end())) {
1135         SUnit &SU = *V2SU.SU;
1136         if (SU.isScheduled || &SU == &ExitSU)
1137           continue;
1138 
1139         PressureDiff &PDiff = getPressureDiff(&SU);
1140         PDiff.addPressureChange(Reg, Decrement, &MRI);
1141         LLVM_DEBUG(dbgs() << "  UpdateRegP: SU(" << SU.NodeNum << ") "
1142                           << printReg(Reg, TRI) << ':'
1143                           << PrintLaneMask(P.LaneMask) << ' ' << *SU.getInstr();
1144                    dbgs() << "              to "; PDiff.dump(*TRI););
1145       }
1146     } else {
1147       assert(P.LaneMask.any());
1148       LLVM_DEBUG(dbgs() << "  LiveReg: " << printVRegOrUnit(Reg, TRI) << "\n");
1149       // This may be called before CurrentBottom has been initialized. However,
1150       // BotRPTracker must have a valid position. We want the value live into the
1151       // instruction or live out of the block, so ask for the previous
1152       // instruction's live-out.
1153       const LiveInterval &LI = LIS->getInterval(Reg);
1154       VNInfo *VNI;
1155       MachineBasicBlock::const_iterator I =
1156         nextIfDebug(BotRPTracker.getPos(), BB->end());
1157       if (I == BB->end())
1158         VNI = LI.getVNInfoBefore(LIS->getMBBEndIdx(BB));
1159       else {
1160         LiveQueryResult LRQ = LI.Query(LIS->getInstructionIndex(*I));
1161         VNI = LRQ.valueIn();
1162       }
1163       // RegisterPressureTracker guarantees that readsReg is true for LiveUses.
1164       assert(VNI && "No live value at use.");
1165       for (const VReg2SUnit &V2SU
1166            : make_range(VRegUses.find(Reg), VRegUses.end())) {
1167         SUnit *SU = V2SU.SU;
1168         // If this use comes before the reaching def, it cannot be a last use,
1169         // so decrease its pressure change.
1170         if (!SU->isScheduled && SU != &ExitSU) {
1171           LiveQueryResult LRQ =
1172               LI.Query(LIS->getInstructionIndex(*SU->getInstr()));
1173           if (LRQ.valueIn() == VNI) {
1174             PressureDiff &PDiff = getPressureDiff(SU);
1175             PDiff.addPressureChange(Reg, true, &MRI);
1176             LLVM_DEBUG(dbgs() << "  UpdateRegP: SU(" << SU->NodeNum << ") "
1177                               << *SU->getInstr();
1178                        dbgs() << "              to "; PDiff.dump(*TRI););
1179           }
1180         }
1181       }
1182     }
1183   }
1184 }
1185 
1186 void ScheduleDAGMILive::dump() const {
1187 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1188   if (EntrySU.getInstr() != nullptr)
1189     dumpNodeAll(EntrySU);
1190   for (const SUnit &SU : SUnits) {
1191     dumpNodeAll(SU);
1192     if (ShouldTrackPressure) {
1193       dbgs() << "  Pressure Diff      : ";
1194       getPressureDiff(&SU).dump(*TRI);
1195     }
1196     dbgs() << "  Single Issue       : ";
1197     if (SchedModel.mustBeginGroup(SU.getInstr()) &&
1198         SchedModel.mustEndGroup(SU.getInstr()))
1199       dbgs() << "true;";
1200     else
1201       dbgs() << "false;";
1202     dbgs() << '\n';
1203   }
1204   if (ExitSU.getInstr() != nullptr)
1205     dumpNodeAll(ExitSU);
1206 #endif
1207 }
1208 
1209 /// schedule - Called back from MachineScheduler::runOnMachineFunction
1210 /// after setting up the current scheduling region. [RegionBegin, RegionEnd)
1211 /// only includes instructions that have DAG nodes, not scheduling boundaries.
1212 ///
1213 /// This is a skeletal driver, with all the functionality pushed into helpers,
1214 /// so that it can be easily extended by experimental schedulers. Generally,
1215 /// implementing MachineSchedStrategy should be sufficient to implement a new
1216 /// scheduling algorithm. However, if a scheduler further subclasses
1217 /// ScheduleDAGMILive then it will want to override this virtual method in order
1218 /// to update any specialized state.
1219 void ScheduleDAGMILive::schedule() {
1220   LLVM_DEBUG(dbgs() << "ScheduleDAGMILive::schedule starting\n");
1221   LLVM_DEBUG(SchedImpl->dumpPolicy());
1222   buildDAGWithRegPressure();
1223 
1224   postprocessDAG();
1225 
1226   SmallVector<SUnit*, 8> TopRoots, BotRoots;
1227   findRootsAndBiasEdges(TopRoots, BotRoots);
1228 
1229   // Initialize the strategy before modifying the DAG.
1230   // This may initialize a DFSResult to be used for queue priority.
1231   SchedImpl->initialize(this);
1232 
1233   LLVM_DEBUG(dump());
1234   if (PrintDAGs) dump();
1235   if (ViewMISchedDAGs) viewGraph();
1236 
1237   // Initialize ready queues now that the DAG and priority data are finalized.
1238   initQueues(TopRoots, BotRoots);
1239 
1240   bool IsTopNode = false;
1241   while (true) {
1242     LLVM_DEBUG(dbgs() << "** ScheduleDAGMILive::schedule picking next node\n");
1243     SUnit *SU = SchedImpl->pickNode(IsTopNode);
1244     if (!SU) break;
1245 
1246     assert(!SU->isScheduled && "Node already scheduled");
1247     if (!checkSchedLimit())
1248       break;
1249 
1250     scheduleMI(SU, IsTopNode);
1251 
1252     if (DFSResult) {
1253       unsigned SubtreeID = DFSResult->getSubtreeID(SU);
1254       if (!ScheduledTrees.test(SubtreeID)) {
1255         ScheduledTrees.set(SubtreeID);
1256         DFSResult->scheduleTree(SubtreeID);
1257         SchedImpl->scheduleTree(SubtreeID);
1258       }
1259     }
1260 
1261     // Notify the scheduling strategy after updating the DAG.
1262     SchedImpl->schedNode(SU, IsTopNode);
1263 
1264     updateQueues(SU, IsTopNode);
1265   }
1266   assert(CurrentTop == CurrentBottom && "Nonempty unscheduled zone.");
1267 
1268   placeDebugValues();
1269 
1270   LLVM_DEBUG({
1271     dbgs() << "*** Final schedule for "
1272            << printMBBReference(*begin()->getParent()) << " ***\n";
1273     dumpSchedule();
1274     dbgs() << '\n';
1275   });
1276 }
1277 
1278 /// Build the DAG and setup three register pressure trackers.
1279 void ScheduleDAGMILive::buildDAGWithRegPressure() {
1280   if (!ShouldTrackPressure) {
1281     RPTracker.reset();
1282     RegionCriticalPSets.clear();
1283     buildSchedGraph(AA);
1284     return;
1285   }
1286 
1287   // Initialize the register pressure tracker used by buildSchedGraph.
1288   RPTracker.init(&MF, RegClassInfo, LIS, BB, LiveRegionEnd,
1289                  ShouldTrackLaneMasks, /*TrackUntiedDefs=*/true);
1290 
1291   // Account for liveness generate by the region boundary.
1292   if (LiveRegionEnd != RegionEnd)
1293     RPTracker.recede();
1294 
1295   // Build the DAG, and compute current register pressure.
1296   buildSchedGraph(AA, &RPTracker, &SUPressureDiffs, LIS, ShouldTrackLaneMasks);
1297 
1298   // Initialize top/bottom trackers after computing region pressure.
1299   initRegPressure();
1300 }
1301 
1302 void ScheduleDAGMILive::computeDFSResult() {
1303   if (!DFSResult)
1304     DFSResult = new SchedDFSResult(/*BottomU*/true, MinSubtreeSize);
1305   DFSResult->clear();
1306   ScheduledTrees.clear();
1307   DFSResult->resize(SUnits.size());
1308   DFSResult->compute(SUnits);
1309   ScheduledTrees.resize(DFSResult->getNumSubtrees());
1310 }
1311 
1312 /// Compute the max cyclic critical path through the DAG. The scheduling DAG
1313 /// only provides the critical path for single block loops. To handle loops that
1314 /// span blocks, we could use the vreg path latencies provided by
1315 /// MachineTraceMetrics instead. However, MachineTraceMetrics is not currently
1316 /// available for use in the scheduler.
1317 ///
1318 /// The cyclic path estimation identifies a def-use pair that crosses the back
1319 /// edge and considers the depth and height of the nodes. For example, consider
1320 /// the following instruction sequence where each instruction has unit latency
1321 /// and defines an eponymous virtual register:
1322 ///
1323 /// a->b(a,c)->c(b)->d(c)->exit
1324 ///
1325 /// The cyclic critical path is a two cycles: b->c->b
1326 /// The acyclic critical path is four cycles: a->b->c->d->exit
1327 /// LiveOutHeight = height(c) = len(c->d->exit) = 2
1328 /// LiveOutDepth = depth(c) + 1 = len(a->b->c) + 1 = 3
1329 /// LiveInHeight = height(b) + 1 = len(b->c->d->exit) + 1 = 4
1330 /// LiveInDepth = depth(b) = len(a->b) = 1
1331 ///
1332 /// LiveOutDepth - LiveInDepth = 3 - 1 = 2
1333 /// LiveInHeight - LiveOutHeight = 4 - 2 = 2
1334 /// CyclicCriticalPath = min(2, 2) = 2
1335 ///
1336 /// This could be relevant to PostRA scheduling, but is currently implemented
1337 /// assuming LiveIntervals.
1338 unsigned ScheduleDAGMILive::computeCyclicCriticalPath() {
1339   // This only applies to single block loop.
1340   if (!BB->isSuccessor(BB))
1341     return 0;
1342 
1343   unsigned MaxCyclicLatency = 0;
1344   // Visit each live out vreg def to find def/use pairs that cross iterations.
1345   for (const RegisterMaskPair &P : RPTracker.getPressure().LiveOutRegs) {
1346     Register Reg = P.RegUnit;
1347     if (!Register::isVirtualRegister(Reg))
1348       continue;
1349     const LiveInterval &LI = LIS->getInterval(Reg);
1350     const VNInfo *DefVNI = LI.getVNInfoBefore(LIS->getMBBEndIdx(BB));
1351     if (!DefVNI)
1352       continue;
1353 
1354     MachineInstr *DefMI = LIS->getInstructionFromIndex(DefVNI->def);
1355     const SUnit *DefSU = getSUnit(DefMI);
1356     if (!DefSU)
1357       continue;
1358 
1359     unsigned LiveOutHeight = DefSU->getHeight();
1360     unsigned LiveOutDepth = DefSU->getDepth() + DefSU->Latency;
1361     // Visit all local users of the vreg def.
1362     for (const VReg2SUnit &V2SU
1363          : make_range(VRegUses.find(Reg), VRegUses.end())) {
1364       SUnit *SU = V2SU.SU;
1365       if (SU == &ExitSU)
1366         continue;
1367 
1368       // Only consider uses of the phi.
1369       LiveQueryResult LRQ = LI.Query(LIS->getInstructionIndex(*SU->getInstr()));
1370       if (!LRQ.valueIn()->isPHIDef())
1371         continue;
1372 
1373       // Assume that a path spanning two iterations is a cycle, which could
1374       // overestimate in strange cases. This allows cyclic latency to be
1375       // estimated as the minimum slack of the vreg's depth or height.
1376       unsigned CyclicLatency = 0;
1377       if (LiveOutDepth > SU->getDepth())
1378         CyclicLatency = LiveOutDepth - SU->getDepth();
1379 
1380       unsigned LiveInHeight = SU->getHeight() + DefSU->Latency;
1381       if (LiveInHeight > LiveOutHeight) {
1382         if (LiveInHeight - LiveOutHeight < CyclicLatency)
1383           CyclicLatency = LiveInHeight - LiveOutHeight;
1384       } else
1385         CyclicLatency = 0;
1386 
1387       LLVM_DEBUG(dbgs() << "Cyclic Path: SU(" << DefSU->NodeNum << ") -> SU("
1388                         << SU->NodeNum << ") = " << CyclicLatency << "c\n");
1389       if (CyclicLatency > MaxCyclicLatency)
1390         MaxCyclicLatency = CyclicLatency;
1391     }
1392   }
1393   LLVM_DEBUG(dbgs() << "Cyclic Critical Path: " << MaxCyclicLatency << "c\n");
1394   return MaxCyclicLatency;
1395 }
1396 
1397 /// Release ExitSU predecessors and setup scheduler queues. Re-position
1398 /// the Top RP tracker in case the region beginning has changed.
1399 void ScheduleDAGMILive::initQueues(ArrayRef<SUnit*> TopRoots,
1400                                    ArrayRef<SUnit*> BotRoots) {
1401   ScheduleDAGMI::initQueues(TopRoots, BotRoots);
1402   if (ShouldTrackPressure) {
1403     assert(TopRPTracker.getPos() == RegionBegin && "bad initial Top tracker");
1404     TopRPTracker.setPos(CurrentTop);
1405   }
1406 }
1407 
1408 /// Move an instruction and update register pressure.
1409 void ScheduleDAGMILive::scheduleMI(SUnit *SU, bool IsTopNode) {
1410   // Move the instruction to its new location in the instruction stream.
1411   MachineInstr *MI = SU->getInstr();
1412 
1413   if (IsTopNode) {
1414     assert(SU->isTopReady() && "node still has unscheduled dependencies");
1415     if (&*CurrentTop == MI)
1416       CurrentTop = nextIfDebug(++CurrentTop, CurrentBottom);
1417     else {
1418       moveInstruction(MI, CurrentTop);
1419       TopRPTracker.setPos(MI);
1420     }
1421 
1422     if (ShouldTrackPressure) {
1423       // Update top scheduled pressure.
1424       RegisterOperands RegOpers;
1425       RegOpers.collect(*MI, *TRI, MRI, ShouldTrackLaneMasks, false);
1426       if (ShouldTrackLaneMasks) {
1427         // Adjust liveness and add missing dead+read-undef flags.
1428         SlotIndex SlotIdx = LIS->getInstructionIndex(*MI).getRegSlot();
1429         RegOpers.adjustLaneLiveness(*LIS, MRI, SlotIdx, MI);
1430       } else {
1431         // Adjust for missing dead-def flags.
1432         RegOpers.detectDeadDefs(*MI, *LIS);
1433       }
1434 
1435       TopRPTracker.advance(RegOpers);
1436       assert(TopRPTracker.getPos() == CurrentTop && "out of sync");
1437       LLVM_DEBUG(dbgs() << "Top Pressure:\n"; dumpRegSetPressure(
1438                      TopRPTracker.getRegSetPressureAtPos(), TRI););
1439 
1440       updateScheduledPressure(SU, TopRPTracker.getPressure().MaxSetPressure);
1441     }
1442   } else {
1443     assert(SU->isBottomReady() && "node still has unscheduled dependencies");
1444     MachineBasicBlock::iterator priorII =
1445       priorNonDebug(CurrentBottom, CurrentTop);
1446     if (&*priorII == MI)
1447       CurrentBottom = priorII;
1448     else {
1449       if (&*CurrentTop == MI) {
1450         CurrentTop = nextIfDebug(++CurrentTop, priorII);
1451         TopRPTracker.setPos(CurrentTop);
1452       }
1453       moveInstruction(MI, CurrentBottom);
1454       CurrentBottom = MI;
1455       BotRPTracker.setPos(CurrentBottom);
1456     }
1457     if (ShouldTrackPressure) {
1458       RegisterOperands RegOpers;
1459       RegOpers.collect(*MI, *TRI, MRI, ShouldTrackLaneMasks, false);
1460       if (ShouldTrackLaneMasks) {
1461         // Adjust liveness and add missing dead+read-undef flags.
1462         SlotIndex SlotIdx = LIS->getInstructionIndex(*MI).getRegSlot();
1463         RegOpers.adjustLaneLiveness(*LIS, MRI, SlotIdx, MI);
1464       } else {
1465         // Adjust for missing dead-def flags.
1466         RegOpers.detectDeadDefs(*MI, *LIS);
1467       }
1468 
1469       if (BotRPTracker.getPos() != CurrentBottom)
1470         BotRPTracker.recedeSkipDebugValues();
1471       SmallVector<RegisterMaskPair, 8> LiveUses;
1472       BotRPTracker.recede(RegOpers, &LiveUses);
1473       assert(BotRPTracker.getPos() == CurrentBottom && "out of sync");
1474       LLVM_DEBUG(dbgs() << "Bottom Pressure:\n"; dumpRegSetPressure(
1475                      BotRPTracker.getRegSetPressureAtPos(), TRI););
1476 
1477       updateScheduledPressure(SU, BotRPTracker.getPressure().MaxSetPressure);
1478       updatePressureDiffs(LiveUses);
1479     }
1480   }
1481 }
1482 
1483 //===----------------------------------------------------------------------===//
1484 // BaseMemOpClusterMutation - DAG post-processing to cluster loads or stores.
1485 //===----------------------------------------------------------------------===//
1486 
1487 namespace {
1488 
1489 /// Post-process the DAG to create cluster edges between neighboring
1490 /// loads or between neighboring stores.
1491 class BaseMemOpClusterMutation : public ScheduleDAGMutation {
1492   struct MemOpInfo {
1493     SUnit *SU;
1494     SmallVector<const MachineOperand *, 4> BaseOps;
1495     int64_t Offset;
1496     unsigned Width;
1497 
1498     MemOpInfo(SUnit *SU, ArrayRef<const MachineOperand *> BaseOps,
1499               int64_t Offset, unsigned Width)
1500         : SU(SU), BaseOps(BaseOps.begin(), BaseOps.end()), Offset(Offset),
1501           Width(Width) {}
1502 
1503     static bool Compare(const MachineOperand *const &A,
1504                         const MachineOperand *const &B) {
1505       if (A->getType() != B->getType())
1506         return A->getType() < B->getType();
1507       if (A->isReg())
1508         return A->getReg() < B->getReg();
1509       if (A->isFI()) {
1510         const MachineFunction &MF = *A->getParent()->getParent()->getParent();
1511         const TargetFrameLowering &TFI = *MF.getSubtarget().getFrameLowering();
1512         bool StackGrowsDown = TFI.getStackGrowthDirection() ==
1513                               TargetFrameLowering::StackGrowsDown;
1514         return StackGrowsDown ? A->getIndex() > B->getIndex()
1515                               : A->getIndex() < B->getIndex();
1516       }
1517 
1518       llvm_unreachable("MemOpClusterMutation only supports register or frame "
1519                        "index bases.");
1520     }
1521 
1522     bool operator<(const MemOpInfo &RHS) const {
1523       // FIXME: Don't compare everything twice. Maybe use C++20 three way
1524       // comparison instead when it's available.
1525       if (std::lexicographical_compare(BaseOps.begin(), BaseOps.end(),
1526                                        RHS.BaseOps.begin(), RHS.BaseOps.end(),
1527                                        Compare))
1528         return true;
1529       if (std::lexicographical_compare(RHS.BaseOps.begin(), RHS.BaseOps.end(),
1530                                        BaseOps.begin(), BaseOps.end(), Compare))
1531         return false;
1532       if (Offset != RHS.Offset)
1533         return Offset < RHS.Offset;
1534       return SU->NodeNum < RHS.SU->NodeNum;
1535     }
1536   };
1537 
1538   const TargetInstrInfo *TII;
1539   const TargetRegisterInfo *TRI;
1540   bool IsLoad;
1541 
1542 public:
1543   BaseMemOpClusterMutation(const TargetInstrInfo *tii,
1544                            const TargetRegisterInfo *tri, bool IsLoad)
1545       : TII(tii), TRI(tri), IsLoad(IsLoad) {}
1546 
1547   void apply(ScheduleDAGInstrs *DAGInstrs) override;
1548 
1549 protected:
1550   void clusterNeighboringMemOps(ArrayRef<MemOpInfo> MemOps, bool FastCluster,
1551                                 ScheduleDAGInstrs *DAG);
1552   void collectMemOpRecords(std::vector<SUnit> &SUnits,
1553                            SmallVectorImpl<MemOpInfo> &MemOpRecords);
1554   bool groupMemOps(ArrayRef<MemOpInfo> MemOps, ScheduleDAGInstrs *DAG,
1555                    DenseMap<unsigned, SmallVector<MemOpInfo, 32>> &Groups);
1556 };
1557 
1558 class StoreClusterMutation : public BaseMemOpClusterMutation {
1559 public:
1560   StoreClusterMutation(const TargetInstrInfo *tii,
1561                        const TargetRegisterInfo *tri)
1562       : BaseMemOpClusterMutation(tii, tri, false) {}
1563 };
1564 
1565 class LoadClusterMutation : public BaseMemOpClusterMutation {
1566 public:
1567   LoadClusterMutation(const TargetInstrInfo *tii, const TargetRegisterInfo *tri)
1568       : BaseMemOpClusterMutation(tii, tri, true) {}
1569 };
1570 
1571 } // end anonymous namespace
1572 
1573 namespace llvm {
1574 
1575 std::unique_ptr<ScheduleDAGMutation>
1576 createLoadClusterDAGMutation(const TargetInstrInfo *TII,
1577                              const TargetRegisterInfo *TRI) {
1578   return EnableMemOpCluster ? std::make_unique<LoadClusterMutation>(TII, TRI)
1579                             : nullptr;
1580 }
1581 
1582 std::unique_ptr<ScheduleDAGMutation>
1583 createStoreClusterDAGMutation(const TargetInstrInfo *TII,
1584                               const TargetRegisterInfo *TRI) {
1585   return EnableMemOpCluster ? std::make_unique<StoreClusterMutation>(TII, TRI)
1586                             : nullptr;
1587 }
1588 
1589 } // end namespace llvm
1590 
1591 // Sorting all the loads/stores first, then for each load/store, checking the
1592 // following load/store one by one, until reach the first non-dependent one and
1593 // call target hook to see if they can cluster.
1594 // If FastCluster is enabled, we assume that, all the loads/stores have been
1595 // preprocessed and now, they didn't have dependencies on each other.
1596 void BaseMemOpClusterMutation::clusterNeighboringMemOps(
1597     ArrayRef<MemOpInfo> MemOpRecords, bool FastCluster,
1598     ScheduleDAGInstrs *DAG) {
1599   // Keep track of the current cluster length and bytes for each SUnit.
1600   DenseMap<unsigned, std::pair<unsigned, unsigned>> SUnit2ClusterInfo;
1601 
1602   // At this point, `MemOpRecords` array must hold atleast two mem ops. Try to
1603   // cluster mem ops collected within `MemOpRecords` array.
1604   for (unsigned Idx = 0, End = MemOpRecords.size(); Idx < (End - 1); ++Idx) {
1605     // Decision to cluster mem ops is taken based on target dependent logic
1606     auto MemOpa = MemOpRecords[Idx];
1607 
1608     // Seek for the next load/store to do the cluster.
1609     unsigned NextIdx = Idx + 1;
1610     for (; NextIdx < End; ++NextIdx)
1611       // Skip if MemOpb has been clustered already or has dependency with
1612       // MemOpa.
1613       if (!SUnit2ClusterInfo.count(MemOpRecords[NextIdx].SU->NodeNum) &&
1614           (FastCluster ||
1615            (!DAG->IsReachable(MemOpRecords[NextIdx].SU, MemOpa.SU) &&
1616             !DAG->IsReachable(MemOpa.SU, MemOpRecords[NextIdx].SU))))
1617         break;
1618     if (NextIdx == End)
1619       continue;
1620 
1621     auto MemOpb = MemOpRecords[NextIdx];
1622     unsigned ClusterLength = 2;
1623     unsigned CurrentClusterBytes = MemOpa.Width + MemOpb.Width;
1624     if (SUnit2ClusterInfo.count(MemOpa.SU->NodeNum)) {
1625       ClusterLength = SUnit2ClusterInfo[MemOpa.SU->NodeNum].first + 1;
1626       CurrentClusterBytes =
1627           SUnit2ClusterInfo[MemOpa.SU->NodeNum].second + MemOpb.Width;
1628     }
1629 
1630     if (!TII->shouldClusterMemOps(MemOpa.BaseOps, MemOpb.BaseOps, ClusterLength,
1631                                   CurrentClusterBytes))
1632       continue;
1633 
1634     SUnit *SUa = MemOpa.SU;
1635     SUnit *SUb = MemOpb.SU;
1636     if (SUa->NodeNum > SUb->NodeNum)
1637       std::swap(SUa, SUb);
1638 
1639     // FIXME: Is this check really required?
1640     if (!DAG->addEdge(SUb, SDep(SUa, SDep::Cluster)))
1641       continue;
1642 
1643     LLVM_DEBUG(dbgs() << "Cluster ld/st SU(" << SUa->NodeNum << ") - SU("
1644                       << SUb->NodeNum << ")\n");
1645     ++NumClustered;
1646 
1647     if (IsLoad) {
1648       // Copy successor edges from SUa to SUb. Interleaving computation
1649       // dependent on SUa can prevent load combining due to register reuse.
1650       // Predecessor edges do not need to be copied from SUb to SUa since
1651       // nearby loads should have effectively the same inputs.
1652       for (const SDep &Succ : SUa->Succs) {
1653         if (Succ.getSUnit() == SUb)
1654           continue;
1655         LLVM_DEBUG(dbgs() << "  Copy Succ SU(" << Succ.getSUnit()->NodeNum
1656                           << ")\n");
1657         DAG->addEdge(Succ.getSUnit(), SDep(SUb, SDep::Artificial));
1658       }
1659     } else {
1660       // Copy predecessor edges from SUb to SUa to avoid the SUnits that
1661       // SUb dependent on scheduled in-between SUb and SUa. Successor edges
1662       // do not need to be copied from SUa to SUb since no one will depend
1663       // on stores.
1664       // Notice that, we don't need to care about the memory dependency as
1665       // we won't try to cluster them if they have any memory dependency.
1666       for (const SDep &Pred : SUb->Preds) {
1667         if (Pred.getSUnit() == SUa)
1668           continue;
1669         LLVM_DEBUG(dbgs() << "  Copy Pred SU(" << Pred.getSUnit()->NodeNum
1670                           << ")\n");
1671         DAG->addEdge(SUa, SDep(Pred.getSUnit(), SDep::Artificial));
1672       }
1673     }
1674 
1675     SUnit2ClusterInfo[MemOpb.SU->NodeNum] = {ClusterLength,
1676                                              CurrentClusterBytes};
1677 
1678     LLVM_DEBUG(dbgs() << "  Curr cluster length: " << ClusterLength
1679                       << ", Curr cluster bytes: " << CurrentClusterBytes
1680                       << "\n");
1681   }
1682 }
1683 
1684 void BaseMemOpClusterMutation::collectMemOpRecords(
1685     std::vector<SUnit> &SUnits, SmallVectorImpl<MemOpInfo> &MemOpRecords) {
1686   for (auto &SU : SUnits) {
1687     if ((IsLoad && !SU.getInstr()->mayLoad()) ||
1688         (!IsLoad && !SU.getInstr()->mayStore()))
1689       continue;
1690 
1691     const MachineInstr &MI = *SU.getInstr();
1692     SmallVector<const MachineOperand *, 4> BaseOps;
1693     int64_t Offset;
1694     bool OffsetIsScalable;
1695     unsigned Width;
1696     if (TII->getMemOperandsWithOffsetWidth(MI, BaseOps, Offset,
1697                                            OffsetIsScalable, Width, TRI)) {
1698       MemOpRecords.push_back(MemOpInfo(&SU, BaseOps, Offset, Width));
1699 
1700       LLVM_DEBUG(dbgs() << "Num BaseOps: " << BaseOps.size() << ", Offset: "
1701                         << Offset << ", OffsetIsScalable: " << OffsetIsScalable
1702                         << ", Width: " << Width << "\n");
1703     }
1704 #ifndef NDEBUG
1705     for (const auto *Op : BaseOps)
1706       assert(Op);
1707 #endif
1708   }
1709 }
1710 
1711 bool BaseMemOpClusterMutation::groupMemOps(
1712     ArrayRef<MemOpInfo> MemOps, ScheduleDAGInstrs *DAG,
1713     DenseMap<unsigned, SmallVector<MemOpInfo, 32>> &Groups) {
1714   bool FastCluster =
1715       ForceFastCluster ||
1716       MemOps.size() * DAG->SUnits.size() / 1000 > FastClusterThreshold;
1717 
1718   for (const auto &MemOp : MemOps) {
1719     unsigned ChainPredID = DAG->SUnits.size();
1720     if (FastCluster) {
1721       for (const SDep &Pred : MemOp.SU->Preds) {
1722         // We only want to cluster the mem ops that have the same ctrl(non-data)
1723         // pred so that they didn't have ctrl dependency for each other. But for
1724         // store instrs, we can still cluster them if the pred is load instr.
1725         if ((Pred.isCtrl() &&
1726              (IsLoad ||
1727               (Pred.getSUnit() && Pred.getSUnit()->getInstr()->mayStore()))) &&
1728             !Pred.isArtificial()) {
1729           ChainPredID = Pred.getSUnit()->NodeNum;
1730           break;
1731         }
1732       }
1733     } else
1734       ChainPredID = 0;
1735 
1736     Groups[ChainPredID].push_back(MemOp);
1737   }
1738   return FastCluster;
1739 }
1740 
1741 /// Callback from DAG postProcessing to create cluster edges for loads/stores.
1742 void BaseMemOpClusterMutation::apply(ScheduleDAGInstrs *DAG) {
1743   // Collect all the clusterable loads/stores
1744   SmallVector<MemOpInfo, 32> MemOpRecords;
1745   collectMemOpRecords(DAG->SUnits, MemOpRecords);
1746 
1747   if (MemOpRecords.size() < 2)
1748     return;
1749 
1750   // Put the loads/stores without dependency into the same group with some
1751   // heuristic if the DAG is too complex to avoid compiling time blow up.
1752   // Notice that, some fusion pair could be lost with this.
1753   DenseMap<unsigned, SmallVector<MemOpInfo, 32>> Groups;
1754   bool FastCluster = groupMemOps(MemOpRecords, DAG, Groups);
1755 
1756   for (auto &Group : Groups) {
1757     // Sorting the loads/stores, so that, we can stop the cluster as early as
1758     // possible.
1759     llvm::sort(Group.second);
1760 
1761     // Trying to cluster all the neighboring loads/stores.
1762     clusterNeighboringMemOps(Group.second, FastCluster, DAG);
1763   }
1764 }
1765 
1766 //===----------------------------------------------------------------------===//
1767 // CopyConstrain - DAG post-processing to encourage copy elimination.
1768 //===----------------------------------------------------------------------===//
1769 
1770 namespace {
1771 
1772 /// Post-process the DAG to create weak edges from all uses of a copy to
1773 /// the one use that defines the copy's source vreg, most likely an induction
1774 /// variable increment.
1775 class CopyConstrain : public ScheduleDAGMutation {
1776   // Transient state.
1777   SlotIndex RegionBeginIdx;
1778 
1779   // RegionEndIdx is the slot index of the last non-debug instruction in the
1780   // scheduling region. So we may have RegionBeginIdx == RegionEndIdx.
1781   SlotIndex RegionEndIdx;
1782 
1783 public:
1784   CopyConstrain(const TargetInstrInfo *, const TargetRegisterInfo *) {}
1785 
1786   void apply(ScheduleDAGInstrs *DAGInstrs) override;
1787 
1788 protected:
1789   void constrainLocalCopy(SUnit *CopySU, ScheduleDAGMILive *DAG);
1790 };
1791 
1792 } // end anonymous namespace
1793 
1794 namespace llvm {
1795 
1796 std::unique_ptr<ScheduleDAGMutation>
1797 createCopyConstrainDAGMutation(const TargetInstrInfo *TII,
1798                                const TargetRegisterInfo *TRI) {
1799   return std::make_unique<CopyConstrain>(TII, TRI);
1800 }
1801 
1802 } // end namespace llvm
1803 
1804 /// constrainLocalCopy handles two possibilities:
1805 /// 1) Local src:
1806 /// I0:     = dst
1807 /// I1: src = ...
1808 /// I2:     = dst
1809 /// I3: dst = src (copy)
1810 /// (create pred->succ edges I0->I1, I2->I1)
1811 ///
1812 /// 2) Local copy:
1813 /// I0: dst = src (copy)
1814 /// I1:     = dst
1815 /// I2: src = ...
1816 /// I3:     = dst
1817 /// (create pred->succ edges I1->I2, I3->I2)
1818 ///
1819 /// Although the MachineScheduler is currently constrained to single blocks,
1820 /// this algorithm should handle extended blocks. An EBB is a set of
1821 /// contiguously numbered blocks such that the previous block in the EBB is
1822 /// always the single predecessor.
1823 void CopyConstrain::constrainLocalCopy(SUnit *CopySU, ScheduleDAGMILive *DAG) {
1824   LiveIntervals *LIS = DAG->getLIS();
1825   MachineInstr *Copy = CopySU->getInstr();
1826 
1827   // Check for pure vreg copies.
1828   const MachineOperand &SrcOp = Copy->getOperand(1);
1829   Register SrcReg = SrcOp.getReg();
1830   if (!Register::isVirtualRegister(SrcReg) || !SrcOp.readsReg())
1831     return;
1832 
1833   const MachineOperand &DstOp = Copy->getOperand(0);
1834   Register DstReg = DstOp.getReg();
1835   if (!Register::isVirtualRegister(DstReg) || DstOp.isDead())
1836     return;
1837 
1838   // Check if either the dest or source is local. If it's live across a back
1839   // edge, it's not local. Note that if both vregs are live across the back
1840   // edge, we cannot successfully contrain the copy without cyclic scheduling.
1841   // If both the copy's source and dest are local live intervals, then we
1842   // should treat the dest as the global for the purpose of adding
1843   // constraints. This adds edges from source's other uses to the copy.
1844   unsigned LocalReg = SrcReg;
1845   unsigned GlobalReg = DstReg;
1846   LiveInterval *LocalLI = &LIS->getInterval(LocalReg);
1847   if (!LocalLI->isLocal(RegionBeginIdx, RegionEndIdx)) {
1848     LocalReg = DstReg;
1849     GlobalReg = SrcReg;
1850     LocalLI = &LIS->getInterval(LocalReg);
1851     if (!LocalLI->isLocal(RegionBeginIdx, RegionEndIdx))
1852       return;
1853   }
1854   LiveInterval *GlobalLI = &LIS->getInterval(GlobalReg);
1855 
1856   // Find the global segment after the start of the local LI.
1857   LiveInterval::iterator GlobalSegment = GlobalLI->find(LocalLI->beginIndex());
1858   // If GlobalLI does not overlap LocalLI->start, then a copy directly feeds a
1859   // local live range. We could create edges from other global uses to the local
1860   // start, but the coalescer should have already eliminated these cases, so
1861   // don't bother dealing with it.
1862   if (GlobalSegment == GlobalLI->end())
1863     return;
1864 
1865   // If GlobalSegment is killed at the LocalLI->start, the call to find()
1866   // returned the next global segment. But if GlobalSegment overlaps with
1867   // LocalLI->start, then advance to the next segment. If a hole in GlobalLI
1868   // exists in LocalLI's vicinity, GlobalSegment will be the end of the hole.
1869   if (GlobalSegment->contains(LocalLI->beginIndex()))
1870     ++GlobalSegment;
1871 
1872   if (GlobalSegment == GlobalLI->end())
1873     return;
1874 
1875   // Check if GlobalLI contains a hole in the vicinity of LocalLI.
1876   if (GlobalSegment != GlobalLI->begin()) {
1877     // Two address defs have no hole.
1878     if (SlotIndex::isSameInstr(std::prev(GlobalSegment)->end,
1879                                GlobalSegment->start)) {
1880       return;
1881     }
1882     // If the prior global segment may be defined by the same two-address
1883     // instruction that also defines LocalLI, then can't make a hole here.
1884     if (SlotIndex::isSameInstr(std::prev(GlobalSegment)->start,
1885                                LocalLI->beginIndex())) {
1886       return;
1887     }
1888     // If GlobalLI has a prior segment, it must be live into the EBB. Otherwise
1889     // it would be a disconnected component in the live range.
1890     assert(std::prev(GlobalSegment)->start < LocalLI->beginIndex() &&
1891            "Disconnected LRG within the scheduling region.");
1892   }
1893   MachineInstr *GlobalDef = LIS->getInstructionFromIndex(GlobalSegment->start);
1894   if (!GlobalDef)
1895     return;
1896 
1897   SUnit *GlobalSU = DAG->getSUnit(GlobalDef);
1898   if (!GlobalSU)
1899     return;
1900 
1901   // GlobalDef is the bottom of the GlobalLI hole. Open the hole by
1902   // constraining the uses of the last local def to precede GlobalDef.
1903   SmallVector<SUnit*,8> LocalUses;
1904   const VNInfo *LastLocalVN = LocalLI->getVNInfoBefore(LocalLI->endIndex());
1905   MachineInstr *LastLocalDef = LIS->getInstructionFromIndex(LastLocalVN->def);
1906   SUnit *LastLocalSU = DAG->getSUnit(LastLocalDef);
1907   for (const SDep &Succ : LastLocalSU->Succs) {
1908     if (Succ.getKind() != SDep::Data || Succ.getReg() != LocalReg)
1909       continue;
1910     if (Succ.getSUnit() == GlobalSU)
1911       continue;
1912     if (!DAG->canAddEdge(GlobalSU, Succ.getSUnit()))
1913       return;
1914     LocalUses.push_back(Succ.getSUnit());
1915   }
1916   // Open the top of the GlobalLI hole by constraining any earlier global uses
1917   // to precede the start of LocalLI.
1918   SmallVector<SUnit*,8> GlobalUses;
1919   MachineInstr *FirstLocalDef =
1920     LIS->getInstructionFromIndex(LocalLI->beginIndex());
1921   SUnit *FirstLocalSU = DAG->getSUnit(FirstLocalDef);
1922   for (const SDep &Pred : GlobalSU->Preds) {
1923     if (Pred.getKind() != SDep::Anti || Pred.getReg() != GlobalReg)
1924       continue;
1925     if (Pred.getSUnit() == FirstLocalSU)
1926       continue;
1927     if (!DAG->canAddEdge(FirstLocalSU, Pred.getSUnit()))
1928       return;
1929     GlobalUses.push_back(Pred.getSUnit());
1930   }
1931   LLVM_DEBUG(dbgs() << "Constraining copy SU(" << CopySU->NodeNum << ")\n");
1932   // Add the weak edges.
1933   for (SUnit *LU : LocalUses) {
1934     LLVM_DEBUG(dbgs() << "  Local use SU(" << LU->NodeNum << ") -> SU("
1935                       << GlobalSU->NodeNum << ")\n");
1936     DAG->addEdge(GlobalSU, SDep(LU, SDep::Weak));
1937   }
1938   for (SUnit *GU : GlobalUses) {
1939     LLVM_DEBUG(dbgs() << "  Global use SU(" << GU->NodeNum << ") -> SU("
1940                       << FirstLocalSU->NodeNum << ")\n");
1941     DAG->addEdge(FirstLocalSU, SDep(GU, SDep::Weak));
1942   }
1943 }
1944 
1945 /// Callback from DAG postProcessing to create weak edges to encourage
1946 /// copy elimination.
1947 void CopyConstrain::apply(ScheduleDAGInstrs *DAGInstrs) {
1948   ScheduleDAGMI *DAG = static_cast<ScheduleDAGMI*>(DAGInstrs);
1949   assert(DAG->hasVRegLiveness() && "Expect VRegs with LiveIntervals");
1950 
1951   MachineBasicBlock::iterator FirstPos = nextIfDebug(DAG->begin(), DAG->end());
1952   if (FirstPos == DAG->end())
1953     return;
1954   RegionBeginIdx = DAG->getLIS()->getInstructionIndex(*FirstPos);
1955   RegionEndIdx = DAG->getLIS()->getInstructionIndex(
1956       *priorNonDebug(DAG->end(), DAG->begin()));
1957 
1958   for (SUnit &SU : DAG->SUnits) {
1959     if (!SU.getInstr()->isCopy())
1960       continue;
1961 
1962     constrainLocalCopy(&SU, static_cast<ScheduleDAGMILive*>(DAG));
1963   }
1964 }
1965 
1966 //===----------------------------------------------------------------------===//
1967 // MachineSchedStrategy helpers used by GenericScheduler, GenericPostScheduler
1968 // and possibly other custom schedulers.
1969 //===----------------------------------------------------------------------===//
1970 
1971 static const unsigned InvalidCycle = ~0U;
1972 
1973 SchedBoundary::~SchedBoundary() { delete HazardRec; }
1974 
1975 /// Given a Count of resource usage and a Latency value, return true if a
1976 /// SchedBoundary becomes resource limited.
1977 /// If we are checking after scheduling a node, we should return true when
1978 /// we just reach the resource limit.
1979 static bool checkResourceLimit(unsigned LFactor, unsigned Count,
1980                                unsigned Latency, bool AfterSchedNode) {
1981   int ResCntFactor = (int)(Count - (Latency * LFactor));
1982   if (AfterSchedNode)
1983     return ResCntFactor >= (int)LFactor;
1984   else
1985     return ResCntFactor > (int)LFactor;
1986 }
1987 
1988 void SchedBoundary::reset() {
1989   // A new HazardRec is created for each DAG and owned by SchedBoundary.
1990   // Destroying and reconstructing it is very expensive though. So keep
1991   // invalid, placeholder HazardRecs.
1992   if (HazardRec && HazardRec->isEnabled()) {
1993     delete HazardRec;
1994     HazardRec = nullptr;
1995   }
1996   Available.clear();
1997   Pending.clear();
1998   CheckPending = false;
1999   CurrCycle = 0;
2000   CurrMOps = 0;
2001   MinReadyCycle = std::numeric_limits<unsigned>::max();
2002   ExpectedLatency = 0;
2003   DependentLatency = 0;
2004   RetiredMOps = 0;
2005   MaxExecutedResCount = 0;
2006   ZoneCritResIdx = 0;
2007   IsResourceLimited = false;
2008   ReservedCycles.clear();
2009   ReservedCyclesIndex.clear();
2010   ResourceGroupSubUnitMasks.clear();
2011 #if LLVM_ENABLE_ABI_BREAKING_CHECKS
2012   // Track the maximum number of stall cycles that could arise either from the
2013   // latency of a DAG edge or the number of cycles that a processor resource is
2014   // reserved (SchedBoundary::ReservedCycles).
2015   MaxObservedStall = 0;
2016 #endif
2017   // Reserve a zero-count for invalid CritResIdx.
2018   ExecutedResCounts.resize(1);
2019   assert(!ExecutedResCounts[0] && "nonzero count for bad resource");
2020 }
2021 
2022 void SchedRemainder::
2023 init(ScheduleDAGMI *DAG, const TargetSchedModel *SchedModel) {
2024   reset();
2025   if (!SchedModel->hasInstrSchedModel())
2026     return;
2027   RemainingCounts.resize(SchedModel->getNumProcResourceKinds());
2028   for (SUnit &SU : DAG->SUnits) {
2029     const MCSchedClassDesc *SC = DAG->getSchedClass(&SU);
2030     RemIssueCount += SchedModel->getNumMicroOps(SU.getInstr(), SC)
2031       * SchedModel->getMicroOpFactor();
2032     for (TargetSchedModel::ProcResIter
2033            PI = SchedModel->getWriteProcResBegin(SC),
2034            PE = SchedModel->getWriteProcResEnd(SC); PI != PE; ++PI) {
2035       unsigned PIdx = PI->ProcResourceIdx;
2036       unsigned Factor = SchedModel->getResourceFactor(PIdx);
2037       RemainingCounts[PIdx] += (Factor * PI->Cycles);
2038     }
2039   }
2040 }
2041 
2042 void SchedBoundary::
2043 init(ScheduleDAGMI *dag, const TargetSchedModel *smodel, SchedRemainder *rem) {
2044   reset();
2045   DAG = dag;
2046   SchedModel = smodel;
2047   Rem = rem;
2048   if (SchedModel->hasInstrSchedModel()) {
2049     unsigned ResourceCount = SchedModel->getNumProcResourceKinds();
2050     ReservedCyclesIndex.resize(ResourceCount);
2051     ExecutedResCounts.resize(ResourceCount);
2052     ResourceGroupSubUnitMasks.resize(ResourceCount, APInt(ResourceCount, 0));
2053     unsigned NumUnits = 0;
2054 
2055     for (unsigned i = 0; i < ResourceCount; ++i) {
2056       ReservedCyclesIndex[i] = NumUnits;
2057       NumUnits += SchedModel->getProcResource(i)->NumUnits;
2058       if (isUnbufferedGroup(i)) {
2059         auto SubUnits = SchedModel->getProcResource(i)->SubUnitsIdxBegin;
2060         for (unsigned U = 0, UE = SchedModel->getProcResource(i)->NumUnits;
2061              U != UE; ++U)
2062           ResourceGroupSubUnitMasks[i].setBit(SubUnits[U]);
2063       }
2064     }
2065 
2066     ReservedCycles.resize(NumUnits, InvalidCycle);
2067   }
2068 }
2069 
2070 /// Compute the stall cycles based on this SUnit's ready time. Heuristics treat
2071 /// these "soft stalls" differently than the hard stall cycles based on CPU
2072 /// resources and computed by checkHazard(). A fully in-order model
2073 /// (MicroOpBufferSize==0) will not make use of this since instructions are not
2074 /// available for scheduling until they are ready. However, a weaker in-order
2075 /// model may use this for heuristics. For example, if a processor has in-order
2076 /// behavior when reading certain resources, this may come into play.
2077 unsigned SchedBoundary::getLatencyStallCycles(SUnit *SU) {
2078   if (!SU->isUnbuffered)
2079     return 0;
2080 
2081   unsigned ReadyCycle = (isTop() ? SU->TopReadyCycle : SU->BotReadyCycle);
2082   if (ReadyCycle > CurrCycle)
2083     return ReadyCycle - CurrCycle;
2084   return 0;
2085 }
2086 
2087 /// Compute the next cycle at which the given processor resource unit
2088 /// can be scheduled.
2089 unsigned SchedBoundary::getNextResourceCycleByInstance(unsigned InstanceIdx,
2090                                                        unsigned Cycles) {
2091   unsigned NextUnreserved = ReservedCycles[InstanceIdx];
2092   // If this resource has never been used, always return cycle zero.
2093   if (NextUnreserved == InvalidCycle)
2094     return 0;
2095   // For bottom-up scheduling add the cycles needed for the current operation.
2096   if (!isTop())
2097     NextUnreserved += Cycles;
2098   return NextUnreserved;
2099 }
2100 
2101 /// Compute the next cycle at which the given processor resource can be
2102 /// scheduled.  Returns the next cycle and the index of the processor resource
2103 /// instance in the reserved cycles vector.
2104 std::pair<unsigned, unsigned>
2105 SchedBoundary::getNextResourceCycle(const MCSchedClassDesc *SC, unsigned PIdx,
2106                                     unsigned Cycles) {
2107 
2108   unsigned MinNextUnreserved = InvalidCycle;
2109   unsigned InstanceIdx = 0;
2110   unsigned StartIndex = ReservedCyclesIndex[PIdx];
2111   unsigned NumberOfInstances = SchedModel->getProcResource(PIdx)->NumUnits;
2112   assert(NumberOfInstances > 0 &&
2113          "Cannot have zero instances of a ProcResource");
2114 
2115   if (isUnbufferedGroup(PIdx)) {
2116     // If any subunits are used by the instruction, report that the resource
2117     // group is available at 0, effectively removing the group record from
2118     // hazarding and basing the hazarding decisions on the subunit records.
2119     // Otherwise, choose the first available instance from among the subunits.
2120     // Specifications which assign cycles to both the subunits and the group or
2121     // which use an unbuffered group with buffered subunits will appear to
2122     // schedule strangely. In the first case, the additional cycles for the
2123     // group will be ignored.  In the second, the group will be ignored
2124     // entirely.
2125     for (const MCWriteProcResEntry &PE :
2126          make_range(SchedModel->getWriteProcResBegin(SC),
2127                     SchedModel->getWriteProcResEnd(SC)))
2128       if (ResourceGroupSubUnitMasks[PIdx][PE.ProcResourceIdx])
2129         return std::make_pair(0u, StartIndex);
2130 
2131     auto SubUnits = SchedModel->getProcResource(PIdx)->SubUnitsIdxBegin;
2132     for (unsigned I = 0, End = NumberOfInstances; I < End; ++I) {
2133       unsigned NextUnreserved, NextInstanceIdx;
2134       std::tie(NextUnreserved, NextInstanceIdx) =
2135           getNextResourceCycle(SC, SubUnits[I], Cycles);
2136       if (MinNextUnreserved > NextUnreserved) {
2137         InstanceIdx = NextInstanceIdx;
2138         MinNextUnreserved = NextUnreserved;
2139       }
2140     }
2141     return std::make_pair(MinNextUnreserved, InstanceIdx);
2142   }
2143 
2144   for (unsigned I = StartIndex, End = StartIndex + NumberOfInstances; I < End;
2145        ++I) {
2146     unsigned NextUnreserved = getNextResourceCycleByInstance(I, Cycles);
2147     if (MinNextUnreserved > NextUnreserved) {
2148       InstanceIdx = I;
2149       MinNextUnreserved = NextUnreserved;
2150     }
2151   }
2152   return std::make_pair(MinNextUnreserved, InstanceIdx);
2153 }
2154 
2155 /// Does this SU have a hazard within the current instruction group.
2156 ///
2157 /// The scheduler supports two modes of hazard recognition. The first is the
2158 /// ScheduleHazardRecognizer API. It is a fully general hazard recognizer that
2159 /// supports highly complicated in-order reservation tables
2160 /// (ScoreboardHazardRecognizer) and arbitrary target-specific logic.
2161 ///
2162 /// The second is a streamlined mechanism that checks for hazards based on
2163 /// simple counters that the scheduler itself maintains. It explicitly checks
2164 /// for instruction dispatch limitations, including the number of micro-ops that
2165 /// can dispatch per cycle.
2166 ///
2167 /// TODO: Also check whether the SU must start a new group.
2168 bool SchedBoundary::checkHazard(SUnit *SU) {
2169   if (HazardRec->isEnabled()
2170       && HazardRec->getHazardType(SU) != ScheduleHazardRecognizer::NoHazard) {
2171     return true;
2172   }
2173 
2174   unsigned uops = SchedModel->getNumMicroOps(SU->getInstr());
2175   if ((CurrMOps > 0) && (CurrMOps + uops > SchedModel->getIssueWidth())) {
2176     LLVM_DEBUG(dbgs() << "  SU(" << SU->NodeNum << ") uops="
2177                       << SchedModel->getNumMicroOps(SU->getInstr()) << '\n');
2178     return true;
2179   }
2180 
2181   if (CurrMOps > 0 &&
2182       ((isTop() && SchedModel->mustBeginGroup(SU->getInstr())) ||
2183        (!isTop() && SchedModel->mustEndGroup(SU->getInstr())))) {
2184     LLVM_DEBUG(dbgs() << "  hazard: SU(" << SU->NodeNum << ") must "
2185                       << (isTop() ? "begin" : "end") << " group\n");
2186     return true;
2187   }
2188 
2189   if (SchedModel->hasInstrSchedModel() && SU->hasReservedResource) {
2190     const MCSchedClassDesc *SC = DAG->getSchedClass(SU);
2191     for (const MCWriteProcResEntry &PE :
2192           make_range(SchedModel->getWriteProcResBegin(SC),
2193                      SchedModel->getWriteProcResEnd(SC))) {
2194       unsigned ResIdx = PE.ProcResourceIdx;
2195       unsigned Cycles = PE.Cycles;
2196       unsigned NRCycle, InstanceIdx;
2197       std::tie(NRCycle, InstanceIdx) = getNextResourceCycle(SC, ResIdx, Cycles);
2198       if (NRCycle > CurrCycle) {
2199 #if LLVM_ENABLE_ABI_BREAKING_CHECKS
2200         MaxObservedStall = std::max(Cycles, MaxObservedStall);
2201 #endif
2202         LLVM_DEBUG(dbgs() << "  SU(" << SU->NodeNum << ") "
2203                           << SchedModel->getResourceName(ResIdx)
2204                           << '[' << InstanceIdx - ReservedCyclesIndex[ResIdx]  << ']'
2205                           << "=" << NRCycle << "c\n");
2206         return true;
2207       }
2208     }
2209   }
2210   return false;
2211 }
2212 
2213 // Find the unscheduled node in ReadySUs with the highest latency.
2214 unsigned SchedBoundary::
2215 findMaxLatency(ArrayRef<SUnit*> ReadySUs) {
2216   SUnit *LateSU = nullptr;
2217   unsigned RemLatency = 0;
2218   for (SUnit *SU : ReadySUs) {
2219     unsigned L = getUnscheduledLatency(SU);
2220     if (L > RemLatency) {
2221       RemLatency = L;
2222       LateSU = SU;
2223     }
2224   }
2225   if (LateSU) {
2226     LLVM_DEBUG(dbgs() << Available.getName() << " RemLatency SU("
2227                       << LateSU->NodeNum << ") " << RemLatency << "c\n");
2228   }
2229   return RemLatency;
2230 }
2231 
2232 // Count resources in this zone and the remaining unscheduled
2233 // instruction. Return the max count, scaled. Set OtherCritIdx to the critical
2234 // resource index, or zero if the zone is issue limited.
2235 unsigned SchedBoundary::
2236 getOtherResourceCount(unsigned &OtherCritIdx) {
2237   OtherCritIdx = 0;
2238   if (!SchedModel->hasInstrSchedModel())
2239     return 0;
2240 
2241   unsigned OtherCritCount = Rem->RemIssueCount
2242     + (RetiredMOps * SchedModel->getMicroOpFactor());
2243   LLVM_DEBUG(dbgs() << "  " << Available.getName() << " + Remain MOps: "
2244                     << OtherCritCount / SchedModel->getMicroOpFactor() << '\n');
2245   for (unsigned PIdx = 1, PEnd = SchedModel->getNumProcResourceKinds();
2246        PIdx != PEnd; ++PIdx) {
2247     unsigned OtherCount = getResourceCount(PIdx) + Rem->RemainingCounts[PIdx];
2248     if (OtherCount > OtherCritCount) {
2249       OtherCritCount = OtherCount;
2250       OtherCritIdx = PIdx;
2251     }
2252   }
2253   if (OtherCritIdx) {
2254     LLVM_DEBUG(
2255         dbgs() << "  " << Available.getName() << " + Remain CritRes: "
2256                << OtherCritCount / SchedModel->getResourceFactor(OtherCritIdx)
2257                << " " << SchedModel->getResourceName(OtherCritIdx) << "\n");
2258   }
2259   return OtherCritCount;
2260 }
2261 
2262 void SchedBoundary::releaseNode(SUnit *SU, unsigned ReadyCycle, bool InPQueue,
2263                                 unsigned Idx) {
2264   assert(SU->getInstr() && "Scheduled SUnit must have instr");
2265 
2266 #if LLVM_ENABLE_ABI_BREAKING_CHECKS
2267   // ReadyCycle was been bumped up to the CurrCycle when this node was
2268   // scheduled, but CurrCycle may have been eagerly advanced immediately after
2269   // scheduling, so may now be greater than ReadyCycle.
2270   if (ReadyCycle > CurrCycle)
2271     MaxObservedStall = std::max(ReadyCycle - CurrCycle, MaxObservedStall);
2272 #endif
2273 
2274   if (ReadyCycle < MinReadyCycle)
2275     MinReadyCycle = ReadyCycle;
2276 
2277   // Check for interlocks first. For the purpose of other heuristics, an
2278   // instruction that cannot issue appears as if it's not in the ReadyQueue.
2279   bool IsBuffered = SchedModel->getMicroOpBufferSize() != 0;
2280   bool HazardDetected = (!IsBuffered && ReadyCycle > CurrCycle) ||
2281                         checkHazard(SU) || (Available.size() >= ReadyListLimit);
2282 
2283   if (!HazardDetected) {
2284     Available.push(SU);
2285 
2286     if (InPQueue)
2287       Pending.remove(Pending.begin() + Idx);
2288     return;
2289   }
2290 
2291   if (!InPQueue)
2292     Pending.push(SU);
2293 }
2294 
2295 /// Move the boundary of scheduled code by one cycle.
2296 void SchedBoundary::bumpCycle(unsigned NextCycle) {
2297   if (SchedModel->getMicroOpBufferSize() == 0) {
2298     assert(MinReadyCycle < std::numeric_limits<unsigned>::max() &&
2299            "MinReadyCycle uninitialized");
2300     if (MinReadyCycle > NextCycle)
2301       NextCycle = MinReadyCycle;
2302   }
2303   // Update the current micro-ops, which will issue in the next cycle.
2304   unsigned DecMOps = SchedModel->getIssueWidth() * (NextCycle - CurrCycle);
2305   CurrMOps = (CurrMOps <= DecMOps) ? 0 : CurrMOps - DecMOps;
2306 
2307   // Decrement DependentLatency based on the next cycle.
2308   if ((NextCycle - CurrCycle) > DependentLatency)
2309     DependentLatency = 0;
2310   else
2311     DependentLatency -= (NextCycle - CurrCycle);
2312 
2313   if (!HazardRec->isEnabled()) {
2314     // Bypass HazardRec virtual calls.
2315     CurrCycle = NextCycle;
2316   } else {
2317     // Bypass getHazardType calls in case of long latency.
2318     for (; CurrCycle != NextCycle; ++CurrCycle) {
2319       if (isTop())
2320         HazardRec->AdvanceCycle();
2321       else
2322         HazardRec->RecedeCycle();
2323     }
2324   }
2325   CheckPending = true;
2326   IsResourceLimited =
2327       checkResourceLimit(SchedModel->getLatencyFactor(), getCriticalCount(),
2328                          getScheduledLatency(), true);
2329 
2330   LLVM_DEBUG(dbgs() << "Cycle: " << CurrCycle << ' ' << Available.getName()
2331                     << '\n');
2332 }
2333 
2334 void SchedBoundary::incExecutedResources(unsigned PIdx, unsigned Count) {
2335   ExecutedResCounts[PIdx] += Count;
2336   if (ExecutedResCounts[PIdx] > MaxExecutedResCount)
2337     MaxExecutedResCount = ExecutedResCounts[PIdx];
2338 }
2339 
2340 /// Add the given processor resource to this scheduled zone.
2341 ///
2342 /// \param Cycles indicates the number of consecutive (non-pipelined) cycles
2343 /// during which this resource is consumed.
2344 ///
2345 /// \return the next cycle at which the instruction may execute without
2346 /// oversubscribing resources.
2347 unsigned SchedBoundary::countResource(const MCSchedClassDesc *SC, unsigned PIdx,
2348                                       unsigned Cycles, unsigned NextCycle) {
2349   unsigned Factor = SchedModel->getResourceFactor(PIdx);
2350   unsigned Count = Factor * Cycles;
2351   LLVM_DEBUG(dbgs() << "  " << SchedModel->getResourceName(PIdx) << " +"
2352                     << Cycles << "x" << Factor << "u\n");
2353 
2354   // Update Executed resources counts.
2355   incExecutedResources(PIdx, Count);
2356   assert(Rem->RemainingCounts[PIdx] >= Count && "resource double counted");
2357   Rem->RemainingCounts[PIdx] -= Count;
2358 
2359   // Check if this resource exceeds the current critical resource. If so, it
2360   // becomes the critical resource.
2361   if (ZoneCritResIdx != PIdx && (getResourceCount(PIdx) > getCriticalCount())) {
2362     ZoneCritResIdx = PIdx;
2363     LLVM_DEBUG(dbgs() << "  *** Critical resource "
2364                       << SchedModel->getResourceName(PIdx) << ": "
2365                       << getResourceCount(PIdx) / SchedModel->getLatencyFactor()
2366                       << "c\n");
2367   }
2368   // For reserved resources, record the highest cycle using the resource.
2369   unsigned NextAvailable, InstanceIdx;
2370   std::tie(NextAvailable, InstanceIdx) = getNextResourceCycle(SC, PIdx, Cycles);
2371   if (NextAvailable > CurrCycle) {
2372     LLVM_DEBUG(dbgs() << "  Resource conflict: "
2373                       << SchedModel->getResourceName(PIdx)
2374                       << '[' << InstanceIdx - ReservedCyclesIndex[PIdx]  << ']'
2375                       << " reserved until @" << NextAvailable << "\n");
2376   }
2377   return NextAvailable;
2378 }
2379 
2380 /// Move the boundary of scheduled code by one SUnit.
2381 void SchedBoundary::bumpNode(SUnit *SU) {
2382   // Update the reservation table.
2383   if (HazardRec->isEnabled()) {
2384     if (!isTop() && SU->isCall) {
2385       // Calls are scheduled with their preceding instructions. For bottom-up
2386       // scheduling, clear the pipeline state before emitting.
2387       HazardRec->Reset();
2388     }
2389     HazardRec->EmitInstruction(SU);
2390     // Scheduling an instruction may have made pending instructions available.
2391     CheckPending = true;
2392   }
2393   // checkHazard should prevent scheduling multiple instructions per cycle that
2394   // exceed the issue width.
2395   const MCSchedClassDesc *SC = DAG->getSchedClass(SU);
2396   unsigned IncMOps = SchedModel->getNumMicroOps(SU->getInstr());
2397   assert(
2398       (CurrMOps == 0 || (CurrMOps + IncMOps) <= SchedModel->getIssueWidth()) &&
2399       "Cannot schedule this instruction's MicroOps in the current cycle.");
2400 
2401   unsigned ReadyCycle = (isTop() ? SU->TopReadyCycle : SU->BotReadyCycle);
2402   LLVM_DEBUG(dbgs() << "  Ready @" << ReadyCycle << "c\n");
2403 
2404   unsigned NextCycle = CurrCycle;
2405   switch (SchedModel->getMicroOpBufferSize()) {
2406   case 0:
2407     assert(ReadyCycle <= CurrCycle && "Broken PendingQueue");
2408     break;
2409   case 1:
2410     if (ReadyCycle > NextCycle) {
2411       NextCycle = ReadyCycle;
2412       LLVM_DEBUG(dbgs() << "  *** Stall until: " << ReadyCycle << "\n");
2413     }
2414     break;
2415   default:
2416     // We don't currently model the OOO reorder buffer, so consider all
2417     // scheduled MOps to be "retired". We do loosely model in-order resource
2418     // latency. If this instruction uses an in-order resource, account for any
2419     // likely stall cycles.
2420     if (SU->isUnbuffered && ReadyCycle > NextCycle)
2421       NextCycle = ReadyCycle;
2422     break;
2423   }
2424   RetiredMOps += IncMOps;
2425 
2426   // Update resource counts and critical resource.
2427   if (SchedModel->hasInstrSchedModel()) {
2428     unsigned DecRemIssue = IncMOps * SchedModel->getMicroOpFactor();
2429     assert(Rem->RemIssueCount >= DecRemIssue && "MOps double counted");
2430     Rem->RemIssueCount -= DecRemIssue;
2431     if (ZoneCritResIdx) {
2432       // Scale scheduled micro-ops for comparing with the critical resource.
2433       unsigned ScaledMOps =
2434         RetiredMOps * SchedModel->getMicroOpFactor();
2435 
2436       // If scaled micro-ops are now more than the previous critical resource by
2437       // a full cycle, then micro-ops issue becomes critical.
2438       if ((int)(ScaledMOps - getResourceCount(ZoneCritResIdx))
2439           >= (int)SchedModel->getLatencyFactor()) {
2440         ZoneCritResIdx = 0;
2441         LLVM_DEBUG(dbgs() << "  *** Critical resource NumMicroOps: "
2442                           << ScaledMOps / SchedModel->getLatencyFactor()
2443                           << "c\n");
2444       }
2445     }
2446     for (TargetSchedModel::ProcResIter
2447            PI = SchedModel->getWriteProcResBegin(SC),
2448            PE = SchedModel->getWriteProcResEnd(SC); PI != PE; ++PI) {
2449       unsigned RCycle =
2450         countResource(SC, PI->ProcResourceIdx, PI->Cycles, NextCycle);
2451       if (RCycle > NextCycle)
2452         NextCycle = RCycle;
2453     }
2454     if (SU->hasReservedResource) {
2455       // For reserved resources, record the highest cycle using the resource.
2456       // For top-down scheduling, this is the cycle in which we schedule this
2457       // instruction plus the number of cycles the operations reserves the
2458       // resource. For bottom-up is it simply the instruction's cycle.
2459       for (TargetSchedModel::ProcResIter
2460              PI = SchedModel->getWriteProcResBegin(SC),
2461              PE = SchedModel->getWriteProcResEnd(SC); PI != PE; ++PI) {
2462         unsigned PIdx = PI->ProcResourceIdx;
2463         if (SchedModel->getProcResource(PIdx)->BufferSize == 0) {
2464           unsigned ReservedUntil, InstanceIdx;
2465           std::tie(ReservedUntil, InstanceIdx) =
2466               getNextResourceCycle(SC, PIdx, 0);
2467           if (isTop()) {
2468             ReservedCycles[InstanceIdx] =
2469                 std::max(ReservedUntil, NextCycle + PI->Cycles);
2470           } else
2471             ReservedCycles[InstanceIdx] = NextCycle;
2472         }
2473       }
2474     }
2475   }
2476   // Update ExpectedLatency and DependentLatency.
2477   unsigned &TopLatency = isTop() ? ExpectedLatency : DependentLatency;
2478   unsigned &BotLatency = isTop() ? DependentLatency : ExpectedLatency;
2479   if (SU->getDepth() > TopLatency) {
2480     TopLatency = SU->getDepth();
2481     LLVM_DEBUG(dbgs() << "  " << Available.getName() << " TopLatency SU("
2482                       << SU->NodeNum << ") " << TopLatency << "c\n");
2483   }
2484   if (SU->getHeight() > BotLatency) {
2485     BotLatency = SU->getHeight();
2486     LLVM_DEBUG(dbgs() << "  " << Available.getName() << " BotLatency SU("
2487                       << SU->NodeNum << ") " << BotLatency << "c\n");
2488   }
2489   // If we stall for any reason, bump the cycle.
2490   if (NextCycle > CurrCycle)
2491     bumpCycle(NextCycle);
2492   else
2493     // After updating ZoneCritResIdx and ExpectedLatency, check if we're
2494     // resource limited. If a stall occurred, bumpCycle does this.
2495     IsResourceLimited =
2496         checkResourceLimit(SchedModel->getLatencyFactor(), getCriticalCount(),
2497                            getScheduledLatency(), true);
2498 
2499   // Update CurrMOps after calling bumpCycle to handle stalls, since bumpCycle
2500   // resets CurrMOps. Loop to handle instructions with more MOps than issue in
2501   // one cycle.  Since we commonly reach the max MOps here, opportunistically
2502   // bump the cycle to avoid uselessly checking everything in the readyQ.
2503   CurrMOps += IncMOps;
2504 
2505   // Bump the cycle count for issue group constraints.
2506   // This must be done after NextCycle has been adjust for all other stalls.
2507   // Calling bumpCycle(X) will reduce CurrMOps by one issue group and set
2508   // currCycle to X.
2509   if ((isTop() &&  SchedModel->mustEndGroup(SU->getInstr())) ||
2510       (!isTop() && SchedModel->mustBeginGroup(SU->getInstr()))) {
2511     LLVM_DEBUG(dbgs() << "  Bump cycle to " << (isTop() ? "end" : "begin")
2512                       << " group\n");
2513     bumpCycle(++NextCycle);
2514   }
2515 
2516   while (CurrMOps >= SchedModel->getIssueWidth()) {
2517     LLVM_DEBUG(dbgs() << "  *** Max MOps " << CurrMOps << " at cycle "
2518                       << CurrCycle << '\n');
2519     bumpCycle(++NextCycle);
2520   }
2521   LLVM_DEBUG(dumpScheduledState());
2522 }
2523 
2524 /// Release pending ready nodes in to the available queue. This makes them
2525 /// visible to heuristics.
2526 void SchedBoundary::releasePending() {
2527   // If the available queue is empty, it is safe to reset MinReadyCycle.
2528   if (Available.empty())
2529     MinReadyCycle = std::numeric_limits<unsigned>::max();
2530 
2531   // Check to see if any of the pending instructions are ready to issue.  If
2532   // so, add them to the available queue.
2533   for (unsigned I = 0, E = Pending.size(); I < E; ++I) {
2534     SUnit *SU = *(Pending.begin() + I);
2535     unsigned ReadyCycle = isTop() ? SU->TopReadyCycle : SU->BotReadyCycle;
2536 
2537     if (ReadyCycle < MinReadyCycle)
2538       MinReadyCycle = ReadyCycle;
2539 
2540     if (Available.size() >= ReadyListLimit)
2541       break;
2542 
2543     releaseNode(SU, ReadyCycle, true, I);
2544     if (E != Pending.size()) {
2545       --I;
2546       --E;
2547     }
2548   }
2549   CheckPending = false;
2550 }
2551 
2552 /// Remove SU from the ready set for this boundary.
2553 void SchedBoundary::removeReady(SUnit *SU) {
2554   if (Available.isInQueue(SU))
2555     Available.remove(Available.find(SU));
2556   else {
2557     assert(Pending.isInQueue(SU) && "bad ready count");
2558     Pending.remove(Pending.find(SU));
2559   }
2560 }
2561 
2562 /// If this queue only has one ready candidate, return it. As a side effect,
2563 /// defer any nodes that now hit a hazard, and advance the cycle until at least
2564 /// one node is ready. If multiple instructions are ready, return NULL.
2565 SUnit *SchedBoundary::pickOnlyChoice() {
2566   if (CheckPending)
2567     releasePending();
2568 
2569   // Defer any ready instrs that now have a hazard.
2570   for (ReadyQueue::iterator I = Available.begin(); I != Available.end();) {
2571     if (checkHazard(*I)) {
2572       Pending.push(*I);
2573       I = Available.remove(I);
2574       continue;
2575     }
2576     ++I;
2577   }
2578   for (unsigned i = 0; Available.empty(); ++i) {
2579 //  FIXME: Re-enable assert once PR20057 is resolved.
2580 //    assert(i <= (HazardRec->getMaxLookAhead() + MaxObservedStall) &&
2581 //           "permanent hazard");
2582     (void)i;
2583     bumpCycle(CurrCycle + 1);
2584     releasePending();
2585   }
2586 
2587   LLVM_DEBUG(Pending.dump());
2588   LLVM_DEBUG(Available.dump());
2589 
2590   if (Available.size() == 1)
2591     return *Available.begin();
2592   return nullptr;
2593 }
2594 
2595 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2596 
2597 /// Dump the content of the \ref ReservedCycles vector for the
2598 /// resources that are used in the basic block.
2599 ///
2600 LLVM_DUMP_METHOD void SchedBoundary::dumpReservedCycles() const {
2601   if (!SchedModel->hasInstrSchedModel())
2602     return;
2603 
2604   unsigned ResourceCount = SchedModel->getNumProcResourceKinds();
2605   unsigned StartIdx = 0;
2606 
2607   for (unsigned ResIdx = 0; ResIdx < ResourceCount; ++ResIdx) {
2608     const unsigned NumUnits = SchedModel->getProcResource(ResIdx)->NumUnits;
2609     std::string ResName = SchedModel->getResourceName(ResIdx);
2610     for (unsigned UnitIdx = 0; UnitIdx < NumUnits; ++UnitIdx) {
2611       dbgs() << ResName << "(" << UnitIdx
2612              << ") = " << ReservedCycles[StartIdx + UnitIdx] << "\n";
2613     }
2614     StartIdx += NumUnits;
2615   }
2616 }
2617 
2618 // This is useful information to dump after bumpNode.
2619 // Note that the Queue contents are more useful before pickNodeFromQueue.
2620 LLVM_DUMP_METHOD void SchedBoundary::dumpScheduledState() const {
2621   unsigned ResFactor;
2622   unsigned ResCount;
2623   if (ZoneCritResIdx) {
2624     ResFactor = SchedModel->getResourceFactor(ZoneCritResIdx);
2625     ResCount = getResourceCount(ZoneCritResIdx);
2626   } else {
2627     ResFactor = SchedModel->getMicroOpFactor();
2628     ResCount = RetiredMOps * ResFactor;
2629   }
2630   unsigned LFactor = SchedModel->getLatencyFactor();
2631   dbgs() << Available.getName() << " @" << CurrCycle << "c\n"
2632          << "  Retired: " << RetiredMOps;
2633   dbgs() << "\n  Executed: " << getExecutedCount() / LFactor << "c";
2634   dbgs() << "\n  Critical: " << ResCount / LFactor << "c, "
2635          << ResCount / ResFactor << " "
2636          << SchedModel->getResourceName(ZoneCritResIdx)
2637          << "\n  ExpectedLatency: " << ExpectedLatency << "c\n"
2638          << (IsResourceLimited ? "  - Resource" : "  - Latency")
2639          << " limited.\n";
2640   if (MISchedDumpReservedCycles)
2641     dumpReservedCycles();
2642 }
2643 #endif
2644 
2645 //===----------------------------------------------------------------------===//
2646 // GenericScheduler - Generic implementation of MachineSchedStrategy.
2647 //===----------------------------------------------------------------------===//
2648 
2649 void GenericSchedulerBase::SchedCandidate::
2650 initResourceDelta(const ScheduleDAGMI *DAG,
2651                   const TargetSchedModel *SchedModel) {
2652   if (!Policy.ReduceResIdx && !Policy.DemandResIdx)
2653     return;
2654 
2655   const MCSchedClassDesc *SC = DAG->getSchedClass(SU);
2656   for (TargetSchedModel::ProcResIter
2657          PI = SchedModel->getWriteProcResBegin(SC),
2658          PE = SchedModel->getWriteProcResEnd(SC); PI != PE; ++PI) {
2659     if (PI->ProcResourceIdx == Policy.ReduceResIdx)
2660       ResDelta.CritResources += PI->Cycles;
2661     if (PI->ProcResourceIdx == Policy.DemandResIdx)
2662       ResDelta.DemandedResources += PI->Cycles;
2663   }
2664 }
2665 
2666 /// Compute remaining latency. We need this both to determine whether the
2667 /// overall schedule has become latency-limited and whether the instructions
2668 /// outside this zone are resource or latency limited.
2669 ///
2670 /// The "dependent" latency is updated incrementally during scheduling as the
2671 /// max height/depth of scheduled nodes minus the cycles since it was
2672 /// scheduled:
2673 ///   DLat = max (N.depth - (CurrCycle - N.ReadyCycle) for N in Zone
2674 ///
2675 /// The "independent" latency is the max ready queue depth:
2676 ///   ILat = max N.depth for N in Available|Pending
2677 ///
2678 /// RemainingLatency is the greater of independent and dependent latency.
2679 ///
2680 /// These computations are expensive, especially in DAGs with many edges, so
2681 /// only do them if necessary.
2682 static unsigned computeRemLatency(SchedBoundary &CurrZone) {
2683   unsigned RemLatency = CurrZone.getDependentLatency();
2684   RemLatency = std::max(RemLatency,
2685                         CurrZone.findMaxLatency(CurrZone.Available.elements()));
2686   RemLatency = std::max(RemLatency,
2687                         CurrZone.findMaxLatency(CurrZone.Pending.elements()));
2688   return RemLatency;
2689 }
2690 
2691 /// Returns true if the current cycle plus remaning latency is greater than
2692 /// the critical path in the scheduling region.
2693 bool GenericSchedulerBase::shouldReduceLatency(const CandPolicy &Policy,
2694                                                SchedBoundary &CurrZone,
2695                                                bool ComputeRemLatency,
2696                                                unsigned &RemLatency) const {
2697   // The current cycle is already greater than the critical path, so we are
2698   // already latency limited and don't need to compute the remaining latency.
2699   if (CurrZone.getCurrCycle() > Rem.CriticalPath)
2700     return true;
2701 
2702   // If we haven't scheduled anything yet, then we aren't latency limited.
2703   if (CurrZone.getCurrCycle() == 0)
2704     return false;
2705 
2706   if (ComputeRemLatency)
2707     RemLatency = computeRemLatency(CurrZone);
2708 
2709   return RemLatency + CurrZone.getCurrCycle() > Rem.CriticalPath;
2710 }
2711 
2712 /// Set the CandPolicy given a scheduling zone given the current resources and
2713 /// latencies inside and outside the zone.
2714 void GenericSchedulerBase::setPolicy(CandPolicy &Policy, bool IsPostRA,
2715                                      SchedBoundary &CurrZone,
2716                                      SchedBoundary *OtherZone) {
2717   // Apply preemptive heuristics based on the total latency and resources
2718   // inside and outside this zone. Potential stalls should be considered before
2719   // following this policy.
2720 
2721   // Compute the critical resource outside the zone.
2722   unsigned OtherCritIdx = 0;
2723   unsigned OtherCount =
2724     OtherZone ? OtherZone->getOtherResourceCount(OtherCritIdx) : 0;
2725 
2726   bool OtherResLimited = false;
2727   unsigned RemLatency = 0;
2728   bool RemLatencyComputed = false;
2729   if (SchedModel->hasInstrSchedModel() && OtherCount != 0) {
2730     RemLatency = computeRemLatency(CurrZone);
2731     RemLatencyComputed = true;
2732     OtherResLimited = checkResourceLimit(SchedModel->getLatencyFactor(),
2733                                          OtherCount, RemLatency, false);
2734   }
2735 
2736   // Schedule aggressively for latency in PostRA mode. We don't check for
2737   // acyclic latency during PostRA, and highly out-of-order processors will
2738   // skip PostRA scheduling.
2739   if (!OtherResLimited &&
2740       (IsPostRA || shouldReduceLatency(Policy, CurrZone, !RemLatencyComputed,
2741                                        RemLatency))) {
2742     Policy.ReduceLatency |= true;
2743     LLVM_DEBUG(dbgs() << "  " << CurrZone.Available.getName()
2744                       << " RemainingLatency " << RemLatency << " + "
2745                       << CurrZone.getCurrCycle() << "c > CritPath "
2746                       << Rem.CriticalPath << "\n");
2747   }
2748   // If the same resource is limiting inside and outside the zone, do nothing.
2749   if (CurrZone.getZoneCritResIdx() == OtherCritIdx)
2750     return;
2751 
2752   LLVM_DEBUG(if (CurrZone.isResourceLimited()) {
2753     dbgs() << "  " << CurrZone.Available.getName() << " ResourceLimited: "
2754            << SchedModel->getResourceName(CurrZone.getZoneCritResIdx()) << "\n";
2755   } if (OtherResLimited) dbgs()
2756                  << "  RemainingLimit: "
2757                  << SchedModel->getResourceName(OtherCritIdx) << "\n";
2758              if (!CurrZone.isResourceLimited() && !OtherResLimited) dbgs()
2759              << "  Latency limited both directions.\n");
2760 
2761   if (CurrZone.isResourceLimited() && !Policy.ReduceResIdx)
2762     Policy.ReduceResIdx = CurrZone.getZoneCritResIdx();
2763 
2764   if (OtherResLimited)
2765     Policy.DemandResIdx = OtherCritIdx;
2766 }
2767 
2768 #ifndef NDEBUG
2769 const char *GenericSchedulerBase::getReasonStr(
2770   GenericSchedulerBase::CandReason Reason) {
2771   switch (Reason) {
2772   case NoCand:         return "NOCAND    ";
2773   case Only1:          return "ONLY1     ";
2774   case PhysReg:        return "PHYS-REG  ";
2775   case RegExcess:      return "REG-EXCESS";
2776   case RegCritical:    return "REG-CRIT  ";
2777   case Stall:          return "STALL     ";
2778   case Cluster:        return "CLUSTER   ";
2779   case Weak:           return "WEAK      ";
2780   case RegMax:         return "REG-MAX   ";
2781   case ResourceReduce: return "RES-REDUCE";
2782   case ResourceDemand: return "RES-DEMAND";
2783   case TopDepthReduce: return "TOP-DEPTH ";
2784   case TopPathReduce:  return "TOP-PATH  ";
2785   case BotHeightReduce:return "BOT-HEIGHT";
2786   case BotPathReduce:  return "BOT-PATH  ";
2787   case NextDefUse:     return "DEF-USE   ";
2788   case NodeOrder:      return "ORDER     ";
2789   };
2790   llvm_unreachable("Unknown reason!");
2791 }
2792 
2793 void GenericSchedulerBase::traceCandidate(const SchedCandidate &Cand) {
2794   PressureChange P;
2795   unsigned ResIdx = 0;
2796   unsigned Latency = 0;
2797   switch (Cand.Reason) {
2798   default:
2799     break;
2800   case RegExcess:
2801     P = Cand.RPDelta.Excess;
2802     break;
2803   case RegCritical:
2804     P = Cand.RPDelta.CriticalMax;
2805     break;
2806   case RegMax:
2807     P = Cand.RPDelta.CurrentMax;
2808     break;
2809   case ResourceReduce:
2810     ResIdx = Cand.Policy.ReduceResIdx;
2811     break;
2812   case ResourceDemand:
2813     ResIdx = Cand.Policy.DemandResIdx;
2814     break;
2815   case TopDepthReduce:
2816     Latency = Cand.SU->getDepth();
2817     break;
2818   case TopPathReduce:
2819     Latency = Cand.SU->getHeight();
2820     break;
2821   case BotHeightReduce:
2822     Latency = Cand.SU->getHeight();
2823     break;
2824   case BotPathReduce:
2825     Latency = Cand.SU->getDepth();
2826     break;
2827   }
2828   dbgs() << "  Cand SU(" << Cand.SU->NodeNum << ") " << getReasonStr(Cand.Reason);
2829   if (P.isValid())
2830     dbgs() << " " << TRI->getRegPressureSetName(P.getPSet())
2831            << ":" << P.getUnitInc() << " ";
2832   else
2833     dbgs() << "      ";
2834   if (ResIdx)
2835     dbgs() << " " << SchedModel->getProcResource(ResIdx)->Name << " ";
2836   else
2837     dbgs() << "         ";
2838   if (Latency)
2839     dbgs() << " " << Latency << " cycles ";
2840   else
2841     dbgs() << "          ";
2842   dbgs() << '\n';
2843 }
2844 #endif
2845 
2846 namespace llvm {
2847 /// Return true if this heuristic determines order.
2848 /// TODO: Consider refactor return type of these functions as integer or enum,
2849 /// as we may need to differentiate whether TryCand is better than Cand.
2850 bool tryLess(int TryVal, int CandVal,
2851              GenericSchedulerBase::SchedCandidate &TryCand,
2852              GenericSchedulerBase::SchedCandidate &Cand,
2853              GenericSchedulerBase::CandReason Reason) {
2854   if (TryVal < CandVal) {
2855     TryCand.Reason = Reason;
2856     return true;
2857   }
2858   if (TryVal > CandVal) {
2859     if (Cand.Reason > Reason)
2860       Cand.Reason = Reason;
2861     return true;
2862   }
2863   return false;
2864 }
2865 
2866 bool tryGreater(int TryVal, int CandVal,
2867                 GenericSchedulerBase::SchedCandidate &TryCand,
2868                 GenericSchedulerBase::SchedCandidate &Cand,
2869                 GenericSchedulerBase::CandReason Reason) {
2870   if (TryVal > CandVal) {
2871     TryCand.Reason = Reason;
2872     return true;
2873   }
2874   if (TryVal < CandVal) {
2875     if (Cand.Reason > Reason)
2876       Cand.Reason = Reason;
2877     return true;
2878   }
2879   return false;
2880 }
2881 
2882 bool tryLatency(GenericSchedulerBase::SchedCandidate &TryCand,
2883                 GenericSchedulerBase::SchedCandidate &Cand,
2884                 SchedBoundary &Zone) {
2885   if (Zone.isTop()) {
2886     // Prefer the candidate with the lesser depth, but only if one of them has
2887     // depth greater than the total latency scheduled so far, otherwise either
2888     // of them could be scheduled now with no stall.
2889     if (std::max(TryCand.SU->getDepth(), Cand.SU->getDepth()) >
2890         Zone.getScheduledLatency()) {
2891       if (tryLess(TryCand.SU->getDepth(), Cand.SU->getDepth(),
2892                   TryCand, Cand, GenericSchedulerBase::TopDepthReduce))
2893         return true;
2894     }
2895     if (tryGreater(TryCand.SU->getHeight(), Cand.SU->getHeight(),
2896                    TryCand, Cand, GenericSchedulerBase::TopPathReduce))
2897       return true;
2898   } else {
2899     // Prefer the candidate with the lesser height, but only if one of them has
2900     // height greater than the total latency scheduled so far, otherwise either
2901     // of them could be scheduled now with no stall.
2902     if (std::max(TryCand.SU->getHeight(), Cand.SU->getHeight()) >
2903         Zone.getScheduledLatency()) {
2904       if (tryLess(TryCand.SU->getHeight(), Cand.SU->getHeight(),
2905                   TryCand, Cand, GenericSchedulerBase::BotHeightReduce))
2906         return true;
2907     }
2908     if (tryGreater(TryCand.SU->getDepth(), Cand.SU->getDepth(),
2909                    TryCand, Cand, GenericSchedulerBase::BotPathReduce))
2910       return true;
2911   }
2912   return false;
2913 }
2914 } // end namespace llvm
2915 
2916 static void tracePick(GenericSchedulerBase::CandReason Reason, bool IsTop) {
2917   LLVM_DEBUG(dbgs() << "Pick " << (IsTop ? "Top " : "Bot ")
2918                     << GenericSchedulerBase::getReasonStr(Reason) << '\n');
2919 }
2920 
2921 static void tracePick(const GenericSchedulerBase::SchedCandidate &Cand) {
2922   tracePick(Cand.Reason, Cand.AtTop);
2923 }
2924 
2925 void GenericScheduler::initialize(ScheduleDAGMI *dag) {
2926   assert(dag->hasVRegLiveness() &&
2927          "(PreRA)GenericScheduler needs vreg liveness");
2928   DAG = static_cast<ScheduleDAGMILive*>(dag);
2929   SchedModel = DAG->getSchedModel();
2930   TRI = DAG->TRI;
2931 
2932   if (RegionPolicy.ComputeDFSResult)
2933     DAG->computeDFSResult();
2934 
2935   Rem.init(DAG, SchedModel);
2936   Top.init(DAG, SchedModel, &Rem);
2937   Bot.init(DAG, SchedModel, &Rem);
2938 
2939   // Initialize resource counts.
2940 
2941   // Initialize the HazardRecognizers. If itineraries don't exist, are empty, or
2942   // are disabled, then these HazardRecs will be disabled.
2943   const InstrItineraryData *Itin = SchedModel->getInstrItineraries();
2944   if (!Top.HazardRec) {
2945     Top.HazardRec =
2946         DAG->MF.getSubtarget().getInstrInfo()->CreateTargetMIHazardRecognizer(
2947             Itin, DAG);
2948   }
2949   if (!Bot.HazardRec) {
2950     Bot.HazardRec =
2951         DAG->MF.getSubtarget().getInstrInfo()->CreateTargetMIHazardRecognizer(
2952             Itin, DAG);
2953   }
2954   TopCand.SU = nullptr;
2955   BotCand.SU = nullptr;
2956 }
2957 
2958 /// Initialize the per-region scheduling policy.
2959 void GenericScheduler::initPolicy(MachineBasicBlock::iterator Begin,
2960                                   MachineBasicBlock::iterator End,
2961                                   unsigned NumRegionInstrs) {
2962   const MachineFunction &MF = *Begin->getMF();
2963   const TargetLowering *TLI = MF.getSubtarget().getTargetLowering();
2964 
2965   // Avoid setting up the register pressure tracker for small regions to save
2966   // compile time. As a rough heuristic, only track pressure when the number of
2967   // schedulable instructions exceeds half the integer register file.
2968   RegionPolicy.ShouldTrackPressure = true;
2969   for (unsigned VT = MVT::i32; VT > (unsigned)MVT::i1; --VT) {
2970     MVT::SimpleValueType LegalIntVT = (MVT::SimpleValueType)VT;
2971     if (TLI->isTypeLegal(LegalIntVT)) {
2972       unsigned NIntRegs = Context->RegClassInfo->getNumAllocatableRegs(
2973         TLI->getRegClassFor(LegalIntVT));
2974       RegionPolicy.ShouldTrackPressure = NumRegionInstrs > (NIntRegs / 2);
2975     }
2976   }
2977 
2978   // For generic targets, we default to bottom-up, because it's simpler and more
2979   // compile-time optimizations have been implemented in that direction.
2980   RegionPolicy.OnlyBottomUp = true;
2981 
2982   // Allow the subtarget to override default policy.
2983   MF.getSubtarget().overrideSchedPolicy(RegionPolicy, NumRegionInstrs);
2984 
2985   // After subtarget overrides, apply command line options.
2986   if (!EnableRegPressure) {
2987     RegionPolicy.ShouldTrackPressure = false;
2988     RegionPolicy.ShouldTrackLaneMasks = false;
2989   }
2990 
2991   // Check -misched-topdown/bottomup can force or unforce scheduling direction.
2992   // e.g. -misched-bottomup=false allows scheduling in both directions.
2993   assert((!ForceTopDown || !ForceBottomUp) &&
2994          "-misched-topdown incompatible with -misched-bottomup");
2995   if (ForceBottomUp.getNumOccurrences() > 0) {
2996     RegionPolicy.OnlyBottomUp = ForceBottomUp;
2997     if (RegionPolicy.OnlyBottomUp)
2998       RegionPolicy.OnlyTopDown = false;
2999   }
3000   if (ForceTopDown.getNumOccurrences() > 0) {
3001     RegionPolicy.OnlyTopDown = ForceTopDown;
3002     if (RegionPolicy.OnlyTopDown)
3003       RegionPolicy.OnlyBottomUp = false;
3004   }
3005 }
3006 
3007 void GenericScheduler::dumpPolicy() const {
3008   // Cannot completely remove virtual function even in release mode.
3009 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3010   dbgs() << "GenericScheduler RegionPolicy: "
3011          << " ShouldTrackPressure=" << RegionPolicy.ShouldTrackPressure
3012          << " OnlyTopDown=" << RegionPolicy.OnlyTopDown
3013          << " OnlyBottomUp=" << RegionPolicy.OnlyBottomUp
3014          << "\n";
3015 #endif
3016 }
3017 
3018 /// Set IsAcyclicLatencyLimited if the acyclic path is longer than the cyclic
3019 /// critical path by more cycles than it takes to drain the instruction buffer.
3020 /// We estimate an upper bounds on in-flight instructions as:
3021 ///
3022 /// CyclesPerIteration = max( CyclicPath, Loop-Resource-Height )
3023 /// InFlightIterations = AcyclicPath / CyclesPerIteration
3024 /// InFlightResources = InFlightIterations * LoopResources
3025 ///
3026 /// TODO: Check execution resources in addition to IssueCount.
3027 void GenericScheduler::checkAcyclicLatency() {
3028   if (Rem.CyclicCritPath == 0 || Rem.CyclicCritPath >= Rem.CriticalPath)
3029     return;
3030 
3031   // Scaled number of cycles per loop iteration.
3032   unsigned IterCount =
3033     std::max(Rem.CyclicCritPath * SchedModel->getLatencyFactor(),
3034              Rem.RemIssueCount);
3035   // Scaled acyclic critical path.
3036   unsigned AcyclicCount = Rem.CriticalPath * SchedModel->getLatencyFactor();
3037   // InFlightCount = (AcyclicPath / IterCycles) * InstrPerLoop
3038   unsigned InFlightCount =
3039     (AcyclicCount * Rem.RemIssueCount + IterCount-1) / IterCount;
3040   unsigned BufferLimit =
3041     SchedModel->getMicroOpBufferSize() * SchedModel->getMicroOpFactor();
3042 
3043   Rem.IsAcyclicLatencyLimited = InFlightCount > BufferLimit;
3044 
3045   LLVM_DEBUG(
3046       dbgs() << "IssueCycles="
3047              << Rem.RemIssueCount / SchedModel->getLatencyFactor() << "c "
3048              << "IterCycles=" << IterCount / SchedModel->getLatencyFactor()
3049              << "c NumIters=" << (AcyclicCount + IterCount - 1) / IterCount
3050              << " InFlight=" << InFlightCount / SchedModel->getMicroOpFactor()
3051              << "m BufferLim=" << SchedModel->getMicroOpBufferSize() << "m\n";
3052       if (Rem.IsAcyclicLatencyLimited) dbgs() << "  ACYCLIC LATENCY LIMIT\n");
3053 }
3054 
3055 void GenericScheduler::registerRoots() {
3056   Rem.CriticalPath = DAG->ExitSU.getDepth();
3057 
3058   // Some roots may not feed into ExitSU. Check all of them in case.
3059   for (const SUnit *SU : Bot.Available) {
3060     if (SU->getDepth() > Rem.CriticalPath)
3061       Rem.CriticalPath = SU->getDepth();
3062   }
3063   LLVM_DEBUG(dbgs() << "Critical Path(GS-RR ): " << Rem.CriticalPath << '\n');
3064   if (DumpCriticalPathLength) {
3065     errs() << "Critical Path(GS-RR ): " << Rem.CriticalPath << " \n";
3066   }
3067 
3068   if (EnableCyclicPath && SchedModel->getMicroOpBufferSize() > 0) {
3069     Rem.CyclicCritPath = DAG->computeCyclicCriticalPath();
3070     checkAcyclicLatency();
3071   }
3072 }
3073 
3074 namespace llvm {
3075 bool tryPressure(const PressureChange &TryP,
3076                  const PressureChange &CandP,
3077                  GenericSchedulerBase::SchedCandidate &TryCand,
3078                  GenericSchedulerBase::SchedCandidate &Cand,
3079                  GenericSchedulerBase::CandReason Reason,
3080                  const TargetRegisterInfo *TRI,
3081                  const MachineFunction &MF) {
3082   // If one candidate decreases and the other increases, go with it.
3083   // Invalid candidates have UnitInc==0.
3084   if (tryGreater(TryP.getUnitInc() < 0, CandP.getUnitInc() < 0, TryCand, Cand,
3085                  Reason)) {
3086     return true;
3087   }
3088   // Do not compare the magnitude of pressure changes between top and bottom
3089   // boundary.
3090   if (Cand.AtTop != TryCand.AtTop)
3091     return false;
3092 
3093   // If both candidates affect the same set in the same boundary, go with the
3094   // smallest increase.
3095   unsigned TryPSet = TryP.getPSetOrMax();
3096   unsigned CandPSet = CandP.getPSetOrMax();
3097   if (TryPSet == CandPSet) {
3098     return tryLess(TryP.getUnitInc(), CandP.getUnitInc(), TryCand, Cand,
3099                    Reason);
3100   }
3101 
3102   int TryRank = TryP.isValid() ? TRI->getRegPressureSetScore(MF, TryPSet) :
3103                                  std::numeric_limits<int>::max();
3104 
3105   int CandRank = CandP.isValid() ? TRI->getRegPressureSetScore(MF, CandPSet) :
3106                                    std::numeric_limits<int>::max();
3107 
3108   // If the candidates are decreasing pressure, reverse priority.
3109   if (TryP.getUnitInc() < 0)
3110     std::swap(TryRank, CandRank);
3111   return tryGreater(TryRank, CandRank, TryCand, Cand, Reason);
3112 }
3113 
3114 unsigned getWeakLeft(const SUnit *SU, bool isTop) {
3115   return (isTop) ? SU->WeakPredsLeft : SU->WeakSuccsLeft;
3116 }
3117 
3118 /// Minimize physical register live ranges. Regalloc wants them adjacent to
3119 /// their physreg def/use.
3120 ///
3121 /// FIXME: This is an unnecessary check on the critical path. Most are root/leaf
3122 /// copies which can be prescheduled. The rest (e.g. x86 MUL) could be bundled
3123 /// with the operation that produces or consumes the physreg. We'll do this when
3124 /// regalloc has support for parallel copies.
3125 int biasPhysReg(const SUnit *SU, bool isTop) {
3126   const MachineInstr *MI = SU->getInstr();
3127 
3128   if (MI->isCopy()) {
3129     unsigned ScheduledOper = isTop ? 1 : 0;
3130     unsigned UnscheduledOper = isTop ? 0 : 1;
3131     // If we have already scheduled the physreg produce/consumer, immediately
3132     // schedule the copy.
3133     if (Register::isPhysicalRegister(MI->getOperand(ScheduledOper).getReg()))
3134       return 1;
3135     // If the physreg is at the boundary, defer it. Otherwise schedule it
3136     // immediately to free the dependent. We can hoist the copy later.
3137     bool AtBoundary = isTop ? !SU->NumSuccsLeft : !SU->NumPredsLeft;
3138     if (Register::isPhysicalRegister(MI->getOperand(UnscheduledOper).getReg()))
3139       return AtBoundary ? -1 : 1;
3140   }
3141 
3142   if (MI->isMoveImmediate()) {
3143     // If we have a move immediate and all successors have been assigned, bias
3144     // towards scheduling this later. Make sure all register defs are to
3145     // physical registers.
3146     bool DoBias = true;
3147     for (const MachineOperand &Op : MI->defs()) {
3148       if (Op.isReg() && !Register::isPhysicalRegister(Op.getReg())) {
3149         DoBias = false;
3150         break;
3151       }
3152     }
3153 
3154     if (DoBias)
3155       return isTop ? -1 : 1;
3156   }
3157 
3158   return 0;
3159 }
3160 } // end namespace llvm
3161 
3162 void GenericScheduler::initCandidate(SchedCandidate &Cand, SUnit *SU,
3163                                      bool AtTop,
3164                                      const RegPressureTracker &RPTracker,
3165                                      RegPressureTracker &TempTracker) {
3166   Cand.SU = SU;
3167   Cand.AtTop = AtTop;
3168   if (DAG->isTrackingPressure()) {
3169     if (AtTop) {
3170       TempTracker.getMaxDownwardPressureDelta(
3171         Cand.SU->getInstr(),
3172         Cand.RPDelta,
3173         DAG->getRegionCriticalPSets(),
3174         DAG->getRegPressure().MaxSetPressure);
3175     } else {
3176       if (VerifyScheduling) {
3177         TempTracker.getMaxUpwardPressureDelta(
3178           Cand.SU->getInstr(),
3179           &DAG->getPressureDiff(Cand.SU),
3180           Cand.RPDelta,
3181           DAG->getRegionCriticalPSets(),
3182           DAG->getRegPressure().MaxSetPressure);
3183       } else {
3184         RPTracker.getUpwardPressureDelta(
3185           Cand.SU->getInstr(),
3186           DAG->getPressureDiff(Cand.SU),
3187           Cand.RPDelta,
3188           DAG->getRegionCriticalPSets(),
3189           DAG->getRegPressure().MaxSetPressure);
3190       }
3191     }
3192   }
3193   LLVM_DEBUG(if (Cand.RPDelta.Excess.isValid()) dbgs()
3194              << "  Try  SU(" << Cand.SU->NodeNum << ") "
3195              << TRI->getRegPressureSetName(Cand.RPDelta.Excess.getPSet()) << ":"
3196              << Cand.RPDelta.Excess.getUnitInc() << "\n");
3197 }
3198 
3199 /// Apply a set of heuristics to a new candidate. Heuristics are currently
3200 /// hierarchical. This may be more efficient than a graduated cost model because
3201 /// we don't need to evaluate all aspects of the model for each node in the
3202 /// queue. But it's really done to make the heuristics easier to debug and
3203 /// statistically analyze.
3204 ///
3205 /// \param Cand provides the policy and current best candidate.
3206 /// \param TryCand refers to the next SUnit candidate, otherwise uninitialized.
3207 /// \param Zone describes the scheduled zone that we are extending, or nullptr
3208 ///             if Cand is from a different zone than TryCand.
3209 /// \return \c true if TryCand is better than Cand (Reason is NOT NoCand)
3210 bool GenericScheduler::tryCandidate(SchedCandidate &Cand,
3211                                     SchedCandidate &TryCand,
3212                                     SchedBoundary *Zone) const {
3213   // Initialize the candidate if needed.
3214   if (!Cand.isValid()) {
3215     TryCand.Reason = NodeOrder;
3216     return true;
3217   }
3218 
3219   // Bias PhysReg Defs and copies to their uses and defined respectively.
3220   if (tryGreater(biasPhysReg(TryCand.SU, TryCand.AtTop),
3221                  biasPhysReg(Cand.SU, Cand.AtTop), TryCand, Cand, PhysReg))
3222     return TryCand.Reason != NoCand;
3223 
3224   // Avoid exceeding the target's limit.
3225   if (DAG->isTrackingPressure() && tryPressure(TryCand.RPDelta.Excess,
3226                                                Cand.RPDelta.Excess,
3227                                                TryCand, Cand, RegExcess, TRI,
3228                                                DAG->MF))
3229     return TryCand.Reason != NoCand;
3230 
3231   // Avoid increasing the max critical pressure in the scheduled region.
3232   if (DAG->isTrackingPressure() && tryPressure(TryCand.RPDelta.CriticalMax,
3233                                                Cand.RPDelta.CriticalMax,
3234                                                TryCand, Cand, RegCritical, TRI,
3235                                                DAG->MF))
3236     return TryCand.Reason != NoCand;
3237 
3238   // We only compare a subset of features when comparing nodes between
3239   // Top and Bottom boundary. Some properties are simply incomparable, in many
3240   // other instances we should only override the other boundary if something
3241   // is a clear good pick on one boundary. Skip heuristics that are more
3242   // "tie-breaking" in nature.
3243   bool SameBoundary = Zone != nullptr;
3244   if (SameBoundary) {
3245     // For loops that are acyclic path limited, aggressively schedule for
3246     // latency. Within an single cycle, whenever CurrMOps > 0, allow normal
3247     // heuristics to take precedence.
3248     if (Rem.IsAcyclicLatencyLimited && !Zone->getCurrMOps() &&
3249         tryLatency(TryCand, Cand, *Zone))
3250       return TryCand.Reason != NoCand;
3251 
3252     // Prioritize instructions that read unbuffered resources by stall cycles.
3253     if (tryLess(Zone->getLatencyStallCycles(TryCand.SU),
3254                 Zone->getLatencyStallCycles(Cand.SU), TryCand, Cand, Stall))
3255       return TryCand.Reason != NoCand;
3256   }
3257 
3258   // Keep clustered nodes together to encourage downstream peephole
3259   // optimizations which may reduce resource requirements.
3260   //
3261   // This is a best effort to set things up for a post-RA pass. Optimizations
3262   // like generating loads of multiple registers should ideally be done within
3263   // the scheduler pass by combining the loads during DAG postprocessing.
3264   const SUnit *CandNextClusterSU =
3265     Cand.AtTop ? DAG->getNextClusterSucc() : DAG->getNextClusterPred();
3266   const SUnit *TryCandNextClusterSU =
3267     TryCand.AtTop ? DAG->getNextClusterSucc() : DAG->getNextClusterPred();
3268   if (tryGreater(TryCand.SU == TryCandNextClusterSU,
3269                  Cand.SU == CandNextClusterSU,
3270                  TryCand, Cand, Cluster))
3271     return TryCand.Reason != NoCand;
3272 
3273   if (SameBoundary) {
3274     // Weak edges are for clustering and other constraints.
3275     if (tryLess(getWeakLeft(TryCand.SU, TryCand.AtTop),
3276                 getWeakLeft(Cand.SU, Cand.AtTop),
3277                 TryCand, Cand, Weak))
3278       return TryCand.Reason != NoCand;
3279   }
3280 
3281   // Avoid increasing the max pressure of the entire region.
3282   if (DAG->isTrackingPressure() && tryPressure(TryCand.RPDelta.CurrentMax,
3283                                                Cand.RPDelta.CurrentMax,
3284                                                TryCand, Cand, RegMax, TRI,
3285                                                DAG->MF))
3286     return TryCand.Reason != NoCand;
3287 
3288   if (SameBoundary) {
3289     // Avoid critical resource consumption and balance the schedule.
3290     TryCand.initResourceDelta(DAG, SchedModel);
3291     if (tryLess(TryCand.ResDelta.CritResources, Cand.ResDelta.CritResources,
3292                 TryCand, Cand, ResourceReduce))
3293       return TryCand.Reason != NoCand;
3294     if (tryGreater(TryCand.ResDelta.DemandedResources,
3295                    Cand.ResDelta.DemandedResources,
3296                    TryCand, Cand, ResourceDemand))
3297       return TryCand.Reason != NoCand;
3298 
3299     // Avoid serializing long latency dependence chains.
3300     // For acyclic path limited loops, latency was already checked above.
3301     if (!RegionPolicy.DisableLatencyHeuristic && TryCand.Policy.ReduceLatency &&
3302         !Rem.IsAcyclicLatencyLimited && tryLatency(TryCand, Cand, *Zone))
3303       return TryCand.Reason != NoCand;
3304 
3305     // Fall through to original instruction order.
3306     if ((Zone->isTop() && TryCand.SU->NodeNum < Cand.SU->NodeNum)
3307         || (!Zone->isTop() && TryCand.SU->NodeNum > Cand.SU->NodeNum)) {
3308       TryCand.Reason = NodeOrder;
3309       return true;
3310     }
3311   }
3312 
3313   return false;
3314 }
3315 
3316 /// Pick the best candidate from the queue.
3317 ///
3318 /// TODO: getMaxPressureDelta results can be mostly cached for each SUnit during
3319 /// DAG building. To adjust for the current scheduling location we need to
3320 /// maintain the number of vreg uses remaining to be top-scheduled.
3321 void GenericScheduler::pickNodeFromQueue(SchedBoundary &Zone,
3322                                          const CandPolicy &ZonePolicy,
3323                                          const RegPressureTracker &RPTracker,
3324                                          SchedCandidate &Cand) {
3325   // getMaxPressureDelta temporarily modifies the tracker.
3326   RegPressureTracker &TempTracker = const_cast<RegPressureTracker&>(RPTracker);
3327 
3328   ReadyQueue &Q = Zone.Available;
3329   for (SUnit *SU : Q) {
3330 
3331     SchedCandidate TryCand(ZonePolicy);
3332     initCandidate(TryCand, SU, Zone.isTop(), RPTracker, TempTracker);
3333     // Pass SchedBoundary only when comparing nodes from the same boundary.
3334     SchedBoundary *ZoneArg = Cand.AtTop == TryCand.AtTop ? &Zone : nullptr;
3335     if (tryCandidate(Cand, TryCand, ZoneArg)) {
3336       // Initialize resource delta if needed in case future heuristics query it.
3337       if (TryCand.ResDelta == SchedResourceDelta())
3338         TryCand.initResourceDelta(DAG, SchedModel);
3339       Cand.setBest(TryCand);
3340       LLVM_DEBUG(traceCandidate(Cand));
3341     }
3342   }
3343 }
3344 
3345 /// Pick the best candidate node from either the top or bottom queue.
3346 SUnit *GenericScheduler::pickNodeBidirectional(bool &IsTopNode) {
3347   // Schedule as far as possible in the direction of no choice. This is most
3348   // efficient, but also provides the best heuristics for CriticalPSets.
3349   if (SUnit *SU = Bot.pickOnlyChoice()) {
3350     IsTopNode = false;
3351     tracePick(Only1, false);
3352     return SU;
3353   }
3354   if (SUnit *SU = Top.pickOnlyChoice()) {
3355     IsTopNode = true;
3356     tracePick(Only1, true);
3357     return SU;
3358   }
3359   // Set the bottom-up policy based on the state of the current bottom zone and
3360   // the instructions outside the zone, including the top zone.
3361   CandPolicy BotPolicy;
3362   setPolicy(BotPolicy, /*IsPostRA=*/false, Bot, &Top);
3363   // Set the top-down policy based on the state of the current top zone and
3364   // the instructions outside the zone, including the bottom zone.
3365   CandPolicy TopPolicy;
3366   setPolicy(TopPolicy, /*IsPostRA=*/false, Top, &Bot);
3367 
3368   // See if BotCand is still valid (because we previously scheduled from Top).
3369   LLVM_DEBUG(dbgs() << "Picking from Bot:\n");
3370   if (!BotCand.isValid() || BotCand.SU->isScheduled ||
3371       BotCand.Policy != BotPolicy) {
3372     BotCand.reset(CandPolicy());
3373     pickNodeFromQueue(Bot, BotPolicy, DAG->getBotRPTracker(), BotCand);
3374     assert(BotCand.Reason != NoCand && "failed to find the first candidate");
3375   } else {
3376     LLVM_DEBUG(traceCandidate(BotCand));
3377 #ifndef NDEBUG
3378     if (VerifyScheduling) {
3379       SchedCandidate TCand;
3380       TCand.reset(CandPolicy());
3381       pickNodeFromQueue(Bot, BotPolicy, DAG->getBotRPTracker(), TCand);
3382       assert(TCand.SU == BotCand.SU &&
3383              "Last pick result should correspond to re-picking right now");
3384     }
3385 #endif
3386   }
3387 
3388   // Check if the top Q has a better candidate.
3389   LLVM_DEBUG(dbgs() << "Picking from Top:\n");
3390   if (!TopCand.isValid() || TopCand.SU->isScheduled ||
3391       TopCand.Policy != TopPolicy) {
3392     TopCand.reset(CandPolicy());
3393     pickNodeFromQueue(Top, TopPolicy, DAG->getTopRPTracker(), TopCand);
3394     assert(TopCand.Reason != NoCand && "failed to find the first candidate");
3395   } else {
3396     LLVM_DEBUG(traceCandidate(TopCand));
3397 #ifndef NDEBUG
3398     if (VerifyScheduling) {
3399       SchedCandidate TCand;
3400       TCand.reset(CandPolicy());
3401       pickNodeFromQueue(Top, TopPolicy, DAG->getTopRPTracker(), TCand);
3402       assert(TCand.SU == TopCand.SU &&
3403            "Last pick result should correspond to re-picking right now");
3404     }
3405 #endif
3406   }
3407 
3408   // Pick best from BotCand and TopCand.
3409   assert(BotCand.isValid());
3410   assert(TopCand.isValid());
3411   SchedCandidate Cand = BotCand;
3412   TopCand.Reason = NoCand;
3413   if (tryCandidate(Cand, TopCand, nullptr)) {
3414     Cand.setBest(TopCand);
3415     LLVM_DEBUG(traceCandidate(Cand));
3416   }
3417 
3418   IsTopNode = Cand.AtTop;
3419   tracePick(Cand);
3420   return Cand.SU;
3421 }
3422 
3423 /// Pick the best node to balance the schedule. Implements MachineSchedStrategy.
3424 SUnit *GenericScheduler::pickNode(bool &IsTopNode) {
3425   if (DAG->top() == DAG->bottom()) {
3426     assert(Top.Available.empty() && Top.Pending.empty() &&
3427            Bot.Available.empty() && Bot.Pending.empty() && "ReadyQ garbage");
3428     return nullptr;
3429   }
3430   SUnit *SU;
3431   do {
3432     if (RegionPolicy.OnlyTopDown) {
3433       SU = Top.pickOnlyChoice();
3434       if (!SU) {
3435         CandPolicy NoPolicy;
3436         TopCand.reset(NoPolicy);
3437         pickNodeFromQueue(Top, NoPolicy, DAG->getTopRPTracker(), TopCand);
3438         assert(TopCand.Reason != NoCand && "failed to find a candidate");
3439         tracePick(TopCand);
3440         SU = TopCand.SU;
3441       }
3442       IsTopNode = true;
3443     } else if (RegionPolicy.OnlyBottomUp) {
3444       SU = Bot.pickOnlyChoice();
3445       if (!SU) {
3446         CandPolicy NoPolicy;
3447         BotCand.reset(NoPolicy);
3448         pickNodeFromQueue(Bot, NoPolicy, DAG->getBotRPTracker(), BotCand);
3449         assert(BotCand.Reason != NoCand && "failed to find a candidate");
3450         tracePick(BotCand);
3451         SU = BotCand.SU;
3452       }
3453       IsTopNode = false;
3454     } else {
3455       SU = pickNodeBidirectional(IsTopNode);
3456     }
3457   } while (SU->isScheduled);
3458 
3459   if (SU->isTopReady())
3460     Top.removeReady(SU);
3461   if (SU->isBottomReady())
3462     Bot.removeReady(SU);
3463 
3464   LLVM_DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") "
3465                     << *SU->getInstr());
3466   return SU;
3467 }
3468 
3469 void GenericScheduler::reschedulePhysReg(SUnit *SU, bool isTop) {
3470   MachineBasicBlock::iterator InsertPos = SU->getInstr();
3471   if (!isTop)
3472     ++InsertPos;
3473   SmallVectorImpl<SDep> &Deps = isTop ? SU->Preds : SU->Succs;
3474 
3475   // Find already scheduled copies with a single physreg dependence and move
3476   // them just above the scheduled instruction.
3477   for (SDep &Dep : Deps) {
3478     if (Dep.getKind() != SDep::Data ||
3479         !Register::isPhysicalRegister(Dep.getReg()))
3480       continue;
3481     SUnit *DepSU = Dep.getSUnit();
3482     if (isTop ? DepSU->Succs.size() > 1 : DepSU->Preds.size() > 1)
3483       continue;
3484     MachineInstr *Copy = DepSU->getInstr();
3485     if (!Copy->isCopy() && !Copy->isMoveImmediate())
3486       continue;
3487     LLVM_DEBUG(dbgs() << "  Rescheduling physreg copy ";
3488                DAG->dumpNode(*Dep.getSUnit()));
3489     DAG->moveInstruction(Copy, InsertPos);
3490   }
3491 }
3492 
3493 /// Update the scheduler's state after scheduling a node. This is the same node
3494 /// that was just returned by pickNode(). However, ScheduleDAGMILive needs to
3495 /// update it's state based on the current cycle before MachineSchedStrategy
3496 /// does.
3497 ///
3498 /// FIXME: Eventually, we may bundle physreg copies rather than rescheduling
3499 /// them here. See comments in biasPhysReg.
3500 void GenericScheduler::schedNode(SUnit *SU, bool IsTopNode) {
3501   if (IsTopNode) {
3502     SU->TopReadyCycle = std::max(SU->TopReadyCycle, Top.getCurrCycle());
3503     Top.bumpNode(SU);
3504     if (SU->hasPhysRegUses)
3505       reschedulePhysReg(SU, true);
3506   } else {
3507     SU->BotReadyCycle = std::max(SU->BotReadyCycle, Bot.getCurrCycle());
3508     Bot.bumpNode(SU);
3509     if (SU->hasPhysRegDefs)
3510       reschedulePhysReg(SU, false);
3511   }
3512 }
3513 
3514 /// Create the standard converging machine scheduler. This will be used as the
3515 /// default scheduler if the target does not set a default.
3516 ScheduleDAGMILive *llvm::createGenericSchedLive(MachineSchedContext *C) {
3517   ScheduleDAGMILive *DAG =
3518       new ScheduleDAGMILive(C, std::make_unique<GenericScheduler>(C));
3519   // Register DAG post-processors.
3520   //
3521   // FIXME: extend the mutation API to allow earlier mutations to instantiate
3522   // data and pass it to later mutations. Have a single mutation that gathers
3523   // the interesting nodes in one pass.
3524   DAG->addMutation(createCopyConstrainDAGMutation(DAG->TII, DAG->TRI));
3525   return DAG;
3526 }
3527 
3528 static ScheduleDAGInstrs *createConvergingSched(MachineSchedContext *C) {
3529   return createGenericSchedLive(C);
3530 }
3531 
3532 static MachineSchedRegistry
3533 GenericSchedRegistry("converge", "Standard converging scheduler.",
3534                      createConvergingSched);
3535 
3536 //===----------------------------------------------------------------------===//
3537 // PostGenericScheduler - Generic PostRA implementation of MachineSchedStrategy.
3538 //===----------------------------------------------------------------------===//
3539 
3540 void PostGenericScheduler::initialize(ScheduleDAGMI *Dag) {
3541   DAG = Dag;
3542   SchedModel = DAG->getSchedModel();
3543   TRI = DAG->TRI;
3544 
3545   Rem.init(DAG, SchedModel);
3546   Top.init(DAG, SchedModel, &Rem);
3547   BotRoots.clear();
3548 
3549   // Initialize the HazardRecognizers. If itineraries don't exist, are empty,
3550   // or are disabled, then these HazardRecs will be disabled.
3551   const InstrItineraryData *Itin = SchedModel->getInstrItineraries();
3552   if (!Top.HazardRec) {
3553     Top.HazardRec =
3554         DAG->MF.getSubtarget().getInstrInfo()->CreateTargetMIHazardRecognizer(
3555             Itin, DAG);
3556   }
3557 }
3558 
3559 void PostGenericScheduler::registerRoots() {
3560   Rem.CriticalPath = DAG->ExitSU.getDepth();
3561 
3562   // Some roots may not feed into ExitSU. Check all of them in case.
3563   for (const SUnit *SU : BotRoots) {
3564     if (SU->getDepth() > Rem.CriticalPath)
3565       Rem.CriticalPath = SU->getDepth();
3566   }
3567   LLVM_DEBUG(dbgs() << "Critical Path: (PGS-RR) " << Rem.CriticalPath << '\n');
3568   if (DumpCriticalPathLength) {
3569     errs() << "Critical Path(PGS-RR ): " << Rem.CriticalPath << " \n";
3570   }
3571 }
3572 
3573 /// Apply a set of heuristics to a new candidate for PostRA scheduling.
3574 ///
3575 /// \param Cand provides the policy and current best candidate.
3576 /// \param TryCand refers to the next SUnit candidate, otherwise uninitialized.
3577 /// \return \c true if TryCand is better than Cand (Reason is NOT NoCand)
3578 bool PostGenericScheduler::tryCandidate(SchedCandidate &Cand,
3579                                         SchedCandidate &TryCand) {
3580   // Initialize the candidate if needed.
3581   if (!Cand.isValid()) {
3582     TryCand.Reason = NodeOrder;
3583     return true;
3584   }
3585 
3586   // Prioritize instructions that read unbuffered resources by stall cycles.
3587   if (tryLess(Top.getLatencyStallCycles(TryCand.SU),
3588               Top.getLatencyStallCycles(Cand.SU), TryCand, Cand, Stall))
3589     return TryCand.Reason != NoCand;
3590 
3591   // Keep clustered nodes together.
3592   if (tryGreater(TryCand.SU == DAG->getNextClusterSucc(),
3593                  Cand.SU == DAG->getNextClusterSucc(),
3594                  TryCand, Cand, Cluster))
3595     return TryCand.Reason != NoCand;
3596 
3597   // Avoid critical resource consumption and balance the schedule.
3598   if (tryLess(TryCand.ResDelta.CritResources, Cand.ResDelta.CritResources,
3599               TryCand, Cand, ResourceReduce))
3600     return TryCand.Reason != NoCand;
3601   if (tryGreater(TryCand.ResDelta.DemandedResources,
3602                  Cand.ResDelta.DemandedResources,
3603                  TryCand, Cand, ResourceDemand))
3604     return TryCand.Reason != NoCand;
3605 
3606   // Avoid serializing long latency dependence chains.
3607   if (Cand.Policy.ReduceLatency && tryLatency(TryCand, Cand, Top)) {
3608     return TryCand.Reason != NoCand;
3609   }
3610 
3611   // Fall through to original instruction order.
3612   if (TryCand.SU->NodeNum < Cand.SU->NodeNum) {
3613     TryCand.Reason = NodeOrder;
3614     return true;
3615   }
3616 
3617   return false;
3618 }
3619 
3620 void PostGenericScheduler::pickNodeFromQueue(SchedCandidate &Cand) {
3621   ReadyQueue &Q = Top.Available;
3622   for (SUnit *SU : Q) {
3623     SchedCandidate TryCand(Cand.Policy);
3624     TryCand.SU = SU;
3625     TryCand.AtTop = true;
3626     TryCand.initResourceDelta(DAG, SchedModel);
3627     if (tryCandidate(Cand, TryCand)) {
3628       Cand.setBest(TryCand);
3629       LLVM_DEBUG(traceCandidate(Cand));
3630     }
3631   }
3632 }
3633 
3634 /// Pick the next node to schedule.
3635 SUnit *PostGenericScheduler::pickNode(bool &IsTopNode) {
3636   if (DAG->top() == DAG->bottom()) {
3637     assert(Top.Available.empty() && Top.Pending.empty() && "ReadyQ garbage");
3638     return nullptr;
3639   }
3640   SUnit *SU;
3641   do {
3642     SU = Top.pickOnlyChoice();
3643     if (SU) {
3644       tracePick(Only1, true);
3645     } else {
3646       CandPolicy NoPolicy;
3647       SchedCandidate TopCand(NoPolicy);
3648       // Set the top-down policy based on the state of the current top zone and
3649       // the instructions outside the zone, including the bottom zone.
3650       setPolicy(TopCand.Policy, /*IsPostRA=*/true, Top, nullptr);
3651       pickNodeFromQueue(TopCand);
3652       assert(TopCand.Reason != NoCand && "failed to find a candidate");
3653       tracePick(TopCand);
3654       SU = TopCand.SU;
3655     }
3656   } while (SU->isScheduled);
3657 
3658   IsTopNode = true;
3659   Top.removeReady(SU);
3660 
3661   LLVM_DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") "
3662                     << *SU->getInstr());
3663   return SU;
3664 }
3665 
3666 /// Called after ScheduleDAGMI has scheduled an instruction and updated
3667 /// scheduled/remaining flags in the DAG nodes.
3668 void PostGenericScheduler::schedNode(SUnit *SU, bool IsTopNode) {
3669   SU->TopReadyCycle = std::max(SU->TopReadyCycle, Top.getCurrCycle());
3670   Top.bumpNode(SU);
3671 }
3672 
3673 ScheduleDAGMI *llvm::createGenericSchedPostRA(MachineSchedContext *C) {
3674   return new ScheduleDAGMI(C, std::make_unique<PostGenericScheduler>(C),
3675                            /*RemoveKillFlags=*/true);
3676 }
3677 
3678 //===----------------------------------------------------------------------===//
3679 // ILP Scheduler. Currently for experimental analysis of heuristics.
3680 //===----------------------------------------------------------------------===//
3681 
3682 namespace {
3683 
3684 /// Order nodes by the ILP metric.
3685 struct ILPOrder {
3686   const SchedDFSResult *DFSResult = nullptr;
3687   const BitVector *ScheduledTrees = nullptr;
3688   bool MaximizeILP;
3689 
3690   ILPOrder(bool MaxILP) : MaximizeILP(MaxILP) {}
3691 
3692   /// Apply a less-than relation on node priority.
3693   ///
3694   /// (Return true if A comes after B in the Q.)
3695   bool operator()(const SUnit *A, const SUnit *B) const {
3696     unsigned SchedTreeA = DFSResult->getSubtreeID(A);
3697     unsigned SchedTreeB = DFSResult->getSubtreeID(B);
3698     if (SchedTreeA != SchedTreeB) {
3699       // Unscheduled trees have lower priority.
3700       if (ScheduledTrees->test(SchedTreeA) != ScheduledTrees->test(SchedTreeB))
3701         return ScheduledTrees->test(SchedTreeB);
3702 
3703       // Trees with shallower connections have have lower priority.
3704       if (DFSResult->getSubtreeLevel(SchedTreeA)
3705           != DFSResult->getSubtreeLevel(SchedTreeB)) {
3706         return DFSResult->getSubtreeLevel(SchedTreeA)
3707           < DFSResult->getSubtreeLevel(SchedTreeB);
3708       }
3709     }
3710     if (MaximizeILP)
3711       return DFSResult->getILP(A) < DFSResult->getILP(B);
3712     else
3713       return DFSResult->getILP(A) > DFSResult->getILP(B);
3714   }
3715 };
3716 
3717 /// Schedule based on the ILP metric.
3718 class ILPScheduler : public MachineSchedStrategy {
3719   ScheduleDAGMILive *DAG = nullptr;
3720   ILPOrder Cmp;
3721 
3722   std::vector<SUnit*> ReadyQ;
3723 
3724 public:
3725   ILPScheduler(bool MaximizeILP) : Cmp(MaximizeILP) {}
3726 
3727   void initialize(ScheduleDAGMI *dag) override {
3728     assert(dag->hasVRegLiveness() && "ILPScheduler needs vreg liveness");
3729     DAG = static_cast<ScheduleDAGMILive*>(dag);
3730     DAG->computeDFSResult();
3731     Cmp.DFSResult = DAG->getDFSResult();
3732     Cmp.ScheduledTrees = &DAG->getScheduledTrees();
3733     ReadyQ.clear();
3734   }
3735 
3736   void registerRoots() override {
3737     // Restore the heap in ReadyQ with the updated DFS results.
3738     std::make_heap(ReadyQ.begin(), ReadyQ.end(), Cmp);
3739   }
3740 
3741   /// Implement MachineSchedStrategy interface.
3742   /// -----------------------------------------
3743 
3744   /// Callback to select the highest priority node from the ready Q.
3745   SUnit *pickNode(bool &IsTopNode) override {
3746     if (ReadyQ.empty()) return nullptr;
3747     std::pop_heap(ReadyQ.begin(), ReadyQ.end(), Cmp);
3748     SUnit *SU = ReadyQ.back();
3749     ReadyQ.pop_back();
3750     IsTopNode = false;
3751     LLVM_DEBUG(dbgs() << "Pick node "
3752                       << "SU(" << SU->NodeNum << ") "
3753                       << " ILP: " << DAG->getDFSResult()->getILP(SU)
3754                       << " Tree: " << DAG->getDFSResult()->getSubtreeID(SU)
3755                       << " @"
3756                       << DAG->getDFSResult()->getSubtreeLevel(
3757                              DAG->getDFSResult()->getSubtreeID(SU))
3758                       << '\n'
3759                       << "Scheduling " << *SU->getInstr());
3760     return SU;
3761   }
3762 
3763   /// Scheduler callback to notify that a new subtree is scheduled.
3764   void scheduleTree(unsigned SubtreeID) override {
3765     std::make_heap(ReadyQ.begin(), ReadyQ.end(), Cmp);
3766   }
3767 
3768   /// Callback after a node is scheduled. Mark a newly scheduled tree, notify
3769   /// DFSResults, and resort the priority Q.
3770   void schedNode(SUnit *SU, bool IsTopNode) override {
3771     assert(!IsTopNode && "SchedDFSResult needs bottom-up");
3772   }
3773 
3774   void releaseTopNode(SUnit *) override { /*only called for top roots*/ }
3775 
3776   void releaseBottomNode(SUnit *SU) override {
3777     ReadyQ.push_back(SU);
3778     std::push_heap(ReadyQ.begin(), ReadyQ.end(), Cmp);
3779   }
3780 };
3781 
3782 } // end anonymous namespace
3783 
3784 static ScheduleDAGInstrs *createILPMaxScheduler(MachineSchedContext *C) {
3785   return new ScheduleDAGMILive(C, std::make_unique<ILPScheduler>(true));
3786 }
3787 static ScheduleDAGInstrs *createILPMinScheduler(MachineSchedContext *C) {
3788   return new ScheduleDAGMILive(C, std::make_unique<ILPScheduler>(false));
3789 }
3790 
3791 static MachineSchedRegistry ILPMaxRegistry(
3792   "ilpmax", "Schedule bottom-up for max ILP", createILPMaxScheduler);
3793 static MachineSchedRegistry ILPMinRegistry(
3794   "ilpmin", "Schedule bottom-up for min ILP", createILPMinScheduler);
3795 
3796 //===----------------------------------------------------------------------===//
3797 // Machine Instruction Shuffler for Correctness Testing
3798 //===----------------------------------------------------------------------===//
3799 
3800 #ifndef NDEBUG
3801 namespace {
3802 
3803 /// Apply a less-than relation on the node order, which corresponds to the
3804 /// instruction order prior to scheduling. IsReverse implements greater-than.
3805 template<bool IsReverse>
3806 struct SUnitOrder {
3807   bool operator()(SUnit *A, SUnit *B) const {
3808     if (IsReverse)
3809       return A->NodeNum > B->NodeNum;
3810     else
3811       return A->NodeNum < B->NodeNum;
3812   }
3813 };
3814 
3815 /// Reorder instructions as much as possible.
3816 class InstructionShuffler : public MachineSchedStrategy {
3817   bool IsAlternating;
3818   bool IsTopDown;
3819 
3820   // Using a less-than relation (SUnitOrder<false>) for the TopQ priority
3821   // gives nodes with a higher number higher priority causing the latest
3822   // instructions to be scheduled first.
3823   PriorityQueue<SUnit*, std::vector<SUnit*>, SUnitOrder<false>>
3824     TopQ;
3825 
3826   // When scheduling bottom-up, use greater-than as the queue priority.
3827   PriorityQueue<SUnit*, std::vector<SUnit*>, SUnitOrder<true>>
3828     BottomQ;
3829 
3830 public:
3831   InstructionShuffler(bool alternate, bool topdown)
3832     : IsAlternating(alternate), IsTopDown(topdown) {}
3833 
3834   void initialize(ScheduleDAGMI*) override {
3835     TopQ.clear();
3836     BottomQ.clear();
3837   }
3838 
3839   /// Implement MachineSchedStrategy interface.
3840   /// -----------------------------------------
3841 
3842   SUnit *pickNode(bool &IsTopNode) override {
3843     SUnit *SU;
3844     if (IsTopDown) {
3845       do {
3846         if (TopQ.empty()) return nullptr;
3847         SU = TopQ.top();
3848         TopQ.pop();
3849       } while (SU->isScheduled);
3850       IsTopNode = true;
3851     } else {
3852       do {
3853         if (BottomQ.empty()) return nullptr;
3854         SU = BottomQ.top();
3855         BottomQ.pop();
3856       } while (SU->isScheduled);
3857       IsTopNode = false;
3858     }
3859     if (IsAlternating)
3860       IsTopDown = !IsTopDown;
3861     return SU;
3862   }
3863 
3864   void schedNode(SUnit *SU, bool IsTopNode) override {}
3865 
3866   void releaseTopNode(SUnit *SU) override {
3867     TopQ.push(SU);
3868   }
3869   void releaseBottomNode(SUnit *SU) override {
3870     BottomQ.push(SU);
3871   }
3872 };
3873 
3874 } // end anonymous namespace
3875 
3876 static ScheduleDAGInstrs *createInstructionShuffler(MachineSchedContext *C) {
3877   bool Alternate = !ForceTopDown && !ForceBottomUp;
3878   bool TopDown = !ForceBottomUp;
3879   assert((TopDown || !ForceTopDown) &&
3880          "-misched-topdown incompatible with -misched-bottomup");
3881   return new ScheduleDAGMILive(
3882       C, std::make_unique<InstructionShuffler>(Alternate, TopDown));
3883 }
3884 
3885 static MachineSchedRegistry ShufflerRegistry(
3886   "shuffle", "Shuffle machine instructions alternating directions",
3887   createInstructionShuffler);
3888 #endif // !NDEBUG
3889 
3890 //===----------------------------------------------------------------------===//
3891 // GraphWriter support for ScheduleDAGMILive.
3892 //===----------------------------------------------------------------------===//
3893 
3894 #ifndef NDEBUG
3895 namespace llvm {
3896 
3897 template<> struct GraphTraits<
3898   ScheduleDAGMI*> : public GraphTraits<ScheduleDAG*> {};
3899 
3900 template<>
3901 struct DOTGraphTraits<ScheduleDAGMI*> : public DefaultDOTGraphTraits {
3902   DOTGraphTraits(bool isSimple = false) : DefaultDOTGraphTraits(isSimple) {}
3903 
3904   static std::string getGraphName(const ScheduleDAG *G) {
3905     return std::string(G->MF.getName());
3906   }
3907 
3908   static bool renderGraphFromBottomUp() {
3909     return true;
3910   }
3911 
3912   static bool isNodeHidden(const SUnit *Node, const ScheduleDAG *G) {
3913     if (ViewMISchedCutoff == 0)
3914       return false;
3915     return (Node->Preds.size() > ViewMISchedCutoff
3916          || Node->Succs.size() > ViewMISchedCutoff);
3917   }
3918 
3919   /// If you want to override the dot attributes printed for a particular
3920   /// edge, override this method.
3921   static std::string getEdgeAttributes(const SUnit *Node,
3922                                        SUnitIterator EI,
3923                                        const ScheduleDAG *Graph) {
3924     if (EI.isArtificialDep())
3925       return "color=cyan,style=dashed";
3926     if (EI.isCtrlDep())
3927       return "color=blue,style=dashed";
3928     return "";
3929   }
3930 
3931   static std::string getNodeLabel(const SUnit *SU, const ScheduleDAG *G) {
3932     std::string Str;
3933     raw_string_ostream SS(Str);
3934     const ScheduleDAGMI *DAG = static_cast<const ScheduleDAGMI*>(G);
3935     const SchedDFSResult *DFS = DAG->hasVRegLiveness() ?
3936       static_cast<const ScheduleDAGMILive*>(G)->getDFSResult() : nullptr;
3937     SS << "SU:" << SU->NodeNum;
3938     if (DFS)
3939       SS << " I:" << DFS->getNumInstrs(SU);
3940     return SS.str();
3941   }
3942 
3943   static std::string getNodeDescription(const SUnit *SU, const ScheduleDAG *G) {
3944     return G->getGraphNodeLabel(SU);
3945   }
3946 
3947   static std::string getNodeAttributes(const SUnit *N, const ScheduleDAG *G) {
3948     std::string Str("shape=Mrecord");
3949     const ScheduleDAGMI *DAG = static_cast<const ScheduleDAGMI*>(G);
3950     const SchedDFSResult *DFS = DAG->hasVRegLiveness() ?
3951       static_cast<const ScheduleDAGMILive*>(G)->getDFSResult() : nullptr;
3952     if (DFS) {
3953       Str += ",style=filled,fillcolor=\"#";
3954       Str += DOT::getColorString(DFS->getSubtreeID(N));
3955       Str += '"';
3956     }
3957     return Str;
3958   }
3959 };
3960 
3961 } // end namespace llvm
3962 #endif // NDEBUG
3963 
3964 /// viewGraph - Pop up a ghostview window with the reachable parts of the DAG
3965 /// rendered using 'dot'.
3966 void ScheduleDAGMI::viewGraph(const Twine &Name, const Twine &Title) {
3967 #ifndef NDEBUG
3968   ViewGraph(this, Name, false, Title);
3969 #else
3970   errs() << "ScheduleDAGMI::viewGraph is only available in debug builds on "
3971          << "systems with Graphviz or gv!\n";
3972 #endif  // NDEBUG
3973 }
3974 
3975 /// Out-of-line implementation with no arguments is handy for gdb.
3976 void ScheduleDAGMI::viewGraph() {
3977   viewGraph(getDAGName(), "Scheduling-Units Graph for " + getDAGName());
3978 }
3979