xref: /llvm-project/llvm/lib/CodeGen/MachineScheduler.cpp (revision a278250b0f85949d4f98e641786e5eb2b540c6b0)
1 //===- MachineScheduler.cpp - Machine Instruction Scheduler ---------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // MachineScheduler schedules machine instructions after phi elimination. It
10 // preserves LiveIntervals so it can be invoked before register allocation.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "llvm/CodeGen/MachineScheduler.h"
15 #include "llvm/ADT/ArrayRef.h"
16 #include "llvm/ADT/BitVector.h"
17 #include "llvm/ADT/DenseMap.h"
18 #include "llvm/ADT/PriorityQueue.h"
19 #include "llvm/ADT/STLExtras.h"
20 #include "llvm/ADT/SmallVector.h"
21 #include "llvm/ADT/Statistic.h"
22 #include "llvm/ADT/iterator_range.h"
23 #include "llvm/Analysis/AliasAnalysis.h"
24 #include "llvm/CodeGen/LiveInterval.h"
25 #include "llvm/CodeGen/LiveIntervals.h"
26 #include "llvm/CodeGen/MachineBasicBlock.h"
27 #include "llvm/CodeGen/MachineDominators.h"
28 #include "llvm/CodeGen/MachineFunction.h"
29 #include "llvm/CodeGen/MachineFunctionPass.h"
30 #include "llvm/CodeGen/MachineInstr.h"
31 #include "llvm/CodeGen/MachineLoopInfo.h"
32 #include "llvm/CodeGen/MachineOperand.h"
33 #include "llvm/CodeGen/MachinePassRegistry.h"
34 #include "llvm/CodeGen/MachineRegisterInfo.h"
35 #include "llvm/CodeGen/Passes.h"
36 #include "llvm/CodeGen/RegisterClassInfo.h"
37 #include "llvm/CodeGen/RegisterPressure.h"
38 #include "llvm/CodeGen/ScheduleDAG.h"
39 #include "llvm/CodeGen/ScheduleDAGInstrs.h"
40 #include "llvm/CodeGen/ScheduleDAGMutation.h"
41 #include "llvm/CodeGen/ScheduleDFS.h"
42 #include "llvm/CodeGen/ScheduleHazardRecognizer.h"
43 #include "llvm/CodeGen/SlotIndexes.h"
44 #include "llvm/CodeGen/TargetFrameLowering.h"
45 #include "llvm/CodeGen/TargetInstrInfo.h"
46 #include "llvm/CodeGen/TargetLowering.h"
47 #include "llvm/CodeGen/TargetPassConfig.h"
48 #include "llvm/CodeGen/TargetRegisterInfo.h"
49 #include "llvm/CodeGen/TargetSchedule.h"
50 #include "llvm/CodeGen/TargetSubtargetInfo.h"
51 #include "llvm/Config/llvm-config.h"
52 #include "llvm/InitializePasses.h"
53 #include "llvm/MC/LaneBitmask.h"
54 #include "llvm/Pass.h"
55 #include "llvm/Support/CommandLine.h"
56 #include "llvm/Support/Compiler.h"
57 #include "llvm/Support/Debug.h"
58 #include "llvm/Support/ErrorHandling.h"
59 #include "llvm/Support/GraphWriter.h"
60 #include "llvm/Support/MachineValueType.h"
61 #include "llvm/Support/raw_ostream.h"
62 #include <algorithm>
63 #include <cassert>
64 #include <cstdint>
65 #include <iterator>
66 #include <limits>
67 #include <memory>
68 #include <string>
69 #include <tuple>
70 #include <utility>
71 #include <vector>
72 
73 using namespace llvm;
74 
75 #define DEBUG_TYPE "machine-scheduler"
76 
77 STATISTIC(NumClustered, "Number of load/store pairs clustered");
78 
79 namespace llvm {
80 
81 cl::opt<bool> ForceTopDown("misched-topdown", cl::Hidden,
82                            cl::desc("Force top-down list scheduling"));
83 cl::opt<bool> ForceBottomUp("misched-bottomup", cl::Hidden,
84                             cl::desc("Force bottom-up list scheduling"));
85 cl::opt<bool>
86 DumpCriticalPathLength("misched-dcpl", cl::Hidden,
87                        cl::desc("Print critical path length to stdout"));
88 
89 cl::opt<bool> VerifyScheduling(
90     "verify-misched", cl::Hidden,
91     cl::desc("Verify machine instrs before and after machine scheduling"));
92 
93 #ifndef NDEBUG
94 cl::opt<bool> ViewMISchedDAGs(
95     "view-misched-dags", cl::Hidden,
96     cl::desc("Pop up a window to show MISched dags after they are processed"));
97 #else
98 const bool ViewMISchedDAGs = false;
99 #endif // NDEBUG
100 
101 } // end namespace llvm
102 
103 #ifndef NDEBUG
104 /// In some situations a few uninteresting nodes depend on nearly all other
105 /// nodes in the graph, provide a cutoff to hide them.
106 static cl::opt<unsigned> ViewMISchedCutoff("view-misched-cutoff", cl::Hidden,
107   cl::desc("Hide nodes with more predecessor/successor than cutoff"));
108 
109 static cl::opt<unsigned> MISchedCutoff("misched-cutoff", cl::Hidden,
110   cl::desc("Stop scheduling after N instructions"), cl::init(~0U));
111 
112 static cl::opt<std::string> SchedOnlyFunc("misched-only-func", cl::Hidden,
113   cl::desc("Only schedule this function"));
114 static cl::opt<unsigned> SchedOnlyBlock("misched-only-block", cl::Hidden,
115                                         cl::desc("Only schedule this MBB#"));
116 static cl::opt<bool> PrintDAGs("misched-print-dags", cl::Hidden,
117                               cl::desc("Print schedule DAGs"));
118 #else
119 static const bool PrintDAGs = false;
120 #endif // NDEBUG
121 
122 /// Avoid quadratic complexity in unusually large basic blocks by limiting the
123 /// size of the ready lists.
124 static cl::opt<unsigned> ReadyListLimit("misched-limit", cl::Hidden,
125   cl::desc("Limit ready list to N instructions"), cl::init(256));
126 
127 static cl::opt<bool> EnableRegPressure("misched-regpressure", cl::Hidden,
128   cl::desc("Enable register pressure scheduling."), cl::init(true));
129 
130 static cl::opt<bool> EnableCyclicPath("misched-cyclicpath", cl::Hidden,
131   cl::desc("Enable cyclic critical path analysis."), cl::init(true));
132 
133 static cl::opt<bool> EnableMemOpCluster("misched-cluster", cl::Hidden,
134                                         cl::desc("Enable memop clustering."),
135                                         cl::init(true));
136 static cl::opt<bool>
137     ForceFastCluster("force-fast-cluster", cl::Hidden,
138                      cl::desc("Switch to fast cluster algorithm with the lost "
139                               "of some fusion opportunities"),
140                      cl::init(false));
141 static cl::opt<unsigned>
142     FastClusterThreshold("fast-cluster-threshold", cl::Hidden,
143                          cl::desc("The threshold for fast cluster"),
144                          cl::init(1000));
145 
146 // DAG subtrees must have at least this many nodes.
147 static const unsigned MinSubtreeSize = 8;
148 
149 // Pin the vtables to this file.
150 void MachineSchedStrategy::anchor() {}
151 
152 void ScheduleDAGMutation::anchor() {}
153 
154 //===----------------------------------------------------------------------===//
155 // Machine Instruction Scheduling Pass and Registry
156 //===----------------------------------------------------------------------===//
157 
158 MachineSchedContext::MachineSchedContext() {
159   RegClassInfo = new RegisterClassInfo();
160 }
161 
162 MachineSchedContext::~MachineSchedContext() {
163   delete RegClassInfo;
164 }
165 
166 namespace {
167 
168 /// Base class for a machine scheduler class that can run at any point.
169 class MachineSchedulerBase : public MachineSchedContext,
170                              public MachineFunctionPass {
171 public:
172   MachineSchedulerBase(char &ID): MachineFunctionPass(ID) {}
173 
174   void print(raw_ostream &O, const Module* = nullptr) const override;
175 
176 protected:
177   void scheduleRegions(ScheduleDAGInstrs &Scheduler, bool FixKillFlags);
178 };
179 
180 /// MachineScheduler runs after coalescing and before register allocation.
181 class MachineScheduler : public MachineSchedulerBase {
182 public:
183   MachineScheduler();
184 
185   void getAnalysisUsage(AnalysisUsage &AU) const override;
186 
187   bool runOnMachineFunction(MachineFunction&) override;
188 
189   static char ID; // Class identification, replacement for typeinfo
190 
191 protected:
192   ScheduleDAGInstrs *createMachineScheduler();
193 };
194 
195 /// PostMachineScheduler runs after shortly before code emission.
196 class PostMachineScheduler : public MachineSchedulerBase {
197 public:
198   PostMachineScheduler();
199 
200   void getAnalysisUsage(AnalysisUsage &AU) const override;
201 
202   bool runOnMachineFunction(MachineFunction&) override;
203 
204   static char ID; // Class identification, replacement for typeinfo
205 
206 protected:
207   ScheduleDAGInstrs *createPostMachineScheduler();
208 };
209 
210 } // end anonymous namespace
211 
212 char MachineScheduler::ID = 0;
213 
214 char &llvm::MachineSchedulerID = MachineScheduler::ID;
215 
216 INITIALIZE_PASS_BEGIN(MachineScheduler, DEBUG_TYPE,
217                       "Machine Instruction Scheduler", false, false)
218 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
219 INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
220 INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
221 INITIALIZE_PASS_DEPENDENCY(SlotIndexes)
222 INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
223 INITIALIZE_PASS_END(MachineScheduler, DEBUG_TYPE,
224                     "Machine Instruction Scheduler", false, false)
225 
226 MachineScheduler::MachineScheduler() : MachineSchedulerBase(ID) {
227   initializeMachineSchedulerPass(*PassRegistry::getPassRegistry());
228 }
229 
230 void MachineScheduler::getAnalysisUsage(AnalysisUsage &AU) const {
231   AU.setPreservesCFG();
232   AU.addRequired<MachineDominatorTree>();
233   AU.addRequired<MachineLoopInfo>();
234   AU.addRequired<AAResultsWrapperPass>();
235   AU.addRequired<TargetPassConfig>();
236   AU.addRequired<SlotIndexes>();
237   AU.addPreserved<SlotIndexes>();
238   AU.addRequired<LiveIntervals>();
239   AU.addPreserved<LiveIntervals>();
240   MachineFunctionPass::getAnalysisUsage(AU);
241 }
242 
243 char PostMachineScheduler::ID = 0;
244 
245 char &llvm::PostMachineSchedulerID = PostMachineScheduler::ID;
246 
247 INITIALIZE_PASS_BEGIN(PostMachineScheduler, "postmisched",
248                       "PostRA Machine Instruction Scheduler", false, false)
249 INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
250 INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
251 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
252 INITIALIZE_PASS_END(PostMachineScheduler, "postmisched",
253                     "PostRA Machine Instruction Scheduler", false, false)
254 
255 PostMachineScheduler::PostMachineScheduler() : MachineSchedulerBase(ID) {
256   initializePostMachineSchedulerPass(*PassRegistry::getPassRegistry());
257 }
258 
259 void PostMachineScheduler::getAnalysisUsage(AnalysisUsage &AU) const {
260   AU.setPreservesCFG();
261   AU.addRequired<MachineDominatorTree>();
262   AU.addRequired<MachineLoopInfo>();
263   AU.addRequired<AAResultsWrapperPass>();
264   AU.addRequired<TargetPassConfig>();
265   MachineFunctionPass::getAnalysisUsage(AU);
266 }
267 
268 MachinePassRegistry<MachineSchedRegistry::ScheduleDAGCtor>
269     MachineSchedRegistry::Registry;
270 
271 /// A dummy default scheduler factory indicates whether the scheduler
272 /// is overridden on the command line.
273 static ScheduleDAGInstrs *useDefaultMachineSched(MachineSchedContext *C) {
274   return nullptr;
275 }
276 
277 /// MachineSchedOpt allows command line selection of the scheduler.
278 static cl::opt<MachineSchedRegistry::ScheduleDAGCtor, false,
279                RegisterPassParser<MachineSchedRegistry>>
280 MachineSchedOpt("misched",
281                 cl::init(&useDefaultMachineSched), cl::Hidden,
282                 cl::desc("Machine instruction scheduler to use"));
283 
284 static MachineSchedRegistry
285 DefaultSchedRegistry("default", "Use the target's default scheduler choice.",
286                      useDefaultMachineSched);
287 
288 static cl::opt<bool> EnableMachineSched(
289     "enable-misched",
290     cl::desc("Enable the machine instruction scheduling pass."), cl::init(true),
291     cl::Hidden);
292 
293 static cl::opt<bool> EnablePostRAMachineSched(
294     "enable-post-misched",
295     cl::desc("Enable the post-ra machine instruction scheduling pass."),
296     cl::init(true), cl::Hidden);
297 
298 /// Decrement this iterator until reaching the top or a non-debug instr.
299 static MachineBasicBlock::const_iterator
300 priorNonDebug(MachineBasicBlock::const_iterator I,
301               MachineBasicBlock::const_iterator Beg) {
302   assert(I != Beg && "reached the top of the region, cannot decrement");
303   while (--I != Beg) {
304     if (!I->isDebugOrPseudoInstr())
305       break;
306   }
307   return I;
308 }
309 
310 /// Non-const version.
311 static MachineBasicBlock::iterator
312 priorNonDebug(MachineBasicBlock::iterator I,
313               MachineBasicBlock::const_iterator Beg) {
314   return priorNonDebug(MachineBasicBlock::const_iterator(I), Beg)
315       .getNonConstIterator();
316 }
317 
318 /// If this iterator is a debug value, increment until reaching the End or a
319 /// non-debug instruction.
320 static MachineBasicBlock::const_iterator
321 nextIfDebug(MachineBasicBlock::const_iterator I,
322             MachineBasicBlock::const_iterator End) {
323   for(; I != End; ++I) {
324     if (!I->isDebugOrPseudoInstr())
325       break;
326   }
327   return I;
328 }
329 
330 /// Non-const version.
331 static MachineBasicBlock::iterator
332 nextIfDebug(MachineBasicBlock::iterator I,
333             MachineBasicBlock::const_iterator End) {
334   return nextIfDebug(MachineBasicBlock::const_iterator(I), End)
335       .getNonConstIterator();
336 }
337 
338 /// Instantiate a ScheduleDAGInstrs that will be owned by the caller.
339 ScheduleDAGInstrs *MachineScheduler::createMachineScheduler() {
340   // Select the scheduler, or set the default.
341   MachineSchedRegistry::ScheduleDAGCtor Ctor = MachineSchedOpt;
342   if (Ctor != useDefaultMachineSched)
343     return Ctor(this);
344 
345   // Get the default scheduler set by the target for this function.
346   ScheduleDAGInstrs *Scheduler = PassConfig->createMachineScheduler(this);
347   if (Scheduler)
348     return Scheduler;
349 
350   // Default to GenericScheduler.
351   return createGenericSchedLive(this);
352 }
353 
354 /// Instantiate a ScheduleDAGInstrs for PostRA scheduling that will be owned by
355 /// the caller. We don't have a command line option to override the postRA
356 /// scheduler. The Target must configure it.
357 ScheduleDAGInstrs *PostMachineScheduler::createPostMachineScheduler() {
358   // Get the postRA scheduler set by the target for this function.
359   ScheduleDAGInstrs *Scheduler = PassConfig->createPostMachineScheduler(this);
360   if (Scheduler)
361     return Scheduler;
362 
363   // Default to GenericScheduler.
364   return createGenericSchedPostRA(this);
365 }
366 
367 /// Top-level MachineScheduler pass driver.
368 ///
369 /// Visit blocks in function order. Divide each block into scheduling regions
370 /// and visit them bottom-up. Visiting regions bottom-up is not required, but is
371 /// consistent with the DAG builder, which traverses the interior of the
372 /// scheduling regions bottom-up.
373 ///
374 /// This design avoids exposing scheduling boundaries to the DAG builder,
375 /// simplifying the DAG builder's support for "special" target instructions.
376 /// At the same time the design allows target schedulers to operate across
377 /// scheduling boundaries, for example to bundle the boundary instructions
378 /// without reordering them. This creates complexity, because the target
379 /// scheduler must update the RegionBegin and RegionEnd positions cached by
380 /// ScheduleDAGInstrs whenever adding or removing instructions. A much simpler
381 /// design would be to split blocks at scheduling boundaries, but LLVM has a
382 /// general bias against block splitting purely for implementation simplicity.
383 bool MachineScheduler::runOnMachineFunction(MachineFunction &mf) {
384   if (skipFunction(mf.getFunction()))
385     return false;
386 
387   if (EnableMachineSched.getNumOccurrences()) {
388     if (!EnableMachineSched)
389       return false;
390   } else if (!mf.getSubtarget().enableMachineScheduler())
391     return false;
392 
393   LLVM_DEBUG(dbgs() << "Before MISched:\n"; mf.print(dbgs()));
394 
395   // Initialize the context of the pass.
396   MF = &mf;
397   MLI = &getAnalysis<MachineLoopInfo>();
398   MDT = &getAnalysis<MachineDominatorTree>();
399   PassConfig = &getAnalysis<TargetPassConfig>();
400   AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
401 
402   LIS = &getAnalysis<LiveIntervals>();
403 
404   if (VerifyScheduling) {
405     LLVM_DEBUG(LIS->dump());
406     MF->verify(this, "Before machine scheduling.");
407   }
408   RegClassInfo->runOnMachineFunction(*MF);
409 
410   // Instantiate the selected scheduler for this target, function, and
411   // optimization level.
412   std::unique_ptr<ScheduleDAGInstrs> Scheduler(createMachineScheduler());
413   scheduleRegions(*Scheduler, false);
414 
415   LLVM_DEBUG(LIS->dump());
416   if (VerifyScheduling)
417     MF->verify(this, "After machine scheduling.");
418   return true;
419 }
420 
421 bool PostMachineScheduler::runOnMachineFunction(MachineFunction &mf) {
422   if (skipFunction(mf.getFunction()))
423     return false;
424 
425   if (EnablePostRAMachineSched.getNumOccurrences()) {
426     if (!EnablePostRAMachineSched)
427       return false;
428   } else if (!mf.getSubtarget().enablePostRAMachineScheduler()) {
429     LLVM_DEBUG(dbgs() << "Subtarget disables post-MI-sched.\n");
430     return false;
431   }
432   LLVM_DEBUG(dbgs() << "Before post-MI-sched:\n"; mf.print(dbgs()));
433 
434   // Initialize the context of the pass.
435   MF = &mf;
436   MLI = &getAnalysis<MachineLoopInfo>();
437   PassConfig = &getAnalysis<TargetPassConfig>();
438   AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
439 
440   if (VerifyScheduling)
441     MF->verify(this, "Before post machine scheduling.");
442 
443   // Instantiate the selected scheduler for this target, function, and
444   // optimization level.
445   std::unique_ptr<ScheduleDAGInstrs> Scheduler(createPostMachineScheduler());
446   scheduleRegions(*Scheduler, true);
447 
448   if (VerifyScheduling)
449     MF->verify(this, "After post machine scheduling.");
450   return true;
451 }
452 
453 /// Return true of the given instruction should not be included in a scheduling
454 /// region.
455 ///
456 /// MachineScheduler does not currently support scheduling across calls. To
457 /// handle calls, the DAG builder needs to be modified to create register
458 /// anti/output dependencies on the registers clobbered by the call's regmask
459 /// operand. In PreRA scheduling, the stack pointer adjustment already prevents
460 /// scheduling across calls. In PostRA scheduling, we need the isCall to enforce
461 /// the boundary, but there would be no benefit to postRA scheduling across
462 /// calls this late anyway.
463 static bool isSchedBoundary(MachineBasicBlock::iterator MI,
464                             MachineBasicBlock *MBB,
465                             MachineFunction *MF,
466                             const TargetInstrInfo *TII) {
467   return MI->isCall() || TII->isSchedulingBoundary(*MI, MBB, *MF);
468 }
469 
470 /// A region of an MBB for scheduling.
471 namespace {
472 struct SchedRegion {
473   /// RegionBegin is the first instruction in the scheduling region, and
474   /// RegionEnd is either MBB->end() or the scheduling boundary after the
475   /// last instruction in the scheduling region. These iterators cannot refer
476   /// to instructions outside of the identified scheduling region because
477   /// those may be reordered before scheduling this region.
478   MachineBasicBlock::iterator RegionBegin;
479   MachineBasicBlock::iterator RegionEnd;
480   unsigned NumRegionInstrs;
481 
482   SchedRegion(MachineBasicBlock::iterator B, MachineBasicBlock::iterator E,
483               unsigned N) :
484     RegionBegin(B), RegionEnd(E), NumRegionInstrs(N) {}
485 };
486 } // end anonymous namespace
487 
488 using MBBRegionsVector = SmallVector<SchedRegion, 16>;
489 
490 static void
491 getSchedRegions(MachineBasicBlock *MBB,
492                 MBBRegionsVector &Regions,
493                 bool RegionsTopDown) {
494   MachineFunction *MF = MBB->getParent();
495   const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
496 
497   MachineBasicBlock::iterator I = nullptr;
498   for(MachineBasicBlock::iterator RegionEnd = MBB->end();
499       RegionEnd != MBB->begin(); RegionEnd = I) {
500 
501     // Avoid decrementing RegionEnd for blocks with no terminator.
502     if (RegionEnd != MBB->end() ||
503         isSchedBoundary(&*std::prev(RegionEnd), &*MBB, MF, TII)) {
504       --RegionEnd;
505     }
506 
507     // The next region starts above the previous region. Look backward in the
508     // instruction stream until we find the nearest boundary.
509     unsigned NumRegionInstrs = 0;
510     I = RegionEnd;
511     for (;I != MBB->begin(); --I) {
512       MachineInstr &MI = *std::prev(I);
513       if (isSchedBoundary(&MI, &*MBB, MF, TII))
514         break;
515       if (!MI.isDebugOrPseudoInstr()) {
516         // MBB::size() uses instr_iterator to count. Here we need a bundle to
517         // count as a single instruction.
518         ++NumRegionInstrs;
519       }
520     }
521 
522     // It's possible we found a scheduling region that only has debug
523     // instructions. Don't bother scheduling these.
524     if (NumRegionInstrs != 0)
525       Regions.push_back(SchedRegion(I, RegionEnd, NumRegionInstrs));
526   }
527 
528   if (RegionsTopDown)
529     std::reverse(Regions.begin(), Regions.end());
530 }
531 
532 /// Main driver for both MachineScheduler and PostMachineScheduler.
533 void MachineSchedulerBase::scheduleRegions(ScheduleDAGInstrs &Scheduler,
534                                            bool FixKillFlags) {
535   // Visit all machine basic blocks.
536   //
537   // TODO: Visit blocks in global postorder or postorder within the bottom-up
538   // loop tree. Then we can optionally compute global RegPressure.
539   for (MachineFunction::iterator MBB = MF->begin(), MBBEnd = MF->end();
540        MBB != MBBEnd; ++MBB) {
541 
542     Scheduler.startBlock(&*MBB);
543 
544 #ifndef NDEBUG
545     if (SchedOnlyFunc.getNumOccurrences() && SchedOnlyFunc != MF->getName())
546       continue;
547     if (SchedOnlyBlock.getNumOccurrences()
548         && (int)SchedOnlyBlock != MBB->getNumber())
549       continue;
550 #endif
551 
552     // Break the block into scheduling regions [I, RegionEnd). RegionEnd
553     // points to the scheduling boundary at the bottom of the region. The DAG
554     // does not include RegionEnd, but the region does (i.e. the next
555     // RegionEnd is above the previous RegionBegin). If the current block has
556     // no terminator then RegionEnd == MBB->end() for the bottom region.
557     //
558     // All the regions of MBB are first found and stored in MBBRegions, which
559     // will be processed (MBB) top-down if initialized with true.
560     //
561     // The Scheduler may insert instructions during either schedule() or
562     // exitRegion(), even for empty regions. So the local iterators 'I' and
563     // 'RegionEnd' are invalid across these calls. Instructions must not be
564     // added to other regions than the current one without updating MBBRegions.
565 
566     MBBRegionsVector MBBRegions;
567     getSchedRegions(&*MBB, MBBRegions, Scheduler.doMBBSchedRegionsTopDown());
568     for (const SchedRegion &R : MBBRegions) {
569       MachineBasicBlock::iterator I = R.RegionBegin;
570       MachineBasicBlock::iterator RegionEnd = R.RegionEnd;
571       unsigned NumRegionInstrs = R.NumRegionInstrs;
572 
573       // Notify the scheduler of the region, even if we may skip scheduling
574       // it. Perhaps it still needs to be bundled.
575       Scheduler.enterRegion(&*MBB, I, RegionEnd, NumRegionInstrs);
576 
577       // Skip empty scheduling regions (0 or 1 schedulable instructions).
578       if (I == RegionEnd || I == std::prev(RegionEnd)) {
579         // Close the current region. Bundle the terminator if needed.
580         // This invalidates 'RegionEnd' and 'I'.
581         Scheduler.exitRegion();
582         continue;
583       }
584       LLVM_DEBUG(dbgs() << "********** MI Scheduling **********\n");
585       LLVM_DEBUG(dbgs() << MF->getName() << ":" << printMBBReference(*MBB)
586                         << " " << MBB->getName() << "\n  From: " << *I
587                         << "    To: ";
588                  if (RegionEnd != MBB->end()) dbgs() << *RegionEnd;
589                  else dbgs() << "End\n";
590                  dbgs() << " RegionInstrs: " << NumRegionInstrs << '\n');
591       if (DumpCriticalPathLength) {
592         errs() << MF->getName();
593         errs() << ":%bb. " << MBB->getNumber();
594         errs() << " " << MBB->getName() << " \n";
595       }
596 
597       // Schedule a region: possibly reorder instructions.
598       // This invalidates the original region iterators.
599       Scheduler.schedule();
600 
601       // Close the current region.
602       Scheduler.exitRegion();
603     }
604     Scheduler.finishBlock();
605     // FIXME: Ideally, no further passes should rely on kill flags. However,
606     // thumb2 size reduction is currently an exception, so the PostMIScheduler
607     // needs to do this.
608     if (FixKillFlags)
609       Scheduler.fixupKills(*MBB);
610   }
611   Scheduler.finalizeSchedule();
612 }
613 
614 void MachineSchedulerBase::print(raw_ostream &O, const Module* m) const {
615   // unimplemented
616 }
617 
618 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
619 LLVM_DUMP_METHOD void ReadyQueue::dump() const {
620   dbgs() << "Queue " << Name << ": ";
621   for (const SUnit *SU : Queue)
622     dbgs() << SU->NodeNum << " ";
623   dbgs() << "\n";
624 }
625 #endif
626 
627 //===----------------------------------------------------------------------===//
628 // ScheduleDAGMI - Basic machine instruction scheduling. This is
629 // independent of PreRA/PostRA scheduling and involves no extra book-keeping for
630 // virtual registers.
631 // ===----------------------------------------------------------------------===/
632 
633 // Provide a vtable anchor.
634 ScheduleDAGMI::~ScheduleDAGMI() = default;
635 
636 /// ReleaseSucc - Decrement the NumPredsLeft count of a successor. When
637 /// NumPredsLeft reaches zero, release the successor node.
638 ///
639 /// FIXME: Adjust SuccSU height based on MinLatency.
640 void ScheduleDAGMI::releaseSucc(SUnit *SU, SDep *SuccEdge) {
641   SUnit *SuccSU = SuccEdge->getSUnit();
642 
643   if (SuccEdge->isWeak()) {
644     --SuccSU->WeakPredsLeft;
645     if (SuccEdge->isCluster())
646       NextClusterSucc = SuccSU;
647     return;
648   }
649 #ifndef NDEBUG
650   if (SuccSU->NumPredsLeft == 0) {
651     dbgs() << "*** Scheduling failed! ***\n";
652     dumpNode(*SuccSU);
653     dbgs() << " has been released too many times!\n";
654     llvm_unreachable(nullptr);
655   }
656 #endif
657   // SU->TopReadyCycle was set to CurrCycle when it was scheduled. However,
658   // CurrCycle may have advanced since then.
659   if (SuccSU->TopReadyCycle < SU->TopReadyCycle + SuccEdge->getLatency())
660     SuccSU->TopReadyCycle = SU->TopReadyCycle + SuccEdge->getLatency();
661 
662   --SuccSU->NumPredsLeft;
663   if (SuccSU->NumPredsLeft == 0 && SuccSU != &ExitSU)
664     SchedImpl->releaseTopNode(SuccSU);
665 }
666 
667 /// releaseSuccessors - Call releaseSucc on each of SU's successors.
668 void ScheduleDAGMI::releaseSuccessors(SUnit *SU) {
669   for (SDep &Succ : SU->Succs)
670     releaseSucc(SU, &Succ);
671 }
672 
673 /// ReleasePred - Decrement the NumSuccsLeft count of a predecessor. When
674 /// NumSuccsLeft reaches zero, release the predecessor node.
675 ///
676 /// FIXME: Adjust PredSU height based on MinLatency.
677 void ScheduleDAGMI::releasePred(SUnit *SU, SDep *PredEdge) {
678   SUnit *PredSU = PredEdge->getSUnit();
679 
680   if (PredEdge->isWeak()) {
681     --PredSU->WeakSuccsLeft;
682     if (PredEdge->isCluster())
683       NextClusterPred = PredSU;
684     return;
685   }
686 #ifndef NDEBUG
687   if (PredSU->NumSuccsLeft == 0) {
688     dbgs() << "*** Scheduling failed! ***\n";
689     dumpNode(*PredSU);
690     dbgs() << " has been released too many times!\n";
691     llvm_unreachable(nullptr);
692   }
693 #endif
694   // SU->BotReadyCycle was set to CurrCycle when it was scheduled. However,
695   // CurrCycle may have advanced since then.
696   if (PredSU->BotReadyCycle < SU->BotReadyCycle + PredEdge->getLatency())
697     PredSU->BotReadyCycle = SU->BotReadyCycle + PredEdge->getLatency();
698 
699   --PredSU->NumSuccsLeft;
700   if (PredSU->NumSuccsLeft == 0 && PredSU != &EntrySU)
701     SchedImpl->releaseBottomNode(PredSU);
702 }
703 
704 /// releasePredecessors - Call releasePred on each of SU's predecessors.
705 void ScheduleDAGMI::releasePredecessors(SUnit *SU) {
706   for (SDep &Pred : SU->Preds)
707     releasePred(SU, &Pred);
708 }
709 
710 void ScheduleDAGMI::startBlock(MachineBasicBlock *bb) {
711   ScheduleDAGInstrs::startBlock(bb);
712   SchedImpl->enterMBB(bb);
713 }
714 
715 void ScheduleDAGMI::finishBlock() {
716   SchedImpl->leaveMBB();
717   ScheduleDAGInstrs::finishBlock();
718 }
719 
720 /// enterRegion - Called back from MachineScheduler::runOnMachineFunction after
721 /// crossing a scheduling boundary. [begin, end) includes all instructions in
722 /// the region, including the boundary itself and single-instruction regions
723 /// that don't get scheduled.
724 void ScheduleDAGMI::enterRegion(MachineBasicBlock *bb,
725                                      MachineBasicBlock::iterator begin,
726                                      MachineBasicBlock::iterator end,
727                                      unsigned regioninstrs)
728 {
729   ScheduleDAGInstrs::enterRegion(bb, begin, end, regioninstrs);
730 
731   SchedImpl->initPolicy(begin, end, regioninstrs);
732 }
733 
734 /// This is normally called from the main scheduler loop but may also be invoked
735 /// by the scheduling strategy to perform additional code motion.
736 void ScheduleDAGMI::moveInstruction(
737   MachineInstr *MI, MachineBasicBlock::iterator InsertPos) {
738   // Advance RegionBegin if the first instruction moves down.
739   if (&*RegionBegin == MI)
740     ++RegionBegin;
741 
742   // Update the instruction stream.
743   BB->splice(InsertPos, BB, MI);
744 
745   // Update LiveIntervals
746   if (LIS)
747     LIS->handleMove(*MI, /*UpdateFlags=*/true);
748 
749   // Recede RegionBegin if an instruction moves above the first.
750   if (RegionBegin == InsertPos)
751     RegionBegin = MI;
752 }
753 
754 bool ScheduleDAGMI::checkSchedLimit() {
755 #ifndef NDEBUG
756   if (NumInstrsScheduled == MISchedCutoff && MISchedCutoff != ~0U) {
757     CurrentTop = CurrentBottom;
758     return false;
759   }
760   ++NumInstrsScheduled;
761 #endif
762   return true;
763 }
764 
765 /// Per-region scheduling driver, called back from
766 /// MachineScheduler::runOnMachineFunction. This is a simplified driver that
767 /// does not consider liveness or register pressure. It is useful for PostRA
768 /// scheduling and potentially other custom schedulers.
769 void ScheduleDAGMI::schedule() {
770   LLVM_DEBUG(dbgs() << "ScheduleDAGMI::schedule starting\n");
771   LLVM_DEBUG(SchedImpl->dumpPolicy());
772 
773   // Build the DAG.
774   buildSchedGraph(AA);
775 
776   postprocessDAG();
777 
778   SmallVector<SUnit*, 8> TopRoots, BotRoots;
779   findRootsAndBiasEdges(TopRoots, BotRoots);
780 
781   LLVM_DEBUG(dump());
782   if (PrintDAGs) dump();
783   if (ViewMISchedDAGs) viewGraph();
784 
785   // Initialize the strategy before modifying the DAG.
786   // This may initialize a DFSResult to be used for queue priority.
787   SchedImpl->initialize(this);
788 
789   // Initialize ready queues now that the DAG and priority data are finalized.
790   initQueues(TopRoots, BotRoots);
791 
792   bool IsTopNode = false;
793   while (true) {
794     LLVM_DEBUG(dbgs() << "** ScheduleDAGMI::schedule picking next node\n");
795     SUnit *SU = SchedImpl->pickNode(IsTopNode);
796     if (!SU) break;
797 
798     assert(!SU->isScheduled && "Node already scheduled");
799     if (!checkSchedLimit())
800       break;
801 
802     MachineInstr *MI = SU->getInstr();
803     if (IsTopNode) {
804       assert(SU->isTopReady() && "node still has unscheduled dependencies");
805       if (&*CurrentTop == MI)
806         CurrentTop = nextIfDebug(++CurrentTop, CurrentBottom);
807       else
808         moveInstruction(MI, CurrentTop);
809     } else {
810       assert(SU->isBottomReady() && "node still has unscheduled dependencies");
811       MachineBasicBlock::iterator priorII =
812         priorNonDebug(CurrentBottom, CurrentTop);
813       if (&*priorII == MI)
814         CurrentBottom = priorII;
815       else {
816         if (&*CurrentTop == MI)
817           CurrentTop = nextIfDebug(++CurrentTop, priorII);
818         moveInstruction(MI, CurrentBottom);
819         CurrentBottom = MI;
820       }
821     }
822     // Notify the scheduling strategy before updating the DAG.
823     // This sets the scheduled node's ReadyCycle to CurrCycle. When updateQueues
824     // runs, it can then use the accurate ReadyCycle time to determine whether
825     // newly released nodes can move to the readyQ.
826     SchedImpl->schedNode(SU, IsTopNode);
827 
828     updateQueues(SU, IsTopNode);
829   }
830   assert(CurrentTop == CurrentBottom && "Nonempty unscheduled zone.");
831 
832   placeDebugValues();
833 
834   LLVM_DEBUG({
835     dbgs() << "*** Final schedule for "
836            << printMBBReference(*begin()->getParent()) << " ***\n";
837     dumpSchedule();
838     dbgs() << '\n';
839   });
840 }
841 
842 /// Apply each ScheduleDAGMutation step in order.
843 void ScheduleDAGMI::postprocessDAG() {
844   for (auto &m : Mutations)
845     m->apply(this);
846 }
847 
848 void ScheduleDAGMI::
849 findRootsAndBiasEdges(SmallVectorImpl<SUnit*> &TopRoots,
850                       SmallVectorImpl<SUnit*> &BotRoots) {
851   for (SUnit &SU : SUnits) {
852     assert(!SU.isBoundaryNode() && "Boundary node should not be in SUnits");
853 
854     // Order predecessors so DFSResult follows the critical path.
855     SU.biasCriticalPath();
856 
857     // A SUnit is ready to top schedule if it has no predecessors.
858     if (!SU.NumPredsLeft)
859       TopRoots.push_back(&SU);
860     // A SUnit is ready to bottom schedule if it has no successors.
861     if (!SU.NumSuccsLeft)
862       BotRoots.push_back(&SU);
863   }
864   ExitSU.biasCriticalPath();
865 }
866 
867 /// Identify DAG roots and setup scheduler queues.
868 void ScheduleDAGMI::initQueues(ArrayRef<SUnit*> TopRoots,
869                                ArrayRef<SUnit*> BotRoots) {
870   NextClusterSucc = nullptr;
871   NextClusterPred = nullptr;
872 
873   // Release all DAG roots for scheduling, not including EntrySU/ExitSU.
874   //
875   // Nodes with unreleased weak edges can still be roots.
876   // Release top roots in forward order.
877   for (SUnit *SU : TopRoots)
878     SchedImpl->releaseTopNode(SU);
879 
880   // Release bottom roots in reverse order so the higher priority nodes appear
881   // first. This is more natural and slightly more efficient.
882   for (SmallVectorImpl<SUnit*>::const_reverse_iterator
883          I = BotRoots.rbegin(), E = BotRoots.rend(); I != E; ++I) {
884     SchedImpl->releaseBottomNode(*I);
885   }
886 
887   releaseSuccessors(&EntrySU);
888   releasePredecessors(&ExitSU);
889 
890   SchedImpl->registerRoots();
891 
892   // Advance past initial DebugValues.
893   CurrentTop = nextIfDebug(RegionBegin, RegionEnd);
894   CurrentBottom = RegionEnd;
895 }
896 
897 /// Update scheduler queues after scheduling an instruction.
898 void ScheduleDAGMI::updateQueues(SUnit *SU, bool IsTopNode) {
899   // Release dependent instructions for scheduling.
900   if (IsTopNode)
901     releaseSuccessors(SU);
902   else
903     releasePredecessors(SU);
904 
905   SU->isScheduled = true;
906 }
907 
908 /// Reinsert any remaining debug_values, just like the PostRA scheduler.
909 void ScheduleDAGMI::placeDebugValues() {
910   // If first instruction was a DBG_VALUE then put it back.
911   if (FirstDbgValue) {
912     BB->splice(RegionBegin, BB, FirstDbgValue);
913     RegionBegin = FirstDbgValue;
914   }
915 
916   for (std::vector<std::pair<MachineInstr *, MachineInstr *>>::iterator
917          DI = DbgValues.end(), DE = DbgValues.begin(); DI != DE; --DI) {
918     std::pair<MachineInstr *, MachineInstr *> P = *std::prev(DI);
919     MachineInstr *DbgValue = P.first;
920     MachineBasicBlock::iterator OrigPrevMI = P.second;
921     if (&*RegionBegin == DbgValue)
922       ++RegionBegin;
923     BB->splice(std::next(OrigPrevMI), BB, DbgValue);
924     if (RegionEnd != BB->end() && OrigPrevMI == &*RegionEnd)
925       RegionEnd = DbgValue;
926   }
927 }
928 
929 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
930 LLVM_DUMP_METHOD void ScheduleDAGMI::dumpSchedule() const {
931   for (MachineInstr &MI : *this) {
932     if (SUnit *SU = getSUnit(&MI))
933       dumpNode(*SU);
934     else
935       dbgs() << "Missing SUnit\n";
936   }
937 }
938 #endif
939 
940 //===----------------------------------------------------------------------===//
941 // ScheduleDAGMILive - Base class for MachineInstr scheduling with LiveIntervals
942 // preservation.
943 //===----------------------------------------------------------------------===//
944 
945 ScheduleDAGMILive::~ScheduleDAGMILive() {
946   delete DFSResult;
947 }
948 
949 void ScheduleDAGMILive::collectVRegUses(SUnit &SU) {
950   const MachineInstr &MI = *SU.getInstr();
951   for (const MachineOperand &MO : MI.operands()) {
952     if (!MO.isReg())
953       continue;
954     if (!MO.readsReg())
955       continue;
956     if (TrackLaneMasks && !MO.isUse())
957       continue;
958 
959     Register Reg = MO.getReg();
960     if (!Register::isVirtualRegister(Reg))
961       continue;
962 
963     // Ignore re-defs.
964     if (TrackLaneMasks) {
965       bool FoundDef = false;
966       for (const MachineOperand &MO2 : MI.operands()) {
967         if (MO2.isReg() && MO2.isDef() && MO2.getReg() == Reg && !MO2.isDead()) {
968           FoundDef = true;
969           break;
970         }
971       }
972       if (FoundDef)
973         continue;
974     }
975 
976     // Record this local VReg use.
977     VReg2SUnitMultiMap::iterator UI = VRegUses.find(Reg);
978     for (; UI != VRegUses.end(); ++UI) {
979       if (UI->SU == &SU)
980         break;
981     }
982     if (UI == VRegUses.end())
983       VRegUses.insert(VReg2SUnit(Reg, LaneBitmask::getNone(), &SU));
984   }
985 }
986 
987 /// enterRegion - Called back from MachineScheduler::runOnMachineFunction after
988 /// crossing a scheduling boundary. [begin, end) includes all instructions in
989 /// the region, including the boundary itself and single-instruction regions
990 /// that don't get scheduled.
991 void ScheduleDAGMILive::enterRegion(MachineBasicBlock *bb,
992                                 MachineBasicBlock::iterator begin,
993                                 MachineBasicBlock::iterator end,
994                                 unsigned regioninstrs)
995 {
996   // ScheduleDAGMI initializes SchedImpl's per-region policy.
997   ScheduleDAGMI::enterRegion(bb, begin, end, regioninstrs);
998 
999   // For convenience remember the end of the liveness region.
1000   LiveRegionEnd = (RegionEnd == bb->end()) ? RegionEnd : std::next(RegionEnd);
1001 
1002   SUPressureDiffs.clear();
1003 
1004   ShouldTrackPressure = SchedImpl->shouldTrackPressure();
1005   ShouldTrackLaneMasks = SchedImpl->shouldTrackLaneMasks();
1006 
1007   assert((!ShouldTrackLaneMasks || ShouldTrackPressure) &&
1008          "ShouldTrackLaneMasks requires ShouldTrackPressure");
1009 }
1010 
1011 // Setup the register pressure trackers for the top scheduled and bottom
1012 // scheduled regions.
1013 void ScheduleDAGMILive::initRegPressure() {
1014   VRegUses.clear();
1015   VRegUses.setUniverse(MRI.getNumVirtRegs());
1016   for (SUnit &SU : SUnits)
1017     collectVRegUses(SU);
1018 
1019   TopRPTracker.init(&MF, RegClassInfo, LIS, BB, RegionBegin,
1020                     ShouldTrackLaneMasks, false);
1021   BotRPTracker.init(&MF, RegClassInfo, LIS, BB, LiveRegionEnd,
1022                     ShouldTrackLaneMasks, false);
1023 
1024   // Close the RPTracker to finalize live ins.
1025   RPTracker.closeRegion();
1026 
1027   LLVM_DEBUG(RPTracker.dump());
1028 
1029   // Initialize the live ins and live outs.
1030   TopRPTracker.addLiveRegs(RPTracker.getPressure().LiveInRegs);
1031   BotRPTracker.addLiveRegs(RPTracker.getPressure().LiveOutRegs);
1032 
1033   // Close one end of the tracker so we can call
1034   // getMaxUpward/DownwardPressureDelta before advancing across any
1035   // instructions. This converts currently live regs into live ins/outs.
1036   TopRPTracker.closeTop();
1037   BotRPTracker.closeBottom();
1038 
1039   BotRPTracker.initLiveThru(RPTracker);
1040   if (!BotRPTracker.getLiveThru().empty()) {
1041     TopRPTracker.initLiveThru(BotRPTracker.getLiveThru());
1042     LLVM_DEBUG(dbgs() << "Live Thru: ";
1043                dumpRegSetPressure(BotRPTracker.getLiveThru(), TRI));
1044   };
1045 
1046   // For each live out vreg reduce the pressure change associated with other
1047   // uses of the same vreg below the live-out reaching def.
1048   updatePressureDiffs(RPTracker.getPressure().LiveOutRegs);
1049 
1050   // Account for liveness generated by the region boundary.
1051   if (LiveRegionEnd != RegionEnd) {
1052     SmallVector<RegisterMaskPair, 8> LiveUses;
1053     BotRPTracker.recede(&LiveUses);
1054     updatePressureDiffs(LiveUses);
1055   }
1056 
1057   LLVM_DEBUG(dbgs() << "Top Pressure:\n";
1058              dumpRegSetPressure(TopRPTracker.getRegSetPressureAtPos(), TRI);
1059              dbgs() << "Bottom Pressure:\n";
1060              dumpRegSetPressure(BotRPTracker.getRegSetPressureAtPos(), TRI););
1061 
1062   assert((BotRPTracker.getPos() == RegionEnd ||
1063           (RegionEnd->isDebugInstr() &&
1064            BotRPTracker.getPos() == priorNonDebug(RegionEnd, RegionBegin))) &&
1065          "Can't find the region bottom");
1066 
1067   // Cache the list of excess pressure sets in this region. This will also track
1068   // the max pressure in the scheduled code for these sets.
1069   RegionCriticalPSets.clear();
1070   const std::vector<unsigned> &RegionPressure =
1071     RPTracker.getPressure().MaxSetPressure;
1072   for (unsigned i = 0, e = RegionPressure.size(); i < e; ++i) {
1073     unsigned Limit = RegClassInfo->getRegPressureSetLimit(i);
1074     if (RegionPressure[i] > Limit) {
1075       LLVM_DEBUG(dbgs() << TRI->getRegPressureSetName(i) << " Limit " << Limit
1076                         << " Actual " << RegionPressure[i] << "\n");
1077       RegionCriticalPSets.push_back(PressureChange(i));
1078     }
1079   }
1080   LLVM_DEBUG(dbgs() << "Excess PSets: ";
1081              for (const PressureChange &RCPS
1082                   : RegionCriticalPSets) dbgs()
1083              << TRI->getRegPressureSetName(RCPS.getPSet()) << " ";
1084              dbgs() << "\n");
1085 }
1086 
1087 void ScheduleDAGMILive::
1088 updateScheduledPressure(const SUnit *SU,
1089                         const std::vector<unsigned> &NewMaxPressure) {
1090   const PressureDiff &PDiff = getPressureDiff(SU);
1091   unsigned CritIdx = 0, CritEnd = RegionCriticalPSets.size();
1092   for (const PressureChange &PC : PDiff) {
1093     if (!PC.isValid())
1094       break;
1095     unsigned ID = PC.getPSet();
1096     while (CritIdx != CritEnd && RegionCriticalPSets[CritIdx].getPSet() < ID)
1097       ++CritIdx;
1098     if (CritIdx != CritEnd && RegionCriticalPSets[CritIdx].getPSet() == ID) {
1099       if ((int)NewMaxPressure[ID] > RegionCriticalPSets[CritIdx].getUnitInc()
1100           && NewMaxPressure[ID] <= (unsigned)std::numeric_limits<int16_t>::max())
1101         RegionCriticalPSets[CritIdx].setUnitInc(NewMaxPressure[ID]);
1102     }
1103     unsigned Limit = RegClassInfo->getRegPressureSetLimit(ID);
1104     if (NewMaxPressure[ID] >= Limit - 2) {
1105       LLVM_DEBUG(dbgs() << "  " << TRI->getRegPressureSetName(ID) << ": "
1106                         << NewMaxPressure[ID]
1107                         << ((NewMaxPressure[ID] > Limit) ? " > " : " <= ")
1108                         << Limit << "(+ " << BotRPTracker.getLiveThru()[ID]
1109                         << " livethru)\n");
1110     }
1111   }
1112 }
1113 
1114 /// Update the PressureDiff array for liveness after scheduling this
1115 /// instruction.
1116 void ScheduleDAGMILive::updatePressureDiffs(
1117     ArrayRef<RegisterMaskPair> LiveUses) {
1118   for (const RegisterMaskPair &P : LiveUses) {
1119     Register Reg = P.RegUnit;
1120     /// FIXME: Currently assuming single-use physregs.
1121     if (!Register::isVirtualRegister(Reg))
1122       continue;
1123 
1124     if (ShouldTrackLaneMasks) {
1125       // If the register has just become live then other uses won't change
1126       // this fact anymore => decrement pressure.
1127       // If the register has just become dead then other uses make it come
1128       // back to life => increment pressure.
1129       bool Decrement = P.LaneMask.any();
1130 
1131       for (const VReg2SUnit &V2SU
1132            : make_range(VRegUses.find(Reg), VRegUses.end())) {
1133         SUnit &SU = *V2SU.SU;
1134         if (SU.isScheduled || &SU == &ExitSU)
1135           continue;
1136 
1137         PressureDiff &PDiff = getPressureDiff(&SU);
1138         PDiff.addPressureChange(Reg, Decrement, &MRI);
1139         LLVM_DEBUG(dbgs() << "  UpdateRegP: SU(" << SU.NodeNum << ") "
1140                           << printReg(Reg, TRI) << ':'
1141                           << PrintLaneMask(P.LaneMask) << ' ' << *SU.getInstr();
1142                    dbgs() << "              to "; PDiff.dump(*TRI););
1143       }
1144     } else {
1145       assert(P.LaneMask.any());
1146       LLVM_DEBUG(dbgs() << "  LiveReg: " << printVRegOrUnit(Reg, TRI) << "\n");
1147       // This may be called before CurrentBottom has been initialized. However,
1148       // BotRPTracker must have a valid position. We want the value live into the
1149       // instruction or live out of the block, so ask for the previous
1150       // instruction's live-out.
1151       const LiveInterval &LI = LIS->getInterval(Reg);
1152       VNInfo *VNI;
1153       MachineBasicBlock::const_iterator I =
1154         nextIfDebug(BotRPTracker.getPos(), BB->end());
1155       if (I == BB->end())
1156         VNI = LI.getVNInfoBefore(LIS->getMBBEndIdx(BB));
1157       else {
1158         LiveQueryResult LRQ = LI.Query(LIS->getInstructionIndex(*I));
1159         VNI = LRQ.valueIn();
1160       }
1161       // RegisterPressureTracker guarantees that readsReg is true for LiveUses.
1162       assert(VNI && "No live value at use.");
1163       for (const VReg2SUnit &V2SU
1164            : make_range(VRegUses.find(Reg), VRegUses.end())) {
1165         SUnit *SU = V2SU.SU;
1166         // If this use comes before the reaching def, it cannot be a last use,
1167         // so decrease its pressure change.
1168         if (!SU->isScheduled && SU != &ExitSU) {
1169           LiveQueryResult LRQ =
1170               LI.Query(LIS->getInstructionIndex(*SU->getInstr()));
1171           if (LRQ.valueIn() == VNI) {
1172             PressureDiff &PDiff = getPressureDiff(SU);
1173             PDiff.addPressureChange(Reg, true, &MRI);
1174             LLVM_DEBUG(dbgs() << "  UpdateRegP: SU(" << SU->NodeNum << ") "
1175                               << *SU->getInstr();
1176                        dbgs() << "              to "; PDiff.dump(*TRI););
1177           }
1178         }
1179       }
1180     }
1181   }
1182 }
1183 
1184 void ScheduleDAGMILive::dump() const {
1185 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1186   if (EntrySU.getInstr() != nullptr)
1187     dumpNodeAll(EntrySU);
1188   for (const SUnit &SU : SUnits) {
1189     dumpNodeAll(SU);
1190     if (ShouldTrackPressure) {
1191       dbgs() << "  Pressure Diff      : ";
1192       getPressureDiff(&SU).dump(*TRI);
1193     }
1194     dbgs() << "  Single Issue       : ";
1195     if (SchedModel.mustBeginGroup(SU.getInstr()) &&
1196         SchedModel.mustEndGroup(SU.getInstr()))
1197       dbgs() << "true;";
1198     else
1199       dbgs() << "false;";
1200     dbgs() << '\n';
1201   }
1202   if (ExitSU.getInstr() != nullptr)
1203     dumpNodeAll(ExitSU);
1204 #endif
1205 }
1206 
1207 /// schedule - Called back from MachineScheduler::runOnMachineFunction
1208 /// after setting up the current scheduling region. [RegionBegin, RegionEnd)
1209 /// only includes instructions that have DAG nodes, not scheduling boundaries.
1210 ///
1211 /// This is a skeletal driver, with all the functionality pushed into helpers,
1212 /// so that it can be easily extended by experimental schedulers. Generally,
1213 /// implementing MachineSchedStrategy should be sufficient to implement a new
1214 /// scheduling algorithm. However, if a scheduler further subclasses
1215 /// ScheduleDAGMILive then it will want to override this virtual method in order
1216 /// to update any specialized state.
1217 void ScheduleDAGMILive::schedule() {
1218   LLVM_DEBUG(dbgs() << "ScheduleDAGMILive::schedule starting\n");
1219   LLVM_DEBUG(SchedImpl->dumpPolicy());
1220   buildDAGWithRegPressure();
1221 
1222   postprocessDAG();
1223 
1224   SmallVector<SUnit*, 8> TopRoots, BotRoots;
1225   findRootsAndBiasEdges(TopRoots, BotRoots);
1226 
1227   // Initialize the strategy before modifying the DAG.
1228   // This may initialize a DFSResult to be used for queue priority.
1229   SchedImpl->initialize(this);
1230 
1231   LLVM_DEBUG(dump());
1232   if (PrintDAGs) dump();
1233   if (ViewMISchedDAGs) viewGraph();
1234 
1235   // Initialize ready queues now that the DAG and priority data are finalized.
1236   initQueues(TopRoots, BotRoots);
1237 
1238   bool IsTopNode = false;
1239   while (true) {
1240     LLVM_DEBUG(dbgs() << "** ScheduleDAGMILive::schedule picking next node\n");
1241     SUnit *SU = SchedImpl->pickNode(IsTopNode);
1242     if (!SU) break;
1243 
1244     assert(!SU->isScheduled && "Node already scheduled");
1245     if (!checkSchedLimit())
1246       break;
1247 
1248     scheduleMI(SU, IsTopNode);
1249 
1250     if (DFSResult) {
1251       unsigned SubtreeID = DFSResult->getSubtreeID(SU);
1252       if (!ScheduledTrees.test(SubtreeID)) {
1253         ScheduledTrees.set(SubtreeID);
1254         DFSResult->scheduleTree(SubtreeID);
1255         SchedImpl->scheduleTree(SubtreeID);
1256       }
1257     }
1258 
1259     // Notify the scheduling strategy after updating the DAG.
1260     SchedImpl->schedNode(SU, IsTopNode);
1261 
1262     updateQueues(SU, IsTopNode);
1263   }
1264   assert(CurrentTop == CurrentBottom && "Nonempty unscheduled zone.");
1265 
1266   placeDebugValues();
1267 
1268   LLVM_DEBUG({
1269     dbgs() << "*** Final schedule for "
1270            << printMBBReference(*begin()->getParent()) << " ***\n";
1271     dumpSchedule();
1272     dbgs() << '\n';
1273   });
1274 }
1275 
1276 /// Build the DAG and setup three register pressure trackers.
1277 void ScheduleDAGMILive::buildDAGWithRegPressure() {
1278   if (!ShouldTrackPressure) {
1279     RPTracker.reset();
1280     RegionCriticalPSets.clear();
1281     buildSchedGraph(AA);
1282     return;
1283   }
1284 
1285   // Initialize the register pressure tracker used by buildSchedGraph.
1286   RPTracker.init(&MF, RegClassInfo, LIS, BB, LiveRegionEnd,
1287                  ShouldTrackLaneMasks, /*TrackUntiedDefs=*/true);
1288 
1289   // Account for liveness generate by the region boundary.
1290   if (LiveRegionEnd != RegionEnd)
1291     RPTracker.recede();
1292 
1293   // Build the DAG, and compute current register pressure.
1294   buildSchedGraph(AA, &RPTracker, &SUPressureDiffs, LIS, ShouldTrackLaneMasks);
1295 
1296   // Initialize top/bottom trackers after computing region pressure.
1297   initRegPressure();
1298 }
1299 
1300 void ScheduleDAGMILive::computeDFSResult() {
1301   if (!DFSResult)
1302     DFSResult = new SchedDFSResult(/*BottomU*/true, MinSubtreeSize);
1303   DFSResult->clear();
1304   ScheduledTrees.clear();
1305   DFSResult->resize(SUnits.size());
1306   DFSResult->compute(SUnits);
1307   ScheduledTrees.resize(DFSResult->getNumSubtrees());
1308 }
1309 
1310 /// Compute the max cyclic critical path through the DAG. The scheduling DAG
1311 /// only provides the critical path for single block loops. To handle loops that
1312 /// span blocks, we could use the vreg path latencies provided by
1313 /// MachineTraceMetrics instead. However, MachineTraceMetrics is not currently
1314 /// available for use in the scheduler.
1315 ///
1316 /// The cyclic path estimation identifies a def-use pair that crosses the back
1317 /// edge and considers the depth and height of the nodes. For example, consider
1318 /// the following instruction sequence where each instruction has unit latency
1319 /// and defines an eponymous virtual register:
1320 ///
1321 /// a->b(a,c)->c(b)->d(c)->exit
1322 ///
1323 /// The cyclic critical path is a two cycles: b->c->b
1324 /// The acyclic critical path is four cycles: a->b->c->d->exit
1325 /// LiveOutHeight = height(c) = len(c->d->exit) = 2
1326 /// LiveOutDepth = depth(c) + 1 = len(a->b->c) + 1 = 3
1327 /// LiveInHeight = height(b) + 1 = len(b->c->d->exit) + 1 = 4
1328 /// LiveInDepth = depth(b) = len(a->b) = 1
1329 ///
1330 /// LiveOutDepth - LiveInDepth = 3 - 1 = 2
1331 /// LiveInHeight - LiveOutHeight = 4 - 2 = 2
1332 /// CyclicCriticalPath = min(2, 2) = 2
1333 ///
1334 /// This could be relevant to PostRA scheduling, but is currently implemented
1335 /// assuming LiveIntervals.
1336 unsigned ScheduleDAGMILive::computeCyclicCriticalPath() {
1337   // This only applies to single block loop.
1338   if (!BB->isSuccessor(BB))
1339     return 0;
1340 
1341   unsigned MaxCyclicLatency = 0;
1342   // Visit each live out vreg def to find def/use pairs that cross iterations.
1343   for (const RegisterMaskPair &P : RPTracker.getPressure().LiveOutRegs) {
1344     Register Reg = P.RegUnit;
1345     if (!Register::isVirtualRegister(Reg))
1346       continue;
1347     const LiveInterval &LI = LIS->getInterval(Reg);
1348     const VNInfo *DefVNI = LI.getVNInfoBefore(LIS->getMBBEndIdx(BB));
1349     if (!DefVNI)
1350       continue;
1351 
1352     MachineInstr *DefMI = LIS->getInstructionFromIndex(DefVNI->def);
1353     const SUnit *DefSU = getSUnit(DefMI);
1354     if (!DefSU)
1355       continue;
1356 
1357     unsigned LiveOutHeight = DefSU->getHeight();
1358     unsigned LiveOutDepth = DefSU->getDepth() + DefSU->Latency;
1359     // Visit all local users of the vreg def.
1360     for (const VReg2SUnit &V2SU
1361          : make_range(VRegUses.find(Reg), VRegUses.end())) {
1362       SUnit *SU = V2SU.SU;
1363       if (SU == &ExitSU)
1364         continue;
1365 
1366       // Only consider uses of the phi.
1367       LiveQueryResult LRQ = LI.Query(LIS->getInstructionIndex(*SU->getInstr()));
1368       if (!LRQ.valueIn()->isPHIDef())
1369         continue;
1370 
1371       // Assume that a path spanning two iterations is a cycle, which could
1372       // overestimate in strange cases. This allows cyclic latency to be
1373       // estimated as the minimum slack of the vreg's depth or height.
1374       unsigned CyclicLatency = 0;
1375       if (LiveOutDepth > SU->getDepth())
1376         CyclicLatency = LiveOutDepth - SU->getDepth();
1377 
1378       unsigned LiveInHeight = SU->getHeight() + DefSU->Latency;
1379       if (LiveInHeight > LiveOutHeight) {
1380         if (LiveInHeight - LiveOutHeight < CyclicLatency)
1381           CyclicLatency = LiveInHeight - LiveOutHeight;
1382       } else
1383         CyclicLatency = 0;
1384 
1385       LLVM_DEBUG(dbgs() << "Cyclic Path: SU(" << DefSU->NodeNum << ") -> SU("
1386                         << SU->NodeNum << ") = " << CyclicLatency << "c\n");
1387       if (CyclicLatency > MaxCyclicLatency)
1388         MaxCyclicLatency = CyclicLatency;
1389     }
1390   }
1391   LLVM_DEBUG(dbgs() << "Cyclic Critical Path: " << MaxCyclicLatency << "c\n");
1392   return MaxCyclicLatency;
1393 }
1394 
1395 /// Release ExitSU predecessors and setup scheduler queues. Re-position
1396 /// the Top RP tracker in case the region beginning has changed.
1397 void ScheduleDAGMILive::initQueues(ArrayRef<SUnit*> TopRoots,
1398                                    ArrayRef<SUnit*> BotRoots) {
1399   ScheduleDAGMI::initQueues(TopRoots, BotRoots);
1400   if (ShouldTrackPressure) {
1401     assert(TopRPTracker.getPos() == RegionBegin && "bad initial Top tracker");
1402     TopRPTracker.setPos(CurrentTop);
1403   }
1404 }
1405 
1406 /// Move an instruction and update register pressure.
1407 void ScheduleDAGMILive::scheduleMI(SUnit *SU, bool IsTopNode) {
1408   // Move the instruction to its new location in the instruction stream.
1409   MachineInstr *MI = SU->getInstr();
1410 
1411   if (IsTopNode) {
1412     assert(SU->isTopReady() && "node still has unscheduled dependencies");
1413     if (&*CurrentTop == MI)
1414       CurrentTop = nextIfDebug(++CurrentTop, CurrentBottom);
1415     else {
1416       moveInstruction(MI, CurrentTop);
1417       TopRPTracker.setPos(MI);
1418     }
1419 
1420     if (ShouldTrackPressure) {
1421       // Update top scheduled pressure.
1422       RegisterOperands RegOpers;
1423       RegOpers.collect(*MI, *TRI, MRI, ShouldTrackLaneMasks, false);
1424       if (ShouldTrackLaneMasks) {
1425         // Adjust liveness and add missing dead+read-undef flags.
1426         SlotIndex SlotIdx = LIS->getInstructionIndex(*MI).getRegSlot();
1427         RegOpers.adjustLaneLiveness(*LIS, MRI, SlotIdx, MI);
1428       } else {
1429         // Adjust for missing dead-def flags.
1430         RegOpers.detectDeadDefs(*MI, *LIS);
1431       }
1432 
1433       TopRPTracker.advance(RegOpers);
1434       assert(TopRPTracker.getPos() == CurrentTop && "out of sync");
1435       LLVM_DEBUG(dbgs() << "Top Pressure:\n"; dumpRegSetPressure(
1436                      TopRPTracker.getRegSetPressureAtPos(), TRI););
1437 
1438       updateScheduledPressure(SU, TopRPTracker.getPressure().MaxSetPressure);
1439     }
1440   } else {
1441     assert(SU->isBottomReady() && "node still has unscheduled dependencies");
1442     MachineBasicBlock::iterator priorII =
1443       priorNonDebug(CurrentBottom, CurrentTop);
1444     if (&*priorII == MI)
1445       CurrentBottom = priorII;
1446     else {
1447       if (&*CurrentTop == MI) {
1448         CurrentTop = nextIfDebug(++CurrentTop, priorII);
1449         TopRPTracker.setPos(CurrentTop);
1450       }
1451       moveInstruction(MI, CurrentBottom);
1452       CurrentBottom = MI;
1453       BotRPTracker.setPos(CurrentBottom);
1454     }
1455     if (ShouldTrackPressure) {
1456       RegisterOperands RegOpers;
1457       RegOpers.collect(*MI, *TRI, MRI, ShouldTrackLaneMasks, false);
1458       if (ShouldTrackLaneMasks) {
1459         // Adjust liveness and add missing dead+read-undef flags.
1460         SlotIndex SlotIdx = LIS->getInstructionIndex(*MI).getRegSlot();
1461         RegOpers.adjustLaneLiveness(*LIS, MRI, SlotIdx, MI);
1462       } else {
1463         // Adjust for missing dead-def flags.
1464         RegOpers.detectDeadDefs(*MI, *LIS);
1465       }
1466 
1467       if (BotRPTracker.getPos() != CurrentBottom)
1468         BotRPTracker.recedeSkipDebugValues();
1469       SmallVector<RegisterMaskPair, 8> LiveUses;
1470       BotRPTracker.recede(RegOpers, &LiveUses);
1471       assert(BotRPTracker.getPos() == CurrentBottom && "out of sync");
1472       LLVM_DEBUG(dbgs() << "Bottom Pressure:\n"; dumpRegSetPressure(
1473                      BotRPTracker.getRegSetPressureAtPos(), TRI););
1474 
1475       updateScheduledPressure(SU, BotRPTracker.getPressure().MaxSetPressure);
1476       updatePressureDiffs(LiveUses);
1477     }
1478   }
1479 }
1480 
1481 //===----------------------------------------------------------------------===//
1482 // BaseMemOpClusterMutation - DAG post-processing to cluster loads or stores.
1483 //===----------------------------------------------------------------------===//
1484 
1485 namespace {
1486 
1487 /// Post-process the DAG to create cluster edges between neighboring
1488 /// loads or between neighboring stores.
1489 class BaseMemOpClusterMutation : public ScheduleDAGMutation {
1490   struct MemOpInfo {
1491     SUnit *SU;
1492     SmallVector<const MachineOperand *, 4> BaseOps;
1493     int64_t Offset;
1494     unsigned Width;
1495 
1496     MemOpInfo(SUnit *SU, ArrayRef<const MachineOperand *> BaseOps,
1497               int64_t Offset, unsigned Width)
1498         : SU(SU), BaseOps(BaseOps.begin(), BaseOps.end()), Offset(Offset),
1499           Width(Width) {}
1500 
1501     static bool Compare(const MachineOperand *const &A,
1502                         const MachineOperand *const &B) {
1503       if (A->getType() != B->getType())
1504         return A->getType() < B->getType();
1505       if (A->isReg())
1506         return A->getReg() < B->getReg();
1507       if (A->isFI()) {
1508         const MachineFunction &MF = *A->getParent()->getParent()->getParent();
1509         const TargetFrameLowering &TFI = *MF.getSubtarget().getFrameLowering();
1510         bool StackGrowsDown = TFI.getStackGrowthDirection() ==
1511                               TargetFrameLowering::StackGrowsDown;
1512         return StackGrowsDown ? A->getIndex() > B->getIndex()
1513                               : A->getIndex() < B->getIndex();
1514       }
1515 
1516       llvm_unreachable("MemOpClusterMutation only supports register or frame "
1517                        "index bases.");
1518     }
1519 
1520     bool operator<(const MemOpInfo &RHS) const {
1521       // FIXME: Don't compare everything twice. Maybe use C++20 three way
1522       // comparison instead when it's available.
1523       if (std::lexicographical_compare(BaseOps.begin(), BaseOps.end(),
1524                                        RHS.BaseOps.begin(), RHS.BaseOps.end(),
1525                                        Compare))
1526         return true;
1527       if (std::lexicographical_compare(RHS.BaseOps.begin(), RHS.BaseOps.end(),
1528                                        BaseOps.begin(), BaseOps.end(), Compare))
1529         return false;
1530       if (Offset != RHS.Offset)
1531         return Offset < RHS.Offset;
1532       return SU->NodeNum < RHS.SU->NodeNum;
1533     }
1534   };
1535 
1536   const TargetInstrInfo *TII;
1537   const TargetRegisterInfo *TRI;
1538   bool IsLoad;
1539 
1540 public:
1541   BaseMemOpClusterMutation(const TargetInstrInfo *tii,
1542                            const TargetRegisterInfo *tri, bool IsLoad)
1543       : TII(tii), TRI(tri), IsLoad(IsLoad) {}
1544 
1545   void apply(ScheduleDAGInstrs *DAGInstrs) override;
1546 
1547 protected:
1548   void clusterNeighboringMemOps(ArrayRef<MemOpInfo> MemOps, bool FastCluster,
1549                                 ScheduleDAGInstrs *DAG);
1550   void collectMemOpRecords(std::vector<SUnit> &SUnits,
1551                            SmallVectorImpl<MemOpInfo> &MemOpRecords);
1552   bool groupMemOps(ArrayRef<MemOpInfo> MemOps, ScheduleDAGInstrs *DAG,
1553                    DenseMap<unsigned, SmallVector<MemOpInfo, 32>> &Groups);
1554 };
1555 
1556 class StoreClusterMutation : public BaseMemOpClusterMutation {
1557 public:
1558   StoreClusterMutation(const TargetInstrInfo *tii,
1559                        const TargetRegisterInfo *tri)
1560       : BaseMemOpClusterMutation(tii, tri, false) {}
1561 };
1562 
1563 class LoadClusterMutation : public BaseMemOpClusterMutation {
1564 public:
1565   LoadClusterMutation(const TargetInstrInfo *tii, const TargetRegisterInfo *tri)
1566       : BaseMemOpClusterMutation(tii, tri, true) {}
1567 };
1568 
1569 } // end anonymous namespace
1570 
1571 namespace llvm {
1572 
1573 std::unique_ptr<ScheduleDAGMutation>
1574 createLoadClusterDAGMutation(const TargetInstrInfo *TII,
1575                              const TargetRegisterInfo *TRI) {
1576   return EnableMemOpCluster ? std::make_unique<LoadClusterMutation>(TII, TRI)
1577                             : nullptr;
1578 }
1579 
1580 std::unique_ptr<ScheduleDAGMutation>
1581 createStoreClusterDAGMutation(const TargetInstrInfo *TII,
1582                               const TargetRegisterInfo *TRI) {
1583   return EnableMemOpCluster ? std::make_unique<StoreClusterMutation>(TII, TRI)
1584                             : nullptr;
1585 }
1586 
1587 } // end namespace llvm
1588 
1589 // Sorting all the loads/stores first, then for each load/store, checking the
1590 // following load/store one by one, until reach the first non-dependent one and
1591 // call target hook to see if they can cluster.
1592 // If FastCluster is enabled, we assume that, all the loads/stores have been
1593 // preprocessed and now, they didn't have dependencies on each other.
1594 void BaseMemOpClusterMutation::clusterNeighboringMemOps(
1595     ArrayRef<MemOpInfo> MemOpRecords, bool FastCluster,
1596     ScheduleDAGInstrs *DAG) {
1597   // Keep track of the current cluster length and bytes for each SUnit.
1598   DenseMap<unsigned, std::pair<unsigned, unsigned>> SUnit2ClusterInfo;
1599 
1600   // At this point, `MemOpRecords` array must hold atleast two mem ops. Try to
1601   // cluster mem ops collected within `MemOpRecords` array.
1602   for (unsigned Idx = 0, End = MemOpRecords.size(); Idx < (End - 1); ++Idx) {
1603     // Decision to cluster mem ops is taken based on target dependent logic
1604     auto MemOpa = MemOpRecords[Idx];
1605 
1606     // Seek for the next load/store to do the cluster.
1607     unsigned NextIdx = Idx + 1;
1608     for (; NextIdx < End; ++NextIdx)
1609       // Skip if MemOpb has been clustered already or has dependency with
1610       // MemOpa.
1611       if (!SUnit2ClusterInfo.count(MemOpRecords[NextIdx].SU->NodeNum) &&
1612           (FastCluster ||
1613            (!DAG->IsReachable(MemOpRecords[NextIdx].SU, MemOpa.SU) &&
1614             !DAG->IsReachable(MemOpa.SU, MemOpRecords[NextIdx].SU))))
1615         break;
1616     if (NextIdx == End)
1617       continue;
1618 
1619     auto MemOpb = MemOpRecords[NextIdx];
1620     unsigned ClusterLength = 2;
1621     unsigned CurrentClusterBytes = MemOpa.Width + MemOpb.Width;
1622     if (SUnit2ClusterInfo.count(MemOpa.SU->NodeNum)) {
1623       ClusterLength = SUnit2ClusterInfo[MemOpa.SU->NodeNum].first + 1;
1624       CurrentClusterBytes =
1625           SUnit2ClusterInfo[MemOpa.SU->NodeNum].second + MemOpb.Width;
1626     }
1627 
1628     if (!TII->shouldClusterMemOps(MemOpa.BaseOps, MemOpb.BaseOps, ClusterLength,
1629                                   CurrentClusterBytes))
1630       continue;
1631 
1632     SUnit *SUa = MemOpa.SU;
1633     SUnit *SUb = MemOpb.SU;
1634     if (SUa->NodeNum > SUb->NodeNum)
1635       std::swap(SUa, SUb);
1636 
1637     // FIXME: Is this check really required?
1638     if (!DAG->addEdge(SUb, SDep(SUa, SDep::Cluster)))
1639       continue;
1640 
1641     LLVM_DEBUG(dbgs() << "Cluster ld/st SU(" << SUa->NodeNum << ") - SU("
1642                       << SUb->NodeNum << ")\n");
1643     ++NumClustered;
1644 
1645     if (IsLoad) {
1646       // Copy successor edges from SUa to SUb. Interleaving computation
1647       // dependent on SUa can prevent load combining due to register reuse.
1648       // Predecessor edges do not need to be copied from SUb to SUa since
1649       // nearby loads should have effectively the same inputs.
1650       for (const SDep &Succ : SUa->Succs) {
1651         if (Succ.getSUnit() == SUb)
1652           continue;
1653         LLVM_DEBUG(dbgs() << "  Copy Succ SU(" << Succ.getSUnit()->NodeNum
1654                           << ")\n");
1655         DAG->addEdge(Succ.getSUnit(), SDep(SUb, SDep::Artificial));
1656       }
1657     } else {
1658       // Copy predecessor edges from SUb to SUa to avoid the SUnits that
1659       // SUb dependent on scheduled in-between SUb and SUa. Successor edges
1660       // do not need to be copied from SUa to SUb since no one will depend
1661       // on stores.
1662       // Notice that, we don't need to care about the memory dependency as
1663       // we won't try to cluster them if they have any memory dependency.
1664       for (const SDep &Pred : SUb->Preds) {
1665         if (Pred.getSUnit() == SUa)
1666           continue;
1667         LLVM_DEBUG(dbgs() << "  Copy Pred SU(" << Pred.getSUnit()->NodeNum
1668                           << ")\n");
1669         DAG->addEdge(SUa, SDep(Pred.getSUnit(), SDep::Artificial));
1670       }
1671     }
1672 
1673     SUnit2ClusterInfo[MemOpb.SU->NodeNum] = {ClusterLength,
1674                                              CurrentClusterBytes};
1675 
1676     LLVM_DEBUG(dbgs() << "  Curr cluster length: " << ClusterLength
1677                       << ", Curr cluster bytes: " << CurrentClusterBytes
1678                       << "\n");
1679   }
1680 }
1681 
1682 void BaseMemOpClusterMutation::collectMemOpRecords(
1683     std::vector<SUnit> &SUnits, SmallVectorImpl<MemOpInfo> &MemOpRecords) {
1684   for (auto &SU : SUnits) {
1685     if ((IsLoad && !SU.getInstr()->mayLoad()) ||
1686         (!IsLoad && !SU.getInstr()->mayStore()))
1687       continue;
1688 
1689     const MachineInstr &MI = *SU.getInstr();
1690     SmallVector<const MachineOperand *, 4> BaseOps;
1691     int64_t Offset;
1692     bool OffsetIsScalable;
1693     unsigned Width;
1694     if (TII->getMemOperandsWithOffsetWidth(MI, BaseOps, Offset,
1695                                            OffsetIsScalable, Width, TRI)) {
1696       MemOpRecords.push_back(MemOpInfo(&SU, BaseOps, Offset, Width));
1697 
1698       LLVM_DEBUG(dbgs() << "Num BaseOps: " << BaseOps.size() << ", Offset: "
1699                         << Offset << ", OffsetIsScalable: " << OffsetIsScalable
1700                         << ", Width: " << Width << "\n");
1701     }
1702 #ifndef NDEBUG
1703     for (auto *Op : BaseOps)
1704       assert(Op);
1705 #endif
1706   }
1707 }
1708 
1709 bool BaseMemOpClusterMutation::groupMemOps(
1710     ArrayRef<MemOpInfo> MemOps, ScheduleDAGInstrs *DAG,
1711     DenseMap<unsigned, SmallVector<MemOpInfo, 32>> &Groups) {
1712   bool FastCluster =
1713       ForceFastCluster ||
1714       MemOps.size() * DAG->SUnits.size() / 1000 > FastClusterThreshold;
1715 
1716   for (const auto &MemOp : MemOps) {
1717     unsigned ChainPredID = DAG->SUnits.size();
1718     if (FastCluster) {
1719       for (const SDep &Pred : MemOp.SU->Preds) {
1720         // We only want to cluster the mem ops that have the same ctrl(non-data)
1721         // pred so that they didn't have ctrl dependency for each other. But for
1722         // store instrs, we can still cluster them if the pred is load instr.
1723         if ((Pred.isCtrl() &&
1724              (IsLoad ||
1725               (Pred.getSUnit() && Pred.getSUnit()->getInstr()->mayStore()))) &&
1726             !Pred.isArtificial()) {
1727           ChainPredID = Pred.getSUnit()->NodeNum;
1728           break;
1729         }
1730       }
1731     } else
1732       ChainPredID = 0;
1733 
1734     Groups[ChainPredID].push_back(MemOp);
1735   }
1736   return FastCluster;
1737 }
1738 
1739 /// Callback from DAG postProcessing to create cluster edges for loads/stores.
1740 void BaseMemOpClusterMutation::apply(ScheduleDAGInstrs *DAG) {
1741   // Collect all the clusterable loads/stores
1742   SmallVector<MemOpInfo, 32> MemOpRecords;
1743   collectMemOpRecords(DAG->SUnits, MemOpRecords);
1744 
1745   if (MemOpRecords.size() < 2)
1746     return;
1747 
1748   // Put the loads/stores without dependency into the same group with some
1749   // heuristic if the DAG is too complex to avoid compiling time blow up.
1750   // Notice that, some fusion pair could be lost with this.
1751   DenseMap<unsigned, SmallVector<MemOpInfo, 32>> Groups;
1752   bool FastCluster = groupMemOps(MemOpRecords, DAG, Groups);
1753 
1754   for (auto &Group : Groups) {
1755     // Sorting the loads/stores, so that, we can stop the cluster as early as
1756     // possible.
1757     llvm::sort(Group.second);
1758 
1759     // Trying to cluster all the neighboring loads/stores.
1760     clusterNeighboringMemOps(Group.second, FastCluster, DAG);
1761   }
1762 }
1763 
1764 //===----------------------------------------------------------------------===//
1765 // CopyConstrain - DAG post-processing to encourage copy elimination.
1766 //===----------------------------------------------------------------------===//
1767 
1768 namespace {
1769 
1770 /// Post-process the DAG to create weak edges from all uses of a copy to
1771 /// the one use that defines the copy's source vreg, most likely an induction
1772 /// variable increment.
1773 class CopyConstrain : public ScheduleDAGMutation {
1774   // Transient state.
1775   SlotIndex RegionBeginIdx;
1776 
1777   // RegionEndIdx is the slot index of the last non-debug instruction in the
1778   // scheduling region. So we may have RegionBeginIdx == RegionEndIdx.
1779   SlotIndex RegionEndIdx;
1780 
1781 public:
1782   CopyConstrain(const TargetInstrInfo *, const TargetRegisterInfo *) {}
1783 
1784   void apply(ScheduleDAGInstrs *DAGInstrs) override;
1785 
1786 protected:
1787   void constrainLocalCopy(SUnit *CopySU, ScheduleDAGMILive *DAG);
1788 };
1789 
1790 } // end anonymous namespace
1791 
1792 namespace llvm {
1793 
1794 std::unique_ptr<ScheduleDAGMutation>
1795 createCopyConstrainDAGMutation(const TargetInstrInfo *TII,
1796                                const TargetRegisterInfo *TRI) {
1797   return std::make_unique<CopyConstrain>(TII, TRI);
1798 }
1799 
1800 } // end namespace llvm
1801 
1802 /// constrainLocalCopy handles two possibilities:
1803 /// 1) Local src:
1804 /// I0:     = dst
1805 /// I1: src = ...
1806 /// I2:     = dst
1807 /// I3: dst = src (copy)
1808 /// (create pred->succ edges I0->I1, I2->I1)
1809 ///
1810 /// 2) Local copy:
1811 /// I0: dst = src (copy)
1812 /// I1:     = dst
1813 /// I2: src = ...
1814 /// I3:     = dst
1815 /// (create pred->succ edges I1->I2, I3->I2)
1816 ///
1817 /// Although the MachineScheduler is currently constrained to single blocks,
1818 /// this algorithm should handle extended blocks. An EBB is a set of
1819 /// contiguously numbered blocks such that the previous block in the EBB is
1820 /// always the single predecessor.
1821 void CopyConstrain::constrainLocalCopy(SUnit *CopySU, ScheduleDAGMILive *DAG) {
1822   LiveIntervals *LIS = DAG->getLIS();
1823   MachineInstr *Copy = CopySU->getInstr();
1824 
1825   // Check for pure vreg copies.
1826   const MachineOperand &SrcOp = Copy->getOperand(1);
1827   Register SrcReg = SrcOp.getReg();
1828   if (!Register::isVirtualRegister(SrcReg) || !SrcOp.readsReg())
1829     return;
1830 
1831   const MachineOperand &DstOp = Copy->getOperand(0);
1832   Register DstReg = DstOp.getReg();
1833   if (!Register::isVirtualRegister(DstReg) || DstOp.isDead())
1834     return;
1835 
1836   // Check if either the dest or source is local. If it's live across a back
1837   // edge, it's not local. Note that if both vregs are live across the back
1838   // edge, we cannot successfully contrain the copy without cyclic scheduling.
1839   // If both the copy's source and dest are local live intervals, then we
1840   // should treat the dest as the global for the purpose of adding
1841   // constraints. This adds edges from source's other uses to the copy.
1842   unsigned LocalReg = SrcReg;
1843   unsigned GlobalReg = DstReg;
1844   LiveInterval *LocalLI = &LIS->getInterval(LocalReg);
1845   if (!LocalLI->isLocal(RegionBeginIdx, RegionEndIdx)) {
1846     LocalReg = DstReg;
1847     GlobalReg = SrcReg;
1848     LocalLI = &LIS->getInterval(LocalReg);
1849     if (!LocalLI->isLocal(RegionBeginIdx, RegionEndIdx))
1850       return;
1851   }
1852   LiveInterval *GlobalLI = &LIS->getInterval(GlobalReg);
1853 
1854   // Find the global segment after the start of the local LI.
1855   LiveInterval::iterator GlobalSegment = GlobalLI->find(LocalLI->beginIndex());
1856   // If GlobalLI does not overlap LocalLI->start, then a copy directly feeds a
1857   // local live range. We could create edges from other global uses to the local
1858   // start, but the coalescer should have already eliminated these cases, so
1859   // don't bother dealing with it.
1860   if (GlobalSegment == GlobalLI->end())
1861     return;
1862 
1863   // If GlobalSegment is killed at the LocalLI->start, the call to find()
1864   // returned the next global segment. But if GlobalSegment overlaps with
1865   // LocalLI->start, then advance to the next segment. If a hole in GlobalLI
1866   // exists in LocalLI's vicinity, GlobalSegment will be the end of the hole.
1867   if (GlobalSegment->contains(LocalLI->beginIndex()))
1868     ++GlobalSegment;
1869 
1870   if (GlobalSegment == GlobalLI->end())
1871     return;
1872 
1873   // Check if GlobalLI contains a hole in the vicinity of LocalLI.
1874   if (GlobalSegment != GlobalLI->begin()) {
1875     // Two address defs have no hole.
1876     if (SlotIndex::isSameInstr(std::prev(GlobalSegment)->end,
1877                                GlobalSegment->start)) {
1878       return;
1879     }
1880     // If the prior global segment may be defined by the same two-address
1881     // instruction that also defines LocalLI, then can't make a hole here.
1882     if (SlotIndex::isSameInstr(std::prev(GlobalSegment)->start,
1883                                LocalLI->beginIndex())) {
1884       return;
1885     }
1886     // If GlobalLI has a prior segment, it must be live into the EBB. Otherwise
1887     // it would be a disconnected component in the live range.
1888     assert(std::prev(GlobalSegment)->start < LocalLI->beginIndex() &&
1889            "Disconnected LRG within the scheduling region.");
1890   }
1891   MachineInstr *GlobalDef = LIS->getInstructionFromIndex(GlobalSegment->start);
1892   if (!GlobalDef)
1893     return;
1894 
1895   SUnit *GlobalSU = DAG->getSUnit(GlobalDef);
1896   if (!GlobalSU)
1897     return;
1898 
1899   // GlobalDef is the bottom of the GlobalLI hole. Open the hole by
1900   // constraining the uses of the last local def to precede GlobalDef.
1901   SmallVector<SUnit*,8> LocalUses;
1902   const VNInfo *LastLocalVN = LocalLI->getVNInfoBefore(LocalLI->endIndex());
1903   MachineInstr *LastLocalDef = LIS->getInstructionFromIndex(LastLocalVN->def);
1904   SUnit *LastLocalSU = DAG->getSUnit(LastLocalDef);
1905   for (const SDep &Succ : LastLocalSU->Succs) {
1906     if (Succ.getKind() != SDep::Data || Succ.getReg() != LocalReg)
1907       continue;
1908     if (Succ.getSUnit() == GlobalSU)
1909       continue;
1910     if (!DAG->canAddEdge(GlobalSU, Succ.getSUnit()))
1911       return;
1912     LocalUses.push_back(Succ.getSUnit());
1913   }
1914   // Open the top of the GlobalLI hole by constraining any earlier global uses
1915   // to precede the start of LocalLI.
1916   SmallVector<SUnit*,8> GlobalUses;
1917   MachineInstr *FirstLocalDef =
1918     LIS->getInstructionFromIndex(LocalLI->beginIndex());
1919   SUnit *FirstLocalSU = DAG->getSUnit(FirstLocalDef);
1920   for (const SDep &Pred : GlobalSU->Preds) {
1921     if (Pred.getKind() != SDep::Anti || Pred.getReg() != GlobalReg)
1922       continue;
1923     if (Pred.getSUnit() == FirstLocalSU)
1924       continue;
1925     if (!DAG->canAddEdge(FirstLocalSU, Pred.getSUnit()))
1926       return;
1927     GlobalUses.push_back(Pred.getSUnit());
1928   }
1929   LLVM_DEBUG(dbgs() << "Constraining copy SU(" << CopySU->NodeNum << ")\n");
1930   // Add the weak edges.
1931   for (SUnit *LU : LocalUses) {
1932     LLVM_DEBUG(dbgs() << "  Local use SU(" << LU->NodeNum << ") -> SU("
1933                       << GlobalSU->NodeNum << ")\n");
1934     DAG->addEdge(GlobalSU, SDep(LU, SDep::Weak));
1935   }
1936   for (SUnit *GU : GlobalUses) {
1937     LLVM_DEBUG(dbgs() << "  Global use SU(" << GU->NodeNum << ") -> SU("
1938                       << FirstLocalSU->NodeNum << ")\n");
1939     DAG->addEdge(FirstLocalSU, SDep(GU, SDep::Weak));
1940   }
1941 }
1942 
1943 /// Callback from DAG postProcessing to create weak edges to encourage
1944 /// copy elimination.
1945 void CopyConstrain::apply(ScheduleDAGInstrs *DAGInstrs) {
1946   ScheduleDAGMI *DAG = static_cast<ScheduleDAGMI*>(DAGInstrs);
1947   assert(DAG->hasVRegLiveness() && "Expect VRegs with LiveIntervals");
1948 
1949   MachineBasicBlock::iterator FirstPos = nextIfDebug(DAG->begin(), DAG->end());
1950   if (FirstPos == DAG->end())
1951     return;
1952   RegionBeginIdx = DAG->getLIS()->getInstructionIndex(*FirstPos);
1953   RegionEndIdx = DAG->getLIS()->getInstructionIndex(
1954       *priorNonDebug(DAG->end(), DAG->begin()));
1955 
1956   for (SUnit &SU : DAG->SUnits) {
1957     if (!SU.getInstr()->isCopy())
1958       continue;
1959 
1960     constrainLocalCopy(&SU, static_cast<ScheduleDAGMILive*>(DAG));
1961   }
1962 }
1963 
1964 //===----------------------------------------------------------------------===//
1965 // MachineSchedStrategy helpers used by GenericScheduler, GenericPostScheduler
1966 // and possibly other custom schedulers.
1967 //===----------------------------------------------------------------------===//
1968 
1969 static const unsigned InvalidCycle = ~0U;
1970 
1971 SchedBoundary::~SchedBoundary() { delete HazardRec; }
1972 
1973 /// Given a Count of resource usage and a Latency value, return true if a
1974 /// SchedBoundary becomes resource limited.
1975 /// If we are checking after scheduling a node, we should return true when
1976 /// we just reach the resource limit.
1977 static bool checkResourceLimit(unsigned LFactor, unsigned Count,
1978                                unsigned Latency, bool AfterSchedNode) {
1979   int ResCntFactor = (int)(Count - (Latency * LFactor));
1980   if (AfterSchedNode)
1981     return ResCntFactor >= (int)LFactor;
1982   else
1983     return ResCntFactor > (int)LFactor;
1984 }
1985 
1986 void SchedBoundary::reset() {
1987   // A new HazardRec is created for each DAG and owned by SchedBoundary.
1988   // Destroying and reconstructing it is very expensive though. So keep
1989   // invalid, placeholder HazardRecs.
1990   if (HazardRec && HazardRec->isEnabled()) {
1991     delete HazardRec;
1992     HazardRec = nullptr;
1993   }
1994   Available.clear();
1995   Pending.clear();
1996   CheckPending = false;
1997   CurrCycle = 0;
1998   CurrMOps = 0;
1999   MinReadyCycle = std::numeric_limits<unsigned>::max();
2000   ExpectedLatency = 0;
2001   DependentLatency = 0;
2002   RetiredMOps = 0;
2003   MaxExecutedResCount = 0;
2004   ZoneCritResIdx = 0;
2005   IsResourceLimited = false;
2006   ReservedCycles.clear();
2007   ReservedCyclesIndex.clear();
2008   ResourceGroupSubUnitMasks.clear();
2009 #ifndef NDEBUG
2010   // Track the maximum number of stall cycles that could arise either from the
2011   // latency of a DAG edge or the number of cycles that a processor resource is
2012   // reserved (SchedBoundary::ReservedCycles).
2013   MaxObservedStall = 0;
2014 #endif
2015   // Reserve a zero-count for invalid CritResIdx.
2016   ExecutedResCounts.resize(1);
2017   assert(!ExecutedResCounts[0] && "nonzero count for bad resource");
2018 }
2019 
2020 void SchedRemainder::
2021 init(ScheduleDAGMI *DAG, const TargetSchedModel *SchedModel) {
2022   reset();
2023   if (!SchedModel->hasInstrSchedModel())
2024     return;
2025   RemainingCounts.resize(SchedModel->getNumProcResourceKinds());
2026   for (SUnit &SU : DAG->SUnits) {
2027     const MCSchedClassDesc *SC = DAG->getSchedClass(&SU);
2028     RemIssueCount += SchedModel->getNumMicroOps(SU.getInstr(), SC)
2029       * SchedModel->getMicroOpFactor();
2030     for (TargetSchedModel::ProcResIter
2031            PI = SchedModel->getWriteProcResBegin(SC),
2032            PE = SchedModel->getWriteProcResEnd(SC); PI != PE; ++PI) {
2033       unsigned PIdx = PI->ProcResourceIdx;
2034       unsigned Factor = SchedModel->getResourceFactor(PIdx);
2035       RemainingCounts[PIdx] += (Factor * PI->Cycles);
2036     }
2037   }
2038 }
2039 
2040 void SchedBoundary::
2041 init(ScheduleDAGMI *dag, const TargetSchedModel *smodel, SchedRemainder *rem) {
2042   reset();
2043   DAG = dag;
2044   SchedModel = smodel;
2045   Rem = rem;
2046   if (SchedModel->hasInstrSchedModel()) {
2047     unsigned ResourceCount = SchedModel->getNumProcResourceKinds();
2048     ReservedCyclesIndex.resize(ResourceCount);
2049     ExecutedResCounts.resize(ResourceCount);
2050     ResourceGroupSubUnitMasks.resize(ResourceCount, APInt(ResourceCount, 0));
2051     unsigned NumUnits = 0;
2052 
2053     for (unsigned i = 0; i < ResourceCount; ++i) {
2054       ReservedCyclesIndex[i] = NumUnits;
2055       NumUnits += SchedModel->getProcResource(i)->NumUnits;
2056       if (isUnbufferedGroup(i)) {
2057         auto SubUnits = SchedModel->getProcResource(i)->SubUnitsIdxBegin;
2058         for (unsigned U = 0, UE = SchedModel->getProcResource(i)->NumUnits;
2059              U != UE; ++U)
2060           ResourceGroupSubUnitMasks[i].setBit(SubUnits[U]);
2061       }
2062     }
2063 
2064     ReservedCycles.resize(NumUnits, InvalidCycle);
2065   }
2066 }
2067 
2068 /// Compute the stall cycles based on this SUnit's ready time. Heuristics treat
2069 /// these "soft stalls" differently than the hard stall cycles based on CPU
2070 /// resources and computed by checkHazard(). A fully in-order model
2071 /// (MicroOpBufferSize==0) will not make use of this since instructions are not
2072 /// available for scheduling until they are ready. However, a weaker in-order
2073 /// model may use this for heuristics. For example, if a processor has in-order
2074 /// behavior when reading certain resources, this may come into play.
2075 unsigned SchedBoundary::getLatencyStallCycles(SUnit *SU) {
2076   if (!SU->isUnbuffered)
2077     return 0;
2078 
2079   unsigned ReadyCycle = (isTop() ? SU->TopReadyCycle : SU->BotReadyCycle);
2080   if (ReadyCycle > CurrCycle)
2081     return ReadyCycle - CurrCycle;
2082   return 0;
2083 }
2084 
2085 /// Compute the next cycle at which the given processor resource unit
2086 /// can be scheduled.
2087 unsigned SchedBoundary::getNextResourceCycleByInstance(unsigned InstanceIdx,
2088                                                        unsigned Cycles) {
2089   unsigned NextUnreserved = ReservedCycles[InstanceIdx];
2090   // If this resource has never been used, always return cycle zero.
2091   if (NextUnreserved == InvalidCycle)
2092     return 0;
2093   // For bottom-up scheduling add the cycles needed for the current operation.
2094   if (!isTop())
2095     NextUnreserved += Cycles;
2096   return NextUnreserved;
2097 }
2098 
2099 /// Compute the next cycle at which the given processor resource can be
2100 /// scheduled.  Returns the next cycle and the index of the processor resource
2101 /// instance in the reserved cycles vector.
2102 std::pair<unsigned, unsigned>
2103 SchedBoundary::getNextResourceCycle(const MCSchedClassDesc *SC, unsigned PIdx,
2104                                     unsigned Cycles) {
2105 
2106   unsigned MinNextUnreserved = InvalidCycle;
2107   unsigned InstanceIdx = 0;
2108   unsigned StartIndex = ReservedCyclesIndex[PIdx];
2109   unsigned NumberOfInstances = SchedModel->getProcResource(PIdx)->NumUnits;
2110   assert(NumberOfInstances > 0 &&
2111          "Cannot have zero instances of a ProcResource");
2112 
2113   if (isUnbufferedGroup(PIdx)) {
2114     // If any subunits are used by the instruction, report that the resource
2115     // group is available at 0, effectively removing the group record from
2116     // hazarding and basing the hazarding decisions on the subunit records.
2117     // Otherwise, choose the first available instance from among the subunits.
2118     // Specifications which assign cycles to both the subunits and the group or
2119     // which use an unbuffered group with buffered subunits will appear to
2120     // schedule strangely. In the first case, the additional cycles for the
2121     // group will be ignored.  In the second, the group will be ignored
2122     // entirely.
2123     for (const MCWriteProcResEntry &PE :
2124          make_range(SchedModel->getWriteProcResBegin(SC),
2125                     SchedModel->getWriteProcResEnd(SC)))
2126       if (ResourceGroupSubUnitMasks[PIdx][PE.ProcResourceIdx])
2127         return std::make_pair(0u, StartIndex);
2128 
2129     auto SubUnits = SchedModel->getProcResource(PIdx)->SubUnitsIdxBegin;
2130     for (unsigned I = 0, End = NumberOfInstances; I < End; ++I) {
2131       unsigned NextUnreserved, NextInstanceIdx;
2132       std::tie(NextUnreserved, NextInstanceIdx) =
2133           getNextResourceCycle(SC, SubUnits[I], Cycles);
2134       if (MinNextUnreserved > NextUnreserved) {
2135         InstanceIdx = NextInstanceIdx;
2136         MinNextUnreserved = NextUnreserved;
2137       }
2138     }
2139     return std::make_pair(MinNextUnreserved, InstanceIdx);
2140   }
2141 
2142   for (unsigned I = StartIndex, End = StartIndex + NumberOfInstances; I < End;
2143        ++I) {
2144     unsigned NextUnreserved = getNextResourceCycleByInstance(I, Cycles);
2145     if (MinNextUnreserved > NextUnreserved) {
2146       InstanceIdx = I;
2147       MinNextUnreserved = NextUnreserved;
2148     }
2149   }
2150   return std::make_pair(MinNextUnreserved, InstanceIdx);
2151 }
2152 
2153 /// Does this SU have a hazard within the current instruction group.
2154 ///
2155 /// The scheduler supports two modes of hazard recognition. The first is the
2156 /// ScheduleHazardRecognizer API. It is a fully general hazard recognizer that
2157 /// supports highly complicated in-order reservation tables
2158 /// (ScoreboardHazardRecognizer) and arbitrary target-specific logic.
2159 ///
2160 /// The second is a streamlined mechanism that checks for hazards based on
2161 /// simple counters that the scheduler itself maintains. It explicitly checks
2162 /// for instruction dispatch limitations, including the number of micro-ops that
2163 /// can dispatch per cycle.
2164 ///
2165 /// TODO: Also check whether the SU must start a new group.
2166 bool SchedBoundary::checkHazard(SUnit *SU) {
2167   if (HazardRec->isEnabled()
2168       && HazardRec->getHazardType(SU) != ScheduleHazardRecognizer::NoHazard) {
2169     return true;
2170   }
2171 
2172   unsigned uops = SchedModel->getNumMicroOps(SU->getInstr());
2173   if ((CurrMOps > 0) && (CurrMOps + uops > SchedModel->getIssueWidth())) {
2174     LLVM_DEBUG(dbgs() << "  SU(" << SU->NodeNum << ") uops="
2175                       << SchedModel->getNumMicroOps(SU->getInstr()) << '\n');
2176     return true;
2177   }
2178 
2179   if (CurrMOps > 0 &&
2180       ((isTop() && SchedModel->mustBeginGroup(SU->getInstr())) ||
2181        (!isTop() && SchedModel->mustEndGroup(SU->getInstr())))) {
2182     LLVM_DEBUG(dbgs() << "  hazard: SU(" << SU->NodeNum << ") must "
2183                       << (isTop() ? "begin" : "end") << " group\n");
2184     return true;
2185   }
2186 
2187   if (SchedModel->hasInstrSchedModel() && SU->hasReservedResource) {
2188     const MCSchedClassDesc *SC = DAG->getSchedClass(SU);
2189     for (const MCWriteProcResEntry &PE :
2190           make_range(SchedModel->getWriteProcResBegin(SC),
2191                      SchedModel->getWriteProcResEnd(SC))) {
2192       unsigned ResIdx = PE.ProcResourceIdx;
2193       unsigned Cycles = PE.Cycles;
2194       unsigned NRCycle, InstanceIdx;
2195       std::tie(NRCycle, InstanceIdx) = getNextResourceCycle(SC, ResIdx, Cycles);
2196       if (NRCycle > CurrCycle) {
2197 #ifndef NDEBUG
2198         MaxObservedStall = std::max(Cycles, MaxObservedStall);
2199 #endif
2200         LLVM_DEBUG(dbgs() << "  SU(" << SU->NodeNum << ") "
2201                           << SchedModel->getResourceName(ResIdx)
2202                           << '[' << InstanceIdx - ReservedCyclesIndex[ResIdx]  << ']'
2203                           << "=" << NRCycle << "c\n");
2204         return true;
2205       }
2206     }
2207   }
2208   return false;
2209 }
2210 
2211 // Find the unscheduled node in ReadySUs with the highest latency.
2212 unsigned SchedBoundary::
2213 findMaxLatency(ArrayRef<SUnit*> ReadySUs) {
2214   SUnit *LateSU = nullptr;
2215   unsigned RemLatency = 0;
2216   for (SUnit *SU : ReadySUs) {
2217     unsigned L = getUnscheduledLatency(SU);
2218     if (L > RemLatency) {
2219       RemLatency = L;
2220       LateSU = SU;
2221     }
2222   }
2223   if (LateSU) {
2224     LLVM_DEBUG(dbgs() << Available.getName() << " RemLatency SU("
2225                       << LateSU->NodeNum << ") " << RemLatency << "c\n");
2226   }
2227   return RemLatency;
2228 }
2229 
2230 // Count resources in this zone and the remaining unscheduled
2231 // instruction. Return the max count, scaled. Set OtherCritIdx to the critical
2232 // resource index, or zero if the zone is issue limited.
2233 unsigned SchedBoundary::
2234 getOtherResourceCount(unsigned &OtherCritIdx) {
2235   OtherCritIdx = 0;
2236   if (!SchedModel->hasInstrSchedModel())
2237     return 0;
2238 
2239   unsigned OtherCritCount = Rem->RemIssueCount
2240     + (RetiredMOps * SchedModel->getMicroOpFactor());
2241   LLVM_DEBUG(dbgs() << "  " << Available.getName() << " + Remain MOps: "
2242                     << OtherCritCount / SchedModel->getMicroOpFactor() << '\n');
2243   for (unsigned PIdx = 1, PEnd = SchedModel->getNumProcResourceKinds();
2244        PIdx != PEnd; ++PIdx) {
2245     unsigned OtherCount = getResourceCount(PIdx) + Rem->RemainingCounts[PIdx];
2246     if (OtherCount > OtherCritCount) {
2247       OtherCritCount = OtherCount;
2248       OtherCritIdx = PIdx;
2249     }
2250   }
2251   if (OtherCritIdx) {
2252     LLVM_DEBUG(
2253         dbgs() << "  " << Available.getName() << " + Remain CritRes: "
2254                << OtherCritCount / SchedModel->getResourceFactor(OtherCritIdx)
2255                << " " << SchedModel->getResourceName(OtherCritIdx) << "\n");
2256   }
2257   return OtherCritCount;
2258 }
2259 
2260 void SchedBoundary::releaseNode(SUnit *SU, unsigned ReadyCycle, bool InPQueue,
2261                                 unsigned Idx) {
2262   assert(SU->getInstr() && "Scheduled SUnit must have instr");
2263 
2264 #ifndef NDEBUG
2265   // ReadyCycle was been bumped up to the CurrCycle when this node was
2266   // scheduled, but CurrCycle may have been eagerly advanced immediately after
2267   // scheduling, so may now be greater than ReadyCycle.
2268   if (ReadyCycle > CurrCycle)
2269     MaxObservedStall = std::max(ReadyCycle - CurrCycle, MaxObservedStall);
2270 #endif
2271 
2272   if (ReadyCycle < MinReadyCycle)
2273     MinReadyCycle = ReadyCycle;
2274 
2275   // Check for interlocks first. For the purpose of other heuristics, an
2276   // instruction that cannot issue appears as if it's not in the ReadyQueue.
2277   bool IsBuffered = SchedModel->getMicroOpBufferSize() != 0;
2278   bool HazardDetected = (!IsBuffered && ReadyCycle > CurrCycle) ||
2279                         checkHazard(SU) || (Available.size() >= ReadyListLimit);
2280 
2281   if (!HazardDetected) {
2282     Available.push(SU);
2283 
2284     if (InPQueue)
2285       Pending.remove(Pending.begin() + Idx);
2286     return;
2287   }
2288 
2289   if (!InPQueue)
2290     Pending.push(SU);
2291 }
2292 
2293 /// Move the boundary of scheduled code by one cycle.
2294 void SchedBoundary::bumpCycle(unsigned NextCycle) {
2295   if (SchedModel->getMicroOpBufferSize() == 0) {
2296     assert(MinReadyCycle < std::numeric_limits<unsigned>::max() &&
2297            "MinReadyCycle uninitialized");
2298     if (MinReadyCycle > NextCycle)
2299       NextCycle = MinReadyCycle;
2300   }
2301   // Update the current micro-ops, which will issue in the next cycle.
2302   unsigned DecMOps = SchedModel->getIssueWidth() * (NextCycle - CurrCycle);
2303   CurrMOps = (CurrMOps <= DecMOps) ? 0 : CurrMOps - DecMOps;
2304 
2305   // Decrement DependentLatency based on the next cycle.
2306   if ((NextCycle - CurrCycle) > DependentLatency)
2307     DependentLatency = 0;
2308   else
2309     DependentLatency -= (NextCycle - CurrCycle);
2310 
2311   if (!HazardRec->isEnabled()) {
2312     // Bypass HazardRec virtual calls.
2313     CurrCycle = NextCycle;
2314   } else {
2315     // Bypass getHazardType calls in case of long latency.
2316     for (; CurrCycle != NextCycle; ++CurrCycle) {
2317       if (isTop())
2318         HazardRec->AdvanceCycle();
2319       else
2320         HazardRec->RecedeCycle();
2321     }
2322   }
2323   CheckPending = true;
2324   IsResourceLimited =
2325       checkResourceLimit(SchedModel->getLatencyFactor(), getCriticalCount(),
2326                          getScheduledLatency(), true);
2327 
2328   LLVM_DEBUG(dbgs() << "Cycle: " << CurrCycle << ' ' << Available.getName()
2329                     << '\n');
2330 }
2331 
2332 void SchedBoundary::incExecutedResources(unsigned PIdx, unsigned Count) {
2333   ExecutedResCounts[PIdx] += Count;
2334   if (ExecutedResCounts[PIdx] > MaxExecutedResCount)
2335     MaxExecutedResCount = ExecutedResCounts[PIdx];
2336 }
2337 
2338 /// Add the given processor resource to this scheduled zone.
2339 ///
2340 /// \param Cycles indicates the number of consecutive (non-pipelined) cycles
2341 /// during which this resource is consumed.
2342 ///
2343 /// \return the next cycle at which the instruction may execute without
2344 /// oversubscribing resources.
2345 unsigned SchedBoundary::countResource(const MCSchedClassDesc *SC, unsigned PIdx,
2346                                       unsigned Cycles, unsigned NextCycle) {
2347   unsigned Factor = SchedModel->getResourceFactor(PIdx);
2348   unsigned Count = Factor * Cycles;
2349   LLVM_DEBUG(dbgs() << "  " << SchedModel->getResourceName(PIdx) << " +"
2350                     << Cycles << "x" << Factor << "u\n");
2351 
2352   // Update Executed resources counts.
2353   incExecutedResources(PIdx, Count);
2354   assert(Rem->RemainingCounts[PIdx] >= Count && "resource double counted");
2355   Rem->RemainingCounts[PIdx] -= Count;
2356 
2357   // Check if this resource exceeds the current critical resource. If so, it
2358   // becomes the critical resource.
2359   if (ZoneCritResIdx != PIdx && (getResourceCount(PIdx) > getCriticalCount())) {
2360     ZoneCritResIdx = PIdx;
2361     LLVM_DEBUG(dbgs() << "  *** Critical resource "
2362                       << SchedModel->getResourceName(PIdx) << ": "
2363                       << getResourceCount(PIdx) / SchedModel->getLatencyFactor()
2364                       << "c\n");
2365   }
2366   // For reserved resources, record the highest cycle using the resource.
2367   unsigned NextAvailable, InstanceIdx;
2368   std::tie(NextAvailable, InstanceIdx) = getNextResourceCycle(SC, PIdx, Cycles);
2369   if (NextAvailable > CurrCycle) {
2370     LLVM_DEBUG(dbgs() << "  Resource conflict: "
2371                       << SchedModel->getResourceName(PIdx)
2372                       << '[' << InstanceIdx - ReservedCyclesIndex[PIdx]  << ']'
2373                       << " reserved until @" << NextAvailable << "\n");
2374   }
2375   return NextAvailable;
2376 }
2377 
2378 /// Move the boundary of scheduled code by one SUnit.
2379 void SchedBoundary::bumpNode(SUnit *SU) {
2380   // Update the reservation table.
2381   if (HazardRec->isEnabled()) {
2382     if (!isTop() && SU->isCall) {
2383       // Calls are scheduled with their preceding instructions. For bottom-up
2384       // scheduling, clear the pipeline state before emitting.
2385       HazardRec->Reset();
2386     }
2387     HazardRec->EmitInstruction(SU);
2388     // Scheduling an instruction may have made pending instructions available.
2389     CheckPending = true;
2390   }
2391   // checkHazard should prevent scheduling multiple instructions per cycle that
2392   // exceed the issue width.
2393   const MCSchedClassDesc *SC = DAG->getSchedClass(SU);
2394   unsigned IncMOps = SchedModel->getNumMicroOps(SU->getInstr());
2395   assert(
2396       (CurrMOps == 0 || (CurrMOps + IncMOps) <= SchedModel->getIssueWidth()) &&
2397       "Cannot schedule this instruction's MicroOps in the current cycle.");
2398 
2399   unsigned ReadyCycle = (isTop() ? SU->TopReadyCycle : SU->BotReadyCycle);
2400   LLVM_DEBUG(dbgs() << "  Ready @" << ReadyCycle << "c\n");
2401 
2402   unsigned NextCycle = CurrCycle;
2403   switch (SchedModel->getMicroOpBufferSize()) {
2404   case 0:
2405     assert(ReadyCycle <= CurrCycle && "Broken PendingQueue");
2406     break;
2407   case 1:
2408     if (ReadyCycle > NextCycle) {
2409       NextCycle = ReadyCycle;
2410       LLVM_DEBUG(dbgs() << "  *** Stall until: " << ReadyCycle << "\n");
2411     }
2412     break;
2413   default:
2414     // We don't currently model the OOO reorder buffer, so consider all
2415     // scheduled MOps to be "retired". We do loosely model in-order resource
2416     // latency. If this instruction uses an in-order resource, account for any
2417     // likely stall cycles.
2418     if (SU->isUnbuffered && ReadyCycle > NextCycle)
2419       NextCycle = ReadyCycle;
2420     break;
2421   }
2422   RetiredMOps += IncMOps;
2423 
2424   // Update resource counts and critical resource.
2425   if (SchedModel->hasInstrSchedModel()) {
2426     unsigned DecRemIssue = IncMOps * SchedModel->getMicroOpFactor();
2427     assert(Rem->RemIssueCount >= DecRemIssue && "MOps double counted");
2428     Rem->RemIssueCount -= DecRemIssue;
2429     if (ZoneCritResIdx) {
2430       // Scale scheduled micro-ops for comparing with the critical resource.
2431       unsigned ScaledMOps =
2432         RetiredMOps * SchedModel->getMicroOpFactor();
2433 
2434       // If scaled micro-ops are now more than the previous critical resource by
2435       // a full cycle, then micro-ops issue becomes critical.
2436       if ((int)(ScaledMOps - getResourceCount(ZoneCritResIdx))
2437           >= (int)SchedModel->getLatencyFactor()) {
2438         ZoneCritResIdx = 0;
2439         LLVM_DEBUG(dbgs() << "  *** Critical resource NumMicroOps: "
2440                           << ScaledMOps / SchedModel->getLatencyFactor()
2441                           << "c\n");
2442       }
2443     }
2444     for (TargetSchedModel::ProcResIter
2445            PI = SchedModel->getWriteProcResBegin(SC),
2446            PE = SchedModel->getWriteProcResEnd(SC); PI != PE; ++PI) {
2447       unsigned RCycle =
2448         countResource(SC, PI->ProcResourceIdx, PI->Cycles, NextCycle);
2449       if (RCycle > NextCycle)
2450         NextCycle = RCycle;
2451     }
2452     if (SU->hasReservedResource) {
2453       // For reserved resources, record the highest cycle using the resource.
2454       // For top-down scheduling, this is the cycle in which we schedule this
2455       // instruction plus the number of cycles the operations reserves the
2456       // resource. For bottom-up is it simply the instruction's cycle.
2457       for (TargetSchedModel::ProcResIter
2458              PI = SchedModel->getWriteProcResBegin(SC),
2459              PE = SchedModel->getWriteProcResEnd(SC); PI != PE; ++PI) {
2460         unsigned PIdx = PI->ProcResourceIdx;
2461         if (SchedModel->getProcResource(PIdx)->BufferSize == 0) {
2462           unsigned ReservedUntil, InstanceIdx;
2463           std::tie(ReservedUntil, InstanceIdx) =
2464               getNextResourceCycle(SC, PIdx, 0);
2465           if (isTop()) {
2466             ReservedCycles[InstanceIdx] =
2467                 std::max(ReservedUntil, NextCycle + PI->Cycles);
2468           } else
2469             ReservedCycles[InstanceIdx] = NextCycle;
2470         }
2471       }
2472     }
2473   }
2474   // Update ExpectedLatency and DependentLatency.
2475   unsigned &TopLatency = isTop() ? ExpectedLatency : DependentLatency;
2476   unsigned &BotLatency = isTop() ? DependentLatency : ExpectedLatency;
2477   if (SU->getDepth() > TopLatency) {
2478     TopLatency = SU->getDepth();
2479     LLVM_DEBUG(dbgs() << "  " << Available.getName() << " TopLatency SU("
2480                       << SU->NodeNum << ") " << TopLatency << "c\n");
2481   }
2482   if (SU->getHeight() > BotLatency) {
2483     BotLatency = SU->getHeight();
2484     LLVM_DEBUG(dbgs() << "  " << Available.getName() << " BotLatency SU("
2485                       << SU->NodeNum << ") " << BotLatency << "c\n");
2486   }
2487   // If we stall for any reason, bump the cycle.
2488   if (NextCycle > CurrCycle)
2489     bumpCycle(NextCycle);
2490   else
2491     // After updating ZoneCritResIdx and ExpectedLatency, check if we're
2492     // resource limited. If a stall occurred, bumpCycle does this.
2493     IsResourceLimited =
2494         checkResourceLimit(SchedModel->getLatencyFactor(), getCriticalCount(),
2495                            getScheduledLatency(), true);
2496 
2497   // Update CurrMOps after calling bumpCycle to handle stalls, since bumpCycle
2498   // resets CurrMOps. Loop to handle instructions with more MOps than issue in
2499   // one cycle.  Since we commonly reach the max MOps here, opportunistically
2500   // bump the cycle to avoid uselessly checking everything in the readyQ.
2501   CurrMOps += IncMOps;
2502 
2503   // Bump the cycle count for issue group constraints.
2504   // This must be done after NextCycle has been adjust for all other stalls.
2505   // Calling bumpCycle(X) will reduce CurrMOps by one issue group and set
2506   // currCycle to X.
2507   if ((isTop() &&  SchedModel->mustEndGroup(SU->getInstr())) ||
2508       (!isTop() && SchedModel->mustBeginGroup(SU->getInstr()))) {
2509     LLVM_DEBUG(dbgs() << "  Bump cycle to " << (isTop() ? "end" : "begin")
2510                       << " group\n");
2511     bumpCycle(++NextCycle);
2512   }
2513 
2514   while (CurrMOps >= SchedModel->getIssueWidth()) {
2515     LLVM_DEBUG(dbgs() << "  *** Max MOps " << CurrMOps << " at cycle "
2516                       << CurrCycle << '\n');
2517     bumpCycle(++NextCycle);
2518   }
2519   LLVM_DEBUG(dumpScheduledState());
2520 }
2521 
2522 /// Release pending ready nodes in to the available queue. This makes them
2523 /// visible to heuristics.
2524 void SchedBoundary::releasePending() {
2525   // If the available queue is empty, it is safe to reset MinReadyCycle.
2526   if (Available.empty())
2527     MinReadyCycle = std::numeric_limits<unsigned>::max();
2528 
2529   // Check to see if any of the pending instructions are ready to issue.  If
2530   // so, add them to the available queue.
2531   for (unsigned I = 0, E = Pending.size(); I < E; ++I) {
2532     SUnit *SU = *(Pending.begin() + I);
2533     unsigned ReadyCycle = isTop() ? SU->TopReadyCycle : SU->BotReadyCycle;
2534 
2535     if (ReadyCycle < MinReadyCycle)
2536       MinReadyCycle = ReadyCycle;
2537 
2538     if (Available.size() >= ReadyListLimit)
2539       break;
2540 
2541     releaseNode(SU, ReadyCycle, true, I);
2542     if (E != Pending.size()) {
2543       --I;
2544       --E;
2545     }
2546   }
2547   CheckPending = false;
2548 }
2549 
2550 /// Remove SU from the ready set for this boundary.
2551 void SchedBoundary::removeReady(SUnit *SU) {
2552   if (Available.isInQueue(SU))
2553     Available.remove(Available.find(SU));
2554   else {
2555     assert(Pending.isInQueue(SU) && "bad ready count");
2556     Pending.remove(Pending.find(SU));
2557   }
2558 }
2559 
2560 /// If this queue only has one ready candidate, return it. As a side effect,
2561 /// defer any nodes that now hit a hazard, and advance the cycle until at least
2562 /// one node is ready. If multiple instructions are ready, return NULL.
2563 SUnit *SchedBoundary::pickOnlyChoice() {
2564   if (CheckPending)
2565     releasePending();
2566 
2567   // Defer any ready instrs that now have a hazard.
2568   for (ReadyQueue::iterator I = Available.begin(); I != Available.end();) {
2569     if (checkHazard(*I)) {
2570       Pending.push(*I);
2571       I = Available.remove(I);
2572       continue;
2573     }
2574     ++I;
2575   }
2576   for (unsigned i = 0; Available.empty(); ++i) {
2577 //  FIXME: Re-enable assert once PR20057 is resolved.
2578 //    assert(i <= (HazardRec->getMaxLookAhead() + MaxObservedStall) &&
2579 //           "permanent hazard");
2580     (void)i;
2581     bumpCycle(CurrCycle + 1);
2582     releasePending();
2583   }
2584 
2585   LLVM_DEBUG(Pending.dump());
2586   LLVM_DEBUG(Available.dump());
2587 
2588   if (Available.size() == 1)
2589     return *Available.begin();
2590   return nullptr;
2591 }
2592 
2593 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2594 // This is useful information to dump after bumpNode.
2595 // Note that the Queue contents are more useful before pickNodeFromQueue.
2596 LLVM_DUMP_METHOD void SchedBoundary::dumpScheduledState() const {
2597   unsigned ResFactor;
2598   unsigned ResCount;
2599   if (ZoneCritResIdx) {
2600     ResFactor = SchedModel->getResourceFactor(ZoneCritResIdx);
2601     ResCount = getResourceCount(ZoneCritResIdx);
2602   } else {
2603     ResFactor = SchedModel->getMicroOpFactor();
2604     ResCount = RetiredMOps * ResFactor;
2605   }
2606   unsigned LFactor = SchedModel->getLatencyFactor();
2607   dbgs() << Available.getName() << " @" << CurrCycle << "c\n"
2608          << "  Retired: " << RetiredMOps;
2609   dbgs() << "\n  Executed: " << getExecutedCount() / LFactor << "c";
2610   dbgs() << "\n  Critical: " << ResCount / LFactor << "c, "
2611          << ResCount / ResFactor << " "
2612          << SchedModel->getResourceName(ZoneCritResIdx)
2613          << "\n  ExpectedLatency: " << ExpectedLatency << "c\n"
2614          << (IsResourceLimited ? "  - Resource" : "  - Latency")
2615          << " limited.\n";
2616 }
2617 #endif
2618 
2619 //===----------------------------------------------------------------------===//
2620 // GenericScheduler - Generic implementation of MachineSchedStrategy.
2621 //===----------------------------------------------------------------------===//
2622 
2623 void GenericSchedulerBase::SchedCandidate::
2624 initResourceDelta(const ScheduleDAGMI *DAG,
2625                   const TargetSchedModel *SchedModel) {
2626   if (!Policy.ReduceResIdx && !Policy.DemandResIdx)
2627     return;
2628 
2629   const MCSchedClassDesc *SC = DAG->getSchedClass(SU);
2630   for (TargetSchedModel::ProcResIter
2631          PI = SchedModel->getWriteProcResBegin(SC),
2632          PE = SchedModel->getWriteProcResEnd(SC); PI != PE; ++PI) {
2633     if (PI->ProcResourceIdx == Policy.ReduceResIdx)
2634       ResDelta.CritResources += PI->Cycles;
2635     if (PI->ProcResourceIdx == Policy.DemandResIdx)
2636       ResDelta.DemandedResources += PI->Cycles;
2637   }
2638 }
2639 
2640 /// Compute remaining latency. We need this both to determine whether the
2641 /// overall schedule has become latency-limited and whether the instructions
2642 /// outside this zone are resource or latency limited.
2643 ///
2644 /// The "dependent" latency is updated incrementally during scheduling as the
2645 /// max height/depth of scheduled nodes minus the cycles since it was
2646 /// scheduled:
2647 ///   DLat = max (N.depth - (CurrCycle - N.ReadyCycle) for N in Zone
2648 ///
2649 /// The "independent" latency is the max ready queue depth:
2650 ///   ILat = max N.depth for N in Available|Pending
2651 ///
2652 /// RemainingLatency is the greater of independent and dependent latency.
2653 ///
2654 /// These computations are expensive, especially in DAGs with many edges, so
2655 /// only do them if necessary.
2656 static unsigned computeRemLatency(SchedBoundary &CurrZone) {
2657   unsigned RemLatency = CurrZone.getDependentLatency();
2658   RemLatency = std::max(RemLatency,
2659                         CurrZone.findMaxLatency(CurrZone.Available.elements()));
2660   RemLatency = std::max(RemLatency,
2661                         CurrZone.findMaxLatency(CurrZone.Pending.elements()));
2662   return RemLatency;
2663 }
2664 
2665 /// Returns true if the current cycle plus remaning latency is greater than
2666 /// the critical path in the scheduling region.
2667 bool GenericSchedulerBase::shouldReduceLatency(const CandPolicy &Policy,
2668                                                SchedBoundary &CurrZone,
2669                                                bool ComputeRemLatency,
2670                                                unsigned &RemLatency) const {
2671   // The current cycle is already greater than the critical path, so we are
2672   // already latency limited and don't need to compute the remaining latency.
2673   if (CurrZone.getCurrCycle() > Rem.CriticalPath)
2674     return true;
2675 
2676   // If we haven't scheduled anything yet, then we aren't latency limited.
2677   if (CurrZone.getCurrCycle() == 0)
2678     return false;
2679 
2680   if (ComputeRemLatency)
2681     RemLatency = computeRemLatency(CurrZone);
2682 
2683   return RemLatency + CurrZone.getCurrCycle() > Rem.CriticalPath;
2684 }
2685 
2686 /// Set the CandPolicy given a scheduling zone given the current resources and
2687 /// latencies inside and outside the zone.
2688 void GenericSchedulerBase::setPolicy(CandPolicy &Policy, bool IsPostRA,
2689                                      SchedBoundary &CurrZone,
2690                                      SchedBoundary *OtherZone) {
2691   // Apply preemptive heuristics based on the total latency and resources
2692   // inside and outside this zone. Potential stalls should be considered before
2693   // following this policy.
2694 
2695   // Compute the critical resource outside the zone.
2696   unsigned OtherCritIdx = 0;
2697   unsigned OtherCount =
2698     OtherZone ? OtherZone->getOtherResourceCount(OtherCritIdx) : 0;
2699 
2700   bool OtherResLimited = false;
2701   unsigned RemLatency = 0;
2702   bool RemLatencyComputed = false;
2703   if (SchedModel->hasInstrSchedModel() && OtherCount != 0) {
2704     RemLatency = computeRemLatency(CurrZone);
2705     RemLatencyComputed = true;
2706     OtherResLimited = checkResourceLimit(SchedModel->getLatencyFactor(),
2707                                          OtherCount, RemLatency, false);
2708   }
2709 
2710   // Schedule aggressively for latency in PostRA mode. We don't check for
2711   // acyclic latency during PostRA, and highly out-of-order processors will
2712   // skip PostRA scheduling.
2713   if (!OtherResLimited &&
2714       (IsPostRA || shouldReduceLatency(Policy, CurrZone, !RemLatencyComputed,
2715                                        RemLatency))) {
2716     Policy.ReduceLatency |= true;
2717     LLVM_DEBUG(dbgs() << "  " << CurrZone.Available.getName()
2718                       << " RemainingLatency " << RemLatency << " + "
2719                       << CurrZone.getCurrCycle() << "c > CritPath "
2720                       << Rem.CriticalPath << "\n");
2721   }
2722   // If the same resource is limiting inside and outside the zone, do nothing.
2723   if (CurrZone.getZoneCritResIdx() == OtherCritIdx)
2724     return;
2725 
2726   LLVM_DEBUG(if (CurrZone.isResourceLimited()) {
2727     dbgs() << "  " << CurrZone.Available.getName() << " ResourceLimited: "
2728            << SchedModel->getResourceName(CurrZone.getZoneCritResIdx()) << "\n";
2729   } if (OtherResLimited) dbgs()
2730                  << "  RemainingLimit: "
2731                  << SchedModel->getResourceName(OtherCritIdx) << "\n";
2732              if (!CurrZone.isResourceLimited() && !OtherResLimited) dbgs()
2733              << "  Latency limited both directions.\n");
2734 
2735   if (CurrZone.isResourceLimited() && !Policy.ReduceResIdx)
2736     Policy.ReduceResIdx = CurrZone.getZoneCritResIdx();
2737 
2738   if (OtherResLimited)
2739     Policy.DemandResIdx = OtherCritIdx;
2740 }
2741 
2742 #ifndef NDEBUG
2743 const char *GenericSchedulerBase::getReasonStr(
2744   GenericSchedulerBase::CandReason Reason) {
2745   switch (Reason) {
2746   case NoCand:         return "NOCAND    ";
2747   case Only1:          return "ONLY1     ";
2748   case PhysReg:        return "PHYS-REG  ";
2749   case RegExcess:      return "REG-EXCESS";
2750   case RegCritical:    return "REG-CRIT  ";
2751   case Stall:          return "STALL     ";
2752   case Cluster:        return "CLUSTER   ";
2753   case Weak:           return "WEAK      ";
2754   case RegMax:         return "REG-MAX   ";
2755   case ResourceReduce: return "RES-REDUCE";
2756   case ResourceDemand: return "RES-DEMAND";
2757   case TopDepthReduce: return "TOP-DEPTH ";
2758   case TopPathReduce:  return "TOP-PATH  ";
2759   case BotHeightReduce:return "BOT-HEIGHT";
2760   case BotPathReduce:  return "BOT-PATH  ";
2761   case NextDefUse:     return "DEF-USE   ";
2762   case NodeOrder:      return "ORDER     ";
2763   };
2764   llvm_unreachable("Unknown reason!");
2765 }
2766 
2767 void GenericSchedulerBase::traceCandidate(const SchedCandidate &Cand) {
2768   PressureChange P;
2769   unsigned ResIdx = 0;
2770   unsigned Latency = 0;
2771   switch (Cand.Reason) {
2772   default:
2773     break;
2774   case RegExcess:
2775     P = Cand.RPDelta.Excess;
2776     break;
2777   case RegCritical:
2778     P = Cand.RPDelta.CriticalMax;
2779     break;
2780   case RegMax:
2781     P = Cand.RPDelta.CurrentMax;
2782     break;
2783   case ResourceReduce:
2784     ResIdx = Cand.Policy.ReduceResIdx;
2785     break;
2786   case ResourceDemand:
2787     ResIdx = Cand.Policy.DemandResIdx;
2788     break;
2789   case TopDepthReduce:
2790     Latency = Cand.SU->getDepth();
2791     break;
2792   case TopPathReduce:
2793     Latency = Cand.SU->getHeight();
2794     break;
2795   case BotHeightReduce:
2796     Latency = Cand.SU->getHeight();
2797     break;
2798   case BotPathReduce:
2799     Latency = Cand.SU->getDepth();
2800     break;
2801   }
2802   dbgs() << "  Cand SU(" << Cand.SU->NodeNum << ") " << getReasonStr(Cand.Reason);
2803   if (P.isValid())
2804     dbgs() << " " << TRI->getRegPressureSetName(P.getPSet())
2805            << ":" << P.getUnitInc() << " ";
2806   else
2807     dbgs() << "      ";
2808   if (ResIdx)
2809     dbgs() << " " << SchedModel->getProcResource(ResIdx)->Name << " ";
2810   else
2811     dbgs() << "         ";
2812   if (Latency)
2813     dbgs() << " " << Latency << " cycles ";
2814   else
2815     dbgs() << "          ";
2816   dbgs() << '\n';
2817 }
2818 #endif
2819 
2820 namespace llvm {
2821 /// Return true if this heuristic determines order.
2822 /// TODO: Consider refactor return type of these functions as integer or enum,
2823 /// as we may need to differentiate whether TryCand is better than Cand.
2824 bool tryLess(int TryVal, int CandVal,
2825              GenericSchedulerBase::SchedCandidate &TryCand,
2826              GenericSchedulerBase::SchedCandidate &Cand,
2827              GenericSchedulerBase::CandReason Reason) {
2828   if (TryVal < CandVal) {
2829     TryCand.Reason = Reason;
2830     return true;
2831   }
2832   if (TryVal > CandVal) {
2833     if (Cand.Reason > Reason)
2834       Cand.Reason = Reason;
2835     return true;
2836   }
2837   return false;
2838 }
2839 
2840 bool tryGreater(int TryVal, int CandVal,
2841                 GenericSchedulerBase::SchedCandidate &TryCand,
2842                 GenericSchedulerBase::SchedCandidate &Cand,
2843                 GenericSchedulerBase::CandReason Reason) {
2844   if (TryVal > CandVal) {
2845     TryCand.Reason = Reason;
2846     return true;
2847   }
2848   if (TryVal < CandVal) {
2849     if (Cand.Reason > Reason)
2850       Cand.Reason = Reason;
2851     return true;
2852   }
2853   return false;
2854 }
2855 
2856 bool tryLatency(GenericSchedulerBase::SchedCandidate &TryCand,
2857                 GenericSchedulerBase::SchedCandidate &Cand,
2858                 SchedBoundary &Zone) {
2859   if (Zone.isTop()) {
2860     // Prefer the candidate with the lesser depth, but only if one of them has
2861     // depth greater than the total latency scheduled so far, otherwise either
2862     // of them could be scheduled now with no stall.
2863     if (std::max(TryCand.SU->getDepth(), Cand.SU->getDepth()) >
2864         Zone.getScheduledLatency()) {
2865       if (tryLess(TryCand.SU->getDepth(), Cand.SU->getDepth(),
2866                   TryCand, Cand, GenericSchedulerBase::TopDepthReduce))
2867         return true;
2868     }
2869     if (tryGreater(TryCand.SU->getHeight(), Cand.SU->getHeight(),
2870                    TryCand, Cand, GenericSchedulerBase::TopPathReduce))
2871       return true;
2872   } else {
2873     // Prefer the candidate with the lesser height, but only if one of them has
2874     // height greater than the total latency scheduled so far, otherwise either
2875     // of them could be scheduled now with no stall.
2876     if (std::max(TryCand.SU->getHeight(), Cand.SU->getHeight()) >
2877         Zone.getScheduledLatency()) {
2878       if (tryLess(TryCand.SU->getHeight(), Cand.SU->getHeight(),
2879                   TryCand, Cand, GenericSchedulerBase::BotHeightReduce))
2880         return true;
2881     }
2882     if (tryGreater(TryCand.SU->getDepth(), Cand.SU->getDepth(),
2883                    TryCand, Cand, GenericSchedulerBase::BotPathReduce))
2884       return true;
2885   }
2886   return false;
2887 }
2888 } // end namespace llvm
2889 
2890 static void tracePick(GenericSchedulerBase::CandReason Reason, bool IsTop) {
2891   LLVM_DEBUG(dbgs() << "Pick " << (IsTop ? "Top " : "Bot ")
2892                     << GenericSchedulerBase::getReasonStr(Reason) << '\n');
2893 }
2894 
2895 static void tracePick(const GenericSchedulerBase::SchedCandidate &Cand) {
2896   tracePick(Cand.Reason, Cand.AtTop);
2897 }
2898 
2899 void GenericScheduler::initialize(ScheduleDAGMI *dag) {
2900   assert(dag->hasVRegLiveness() &&
2901          "(PreRA)GenericScheduler needs vreg liveness");
2902   DAG = static_cast<ScheduleDAGMILive*>(dag);
2903   SchedModel = DAG->getSchedModel();
2904   TRI = DAG->TRI;
2905 
2906   if (RegionPolicy.ComputeDFSResult)
2907     DAG->computeDFSResult();
2908 
2909   Rem.init(DAG, SchedModel);
2910   Top.init(DAG, SchedModel, &Rem);
2911   Bot.init(DAG, SchedModel, &Rem);
2912 
2913   // Initialize resource counts.
2914 
2915   // Initialize the HazardRecognizers. If itineraries don't exist, are empty, or
2916   // are disabled, then these HazardRecs will be disabled.
2917   const InstrItineraryData *Itin = SchedModel->getInstrItineraries();
2918   if (!Top.HazardRec) {
2919     Top.HazardRec =
2920         DAG->MF.getSubtarget().getInstrInfo()->CreateTargetMIHazardRecognizer(
2921             Itin, DAG);
2922   }
2923   if (!Bot.HazardRec) {
2924     Bot.HazardRec =
2925         DAG->MF.getSubtarget().getInstrInfo()->CreateTargetMIHazardRecognizer(
2926             Itin, DAG);
2927   }
2928   TopCand.SU = nullptr;
2929   BotCand.SU = nullptr;
2930 }
2931 
2932 /// Initialize the per-region scheduling policy.
2933 void GenericScheduler::initPolicy(MachineBasicBlock::iterator Begin,
2934                                   MachineBasicBlock::iterator End,
2935                                   unsigned NumRegionInstrs) {
2936   const MachineFunction &MF = *Begin->getMF();
2937   const TargetLowering *TLI = MF.getSubtarget().getTargetLowering();
2938 
2939   // Avoid setting up the register pressure tracker for small regions to save
2940   // compile time. As a rough heuristic, only track pressure when the number of
2941   // schedulable instructions exceeds half the integer register file.
2942   RegionPolicy.ShouldTrackPressure = true;
2943   for (unsigned VT = MVT::i32; VT > (unsigned)MVT::i1; --VT) {
2944     MVT::SimpleValueType LegalIntVT = (MVT::SimpleValueType)VT;
2945     if (TLI->isTypeLegal(LegalIntVT)) {
2946       unsigned NIntRegs = Context->RegClassInfo->getNumAllocatableRegs(
2947         TLI->getRegClassFor(LegalIntVT));
2948       RegionPolicy.ShouldTrackPressure = NumRegionInstrs > (NIntRegs / 2);
2949     }
2950   }
2951 
2952   // For generic targets, we default to bottom-up, because it's simpler and more
2953   // compile-time optimizations have been implemented in that direction.
2954   RegionPolicy.OnlyBottomUp = true;
2955 
2956   // Allow the subtarget to override default policy.
2957   MF.getSubtarget().overrideSchedPolicy(RegionPolicy, NumRegionInstrs);
2958 
2959   // After subtarget overrides, apply command line options.
2960   if (!EnableRegPressure) {
2961     RegionPolicy.ShouldTrackPressure = false;
2962     RegionPolicy.ShouldTrackLaneMasks = false;
2963   }
2964 
2965   // Check -misched-topdown/bottomup can force or unforce scheduling direction.
2966   // e.g. -misched-bottomup=false allows scheduling in both directions.
2967   assert((!ForceTopDown || !ForceBottomUp) &&
2968          "-misched-topdown incompatible with -misched-bottomup");
2969   if (ForceBottomUp.getNumOccurrences() > 0) {
2970     RegionPolicy.OnlyBottomUp = ForceBottomUp;
2971     if (RegionPolicy.OnlyBottomUp)
2972       RegionPolicy.OnlyTopDown = false;
2973   }
2974   if (ForceTopDown.getNumOccurrences() > 0) {
2975     RegionPolicy.OnlyTopDown = ForceTopDown;
2976     if (RegionPolicy.OnlyTopDown)
2977       RegionPolicy.OnlyBottomUp = false;
2978   }
2979 }
2980 
2981 void GenericScheduler::dumpPolicy() const {
2982   // Cannot completely remove virtual function even in release mode.
2983 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2984   dbgs() << "GenericScheduler RegionPolicy: "
2985          << " ShouldTrackPressure=" << RegionPolicy.ShouldTrackPressure
2986          << " OnlyTopDown=" << RegionPolicy.OnlyTopDown
2987          << " OnlyBottomUp=" << RegionPolicy.OnlyBottomUp
2988          << "\n";
2989 #endif
2990 }
2991 
2992 /// Set IsAcyclicLatencyLimited if the acyclic path is longer than the cyclic
2993 /// critical path by more cycles than it takes to drain the instruction buffer.
2994 /// We estimate an upper bounds on in-flight instructions as:
2995 ///
2996 /// CyclesPerIteration = max( CyclicPath, Loop-Resource-Height )
2997 /// InFlightIterations = AcyclicPath / CyclesPerIteration
2998 /// InFlightResources = InFlightIterations * LoopResources
2999 ///
3000 /// TODO: Check execution resources in addition to IssueCount.
3001 void GenericScheduler::checkAcyclicLatency() {
3002   if (Rem.CyclicCritPath == 0 || Rem.CyclicCritPath >= Rem.CriticalPath)
3003     return;
3004 
3005   // Scaled number of cycles per loop iteration.
3006   unsigned IterCount =
3007     std::max(Rem.CyclicCritPath * SchedModel->getLatencyFactor(),
3008              Rem.RemIssueCount);
3009   // Scaled acyclic critical path.
3010   unsigned AcyclicCount = Rem.CriticalPath * SchedModel->getLatencyFactor();
3011   // InFlightCount = (AcyclicPath / IterCycles) * InstrPerLoop
3012   unsigned InFlightCount =
3013     (AcyclicCount * Rem.RemIssueCount + IterCount-1) / IterCount;
3014   unsigned BufferLimit =
3015     SchedModel->getMicroOpBufferSize() * SchedModel->getMicroOpFactor();
3016 
3017   Rem.IsAcyclicLatencyLimited = InFlightCount > BufferLimit;
3018 
3019   LLVM_DEBUG(
3020       dbgs() << "IssueCycles="
3021              << Rem.RemIssueCount / SchedModel->getLatencyFactor() << "c "
3022              << "IterCycles=" << IterCount / SchedModel->getLatencyFactor()
3023              << "c NumIters=" << (AcyclicCount + IterCount - 1) / IterCount
3024              << " InFlight=" << InFlightCount / SchedModel->getMicroOpFactor()
3025              << "m BufferLim=" << SchedModel->getMicroOpBufferSize() << "m\n";
3026       if (Rem.IsAcyclicLatencyLimited) dbgs() << "  ACYCLIC LATENCY LIMIT\n");
3027 }
3028 
3029 void GenericScheduler::registerRoots() {
3030   Rem.CriticalPath = DAG->ExitSU.getDepth();
3031 
3032   // Some roots may not feed into ExitSU. Check all of them in case.
3033   for (const SUnit *SU : Bot.Available) {
3034     if (SU->getDepth() > Rem.CriticalPath)
3035       Rem.CriticalPath = SU->getDepth();
3036   }
3037   LLVM_DEBUG(dbgs() << "Critical Path(GS-RR ): " << Rem.CriticalPath << '\n');
3038   if (DumpCriticalPathLength) {
3039     errs() << "Critical Path(GS-RR ): " << Rem.CriticalPath << " \n";
3040   }
3041 
3042   if (EnableCyclicPath && SchedModel->getMicroOpBufferSize() > 0) {
3043     Rem.CyclicCritPath = DAG->computeCyclicCriticalPath();
3044     checkAcyclicLatency();
3045   }
3046 }
3047 
3048 namespace llvm {
3049 bool tryPressure(const PressureChange &TryP,
3050                  const PressureChange &CandP,
3051                  GenericSchedulerBase::SchedCandidate &TryCand,
3052                  GenericSchedulerBase::SchedCandidate &Cand,
3053                  GenericSchedulerBase::CandReason Reason,
3054                  const TargetRegisterInfo *TRI,
3055                  const MachineFunction &MF) {
3056   // If one candidate decreases and the other increases, go with it.
3057   // Invalid candidates have UnitInc==0.
3058   if (tryGreater(TryP.getUnitInc() < 0, CandP.getUnitInc() < 0, TryCand, Cand,
3059                  Reason)) {
3060     return true;
3061   }
3062   // Do not compare the magnitude of pressure changes between top and bottom
3063   // boundary.
3064   if (Cand.AtTop != TryCand.AtTop)
3065     return false;
3066 
3067   // If both candidates affect the same set in the same boundary, go with the
3068   // smallest increase.
3069   unsigned TryPSet = TryP.getPSetOrMax();
3070   unsigned CandPSet = CandP.getPSetOrMax();
3071   if (TryPSet == CandPSet) {
3072     return tryLess(TryP.getUnitInc(), CandP.getUnitInc(), TryCand, Cand,
3073                    Reason);
3074   }
3075 
3076   int TryRank = TryP.isValid() ? TRI->getRegPressureSetScore(MF, TryPSet) :
3077                                  std::numeric_limits<int>::max();
3078 
3079   int CandRank = CandP.isValid() ? TRI->getRegPressureSetScore(MF, CandPSet) :
3080                                    std::numeric_limits<int>::max();
3081 
3082   // If the candidates are decreasing pressure, reverse priority.
3083   if (TryP.getUnitInc() < 0)
3084     std::swap(TryRank, CandRank);
3085   return tryGreater(TryRank, CandRank, TryCand, Cand, Reason);
3086 }
3087 
3088 unsigned getWeakLeft(const SUnit *SU, bool isTop) {
3089   return (isTop) ? SU->WeakPredsLeft : SU->WeakSuccsLeft;
3090 }
3091 
3092 /// Minimize physical register live ranges. Regalloc wants them adjacent to
3093 /// their physreg def/use.
3094 ///
3095 /// FIXME: This is an unnecessary check on the critical path. Most are root/leaf
3096 /// copies which can be prescheduled. The rest (e.g. x86 MUL) could be bundled
3097 /// with the operation that produces or consumes the physreg. We'll do this when
3098 /// regalloc has support for parallel copies.
3099 int biasPhysReg(const SUnit *SU, bool isTop) {
3100   const MachineInstr *MI = SU->getInstr();
3101 
3102   if (MI->isCopy()) {
3103     unsigned ScheduledOper = isTop ? 1 : 0;
3104     unsigned UnscheduledOper = isTop ? 0 : 1;
3105     // If we have already scheduled the physreg produce/consumer, immediately
3106     // schedule the copy.
3107     if (Register::isPhysicalRegister(MI->getOperand(ScheduledOper).getReg()))
3108       return 1;
3109     // If the physreg is at the boundary, defer it. Otherwise schedule it
3110     // immediately to free the dependent. We can hoist the copy later.
3111     bool AtBoundary = isTop ? !SU->NumSuccsLeft : !SU->NumPredsLeft;
3112     if (Register::isPhysicalRegister(MI->getOperand(UnscheduledOper).getReg()))
3113       return AtBoundary ? -1 : 1;
3114   }
3115 
3116   if (MI->isMoveImmediate()) {
3117     // If we have a move immediate and all successors have been assigned, bias
3118     // towards scheduling this later. Make sure all register defs are to
3119     // physical registers.
3120     bool DoBias = true;
3121     for (const MachineOperand &Op : MI->defs()) {
3122       if (Op.isReg() && !Register::isPhysicalRegister(Op.getReg())) {
3123         DoBias = false;
3124         break;
3125       }
3126     }
3127 
3128     if (DoBias)
3129       return isTop ? -1 : 1;
3130   }
3131 
3132   return 0;
3133 }
3134 } // end namespace llvm
3135 
3136 void GenericScheduler::initCandidate(SchedCandidate &Cand, SUnit *SU,
3137                                      bool AtTop,
3138                                      const RegPressureTracker &RPTracker,
3139                                      RegPressureTracker &TempTracker) {
3140   Cand.SU = SU;
3141   Cand.AtTop = AtTop;
3142   if (DAG->isTrackingPressure()) {
3143     if (AtTop) {
3144       TempTracker.getMaxDownwardPressureDelta(
3145         Cand.SU->getInstr(),
3146         Cand.RPDelta,
3147         DAG->getRegionCriticalPSets(),
3148         DAG->getRegPressure().MaxSetPressure);
3149     } else {
3150       if (VerifyScheduling) {
3151         TempTracker.getMaxUpwardPressureDelta(
3152           Cand.SU->getInstr(),
3153           &DAG->getPressureDiff(Cand.SU),
3154           Cand.RPDelta,
3155           DAG->getRegionCriticalPSets(),
3156           DAG->getRegPressure().MaxSetPressure);
3157       } else {
3158         RPTracker.getUpwardPressureDelta(
3159           Cand.SU->getInstr(),
3160           DAG->getPressureDiff(Cand.SU),
3161           Cand.RPDelta,
3162           DAG->getRegionCriticalPSets(),
3163           DAG->getRegPressure().MaxSetPressure);
3164       }
3165     }
3166   }
3167   LLVM_DEBUG(if (Cand.RPDelta.Excess.isValid()) dbgs()
3168              << "  Try  SU(" << Cand.SU->NodeNum << ") "
3169              << TRI->getRegPressureSetName(Cand.RPDelta.Excess.getPSet()) << ":"
3170              << Cand.RPDelta.Excess.getUnitInc() << "\n");
3171 }
3172 
3173 /// Apply a set of heuristics to a new candidate. Heuristics are currently
3174 /// hierarchical. This may be more efficient than a graduated cost model because
3175 /// we don't need to evaluate all aspects of the model for each node in the
3176 /// queue. But it's really done to make the heuristics easier to debug and
3177 /// statistically analyze.
3178 ///
3179 /// \param Cand provides the policy and current best candidate.
3180 /// \param TryCand refers to the next SUnit candidate, otherwise uninitialized.
3181 /// \param Zone describes the scheduled zone that we are extending, or nullptr
3182 ///             if Cand is from a different zone than TryCand.
3183 /// \return \c true if TryCand is better than Cand (Reason is NOT NoCand)
3184 bool GenericScheduler::tryCandidate(SchedCandidate &Cand,
3185                                     SchedCandidate &TryCand,
3186                                     SchedBoundary *Zone) const {
3187   // Initialize the candidate if needed.
3188   if (!Cand.isValid()) {
3189     TryCand.Reason = NodeOrder;
3190     return true;
3191   }
3192 
3193   // Bias PhysReg Defs and copies to their uses and defined respectively.
3194   if (tryGreater(biasPhysReg(TryCand.SU, TryCand.AtTop),
3195                  biasPhysReg(Cand.SU, Cand.AtTop), TryCand, Cand, PhysReg))
3196     return TryCand.Reason != NoCand;
3197 
3198   // Avoid exceeding the target's limit.
3199   if (DAG->isTrackingPressure() && tryPressure(TryCand.RPDelta.Excess,
3200                                                Cand.RPDelta.Excess,
3201                                                TryCand, Cand, RegExcess, TRI,
3202                                                DAG->MF))
3203     return TryCand.Reason != NoCand;
3204 
3205   // Avoid increasing the max critical pressure in the scheduled region.
3206   if (DAG->isTrackingPressure() && tryPressure(TryCand.RPDelta.CriticalMax,
3207                                                Cand.RPDelta.CriticalMax,
3208                                                TryCand, Cand, RegCritical, TRI,
3209                                                DAG->MF))
3210     return TryCand.Reason != NoCand;
3211 
3212   // We only compare a subset of features when comparing nodes between
3213   // Top and Bottom boundary. Some properties are simply incomparable, in many
3214   // other instances we should only override the other boundary if something
3215   // is a clear good pick on one boundary. Skip heuristics that are more
3216   // "tie-breaking" in nature.
3217   bool SameBoundary = Zone != nullptr;
3218   if (SameBoundary) {
3219     // For loops that are acyclic path limited, aggressively schedule for
3220     // latency. Within an single cycle, whenever CurrMOps > 0, allow normal
3221     // heuristics to take precedence.
3222     if (Rem.IsAcyclicLatencyLimited && !Zone->getCurrMOps() &&
3223         tryLatency(TryCand, Cand, *Zone))
3224       return TryCand.Reason != NoCand;
3225 
3226     // Prioritize instructions that read unbuffered resources by stall cycles.
3227     if (tryLess(Zone->getLatencyStallCycles(TryCand.SU),
3228                 Zone->getLatencyStallCycles(Cand.SU), TryCand, Cand, Stall))
3229       return TryCand.Reason != NoCand;
3230   }
3231 
3232   // Keep clustered nodes together to encourage downstream peephole
3233   // optimizations which may reduce resource requirements.
3234   //
3235   // This is a best effort to set things up for a post-RA pass. Optimizations
3236   // like generating loads of multiple registers should ideally be done within
3237   // the scheduler pass by combining the loads during DAG postprocessing.
3238   const SUnit *CandNextClusterSU =
3239     Cand.AtTop ? DAG->getNextClusterSucc() : DAG->getNextClusterPred();
3240   const SUnit *TryCandNextClusterSU =
3241     TryCand.AtTop ? DAG->getNextClusterSucc() : DAG->getNextClusterPred();
3242   if (tryGreater(TryCand.SU == TryCandNextClusterSU,
3243                  Cand.SU == CandNextClusterSU,
3244                  TryCand, Cand, Cluster))
3245     return TryCand.Reason != NoCand;
3246 
3247   if (SameBoundary) {
3248     // Weak edges are for clustering and other constraints.
3249     if (tryLess(getWeakLeft(TryCand.SU, TryCand.AtTop),
3250                 getWeakLeft(Cand.SU, Cand.AtTop),
3251                 TryCand, Cand, Weak))
3252       return TryCand.Reason != NoCand;
3253   }
3254 
3255   // Avoid increasing the max pressure of the entire region.
3256   if (DAG->isTrackingPressure() && tryPressure(TryCand.RPDelta.CurrentMax,
3257                                                Cand.RPDelta.CurrentMax,
3258                                                TryCand, Cand, RegMax, TRI,
3259                                                DAG->MF))
3260     return TryCand.Reason != NoCand;
3261 
3262   if (SameBoundary) {
3263     // Avoid critical resource consumption and balance the schedule.
3264     TryCand.initResourceDelta(DAG, SchedModel);
3265     if (tryLess(TryCand.ResDelta.CritResources, Cand.ResDelta.CritResources,
3266                 TryCand, Cand, ResourceReduce))
3267       return TryCand.Reason != NoCand;
3268     if (tryGreater(TryCand.ResDelta.DemandedResources,
3269                    Cand.ResDelta.DemandedResources,
3270                    TryCand, Cand, ResourceDemand))
3271       return TryCand.Reason != NoCand;
3272 
3273     // Avoid serializing long latency dependence chains.
3274     // For acyclic path limited loops, latency was already checked above.
3275     if (!RegionPolicy.DisableLatencyHeuristic && TryCand.Policy.ReduceLatency &&
3276         !Rem.IsAcyclicLatencyLimited && tryLatency(TryCand, Cand, *Zone))
3277       return TryCand.Reason != NoCand;
3278 
3279     // Fall through to original instruction order.
3280     if ((Zone->isTop() && TryCand.SU->NodeNum < Cand.SU->NodeNum)
3281         || (!Zone->isTop() && TryCand.SU->NodeNum > Cand.SU->NodeNum)) {
3282       TryCand.Reason = NodeOrder;
3283       return true;
3284     }
3285   }
3286 
3287   return false;
3288 }
3289 
3290 /// Pick the best candidate from the queue.
3291 ///
3292 /// TODO: getMaxPressureDelta results can be mostly cached for each SUnit during
3293 /// DAG building. To adjust for the current scheduling location we need to
3294 /// maintain the number of vreg uses remaining to be top-scheduled.
3295 void GenericScheduler::pickNodeFromQueue(SchedBoundary &Zone,
3296                                          const CandPolicy &ZonePolicy,
3297                                          const RegPressureTracker &RPTracker,
3298                                          SchedCandidate &Cand) {
3299   // getMaxPressureDelta temporarily modifies the tracker.
3300   RegPressureTracker &TempTracker = const_cast<RegPressureTracker&>(RPTracker);
3301 
3302   ReadyQueue &Q = Zone.Available;
3303   for (SUnit *SU : Q) {
3304 
3305     SchedCandidate TryCand(ZonePolicy);
3306     initCandidate(TryCand, SU, Zone.isTop(), RPTracker, TempTracker);
3307     // Pass SchedBoundary only when comparing nodes from the same boundary.
3308     SchedBoundary *ZoneArg = Cand.AtTop == TryCand.AtTop ? &Zone : nullptr;
3309     if (tryCandidate(Cand, TryCand, ZoneArg)) {
3310       // Initialize resource delta if needed in case future heuristics query it.
3311       if (TryCand.ResDelta == SchedResourceDelta())
3312         TryCand.initResourceDelta(DAG, SchedModel);
3313       Cand.setBest(TryCand);
3314       LLVM_DEBUG(traceCandidate(Cand));
3315     }
3316   }
3317 }
3318 
3319 /// Pick the best candidate node from either the top or bottom queue.
3320 SUnit *GenericScheduler::pickNodeBidirectional(bool &IsTopNode) {
3321   // Schedule as far as possible in the direction of no choice. This is most
3322   // efficient, but also provides the best heuristics for CriticalPSets.
3323   if (SUnit *SU = Bot.pickOnlyChoice()) {
3324     IsTopNode = false;
3325     tracePick(Only1, false);
3326     return SU;
3327   }
3328   if (SUnit *SU = Top.pickOnlyChoice()) {
3329     IsTopNode = true;
3330     tracePick(Only1, true);
3331     return SU;
3332   }
3333   // Set the bottom-up policy based on the state of the current bottom zone and
3334   // the instructions outside the zone, including the top zone.
3335   CandPolicy BotPolicy;
3336   setPolicy(BotPolicy, /*IsPostRA=*/false, Bot, &Top);
3337   // Set the top-down policy based on the state of the current top zone and
3338   // the instructions outside the zone, including the bottom zone.
3339   CandPolicy TopPolicy;
3340   setPolicy(TopPolicy, /*IsPostRA=*/false, Top, &Bot);
3341 
3342   // See if BotCand is still valid (because we previously scheduled from Top).
3343   LLVM_DEBUG(dbgs() << "Picking from Bot:\n");
3344   if (!BotCand.isValid() || BotCand.SU->isScheduled ||
3345       BotCand.Policy != BotPolicy) {
3346     BotCand.reset(CandPolicy());
3347     pickNodeFromQueue(Bot, BotPolicy, DAG->getBotRPTracker(), BotCand);
3348     assert(BotCand.Reason != NoCand && "failed to find the first candidate");
3349   } else {
3350     LLVM_DEBUG(traceCandidate(BotCand));
3351 #ifndef NDEBUG
3352     if (VerifyScheduling) {
3353       SchedCandidate TCand;
3354       TCand.reset(CandPolicy());
3355       pickNodeFromQueue(Bot, BotPolicy, DAG->getBotRPTracker(), TCand);
3356       assert(TCand.SU == BotCand.SU &&
3357              "Last pick result should correspond to re-picking right now");
3358     }
3359 #endif
3360   }
3361 
3362   // Check if the top Q has a better candidate.
3363   LLVM_DEBUG(dbgs() << "Picking from Top:\n");
3364   if (!TopCand.isValid() || TopCand.SU->isScheduled ||
3365       TopCand.Policy != TopPolicy) {
3366     TopCand.reset(CandPolicy());
3367     pickNodeFromQueue(Top, TopPolicy, DAG->getTopRPTracker(), TopCand);
3368     assert(TopCand.Reason != NoCand && "failed to find the first candidate");
3369   } else {
3370     LLVM_DEBUG(traceCandidate(TopCand));
3371 #ifndef NDEBUG
3372     if (VerifyScheduling) {
3373       SchedCandidate TCand;
3374       TCand.reset(CandPolicy());
3375       pickNodeFromQueue(Top, TopPolicy, DAG->getTopRPTracker(), TCand);
3376       assert(TCand.SU == TopCand.SU &&
3377            "Last pick result should correspond to re-picking right now");
3378     }
3379 #endif
3380   }
3381 
3382   // Pick best from BotCand and TopCand.
3383   assert(BotCand.isValid());
3384   assert(TopCand.isValid());
3385   SchedCandidate Cand = BotCand;
3386   TopCand.Reason = NoCand;
3387   if (tryCandidate(Cand, TopCand, nullptr)) {
3388     Cand.setBest(TopCand);
3389     LLVM_DEBUG(traceCandidate(Cand));
3390   }
3391 
3392   IsTopNode = Cand.AtTop;
3393   tracePick(Cand);
3394   return Cand.SU;
3395 }
3396 
3397 /// Pick the best node to balance the schedule. Implements MachineSchedStrategy.
3398 SUnit *GenericScheduler::pickNode(bool &IsTopNode) {
3399   if (DAG->top() == DAG->bottom()) {
3400     assert(Top.Available.empty() && Top.Pending.empty() &&
3401            Bot.Available.empty() && Bot.Pending.empty() && "ReadyQ garbage");
3402     return nullptr;
3403   }
3404   SUnit *SU;
3405   do {
3406     if (RegionPolicy.OnlyTopDown) {
3407       SU = Top.pickOnlyChoice();
3408       if (!SU) {
3409         CandPolicy NoPolicy;
3410         TopCand.reset(NoPolicy);
3411         pickNodeFromQueue(Top, NoPolicy, DAG->getTopRPTracker(), TopCand);
3412         assert(TopCand.Reason != NoCand && "failed to find a candidate");
3413         tracePick(TopCand);
3414         SU = TopCand.SU;
3415       }
3416       IsTopNode = true;
3417     } else if (RegionPolicy.OnlyBottomUp) {
3418       SU = Bot.pickOnlyChoice();
3419       if (!SU) {
3420         CandPolicy NoPolicy;
3421         BotCand.reset(NoPolicy);
3422         pickNodeFromQueue(Bot, NoPolicy, DAG->getBotRPTracker(), BotCand);
3423         assert(BotCand.Reason != NoCand && "failed to find a candidate");
3424         tracePick(BotCand);
3425         SU = BotCand.SU;
3426       }
3427       IsTopNode = false;
3428     } else {
3429       SU = pickNodeBidirectional(IsTopNode);
3430     }
3431   } while (SU->isScheduled);
3432 
3433   if (SU->isTopReady())
3434     Top.removeReady(SU);
3435   if (SU->isBottomReady())
3436     Bot.removeReady(SU);
3437 
3438   LLVM_DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") "
3439                     << *SU->getInstr());
3440   return SU;
3441 }
3442 
3443 void GenericScheduler::reschedulePhysReg(SUnit *SU, bool isTop) {
3444   MachineBasicBlock::iterator InsertPos = SU->getInstr();
3445   if (!isTop)
3446     ++InsertPos;
3447   SmallVectorImpl<SDep> &Deps = isTop ? SU->Preds : SU->Succs;
3448 
3449   // Find already scheduled copies with a single physreg dependence and move
3450   // them just above the scheduled instruction.
3451   for (SDep &Dep : Deps) {
3452     if (Dep.getKind() != SDep::Data ||
3453         !Register::isPhysicalRegister(Dep.getReg()))
3454       continue;
3455     SUnit *DepSU = Dep.getSUnit();
3456     if (isTop ? DepSU->Succs.size() > 1 : DepSU->Preds.size() > 1)
3457       continue;
3458     MachineInstr *Copy = DepSU->getInstr();
3459     if (!Copy->isCopy() && !Copy->isMoveImmediate())
3460       continue;
3461     LLVM_DEBUG(dbgs() << "  Rescheduling physreg copy ";
3462                DAG->dumpNode(*Dep.getSUnit()));
3463     DAG->moveInstruction(Copy, InsertPos);
3464   }
3465 }
3466 
3467 /// Update the scheduler's state after scheduling a node. This is the same node
3468 /// that was just returned by pickNode(). However, ScheduleDAGMILive needs to
3469 /// update it's state based on the current cycle before MachineSchedStrategy
3470 /// does.
3471 ///
3472 /// FIXME: Eventually, we may bundle physreg copies rather than rescheduling
3473 /// them here. See comments in biasPhysReg.
3474 void GenericScheduler::schedNode(SUnit *SU, bool IsTopNode) {
3475   if (IsTopNode) {
3476     SU->TopReadyCycle = std::max(SU->TopReadyCycle, Top.getCurrCycle());
3477     Top.bumpNode(SU);
3478     if (SU->hasPhysRegUses)
3479       reschedulePhysReg(SU, true);
3480   } else {
3481     SU->BotReadyCycle = std::max(SU->BotReadyCycle, Bot.getCurrCycle());
3482     Bot.bumpNode(SU);
3483     if (SU->hasPhysRegDefs)
3484       reschedulePhysReg(SU, false);
3485   }
3486 }
3487 
3488 /// Create the standard converging machine scheduler. This will be used as the
3489 /// default scheduler if the target does not set a default.
3490 ScheduleDAGMILive *llvm::createGenericSchedLive(MachineSchedContext *C) {
3491   ScheduleDAGMILive *DAG =
3492       new ScheduleDAGMILive(C, std::make_unique<GenericScheduler>(C));
3493   // Register DAG post-processors.
3494   //
3495   // FIXME: extend the mutation API to allow earlier mutations to instantiate
3496   // data and pass it to later mutations. Have a single mutation that gathers
3497   // the interesting nodes in one pass.
3498   DAG->addMutation(createCopyConstrainDAGMutation(DAG->TII, DAG->TRI));
3499   return DAG;
3500 }
3501 
3502 static ScheduleDAGInstrs *createConvergingSched(MachineSchedContext *C) {
3503   return createGenericSchedLive(C);
3504 }
3505 
3506 static MachineSchedRegistry
3507 GenericSchedRegistry("converge", "Standard converging scheduler.",
3508                      createConvergingSched);
3509 
3510 //===----------------------------------------------------------------------===//
3511 // PostGenericScheduler - Generic PostRA implementation of MachineSchedStrategy.
3512 //===----------------------------------------------------------------------===//
3513 
3514 void PostGenericScheduler::initialize(ScheduleDAGMI *Dag) {
3515   DAG = Dag;
3516   SchedModel = DAG->getSchedModel();
3517   TRI = DAG->TRI;
3518 
3519   Rem.init(DAG, SchedModel);
3520   Top.init(DAG, SchedModel, &Rem);
3521   BotRoots.clear();
3522 
3523   // Initialize the HazardRecognizers. If itineraries don't exist, are empty,
3524   // or are disabled, then these HazardRecs will be disabled.
3525   const InstrItineraryData *Itin = SchedModel->getInstrItineraries();
3526   if (!Top.HazardRec) {
3527     Top.HazardRec =
3528         DAG->MF.getSubtarget().getInstrInfo()->CreateTargetMIHazardRecognizer(
3529             Itin, DAG);
3530   }
3531 }
3532 
3533 void PostGenericScheduler::registerRoots() {
3534   Rem.CriticalPath = DAG->ExitSU.getDepth();
3535 
3536   // Some roots may not feed into ExitSU. Check all of them in case.
3537   for (const SUnit *SU : BotRoots) {
3538     if (SU->getDepth() > Rem.CriticalPath)
3539       Rem.CriticalPath = SU->getDepth();
3540   }
3541   LLVM_DEBUG(dbgs() << "Critical Path: (PGS-RR) " << Rem.CriticalPath << '\n');
3542   if (DumpCriticalPathLength) {
3543     errs() << "Critical Path(PGS-RR ): " << Rem.CriticalPath << " \n";
3544   }
3545 }
3546 
3547 /// Apply a set of heuristics to a new candidate for PostRA scheduling.
3548 ///
3549 /// \param Cand provides the policy and current best candidate.
3550 /// \param TryCand refers to the next SUnit candidate, otherwise uninitialized.
3551 /// \return \c true if TryCand is better than Cand (Reason is NOT NoCand)
3552 bool PostGenericScheduler::tryCandidate(SchedCandidate &Cand,
3553                                         SchedCandidate &TryCand) {
3554   // Initialize the candidate if needed.
3555   if (!Cand.isValid()) {
3556     TryCand.Reason = NodeOrder;
3557     return true;
3558   }
3559 
3560   // Prioritize instructions that read unbuffered resources by stall cycles.
3561   if (tryLess(Top.getLatencyStallCycles(TryCand.SU),
3562               Top.getLatencyStallCycles(Cand.SU), TryCand, Cand, Stall))
3563     return TryCand.Reason != NoCand;
3564 
3565   // Keep clustered nodes together.
3566   if (tryGreater(TryCand.SU == DAG->getNextClusterSucc(),
3567                  Cand.SU == DAG->getNextClusterSucc(),
3568                  TryCand, Cand, Cluster))
3569     return TryCand.Reason != NoCand;
3570 
3571   // Avoid critical resource consumption and balance the schedule.
3572   if (tryLess(TryCand.ResDelta.CritResources, Cand.ResDelta.CritResources,
3573               TryCand, Cand, ResourceReduce))
3574     return TryCand.Reason != NoCand;
3575   if (tryGreater(TryCand.ResDelta.DemandedResources,
3576                  Cand.ResDelta.DemandedResources,
3577                  TryCand, Cand, ResourceDemand))
3578     return TryCand.Reason != NoCand;
3579 
3580   // Avoid serializing long latency dependence chains.
3581   if (Cand.Policy.ReduceLatency && tryLatency(TryCand, Cand, Top)) {
3582     return TryCand.Reason != NoCand;
3583   }
3584 
3585   // Fall through to original instruction order.
3586   if (TryCand.SU->NodeNum < Cand.SU->NodeNum) {
3587     TryCand.Reason = NodeOrder;
3588     return true;
3589   }
3590 
3591   return false;
3592 }
3593 
3594 void PostGenericScheduler::pickNodeFromQueue(SchedCandidate &Cand) {
3595   ReadyQueue &Q = Top.Available;
3596   for (SUnit *SU : Q) {
3597     SchedCandidate TryCand(Cand.Policy);
3598     TryCand.SU = SU;
3599     TryCand.AtTop = true;
3600     TryCand.initResourceDelta(DAG, SchedModel);
3601     if (tryCandidate(Cand, TryCand)) {
3602       Cand.setBest(TryCand);
3603       LLVM_DEBUG(traceCandidate(Cand));
3604     }
3605   }
3606 }
3607 
3608 /// Pick the next node to schedule.
3609 SUnit *PostGenericScheduler::pickNode(bool &IsTopNode) {
3610   if (DAG->top() == DAG->bottom()) {
3611     assert(Top.Available.empty() && Top.Pending.empty() && "ReadyQ garbage");
3612     return nullptr;
3613   }
3614   SUnit *SU;
3615   do {
3616     SU = Top.pickOnlyChoice();
3617     if (SU) {
3618       tracePick(Only1, true);
3619     } else {
3620       CandPolicy NoPolicy;
3621       SchedCandidate TopCand(NoPolicy);
3622       // Set the top-down policy based on the state of the current top zone and
3623       // the instructions outside the zone, including the bottom zone.
3624       setPolicy(TopCand.Policy, /*IsPostRA=*/true, Top, nullptr);
3625       pickNodeFromQueue(TopCand);
3626       assert(TopCand.Reason != NoCand && "failed to find a candidate");
3627       tracePick(TopCand);
3628       SU = TopCand.SU;
3629     }
3630   } while (SU->isScheduled);
3631 
3632   IsTopNode = true;
3633   Top.removeReady(SU);
3634 
3635   LLVM_DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") "
3636                     << *SU->getInstr());
3637   return SU;
3638 }
3639 
3640 /// Called after ScheduleDAGMI has scheduled an instruction and updated
3641 /// scheduled/remaining flags in the DAG nodes.
3642 void PostGenericScheduler::schedNode(SUnit *SU, bool IsTopNode) {
3643   SU->TopReadyCycle = std::max(SU->TopReadyCycle, Top.getCurrCycle());
3644   Top.bumpNode(SU);
3645 }
3646 
3647 ScheduleDAGMI *llvm::createGenericSchedPostRA(MachineSchedContext *C) {
3648   return new ScheduleDAGMI(C, std::make_unique<PostGenericScheduler>(C),
3649                            /*RemoveKillFlags=*/true);
3650 }
3651 
3652 //===----------------------------------------------------------------------===//
3653 // ILP Scheduler. Currently for experimental analysis of heuristics.
3654 //===----------------------------------------------------------------------===//
3655 
3656 namespace {
3657 
3658 /// Order nodes by the ILP metric.
3659 struct ILPOrder {
3660   const SchedDFSResult *DFSResult = nullptr;
3661   const BitVector *ScheduledTrees = nullptr;
3662   bool MaximizeILP;
3663 
3664   ILPOrder(bool MaxILP) : MaximizeILP(MaxILP) {}
3665 
3666   /// Apply a less-than relation on node priority.
3667   ///
3668   /// (Return true if A comes after B in the Q.)
3669   bool operator()(const SUnit *A, const SUnit *B) const {
3670     unsigned SchedTreeA = DFSResult->getSubtreeID(A);
3671     unsigned SchedTreeB = DFSResult->getSubtreeID(B);
3672     if (SchedTreeA != SchedTreeB) {
3673       // Unscheduled trees have lower priority.
3674       if (ScheduledTrees->test(SchedTreeA) != ScheduledTrees->test(SchedTreeB))
3675         return ScheduledTrees->test(SchedTreeB);
3676 
3677       // Trees with shallower connections have have lower priority.
3678       if (DFSResult->getSubtreeLevel(SchedTreeA)
3679           != DFSResult->getSubtreeLevel(SchedTreeB)) {
3680         return DFSResult->getSubtreeLevel(SchedTreeA)
3681           < DFSResult->getSubtreeLevel(SchedTreeB);
3682       }
3683     }
3684     if (MaximizeILP)
3685       return DFSResult->getILP(A) < DFSResult->getILP(B);
3686     else
3687       return DFSResult->getILP(A) > DFSResult->getILP(B);
3688   }
3689 };
3690 
3691 /// Schedule based on the ILP metric.
3692 class ILPScheduler : public MachineSchedStrategy {
3693   ScheduleDAGMILive *DAG = nullptr;
3694   ILPOrder Cmp;
3695 
3696   std::vector<SUnit*> ReadyQ;
3697 
3698 public:
3699   ILPScheduler(bool MaximizeILP) : Cmp(MaximizeILP) {}
3700 
3701   void initialize(ScheduleDAGMI *dag) override {
3702     assert(dag->hasVRegLiveness() && "ILPScheduler needs vreg liveness");
3703     DAG = static_cast<ScheduleDAGMILive*>(dag);
3704     DAG->computeDFSResult();
3705     Cmp.DFSResult = DAG->getDFSResult();
3706     Cmp.ScheduledTrees = &DAG->getScheduledTrees();
3707     ReadyQ.clear();
3708   }
3709 
3710   void registerRoots() override {
3711     // Restore the heap in ReadyQ with the updated DFS results.
3712     std::make_heap(ReadyQ.begin(), ReadyQ.end(), Cmp);
3713   }
3714 
3715   /// Implement MachineSchedStrategy interface.
3716   /// -----------------------------------------
3717 
3718   /// Callback to select the highest priority node from the ready Q.
3719   SUnit *pickNode(bool &IsTopNode) override {
3720     if (ReadyQ.empty()) return nullptr;
3721     std::pop_heap(ReadyQ.begin(), ReadyQ.end(), Cmp);
3722     SUnit *SU = ReadyQ.back();
3723     ReadyQ.pop_back();
3724     IsTopNode = false;
3725     LLVM_DEBUG(dbgs() << "Pick node "
3726                       << "SU(" << SU->NodeNum << ") "
3727                       << " ILP: " << DAG->getDFSResult()->getILP(SU)
3728                       << " Tree: " << DAG->getDFSResult()->getSubtreeID(SU)
3729                       << " @"
3730                       << DAG->getDFSResult()->getSubtreeLevel(
3731                              DAG->getDFSResult()->getSubtreeID(SU))
3732                       << '\n'
3733                       << "Scheduling " << *SU->getInstr());
3734     return SU;
3735   }
3736 
3737   /// Scheduler callback to notify that a new subtree is scheduled.
3738   void scheduleTree(unsigned SubtreeID) override {
3739     std::make_heap(ReadyQ.begin(), ReadyQ.end(), Cmp);
3740   }
3741 
3742   /// Callback after a node is scheduled. Mark a newly scheduled tree, notify
3743   /// DFSResults, and resort the priority Q.
3744   void schedNode(SUnit *SU, bool IsTopNode) override {
3745     assert(!IsTopNode && "SchedDFSResult needs bottom-up");
3746   }
3747 
3748   void releaseTopNode(SUnit *) override { /*only called for top roots*/ }
3749 
3750   void releaseBottomNode(SUnit *SU) override {
3751     ReadyQ.push_back(SU);
3752     std::push_heap(ReadyQ.begin(), ReadyQ.end(), Cmp);
3753   }
3754 };
3755 
3756 } // end anonymous namespace
3757 
3758 static ScheduleDAGInstrs *createILPMaxScheduler(MachineSchedContext *C) {
3759   return new ScheduleDAGMILive(C, std::make_unique<ILPScheduler>(true));
3760 }
3761 static ScheduleDAGInstrs *createILPMinScheduler(MachineSchedContext *C) {
3762   return new ScheduleDAGMILive(C, std::make_unique<ILPScheduler>(false));
3763 }
3764 
3765 static MachineSchedRegistry ILPMaxRegistry(
3766   "ilpmax", "Schedule bottom-up for max ILP", createILPMaxScheduler);
3767 static MachineSchedRegistry ILPMinRegistry(
3768   "ilpmin", "Schedule bottom-up for min ILP", createILPMinScheduler);
3769 
3770 //===----------------------------------------------------------------------===//
3771 // Machine Instruction Shuffler for Correctness Testing
3772 //===----------------------------------------------------------------------===//
3773 
3774 #ifndef NDEBUG
3775 namespace {
3776 
3777 /// Apply a less-than relation on the node order, which corresponds to the
3778 /// instruction order prior to scheduling. IsReverse implements greater-than.
3779 template<bool IsReverse>
3780 struct SUnitOrder {
3781   bool operator()(SUnit *A, SUnit *B) const {
3782     if (IsReverse)
3783       return A->NodeNum > B->NodeNum;
3784     else
3785       return A->NodeNum < B->NodeNum;
3786   }
3787 };
3788 
3789 /// Reorder instructions as much as possible.
3790 class InstructionShuffler : public MachineSchedStrategy {
3791   bool IsAlternating;
3792   bool IsTopDown;
3793 
3794   // Using a less-than relation (SUnitOrder<false>) for the TopQ priority
3795   // gives nodes with a higher number higher priority causing the latest
3796   // instructions to be scheduled first.
3797   PriorityQueue<SUnit*, std::vector<SUnit*>, SUnitOrder<false>>
3798     TopQ;
3799 
3800   // When scheduling bottom-up, use greater-than as the queue priority.
3801   PriorityQueue<SUnit*, std::vector<SUnit*>, SUnitOrder<true>>
3802     BottomQ;
3803 
3804 public:
3805   InstructionShuffler(bool alternate, bool topdown)
3806     : IsAlternating(alternate), IsTopDown(topdown) {}
3807 
3808   void initialize(ScheduleDAGMI*) override {
3809     TopQ.clear();
3810     BottomQ.clear();
3811   }
3812 
3813   /// Implement MachineSchedStrategy interface.
3814   /// -----------------------------------------
3815 
3816   SUnit *pickNode(bool &IsTopNode) override {
3817     SUnit *SU;
3818     if (IsTopDown) {
3819       do {
3820         if (TopQ.empty()) return nullptr;
3821         SU = TopQ.top();
3822         TopQ.pop();
3823       } while (SU->isScheduled);
3824       IsTopNode = true;
3825     } else {
3826       do {
3827         if (BottomQ.empty()) return nullptr;
3828         SU = BottomQ.top();
3829         BottomQ.pop();
3830       } while (SU->isScheduled);
3831       IsTopNode = false;
3832     }
3833     if (IsAlternating)
3834       IsTopDown = !IsTopDown;
3835     return SU;
3836   }
3837 
3838   void schedNode(SUnit *SU, bool IsTopNode) override {}
3839 
3840   void releaseTopNode(SUnit *SU) override {
3841     TopQ.push(SU);
3842   }
3843   void releaseBottomNode(SUnit *SU) override {
3844     BottomQ.push(SU);
3845   }
3846 };
3847 
3848 } // end anonymous namespace
3849 
3850 static ScheduleDAGInstrs *createInstructionShuffler(MachineSchedContext *C) {
3851   bool Alternate = !ForceTopDown && !ForceBottomUp;
3852   bool TopDown = !ForceBottomUp;
3853   assert((TopDown || !ForceTopDown) &&
3854          "-misched-topdown incompatible with -misched-bottomup");
3855   return new ScheduleDAGMILive(
3856       C, std::make_unique<InstructionShuffler>(Alternate, TopDown));
3857 }
3858 
3859 static MachineSchedRegistry ShufflerRegistry(
3860   "shuffle", "Shuffle machine instructions alternating directions",
3861   createInstructionShuffler);
3862 #endif // !NDEBUG
3863 
3864 //===----------------------------------------------------------------------===//
3865 // GraphWriter support for ScheduleDAGMILive.
3866 //===----------------------------------------------------------------------===//
3867 
3868 #ifndef NDEBUG
3869 namespace llvm {
3870 
3871 template<> struct GraphTraits<
3872   ScheduleDAGMI*> : public GraphTraits<ScheduleDAG*> {};
3873 
3874 template<>
3875 struct DOTGraphTraits<ScheduleDAGMI*> : public DefaultDOTGraphTraits {
3876   DOTGraphTraits(bool isSimple = false) : DefaultDOTGraphTraits(isSimple) {}
3877 
3878   static std::string getGraphName(const ScheduleDAG *G) {
3879     return std::string(G->MF.getName());
3880   }
3881 
3882   static bool renderGraphFromBottomUp() {
3883     return true;
3884   }
3885 
3886   static bool isNodeHidden(const SUnit *Node, const ScheduleDAG *G) {
3887     if (ViewMISchedCutoff == 0)
3888       return false;
3889     return (Node->Preds.size() > ViewMISchedCutoff
3890          || Node->Succs.size() > ViewMISchedCutoff);
3891   }
3892 
3893   /// If you want to override the dot attributes printed for a particular
3894   /// edge, override this method.
3895   static std::string getEdgeAttributes(const SUnit *Node,
3896                                        SUnitIterator EI,
3897                                        const ScheduleDAG *Graph) {
3898     if (EI.isArtificialDep())
3899       return "color=cyan,style=dashed";
3900     if (EI.isCtrlDep())
3901       return "color=blue,style=dashed";
3902     return "";
3903   }
3904 
3905   static std::string getNodeLabel(const SUnit *SU, const ScheduleDAG *G) {
3906     std::string Str;
3907     raw_string_ostream SS(Str);
3908     const ScheduleDAGMI *DAG = static_cast<const ScheduleDAGMI*>(G);
3909     const SchedDFSResult *DFS = DAG->hasVRegLiveness() ?
3910       static_cast<const ScheduleDAGMILive*>(G)->getDFSResult() : nullptr;
3911     SS << "SU:" << SU->NodeNum;
3912     if (DFS)
3913       SS << " I:" << DFS->getNumInstrs(SU);
3914     return SS.str();
3915   }
3916 
3917   static std::string getNodeDescription(const SUnit *SU, const ScheduleDAG *G) {
3918     return G->getGraphNodeLabel(SU);
3919   }
3920 
3921   static std::string getNodeAttributes(const SUnit *N, const ScheduleDAG *G) {
3922     std::string Str("shape=Mrecord");
3923     const ScheduleDAGMI *DAG = static_cast<const ScheduleDAGMI*>(G);
3924     const SchedDFSResult *DFS = DAG->hasVRegLiveness() ?
3925       static_cast<const ScheduleDAGMILive*>(G)->getDFSResult() : nullptr;
3926     if (DFS) {
3927       Str += ",style=filled,fillcolor=\"#";
3928       Str += DOT::getColorString(DFS->getSubtreeID(N));
3929       Str += '"';
3930     }
3931     return Str;
3932   }
3933 };
3934 
3935 } // end namespace llvm
3936 #endif // NDEBUG
3937 
3938 /// viewGraph - Pop up a ghostview window with the reachable parts of the DAG
3939 /// rendered using 'dot'.
3940 void ScheduleDAGMI::viewGraph(const Twine &Name, const Twine &Title) {
3941 #ifndef NDEBUG
3942   ViewGraph(this, Name, false, Title);
3943 #else
3944   errs() << "ScheduleDAGMI::viewGraph is only available in debug builds on "
3945          << "systems with Graphviz or gv!\n";
3946 #endif  // NDEBUG
3947 }
3948 
3949 /// Out-of-line implementation with no arguments is handy for gdb.
3950 void ScheduleDAGMI::viewGraph() {
3951   viewGraph(getDAGName(), "Scheduling-Units Graph for " + getDAGName());
3952 }
3953