xref: /freebsd-src/contrib/llvm-project/llvm/lib/CodeGen/MachinePipeliner.cpp (revision 335c7cda12138f2aefa41fb739707612cc12a9be)
1 //===- MachinePipeliner.cpp - Machine Software Pipeliner Pass -------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // An implementation of the Swing Modulo Scheduling (SMS) software pipeliner.
10 //
11 // This SMS implementation is a target-independent back-end pass. When enabled,
12 // the pass runs just prior to the register allocation pass, while the machine
13 // IR is in SSA form. If software pipelining is successful, then the original
14 // loop is replaced by the optimized loop. The optimized loop contains one or
15 // more prolog blocks, the pipelined kernel, and one or more epilog blocks. If
16 // the instructions cannot be scheduled in a given MII, we increase the MII by
17 // one and try again.
18 //
19 // The SMS implementation is an extension of the ScheduleDAGInstrs class. We
20 // represent loop carried dependences in the DAG as order edges to the Phi
21 // nodes. We also perform several passes over the DAG to eliminate unnecessary
22 // edges that inhibit the ability to pipeline. The implementation uses the
23 // DFAPacketizer class to compute the minimum initiation interval and the check
24 // where an instruction may be inserted in the pipelined schedule.
25 //
26 // In order for the SMS pass to work, several target specific hooks need to be
27 // implemented to get information about the loop structure and to rewrite
28 // instructions.
29 //
30 //===----------------------------------------------------------------------===//
31 
32 #include "llvm/CodeGen/MachinePipeliner.h"
33 #include "llvm/ADT/ArrayRef.h"
34 #include "llvm/ADT/BitVector.h"
35 #include "llvm/ADT/DenseMap.h"
36 #include "llvm/ADT/MapVector.h"
37 #include "llvm/ADT/PriorityQueue.h"
38 #include "llvm/ADT/STLExtras.h"
39 #include "llvm/ADT/SetOperations.h"
40 #include "llvm/ADT/SetVector.h"
41 #include "llvm/ADT/SmallPtrSet.h"
42 #include "llvm/ADT/SmallSet.h"
43 #include "llvm/ADT/SmallVector.h"
44 #include "llvm/ADT/Statistic.h"
45 #include "llvm/ADT/iterator_range.h"
46 #include "llvm/Analysis/AliasAnalysis.h"
47 #include "llvm/Analysis/CycleAnalysis.h"
48 #include "llvm/Analysis/MemoryLocation.h"
49 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
50 #include "llvm/Analysis/ValueTracking.h"
51 #include "llvm/CodeGen/DFAPacketizer.h"
52 #include "llvm/CodeGen/LiveIntervals.h"
53 #include "llvm/CodeGen/MachineBasicBlock.h"
54 #include "llvm/CodeGen/MachineDominators.h"
55 #include "llvm/CodeGen/MachineFunction.h"
56 #include "llvm/CodeGen/MachineFunctionPass.h"
57 #include "llvm/CodeGen/MachineInstr.h"
58 #include "llvm/CodeGen/MachineInstrBuilder.h"
59 #include "llvm/CodeGen/MachineLoopInfo.h"
60 #include "llvm/CodeGen/MachineMemOperand.h"
61 #include "llvm/CodeGen/MachineOperand.h"
62 #include "llvm/CodeGen/MachineRegisterInfo.h"
63 #include "llvm/CodeGen/ModuloSchedule.h"
64 #include "llvm/CodeGen/Register.h"
65 #include "llvm/CodeGen/RegisterClassInfo.h"
66 #include "llvm/CodeGen/RegisterPressure.h"
67 #include "llvm/CodeGen/ScheduleDAG.h"
68 #include "llvm/CodeGen/ScheduleDAGMutation.h"
69 #include "llvm/CodeGen/TargetInstrInfo.h"
70 #include "llvm/CodeGen/TargetOpcodes.h"
71 #include "llvm/CodeGen/TargetRegisterInfo.h"
72 #include "llvm/CodeGen/TargetSubtargetInfo.h"
73 #include "llvm/Config/llvm-config.h"
74 #include "llvm/IR/Attributes.h"
75 #include "llvm/IR/Function.h"
76 #include "llvm/MC/LaneBitmask.h"
77 #include "llvm/MC/MCInstrDesc.h"
78 #include "llvm/MC/MCInstrItineraries.h"
79 #include "llvm/MC/MCRegisterInfo.h"
80 #include "llvm/Pass.h"
81 #include "llvm/Support/CommandLine.h"
82 #include "llvm/Support/Compiler.h"
83 #include "llvm/Support/Debug.h"
84 #include "llvm/Support/MathExtras.h"
85 #include "llvm/Support/raw_ostream.h"
86 #include <algorithm>
87 #include <cassert>
88 #include <climits>
89 #include <cstdint>
90 #include <deque>
91 #include <functional>
92 #include <iomanip>
93 #include <iterator>
94 #include <map>
95 #include <memory>
96 #include <sstream>
97 #include <tuple>
98 #include <utility>
99 #include <vector>
100 
101 using namespace llvm;
102 
103 #define DEBUG_TYPE "pipeliner"
104 
105 STATISTIC(NumTrytoPipeline, "Number of loops that we attempt to pipeline");
106 STATISTIC(NumPipelined, "Number of loops software pipelined");
107 STATISTIC(NumNodeOrderIssues, "Number of node order issues found");
108 STATISTIC(NumFailBranch, "Pipeliner abort due to unknown branch");
109 STATISTIC(NumFailLoop, "Pipeliner abort due to unsupported loop");
110 STATISTIC(NumFailPreheader, "Pipeliner abort due to missing preheader");
111 STATISTIC(NumFailLargeMaxMII, "Pipeliner abort due to MaxMII too large");
112 STATISTIC(NumFailZeroMII, "Pipeliner abort due to zero MII");
113 STATISTIC(NumFailNoSchedule, "Pipeliner abort due to no schedule found");
114 STATISTIC(NumFailZeroStage, "Pipeliner abort due to zero stage");
115 STATISTIC(NumFailLargeMaxStage, "Pipeliner abort due to too many stages");
116 
117 /// A command line option to turn software pipelining on or off.
118 static cl::opt<bool> EnableSWP("enable-pipeliner", cl::Hidden, cl::init(true),
119                                cl::desc("Enable Software Pipelining"));
120 
121 /// A command line option to enable SWP at -Os.
122 static cl::opt<bool> EnableSWPOptSize("enable-pipeliner-opt-size",
123                                       cl::desc("Enable SWP at Os."), cl::Hidden,
124                                       cl::init(false));
125 
126 /// A command line argument to limit minimum initial interval for pipelining.
127 static cl::opt<int> SwpMaxMii("pipeliner-max-mii",
128                               cl::desc("Size limit for the MII."),
129                               cl::Hidden, cl::init(27));
130 
131 /// A command line argument to force pipeliner to use specified initial
132 /// interval.
133 static cl::opt<int> SwpForceII("pipeliner-force-ii",
134                                cl::desc("Force pipeliner to use specified II."),
135                                cl::Hidden, cl::init(-1));
136 
137 /// A command line argument to limit the number of stages in the pipeline.
138 static cl::opt<int>
139     SwpMaxStages("pipeliner-max-stages",
140                  cl::desc("Maximum stages allowed in the generated scheduled."),
141                  cl::Hidden, cl::init(3));
142 
143 /// A command line option to disable the pruning of chain dependences due to
144 /// an unrelated Phi.
145 static cl::opt<bool>
146     SwpPruneDeps("pipeliner-prune-deps",
147                  cl::desc("Prune dependences between unrelated Phi nodes."),
148                  cl::Hidden, cl::init(true));
149 
150 /// A command line option to disable the pruning of loop carried order
151 /// dependences.
152 static cl::opt<bool>
153     SwpPruneLoopCarried("pipeliner-prune-loop-carried",
154                         cl::desc("Prune loop carried order dependences."),
155                         cl::Hidden, cl::init(true));
156 
157 #ifndef NDEBUG
158 static cl::opt<int> SwpLoopLimit("pipeliner-max", cl::Hidden, cl::init(-1));
159 #endif
160 
161 static cl::opt<bool> SwpIgnoreRecMII("pipeliner-ignore-recmii",
162                                      cl::ReallyHidden,
163                                      cl::desc("Ignore RecMII"));
164 
165 static cl::opt<bool> SwpShowResMask("pipeliner-show-mask", cl::Hidden,
166                                     cl::init(false));
167 static cl::opt<bool> SwpDebugResource("pipeliner-dbg-res", cl::Hidden,
168                                       cl::init(false));
169 
170 static cl::opt<bool> EmitTestAnnotations(
171     "pipeliner-annotate-for-testing", cl::Hidden, cl::init(false),
172     cl::desc("Instead of emitting the pipelined code, annotate instructions "
173              "with the generated schedule for feeding into the "
174              "-modulo-schedule-test pass"));
175 
176 static cl::opt<bool> ExperimentalCodeGen(
177     "pipeliner-experimental-cg", cl::Hidden, cl::init(false),
178     cl::desc(
179         "Use the experimental peeling code generator for software pipelining"));
180 
181 static cl::opt<int> SwpIISearchRange("pipeliner-ii-search-range",
182                                      cl::desc("Range to search for II"),
183                                      cl::Hidden, cl::init(10));
184 
185 static cl::opt<bool>
186     LimitRegPressure("pipeliner-register-pressure", cl::Hidden, cl::init(false),
187                      cl::desc("Limit register pressure of scheduled loop"));
188 
189 static cl::opt<int>
190     RegPressureMargin("pipeliner-register-pressure-margin", cl::Hidden,
191                       cl::init(5),
192                       cl::desc("Margin representing the unused percentage of "
193                                "the register pressure limit"));
194 
195 namespace llvm {
196 
197 // A command line option to enable the CopyToPhi DAG mutation.
198 cl::opt<bool> SwpEnableCopyToPhi("pipeliner-enable-copytophi", cl::ReallyHidden,
199                                  cl::init(true),
200                                  cl::desc("Enable CopyToPhi DAG Mutation"));
201 
202 /// A command line argument to force pipeliner to use specified issue
203 /// width.
204 cl::opt<int> SwpForceIssueWidth(
205     "pipeliner-force-issue-width",
206     cl::desc("Force pipeliner to use specified issue width."), cl::Hidden,
207     cl::init(-1));
208 
209 } // end namespace llvm
210 
211 unsigned SwingSchedulerDAG::Circuits::MaxPaths = 5;
212 char MachinePipeliner::ID = 0;
213 #ifndef NDEBUG
214 int MachinePipeliner::NumTries = 0;
215 #endif
216 char &llvm::MachinePipelinerID = MachinePipeliner::ID;
217 
218 INITIALIZE_PASS_BEGIN(MachinePipeliner, DEBUG_TYPE,
219                       "Modulo Software Pipelining", false, false)
220 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
221 INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
222 INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
223 INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
224 INITIALIZE_PASS_END(MachinePipeliner, DEBUG_TYPE,
225                     "Modulo Software Pipelining", false, false)
226 
227 /// The "main" function for implementing Swing Modulo Scheduling.
228 bool MachinePipeliner::runOnMachineFunction(MachineFunction &mf) {
229   if (skipFunction(mf.getFunction()))
230     return false;
231 
232   if (!EnableSWP)
233     return false;
234 
235   if (mf.getFunction().getAttributes().hasFnAttr(Attribute::OptimizeForSize) &&
236       !EnableSWPOptSize.getPosition())
237     return false;
238 
239   if (!mf.getSubtarget().enableMachinePipeliner())
240     return false;
241 
242   // Cannot pipeline loops without instruction itineraries if we are using
243   // DFA for the pipeliner.
244   if (mf.getSubtarget().useDFAforSMS() &&
245       (!mf.getSubtarget().getInstrItineraryData() ||
246        mf.getSubtarget().getInstrItineraryData()->isEmpty()))
247     return false;
248 
249   MF = &mf;
250   MLI = &getAnalysis<MachineLoopInfo>();
251   MDT = &getAnalysis<MachineDominatorTree>();
252   ORE = &getAnalysis<MachineOptimizationRemarkEmitterPass>().getORE();
253   TII = MF->getSubtarget().getInstrInfo();
254   RegClassInfo.runOnMachineFunction(*MF);
255 
256   for (const auto &L : *MLI)
257     scheduleLoop(*L);
258 
259   return false;
260 }
261 
262 /// Attempt to perform the SMS algorithm on the specified loop. This function is
263 /// the main entry point for the algorithm.  The function identifies candidate
264 /// loops, calculates the minimum initiation interval, and attempts to schedule
265 /// the loop.
266 bool MachinePipeliner::scheduleLoop(MachineLoop &L) {
267   bool Changed = false;
268   for (const auto &InnerLoop : L)
269     Changed |= scheduleLoop(*InnerLoop);
270 
271 #ifndef NDEBUG
272   // Stop trying after reaching the limit (if any).
273   int Limit = SwpLoopLimit;
274   if (Limit >= 0) {
275     if (NumTries >= SwpLoopLimit)
276       return Changed;
277     NumTries++;
278   }
279 #endif
280 
281   setPragmaPipelineOptions(L);
282   if (!canPipelineLoop(L)) {
283     LLVM_DEBUG(dbgs() << "\n!!! Can not pipeline loop.\n");
284     ORE->emit([&]() {
285       return MachineOptimizationRemarkMissed(DEBUG_TYPE, "canPipelineLoop",
286                                              L.getStartLoc(), L.getHeader())
287              << "Failed to pipeline loop";
288     });
289 
290     LI.LoopPipelinerInfo.reset();
291     return Changed;
292   }
293 
294   ++NumTrytoPipeline;
295 
296   Changed = swingModuloScheduler(L);
297 
298   LI.LoopPipelinerInfo.reset();
299   return Changed;
300 }
301 
302 void MachinePipeliner::setPragmaPipelineOptions(MachineLoop &L) {
303   // Reset the pragma for the next loop in iteration.
304   disabledByPragma = false;
305   II_setByPragma = 0;
306 
307   MachineBasicBlock *LBLK = L.getTopBlock();
308 
309   if (LBLK == nullptr)
310     return;
311 
312   const BasicBlock *BBLK = LBLK->getBasicBlock();
313   if (BBLK == nullptr)
314     return;
315 
316   const Instruction *TI = BBLK->getTerminator();
317   if (TI == nullptr)
318     return;
319 
320   MDNode *LoopID = TI->getMetadata(LLVMContext::MD_loop);
321   if (LoopID == nullptr)
322     return;
323 
324   assert(LoopID->getNumOperands() > 0 && "requires atleast one operand");
325   assert(LoopID->getOperand(0) == LoopID && "invalid loop");
326 
327   for (unsigned i = 1, e = LoopID->getNumOperands(); i < e; ++i) {
328     MDNode *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
329 
330     if (MD == nullptr)
331       continue;
332 
333     MDString *S = dyn_cast<MDString>(MD->getOperand(0));
334 
335     if (S == nullptr)
336       continue;
337 
338     if (S->getString() == "llvm.loop.pipeline.initiationinterval") {
339       assert(MD->getNumOperands() == 2 &&
340              "Pipeline initiation interval hint metadata should have two operands.");
341       II_setByPragma =
342           mdconst::extract<ConstantInt>(MD->getOperand(1))->getZExtValue();
343       assert(II_setByPragma >= 1 && "Pipeline initiation interval must be positive.");
344     } else if (S->getString() == "llvm.loop.pipeline.disable") {
345       disabledByPragma = true;
346     }
347   }
348 }
349 
350 /// Return true if the loop can be software pipelined.  The algorithm is
351 /// restricted to loops with a single basic block.  Make sure that the
352 /// branch in the loop can be analyzed.
353 bool MachinePipeliner::canPipelineLoop(MachineLoop &L) {
354   if (L.getNumBlocks() != 1) {
355     ORE->emit([&]() {
356       return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "canPipelineLoop",
357                                                L.getStartLoc(), L.getHeader())
358              << "Not a single basic block: "
359              << ore::NV("NumBlocks", L.getNumBlocks());
360     });
361     return false;
362   }
363 
364   if (disabledByPragma) {
365     ORE->emit([&]() {
366       return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "canPipelineLoop",
367                                                L.getStartLoc(), L.getHeader())
368              << "Disabled by Pragma.";
369     });
370     return false;
371   }
372 
373   // Check if the branch can't be understood because we can't do pipelining
374   // if that's the case.
375   LI.TBB = nullptr;
376   LI.FBB = nullptr;
377   LI.BrCond.clear();
378   if (TII->analyzeBranch(*L.getHeader(), LI.TBB, LI.FBB, LI.BrCond)) {
379     LLVM_DEBUG(dbgs() << "Unable to analyzeBranch, can NOT pipeline Loop\n");
380     NumFailBranch++;
381     ORE->emit([&]() {
382       return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "canPipelineLoop",
383                                                L.getStartLoc(), L.getHeader())
384              << "The branch can't be understood";
385     });
386     return false;
387   }
388 
389   LI.LoopInductionVar = nullptr;
390   LI.LoopCompare = nullptr;
391   LI.LoopPipelinerInfo = TII->analyzeLoopForPipelining(L.getTopBlock());
392   if (!LI.LoopPipelinerInfo) {
393     LLVM_DEBUG(dbgs() << "Unable to analyzeLoop, can NOT pipeline Loop\n");
394     NumFailLoop++;
395     ORE->emit([&]() {
396       return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "canPipelineLoop",
397                                                L.getStartLoc(), L.getHeader())
398              << "The loop structure is not supported";
399     });
400     return false;
401   }
402 
403   if (!L.getLoopPreheader()) {
404     LLVM_DEBUG(dbgs() << "Preheader not found, can NOT pipeline Loop\n");
405     NumFailPreheader++;
406     ORE->emit([&]() {
407       return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "canPipelineLoop",
408                                                L.getStartLoc(), L.getHeader())
409              << "No loop preheader found";
410     });
411     return false;
412   }
413 
414   // Remove any subregisters from inputs to phi nodes.
415   preprocessPhiNodes(*L.getHeader());
416   return true;
417 }
418 
419 void MachinePipeliner::preprocessPhiNodes(MachineBasicBlock &B) {
420   MachineRegisterInfo &MRI = MF->getRegInfo();
421   SlotIndexes &Slots = *getAnalysis<LiveIntervals>().getSlotIndexes();
422 
423   for (MachineInstr &PI : B.phis()) {
424     MachineOperand &DefOp = PI.getOperand(0);
425     assert(DefOp.getSubReg() == 0);
426     auto *RC = MRI.getRegClass(DefOp.getReg());
427 
428     for (unsigned i = 1, n = PI.getNumOperands(); i != n; i += 2) {
429       MachineOperand &RegOp = PI.getOperand(i);
430       if (RegOp.getSubReg() == 0)
431         continue;
432 
433       // If the operand uses a subregister, replace it with a new register
434       // without subregisters, and generate a copy to the new register.
435       Register NewReg = MRI.createVirtualRegister(RC);
436       MachineBasicBlock &PredB = *PI.getOperand(i+1).getMBB();
437       MachineBasicBlock::iterator At = PredB.getFirstTerminator();
438       const DebugLoc &DL = PredB.findDebugLoc(At);
439       auto Copy = BuildMI(PredB, At, DL, TII->get(TargetOpcode::COPY), NewReg)
440                     .addReg(RegOp.getReg(), getRegState(RegOp),
441                             RegOp.getSubReg());
442       Slots.insertMachineInstrInMaps(*Copy);
443       RegOp.setReg(NewReg);
444       RegOp.setSubReg(0);
445     }
446   }
447 }
448 
449 /// The SMS algorithm consists of the following main steps:
450 /// 1. Computation and analysis of the dependence graph.
451 /// 2. Ordering of the nodes (instructions).
452 /// 3. Attempt to Schedule the loop.
453 bool MachinePipeliner::swingModuloScheduler(MachineLoop &L) {
454   assert(L.getBlocks().size() == 1 && "SMS works on single blocks only.");
455 
456   SwingSchedulerDAG SMS(*this, L, getAnalysis<LiveIntervals>(), RegClassInfo,
457                         II_setByPragma, LI.LoopPipelinerInfo.get());
458 
459   MachineBasicBlock *MBB = L.getHeader();
460   // The kernel should not include any terminator instructions.  These
461   // will be added back later.
462   SMS.startBlock(MBB);
463 
464   // Compute the number of 'real' instructions in the basic block by
465   // ignoring terminators.
466   unsigned size = MBB->size();
467   for (MachineBasicBlock::iterator I = MBB->getFirstTerminator(),
468                                    E = MBB->instr_end();
469        I != E; ++I, --size)
470     ;
471 
472   SMS.enterRegion(MBB, MBB->begin(), MBB->getFirstTerminator(), size);
473   SMS.schedule();
474   SMS.exitRegion();
475 
476   SMS.finishBlock();
477   return SMS.hasNewSchedule();
478 }
479 
480 void MachinePipeliner::getAnalysisUsage(AnalysisUsage &AU) const {
481   AU.addRequired<AAResultsWrapperPass>();
482   AU.addPreserved<AAResultsWrapperPass>();
483   AU.addRequired<MachineLoopInfo>();
484   AU.addRequired<MachineDominatorTree>();
485   AU.addRequired<LiveIntervals>();
486   AU.addRequired<MachineOptimizationRemarkEmitterPass>();
487   MachineFunctionPass::getAnalysisUsage(AU);
488 }
489 
490 void SwingSchedulerDAG::setMII(unsigned ResMII, unsigned RecMII) {
491   if (SwpForceII > 0)
492     MII = SwpForceII;
493   else if (II_setByPragma > 0)
494     MII = II_setByPragma;
495   else
496     MII = std::max(ResMII, RecMII);
497 }
498 
499 void SwingSchedulerDAG::setMAX_II() {
500   if (SwpForceII > 0)
501     MAX_II = SwpForceII;
502   else if (II_setByPragma > 0)
503     MAX_II = II_setByPragma;
504   else
505     MAX_II = MII + SwpIISearchRange;
506 }
507 
508 /// We override the schedule function in ScheduleDAGInstrs to implement the
509 /// scheduling part of the Swing Modulo Scheduling algorithm.
510 void SwingSchedulerDAG::schedule() {
511   AliasAnalysis *AA = &Pass.getAnalysis<AAResultsWrapperPass>().getAAResults();
512   buildSchedGraph(AA);
513   addLoopCarriedDependences(AA);
514   updatePhiDependences();
515   Topo.InitDAGTopologicalSorting();
516   changeDependences();
517   postProcessDAG();
518   LLVM_DEBUG(dump());
519 
520   NodeSetType NodeSets;
521   findCircuits(NodeSets);
522   NodeSetType Circuits = NodeSets;
523 
524   // Calculate the MII.
525   unsigned ResMII = calculateResMII();
526   unsigned RecMII = calculateRecMII(NodeSets);
527 
528   fuseRecs(NodeSets);
529 
530   // This flag is used for testing and can cause correctness problems.
531   if (SwpIgnoreRecMII)
532     RecMII = 0;
533 
534   setMII(ResMII, RecMII);
535   setMAX_II();
536 
537   LLVM_DEBUG(dbgs() << "MII = " << MII << " MAX_II = " << MAX_II
538                     << " (rec=" << RecMII << ", res=" << ResMII << ")\n");
539 
540   // Can't schedule a loop without a valid MII.
541   if (MII == 0) {
542     LLVM_DEBUG(dbgs() << "Invalid Minimal Initiation Interval: 0\n");
543     NumFailZeroMII++;
544     Pass.ORE->emit([&]() {
545       return MachineOptimizationRemarkAnalysis(
546                  DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
547              << "Invalid Minimal Initiation Interval: 0";
548     });
549     return;
550   }
551 
552   // Don't pipeline large loops.
553   if (SwpMaxMii != -1 && (int)MII > SwpMaxMii) {
554     LLVM_DEBUG(dbgs() << "MII > " << SwpMaxMii
555                       << ", we don't pipeline large loops\n");
556     NumFailLargeMaxMII++;
557     Pass.ORE->emit([&]() {
558       return MachineOptimizationRemarkAnalysis(
559                  DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
560              << "Minimal Initiation Interval too large: "
561              << ore::NV("MII", (int)MII) << " > "
562              << ore::NV("SwpMaxMii", SwpMaxMii) << "."
563              << "Refer to -pipeliner-max-mii.";
564     });
565     return;
566   }
567 
568   computeNodeFunctions(NodeSets);
569 
570   registerPressureFilter(NodeSets);
571 
572   colocateNodeSets(NodeSets);
573 
574   checkNodeSets(NodeSets);
575 
576   LLVM_DEBUG({
577     for (auto &I : NodeSets) {
578       dbgs() << "  Rec NodeSet ";
579       I.dump();
580     }
581   });
582 
583   llvm::stable_sort(NodeSets, std::greater<NodeSet>());
584 
585   groupRemainingNodes(NodeSets);
586 
587   removeDuplicateNodes(NodeSets);
588 
589   LLVM_DEBUG({
590     for (auto &I : NodeSets) {
591       dbgs() << "  NodeSet ";
592       I.dump();
593     }
594   });
595 
596   computeNodeOrder(NodeSets);
597 
598   // check for node order issues
599   checkValidNodeOrder(Circuits);
600 
601   SMSchedule Schedule(Pass.MF, this);
602   Scheduled = schedulePipeline(Schedule);
603 
604   if (!Scheduled){
605     LLVM_DEBUG(dbgs() << "No schedule found, return\n");
606     NumFailNoSchedule++;
607     Pass.ORE->emit([&]() {
608       return MachineOptimizationRemarkAnalysis(
609                  DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
610              << "Unable to find schedule";
611     });
612     return;
613   }
614 
615   unsigned numStages = Schedule.getMaxStageCount();
616   // No need to generate pipeline if there are no overlapped iterations.
617   if (numStages == 0) {
618     LLVM_DEBUG(dbgs() << "No overlapped iterations, skip.\n");
619     NumFailZeroStage++;
620     Pass.ORE->emit([&]() {
621       return MachineOptimizationRemarkAnalysis(
622                  DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
623              << "No need to pipeline - no overlapped iterations in schedule.";
624     });
625     return;
626   }
627   // Check that the maximum stage count is less than user-defined limit.
628   if (SwpMaxStages > -1 && (int)numStages > SwpMaxStages) {
629     LLVM_DEBUG(dbgs() << "numStages:" << numStages << ">" << SwpMaxStages
630                       << " : too many stages, abort\n");
631     NumFailLargeMaxStage++;
632     Pass.ORE->emit([&]() {
633       return MachineOptimizationRemarkAnalysis(
634                  DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
635              << "Too many stages in schedule: "
636              << ore::NV("numStages", (int)numStages) << " > "
637              << ore::NV("SwpMaxStages", SwpMaxStages)
638              << ". Refer to -pipeliner-max-stages.";
639     });
640     return;
641   }
642 
643   Pass.ORE->emit([&]() {
644     return MachineOptimizationRemark(DEBUG_TYPE, "schedule", Loop.getStartLoc(),
645                                      Loop.getHeader())
646            << "Pipelined succesfully!";
647   });
648 
649   // Generate the schedule as a ModuloSchedule.
650   DenseMap<MachineInstr *, int> Cycles, Stages;
651   std::vector<MachineInstr *> OrderedInsts;
652   for (int Cycle = Schedule.getFirstCycle(); Cycle <= Schedule.getFinalCycle();
653        ++Cycle) {
654     for (SUnit *SU : Schedule.getInstructions(Cycle)) {
655       OrderedInsts.push_back(SU->getInstr());
656       Cycles[SU->getInstr()] = Cycle;
657       Stages[SU->getInstr()] = Schedule.stageScheduled(SU);
658     }
659   }
660   DenseMap<MachineInstr *, std::pair<unsigned, int64_t>> NewInstrChanges;
661   for (auto &KV : NewMIs) {
662     Cycles[KV.first] = Cycles[KV.second];
663     Stages[KV.first] = Stages[KV.second];
664     NewInstrChanges[KV.first] = InstrChanges[getSUnit(KV.first)];
665   }
666 
667   ModuloSchedule MS(MF, &Loop, std::move(OrderedInsts), std::move(Cycles),
668                     std::move(Stages));
669   if (EmitTestAnnotations) {
670     assert(NewInstrChanges.empty() &&
671            "Cannot serialize a schedule with InstrChanges!");
672     ModuloScheduleTestAnnotater MSTI(MF, MS);
673     MSTI.annotate();
674     return;
675   }
676   // The experimental code generator can't work if there are InstChanges.
677   if (ExperimentalCodeGen && NewInstrChanges.empty()) {
678     PeelingModuloScheduleExpander MSE(MF, MS, &LIS);
679     MSE.expand();
680   } else {
681     ModuloScheduleExpander MSE(MF, MS, LIS, std::move(NewInstrChanges));
682     MSE.expand();
683     MSE.cleanup();
684   }
685   ++NumPipelined;
686 }
687 
688 /// Clean up after the software pipeliner runs.
689 void SwingSchedulerDAG::finishBlock() {
690   for (auto &KV : NewMIs)
691     MF.deleteMachineInstr(KV.second);
692   NewMIs.clear();
693 
694   // Call the superclass.
695   ScheduleDAGInstrs::finishBlock();
696 }
697 
698 /// Return the register values for  the operands of a Phi instruction.
699 /// This function assume the instruction is a Phi.
700 static void getPhiRegs(MachineInstr &Phi, MachineBasicBlock *Loop,
701                        unsigned &InitVal, unsigned &LoopVal) {
702   assert(Phi.isPHI() && "Expecting a Phi.");
703 
704   InitVal = 0;
705   LoopVal = 0;
706   for (unsigned i = 1, e = Phi.getNumOperands(); i != e; i += 2)
707     if (Phi.getOperand(i + 1).getMBB() != Loop)
708       InitVal = Phi.getOperand(i).getReg();
709     else
710       LoopVal = Phi.getOperand(i).getReg();
711 
712   assert(InitVal != 0 && LoopVal != 0 && "Unexpected Phi structure.");
713 }
714 
715 /// Return the Phi register value that comes the loop block.
716 static unsigned getLoopPhiReg(const MachineInstr &Phi,
717                               const MachineBasicBlock *LoopBB) {
718   for (unsigned i = 1, e = Phi.getNumOperands(); i != e; i += 2)
719     if (Phi.getOperand(i + 1).getMBB() == LoopBB)
720       return Phi.getOperand(i).getReg();
721   return 0;
722 }
723 
724 /// Return true if SUb can be reached from SUa following the chain edges.
725 static bool isSuccOrder(SUnit *SUa, SUnit *SUb) {
726   SmallPtrSet<SUnit *, 8> Visited;
727   SmallVector<SUnit *, 8> Worklist;
728   Worklist.push_back(SUa);
729   while (!Worklist.empty()) {
730     const SUnit *SU = Worklist.pop_back_val();
731     for (const auto &SI : SU->Succs) {
732       SUnit *SuccSU = SI.getSUnit();
733       if (SI.getKind() == SDep::Order) {
734         if (Visited.count(SuccSU))
735           continue;
736         if (SuccSU == SUb)
737           return true;
738         Worklist.push_back(SuccSU);
739         Visited.insert(SuccSU);
740       }
741     }
742   }
743   return false;
744 }
745 
746 /// Return true if the instruction causes a chain between memory
747 /// references before and after it.
748 static bool isDependenceBarrier(MachineInstr &MI) {
749   return MI.isCall() || MI.mayRaiseFPException() ||
750          MI.hasUnmodeledSideEffects() ||
751          (MI.hasOrderedMemoryRef() &&
752           (!MI.mayLoad() || !MI.isDereferenceableInvariantLoad()));
753 }
754 
755 /// Return the underlying objects for the memory references of an instruction.
756 /// This function calls the code in ValueTracking, but first checks that the
757 /// instruction has a memory operand.
758 static void getUnderlyingObjects(const MachineInstr *MI,
759                                  SmallVectorImpl<const Value *> &Objs) {
760   if (!MI->hasOneMemOperand())
761     return;
762   MachineMemOperand *MM = *MI->memoperands_begin();
763   if (!MM->getValue())
764     return;
765   getUnderlyingObjects(MM->getValue(), Objs);
766   for (const Value *V : Objs) {
767     if (!isIdentifiedObject(V)) {
768       Objs.clear();
769       return;
770     }
771     Objs.push_back(V);
772   }
773 }
774 
775 /// Add a chain edge between a load and store if the store can be an
776 /// alias of the load on a subsequent iteration, i.e., a loop carried
777 /// dependence. This code is very similar to the code in ScheduleDAGInstrs
778 /// but that code doesn't create loop carried dependences.
779 void SwingSchedulerDAG::addLoopCarriedDependences(AliasAnalysis *AA) {
780   MapVector<const Value *, SmallVector<SUnit *, 4>> PendingLoads;
781   Value *UnknownValue =
782     UndefValue::get(Type::getVoidTy(MF.getFunction().getContext()));
783   for (auto &SU : SUnits) {
784     MachineInstr &MI = *SU.getInstr();
785     if (isDependenceBarrier(MI))
786       PendingLoads.clear();
787     else if (MI.mayLoad()) {
788       SmallVector<const Value *, 4> Objs;
789       ::getUnderlyingObjects(&MI, Objs);
790       if (Objs.empty())
791         Objs.push_back(UnknownValue);
792       for (const auto *V : Objs) {
793         SmallVector<SUnit *, 4> &SUs = PendingLoads[V];
794         SUs.push_back(&SU);
795       }
796     } else if (MI.mayStore()) {
797       SmallVector<const Value *, 4> Objs;
798       ::getUnderlyingObjects(&MI, Objs);
799       if (Objs.empty())
800         Objs.push_back(UnknownValue);
801       for (const auto *V : Objs) {
802         MapVector<const Value *, SmallVector<SUnit *, 4>>::iterator I =
803             PendingLoads.find(V);
804         if (I == PendingLoads.end())
805           continue;
806         for (auto *Load : I->second) {
807           if (isSuccOrder(Load, &SU))
808             continue;
809           MachineInstr &LdMI = *Load->getInstr();
810           // First, perform the cheaper check that compares the base register.
811           // If they are the same and the load offset is less than the store
812           // offset, then mark the dependence as loop carried potentially.
813           const MachineOperand *BaseOp1, *BaseOp2;
814           int64_t Offset1, Offset2;
815           bool Offset1IsScalable, Offset2IsScalable;
816           if (TII->getMemOperandWithOffset(LdMI, BaseOp1, Offset1,
817                                            Offset1IsScalable, TRI) &&
818               TII->getMemOperandWithOffset(MI, BaseOp2, Offset2,
819                                            Offset2IsScalable, TRI)) {
820             if (BaseOp1->isIdenticalTo(*BaseOp2) &&
821                 Offset1IsScalable == Offset2IsScalable &&
822                 (int)Offset1 < (int)Offset2) {
823               assert(TII->areMemAccessesTriviallyDisjoint(LdMI, MI) &&
824                      "What happened to the chain edge?");
825               SDep Dep(Load, SDep::Barrier);
826               Dep.setLatency(1);
827               SU.addPred(Dep);
828               continue;
829             }
830           }
831           // Second, the more expensive check that uses alias analysis on the
832           // base registers. If they alias, and the load offset is less than
833           // the store offset, the mark the dependence as loop carried.
834           if (!AA) {
835             SDep Dep(Load, SDep::Barrier);
836             Dep.setLatency(1);
837             SU.addPred(Dep);
838             continue;
839           }
840           MachineMemOperand *MMO1 = *LdMI.memoperands_begin();
841           MachineMemOperand *MMO2 = *MI.memoperands_begin();
842           if (!MMO1->getValue() || !MMO2->getValue()) {
843             SDep Dep(Load, SDep::Barrier);
844             Dep.setLatency(1);
845             SU.addPred(Dep);
846             continue;
847           }
848           if (MMO1->getValue() == MMO2->getValue() &&
849               MMO1->getOffset() <= MMO2->getOffset()) {
850             SDep Dep(Load, SDep::Barrier);
851             Dep.setLatency(1);
852             SU.addPred(Dep);
853             continue;
854           }
855           if (!AA->isNoAlias(
856                   MemoryLocation::getAfter(MMO1->getValue(), MMO1->getAAInfo()),
857                   MemoryLocation::getAfter(MMO2->getValue(),
858                                            MMO2->getAAInfo()))) {
859             SDep Dep(Load, SDep::Barrier);
860             Dep.setLatency(1);
861             SU.addPred(Dep);
862           }
863         }
864       }
865     }
866   }
867 }
868 
869 /// Update the phi dependences to the DAG because ScheduleDAGInstrs no longer
870 /// processes dependences for PHIs. This function adds true dependences
871 /// from a PHI to a use, and a loop carried dependence from the use to the
872 /// PHI. The loop carried dependence is represented as an anti dependence
873 /// edge. This function also removes chain dependences between unrelated
874 /// PHIs.
875 void SwingSchedulerDAG::updatePhiDependences() {
876   SmallVector<SDep, 4> RemoveDeps;
877   const TargetSubtargetInfo &ST = MF.getSubtarget<TargetSubtargetInfo>();
878 
879   // Iterate over each DAG node.
880   for (SUnit &I : SUnits) {
881     RemoveDeps.clear();
882     // Set to true if the instruction has an operand defined by a Phi.
883     unsigned HasPhiUse = 0;
884     unsigned HasPhiDef = 0;
885     MachineInstr *MI = I.getInstr();
886     // Iterate over each operand, and we process the definitions.
887     for (const MachineOperand &MO : MI->operands()) {
888       if (!MO.isReg())
889         continue;
890       Register Reg = MO.getReg();
891       if (MO.isDef()) {
892         // If the register is used by a Phi, then create an anti dependence.
893         for (MachineRegisterInfo::use_instr_iterator
894                  UI = MRI.use_instr_begin(Reg),
895                  UE = MRI.use_instr_end();
896              UI != UE; ++UI) {
897           MachineInstr *UseMI = &*UI;
898           SUnit *SU = getSUnit(UseMI);
899           if (SU != nullptr && UseMI->isPHI()) {
900             if (!MI->isPHI()) {
901               SDep Dep(SU, SDep::Anti, Reg);
902               Dep.setLatency(1);
903               I.addPred(Dep);
904             } else {
905               HasPhiDef = Reg;
906               // Add a chain edge to a dependent Phi that isn't an existing
907               // predecessor.
908               if (SU->NodeNum < I.NodeNum && !I.isPred(SU))
909                 I.addPred(SDep(SU, SDep::Barrier));
910             }
911           }
912         }
913       } else if (MO.isUse()) {
914         // If the register is defined by a Phi, then create a true dependence.
915         MachineInstr *DefMI = MRI.getUniqueVRegDef(Reg);
916         if (DefMI == nullptr)
917           continue;
918         SUnit *SU = getSUnit(DefMI);
919         if (SU != nullptr && DefMI->isPHI()) {
920           if (!MI->isPHI()) {
921             SDep Dep(SU, SDep::Data, Reg);
922             Dep.setLatency(0);
923             ST.adjustSchedDependency(SU, 0, &I, MO.getOperandNo(), Dep);
924             I.addPred(Dep);
925           } else {
926             HasPhiUse = Reg;
927             // Add a chain edge to a dependent Phi that isn't an existing
928             // predecessor.
929             if (SU->NodeNum < I.NodeNum && !I.isPred(SU))
930               I.addPred(SDep(SU, SDep::Barrier));
931           }
932         }
933       }
934     }
935     // Remove order dependences from an unrelated Phi.
936     if (!SwpPruneDeps)
937       continue;
938     for (auto &PI : I.Preds) {
939       MachineInstr *PMI = PI.getSUnit()->getInstr();
940       if (PMI->isPHI() && PI.getKind() == SDep::Order) {
941         if (I.getInstr()->isPHI()) {
942           if (PMI->getOperand(0).getReg() == HasPhiUse)
943             continue;
944           if (getLoopPhiReg(*PMI, PMI->getParent()) == HasPhiDef)
945             continue;
946         }
947         RemoveDeps.push_back(PI);
948       }
949     }
950     for (int i = 0, e = RemoveDeps.size(); i != e; ++i)
951       I.removePred(RemoveDeps[i]);
952   }
953 }
954 
955 /// Iterate over each DAG node and see if we can change any dependences
956 /// in order to reduce the recurrence MII.
957 void SwingSchedulerDAG::changeDependences() {
958   // See if an instruction can use a value from the previous iteration.
959   // If so, we update the base and offset of the instruction and change
960   // the dependences.
961   for (SUnit &I : SUnits) {
962     unsigned BasePos = 0, OffsetPos = 0, NewBase = 0;
963     int64_t NewOffset = 0;
964     if (!canUseLastOffsetValue(I.getInstr(), BasePos, OffsetPos, NewBase,
965                                NewOffset))
966       continue;
967 
968     // Get the MI and SUnit for the instruction that defines the original base.
969     Register OrigBase = I.getInstr()->getOperand(BasePos).getReg();
970     MachineInstr *DefMI = MRI.getUniqueVRegDef(OrigBase);
971     if (!DefMI)
972       continue;
973     SUnit *DefSU = getSUnit(DefMI);
974     if (!DefSU)
975       continue;
976     // Get the MI and SUnit for the instruction that defins the new base.
977     MachineInstr *LastMI = MRI.getUniqueVRegDef(NewBase);
978     if (!LastMI)
979       continue;
980     SUnit *LastSU = getSUnit(LastMI);
981     if (!LastSU)
982       continue;
983 
984     if (Topo.IsReachable(&I, LastSU))
985       continue;
986 
987     // Remove the dependence. The value now depends on a prior iteration.
988     SmallVector<SDep, 4> Deps;
989     for (const SDep &P : I.Preds)
990       if (P.getSUnit() == DefSU)
991         Deps.push_back(P);
992     for (int i = 0, e = Deps.size(); i != e; i++) {
993       Topo.RemovePred(&I, Deps[i].getSUnit());
994       I.removePred(Deps[i]);
995     }
996     // Remove the chain dependence between the instructions.
997     Deps.clear();
998     for (auto &P : LastSU->Preds)
999       if (P.getSUnit() == &I && P.getKind() == SDep::Order)
1000         Deps.push_back(P);
1001     for (int i = 0, e = Deps.size(); i != e; i++) {
1002       Topo.RemovePred(LastSU, Deps[i].getSUnit());
1003       LastSU->removePred(Deps[i]);
1004     }
1005 
1006     // Add a dependence between the new instruction and the instruction
1007     // that defines the new base.
1008     SDep Dep(&I, SDep::Anti, NewBase);
1009     Topo.AddPred(LastSU, &I);
1010     LastSU->addPred(Dep);
1011 
1012     // Remember the base and offset information so that we can update the
1013     // instruction during code generation.
1014     InstrChanges[&I] = std::make_pair(NewBase, NewOffset);
1015   }
1016 }
1017 
1018 /// Create an instruction stream that represents a single iteration and stage of
1019 /// each instruction. This function differs from SMSchedule::finalizeSchedule in
1020 /// that this doesn't have any side-effect to SwingSchedulerDAG. That is, this
1021 /// function is an approximation of SMSchedule::finalizeSchedule with all
1022 /// non-const operations removed.
1023 static void computeScheduledInsts(const SwingSchedulerDAG *SSD,
1024                                   SMSchedule &Schedule,
1025                                   std::vector<MachineInstr *> &OrderedInsts,
1026                                   DenseMap<MachineInstr *, unsigned> &Stages) {
1027   DenseMap<int, std::deque<SUnit *>> Instrs;
1028 
1029   // Move all instructions to the first stage from the later stages.
1030   for (int Cycle = Schedule.getFirstCycle(); Cycle <= Schedule.getFinalCycle();
1031        ++Cycle) {
1032     for (int Stage = 0, LastStage = Schedule.getMaxStageCount();
1033          Stage <= LastStage; ++Stage) {
1034       for (SUnit *SU : llvm::reverse(Schedule.getInstructions(
1035                Cycle + Stage * Schedule.getInitiationInterval()))) {
1036         Instrs[Cycle].push_front(SU);
1037       }
1038     }
1039   }
1040 
1041   for (int Cycle = Schedule.getFirstCycle(); Cycle <= Schedule.getFinalCycle();
1042        ++Cycle) {
1043     std::deque<SUnit *> &CycleInstrs = Instrs[Cycle];
1044     CycleInstrs = Schedule.reorderInstructions(SSD, CycleInstrs);
1045     for (SUnit *SU : CycleInstrs) {
1046       MachineInstr *MI = SU->getInstr();
1047       OrderedInsts.push_back(MI);
1048       Stages[MI] = Schedule.stageScheduled(SU);
1049     }
1050   }
1051 }
1052 
1053 namespace {
1054 
1055 // FuncUnitSorter - Comparison operator used to sort instructions by
1056 // the number of functional unit choices.
1057 struct FuncUnitSorter {
1058   const InstrItineraryData *InstrItins;
1059   const MCSubtargetInfo *STI;
1060   DenseMap<InstrStage::FuncUnits, unsigned> Resources;
1061 
1062   FuncUnitSorter(const TargetSubtargetInfo &TSI)
1063       : InstrItins(TSI.getInstrItineraryData()), STI(&TSI) {}
1064 
1065   // Compute the number of functional unit alternatives needed
1066   // at each stage, and take the minimum value. We prioritize the
1067   // instructions by the least number of choices first.
1068   unsigned minFuncUnits(const MachineInstr *Inst,
1069                         InstrStage::FuncUnits &F) const {
1070     unsigned SchedClass = Inst->getDesc().getSchedClass();
1071     unsigned min = UINT_MAX;
1072     if (InstrItins && !InstrItins->isEmpty()) {
1073       for (const InstrStage &IS :
1074            make_range(InstrItins->beginStage(SchedClass),
1075                       InstrItins->endStage(SchedClass))) {
1076         InstrStage::FuncUnits funcUnits = IS.getUnits();
1077         unsigned numAlternatives = llvm::popcount(funcUnits);
1078         if (numAlternatives < min) {
1079           min = numAlternatives;
1080           F = funcUnits;
1081         }
1082       }
1083       return min;
1084     }
1085     if (STI && STI->getSchedModel().hasInstrSchedModel()) {
1086       const MCSchedClassDesc *SCDesc =
1087           STI->getSchedModel().getSchedClassDesc(SchedClass);
1088       if (!SCDesc->isValid())
1089         // No valid Schedule Class Desc for schedClass, should be
1090         // Pseudo/PostRAPseudo
1091         return min;
1092 
1093       for (const MCWriteProcResEntry &PRE :
1094            make_range(STI->getWriteProcResBegin(SCDesc),
1095                       STI->getWriteProcResEnd(SCDesc))) {
1096         if (!PRE.ReleaseAtCycle)
1097           continue;
1098         const MCProcResourceDesc *ProcResource =
1099             STI->getSchedModel().getProcResource(PRE.ProcResourceIdx);
1100         unsigned NumUnits = ProcResource->NumUnits;
1101         if (NumUnits < min) {
1102           min = NumUnits;
1103           F = PRE.ProcResourceIdx;
1104         }
1105       }
1106       return min;
1107     }
1108     llvm_unreachable("Should have non-empty InstrItins or hasInstrSchedModel!");
1109   }
1110 
1111   // Compute the critical resources needed by the instruction. This
1112   // function records the functional units needed by instructions that
1113   // must use only one functional unit. We use this as a tie breaker
1114   // for computing the resource MII. The instrutions that require
1115   // the same, highly used, functional unit have high priority.
1116   void calcCriticalResources(MachineInstr &MI) {
1117     unsigned SchedClass = MI.getDesc().getSchedClass();
1118     if (InstrItins && !InstrItins->isEmpty()) {
1119       for (const InstrStage &IS :
1120            make_range(InstrItins->beginStage(SchedClass),
1121                       InstrItins->endStage(SchedClass))) {
1122         InstrStage::FuncUnits FuncUnits = IS.getUnits();
1123         if (llvm::popcount(FuncUnits) == 1)
1124           Resources[FuncUnits]++;
1125       }
1126       return;
1127     }
1128     if (STI && STI->getSchedModel().hasInstrSchedModel()) {
1129       const MCSchedClassDesc *SCDesc =
1130           STI->getSchedModel().getSchedClassDesc(SchedClass);
1131       if (!SCDesc->isValid())
1132         // No valid Schedule Class Desc for schedClass, should be
1133         // Pseudo/PostRAPseudo
1134         return;
1135 
1136       for (const MCWriteProcResEntry &PRE :
1137            make_range(STI->getWriteProcResBegin(SCDesc),
1138                       STI->getWriteProcResEnd(SCDesc))) {
1139         if (!PRE.ReleaseAtCycle)
1140           continue;
1141         Resources[PRE.ProcResourceIdx]++;
1142       }
1143       return;
1144     }
1145     llvm_unreachable("Should have non-empty InstrItins or hasInstrSchedModel!");
1146   }
1147 
1148   /// Return true if IS1 has less priority than IS2.
1149   bool operator()(const MachineInstr *IS1, const MachineInstr *IS2) const {
1150     InstrStage::FuncUnits F1 = 0, F2 = 0;
1151     unsigned MFUs1 = minFuncUnits(IS1, F1);
1152     unsigned MFUs2 = minFuncUnits(IS2, F2);
1153     if (MFUs1 == MFUs2)
1154       return Resources.lookup(F1) < Resources.lookup(F2);
1155     return MFUs1 > MFUs2;
1156   }
1157 };
1158 
1159 /// Calculate the maximum register pressure of the scheduled instructions stream
1160 class HighRegisterPressureDetector {
1161   MachineBasicBlock *OrigMBB;
1162   const MachineFunction &MF;
1163   const MachineRegisterInfo &MRI;
1164   const TargetRegisterInfo *TRI;
1165 
1166   const unsigned PSetNum;
1167 
1168   // Indexed by PSet ID
1169   // InitSetPressure takes into account the register pressure of live-in
1170   // registers. It's not depend on how the loop is scheduled, so it's enough to
1171   // calculate them once at the beginning.
1172   std::vector<unsigned> InitSetPressure;
1173 
1174   // Indexed by PSet ID
1175   // Upper limit for each register pressure set
1176   std::vector<unsigned> PressureSetLimit;
1177 
1178   DenseMap<MachineInstr *, RegisterOperands> ROMap;
1179 
1180   using Instr2LastUsesTy = DenseMap<MachineInstr *, SmallDenseSet<Register, 4>>;
1181 
1182 public:
1183   using OrderedInstsTy = std::vector<MachineInstr *>;
1184   using Instr2StageTy = DenseMap<MachineInstr *, unsigned>;
1185 
1186 private:
1187   static void dumpRegisterPressures(const std::vector<unsigned> &Pressures) {
1188     if (Pressures.size() == 0) {
1189       dbgs() << "[]";
1190     } else {
1191       char Prefix = '[';
1192       for (unsigned P : Pressures) {
1193         dbgs() << Prefix << P;
1194         Prefix = ' ';
1195       }
1196       dbgs() << ']';
1197     }
1198   }
1199 
1200   void dumpPSet(Register Reg) const {
1201     dbgs() << "Reg=" << printReg(Reg, TRI, 0, &MRI) << " PSet=";
1202     for (auto PSetIter = MRI.getPressureSets(Reg); PSetIter.isValid();
1203          ++PSetIter) {
1204       dbgs() << *PSetIter << ' ';
1205     }
1206     dbgs() << '\n';
1207   }
1208 
1209   void increaseRegisterPressure(std::vector<unsigned> &Pressure,
1210                                 Register Reg) const {
1211     auto PSetIter = MRI.getPressureSets(Reg);
1212     unsigned Weight = PSetIter.getWeight();
1213     for (; PSetIter.isValid(); ++PSetIter)
1214       Pressure[*PSetIter] += Weight;
1215   }
1216 
1217   void decreaseRegisterPressure(std::vector<unsigned> &Pressure,
1218                                 Register Reg) const {
1219     auto PSetIter = MRI.getPressureSets(Reg);
1220     unsigned Weight = PSetIter.getWeight();
1221     for (; PSetIter.isValid(); ++PSetIter) {
1222       auto &P = Pressure[*PSetIter];
1223       assert(P >= Weight &&
1224              "register pressure must be greater than or equal weight");
1225       P -= Weight;
1226     }
1227   }
1228 
1229   // Return true if Reg is fixed one, for example, stack pointer
1230   bool isFixedRegister(Register Reg) const {
1231     return Reg.isPhysical() && TRI->isFixedRegister(MF, Reg.asMCReg());
1232   }
1233 
1234   bool isDefinedInThisLoop(Register Reg) const {
1235     return Reg.isVirtual() && MRI.getVRegDef(Reg)->getParent() == OrigMBB;
1236   }
1237 
1238   // Search for live-in variables. They are factored into the register pressure
1239   // from the begining. Live-in variables used by every iteration should be
1240   // considered as alive throughout the loop. For example, the variable `c` in
1241   // following code. \code
1242   //   int c = ...;
1243   //   for (int i = 0; i < n; i++)
1244   //     a[i] += b[i] + c;
1245   // \endcode
1246   void computeLiveIn() {
1247     DenseSet<Register> Used;
1248     for (auto &MI : *OrigMBB) {
1249       if (MI.isDebugInstr())
1250         continue;
1251       for (auto Use : ROMap[&MI].Uses) {
1252         auto Reg = Use.RegUnit;
1253         // Ignore the variable that appears only on one side of phi instruction
1254         // because it's used only at the first iteration.
1255         if (MI.isPHI() && Reg != getLoopPhiReg(MI, OrigMBB))
1256           continue;
1257         if (isFixedRegister(Reg))
1258           continue;
1259         if (isDefinedInThisLoop(Reg))
1260           continue;
1261         Used.insert(Reg);
1262       }
1263     }
1264 
1265     for (auto LiveIn : Used)
1266       increaseRegisterPressure(InitSetPressure, LiveIn);
1267   }
1268 
1269   // Calculate the upper limit of each pressure set
1270   void computePressureSetLimit(const RegisterClassInfo &RCI) {
1271     for (unsigned PSet = 0; PSet < PSetNum; PSet++)
1272       PressureSetLimit[PSet] = RCI.getRegPressureSetLimit(PSet);
1273 
1274     // We assume fixed registers, such as stack pointer, are already in use.
1275     // Therefore subtracting the weight of the fixed registers from the limit of
1276     // each pressure set in advance.
1277     SmallDenseSet<Register, 8> FixedRegs;
1278     for (const TargetRegisterClass *TRC : TRI->regclasses()) {
1279       for (const MCPhysReg Reg : *TRC)
1280         if (isFixedRegister(Reg))
1281           FixedRegs.insert(Reg);
1282     }
1283 
1284     LLVM_DEBUG({
1285       for (auto Reg : FixedRegs) {
1286         dbgs() << printReg(Reg, TRI, 0, &MRI) << ": [";
1287         const int *Sets = TRI->getRegUnitPressureSets(Reg);
1288         for (; *Sets != -1; Sets++) {
1289           dbgs() << TRI->getRegPressureSetName(*Sets) << ", ";
1290         }
1291         dbgs() << "]\n";
1292       }
1293     });
1294 
1295     for (auto Reg : FixedRegs) {
1296       LLVM_DEBUG(dbgs() << "fixed register: " << printReg(Reg, TRI, 0, &MRI)
1297                         << "\n");
1298       auto PSetIter = MRI.getPressureSets(Reg);
1299       unsigned Weight = PSetIter.getWeight();
1300       for (; PSetIter.isValid(); ++PSetIter) {
1301         unsigned &Limit = PressureSetLimit[*PSetIter];
1302         assert(Limit >= Weight &&
1303                "register pressure limit must be greater than or equal weight");
1304         Limit -= Weight;
1305         LLVM_DEBUG(dbgs() << "PSet=" << *PSetIter << " Limit=" << Limit
1306                           << " (decreased by " << Weight << ")\n");
1307       }
1308     }
1309   }
1310 
1311   // There are two patterns of last-use.
1312   //   - by an instruction of the current iteration
1313   //   - by a phi instruction of the next iteration (loop carried value)
1314   //
1315   // Furthermore, following two groups of instructions are executed
1316   // simultaneously
1317   //   - next iteration's phi instructions in i-th stage
1318   //   - current iteration's instructions in i+1-th stage
1319   //
1320   // This function calculates the last-use of each register while taking into
1321   // account the above two patterns.
1322   Instr2LastUsesTy computeLastUses(const OrderedInstsTy &OrderedInsts,
1323                                    Instr2StageTy &Stages) const {
1324     // We treat virtual registers that are defined and used in this loop.
1325     // Following virtual register will be ignored
1326     //   - live-in one
1327     //   - defined but not used in the loop (potentially live-out)
1328     DenseSet<Register> TargetRegs;
1329     const auto UpdateTargetRegs = [this, &TargetRegs](Register Reg) {
1330       if (isDefinedInThisLoop(Reg))
1331         TargetRegs.insert(Reg);
1332     };
1333     for (MachineInstr *MI : OrderedInsts) {
1334       if (MI->isPHI()) {
1335         Register Reg = getLoopPhiReg(*MI, OrigMBB);
1336         UpdateTargetRegs(Reg);
1337       } else {
1338         for (auto Use : ROMap.find(MI)->getSecond().Uses)
1339           UpdateTargetRegs(Use.RegUnit);
1340       }
1341     }
1342 
1343     const auto InstrScore = [&Stages](MachineInstr *MI) {
1344       return Stages[MI] + MI->isPHI();
1345     };
1346 
1347     DenseMap<Register, MachineInstr *> LastUseMI;
1348     for (MachineInstr *MI : llvm::reverse(OrderedInsts)) {
1349       for (auto Use : ROMap.find(MI)->getSecond().Uses) {
1350         auto Reg = Use.RegUnit;
1351         if (!TargetRegs.contains(Reg))
1352           continue;
1353         auto Ite = LastUseMI.find(Reg);
1354         if (Ite == LastUseMI.end()) {
1355           LastUseMI[Reg] = MI;
1356         } else {
1357           MachineInstr *Orig = Ite->second;
1358           MachineInstr *New = MI;
1359           if (InstrScore(Orig) < InstrScore(New))
1360             LastUseMI[Reg] = New;
1361         }
1362       }
1363     }
1364 
1365     Instr2LastUsesTy LastUses;
1366     for (auto &Entry : LastUseMI)
1367       LastUses[Entry.second].insert(Entry.first);
1368     return LastUses;
1369   }
1370 
1371   // Compute the maximum register pressure of the kernel. We'll simulate #Stage
1372   // iterations and check the register pressure at the point where all stages
1373   // overlapping.
1374   //
1375   // An example of unrolled loop where #Stage is 4..
1376   // Iter   i+0 i+1 i+2 i+3
1377   // ------------------------
1378   // Stage   0
1379   // Stage   1   0
1380   // Stage   2   1   0
1381   // Stage   3   2   1   0  <- All stages overlap
1382   //
1383   std::vector<unsigned>
1384   computeMaxSetPressure(const OrderedInstsTy &OrderedInsts,
1385                         Instr2StageTy &Stages,
1386                         const unsigned StageCount) const {
1387     using RegSetTy = SmallDenseSet<Register, 16>;
1388 
1389     // Indexed by #Iter. To treat "local" variables of each stage separately, we
1390     // manage the liveness of the registers independently by iterations.
1391     SmallVector<RegSetTy> LiveRegSets(StageCount);
1392 
1393     auto CurSetPressure = InitSetPressure;
1394     auto MaxSetPressure = InitSetPressure;
1395     auto LastUses = computeLastUses(OrderedInsts, Stages);
1396 
1397     LLVM_DEBUG({
1398       dbgs() << "Ordered instructions:\n";
1399       for (MachineInstr *MI : OrderedInsts) {
1400         dbgs() << "Stage " << Stages[MI] << ": ";
1401         MI->dump();
1402       }
1403     });
1404 
1405     const auto InsertReg = [this, &CurSetPressure](RegSetTy &RegSet,
1406                                                    Register Reg) {
1407       if (!Reg.isValid() || isFixedRegister(Reg))
1408         return;
1409 
1410       bool Inserted = RegSet.insert(Reg).second;
1411       if (!Inserted)
1412         return;
1413 
1414       LLVM_DEBUG(dbgs() << "insert " << printReg(Reg, TRI, 0, &MRI) << "\n");
1415       increaseRegisterPressure(CurSetPressure, Reg);
1416       LLVM_DEBUG(dumpPSet(Reg));
1417     };
1418 
1419     const auto EraseReg = [this, &CurSetPressure](RegSetTy &RegSet,
1420                                                   Register Reg) {
1421       if (!Reg.isValid() || isFixedRegister(Reg))
1422         return;
1423 
1424       // live-in register
1425       if (!RegSet.contains(Reg))
1426         return;
1427 
1428       LLVM_DEBUG(dbgs() << "erase " << printReg(Reg, TRI, 0, &MRI) << "\n");
1429       RegSet.erase(Reg);
1430       decreaseRegisterPressure(CurSetPressure, Reg);
1431       LLVM_DEBUG(dumpPSet(Reg));
1432     };
1433 
1434     for (unsigned I = 0; I < StageCount; I++) {
1435       for (MachineInstr *MI : OrderedInsts) {
1436         const auto Stage = Stages[MI];
1437         if (I < Stage)
1438           continue;
1439 
1440         const unsigned Iter = I - Stage;
1441 
1442         for (auto Def : ROMap.find(MI)->getSecond().Defs)
1443           InsertReg(LiveRegSets[Iter], Def.RegUnit);
1444 
1445         for (auto LastUse : LastUses[MI]) {
1446           if (MI->isPHI()) {
1447             if (Iter != 0)
1448               EraseReg(LiveRegSets[Iter - 1], LastUse);
1449           } else {
1450             EraseReg(LiveRegSets[Iter], LastUse);
1451           }
1452         }
1453 
1454         for (unsigned PSet = 0; PSet < PSetNum; PSet++)
1455           MaxSetPressure[PSet] =
1456               std::max(MaxSetPressure[PSet], CurSetPressure[PSet]);
1457 
1458         LLVM_DEBUG({
1459           dbgs() << "CurSetPressure=";
1460           dumpRegisterPressures(CurSetPressure);
1461           dbgs() << " iter=" << Iter << " stage=" << Stage << ":";
1462           MI->dump();
1463         });
1464       }
1465     }
1466 
1467     return MaxSetPressure;
1468   }
1469 
1470 public:
1471   HighRegisterPressureDetector(MachineBasicBlock *OrigMBB,
1472                                const MachineFunction &MF)
1473       : OrigMBB(OrigMBB), MF(MF), MRI(MF.getRegInfo()),
1474         TRI(MF.getSubtarget().getRegisterInfo()),
1475         PSetNum(TRI->getNumRegPressureSets()), InitSetPressure(PSetNum, 0),
1476         PressureSetLimit(PSetNum, 0) {}
1477 
1478   // Used to calculate register pressure, which is independent of loop
1479   // scheduling.
1480   void init(const RegisterClassInfo &RCI) {
1481     for (MachineInstr &MI : *OrigMBB) {
1482       if (MI.isDebugInstr())
1483         continue;
1484       ROMap[&MI].collect(MI, *TRI, MRI, false, true);
1485     }
1486 
1487     computeLiveIn();
1488     computePressureSetLimit(RCI);
1489   }
1490 
1491   // Calculate the maximum register pressures of the loop and check if they
1492   // exceed the limit
1493   bool detect(const SwingSchedulerDAG *SSD, SMSchedule &Schedule,
1494               const unsigned MaxStage) const {
1495     assert(0 <= RegPressureMargin && RegPressureMargin <= 100 &&
1496            "the percentage of the margin must be between 0 to 100");
1497 
1498     OrderedInstsTy OrderedInsts;
1499     Instr2StageTy Stages;
1500     computeScheduledInsts(SSD, Schedule, OrderedInsts, Stages);
1501     const auto MaxSetPressure =
1502         computeMaxSetPressure(OrderedInsts, Stages, MaxStage + 1);
1503 
1504     LLVM_DEBUG({
1505       dbgs() << "Dump MaxSetPressure:\n";
1506       for (unsigned I = 0; I < MaxSetPressure.size(); I++) {
1507         dbgs() << format("MaxSetPressure[%d]=%d\n", I, MaxSetPressure[I]);
1508       }
1509       dbgs() << '\n';
1510     });
1511 
1512     for (unsigned PSet = 0; PSet < PSetNum; PSet++) {
1513       unsigned Limit = PressureSetLimit[PSet];
1514       unsigned Margin = Limit * RegPressureMargin / 100;
1515       LLVM_DEBUG(dbgs() << "PSet=" << PSet << " Limit=" << Limit
1516                         << " Margin=" << Margin << "\n");
1517       if (Limit < MaxSetPressure[PSet] + Margin) {
1518         LLVM_DEBUG(
1519             dbgs()
1520             << "Rejected the schedule because of too high register pressure\n");
1521         return true;
1522       }
1523     }
1524     return false;
1525   }
1526 };
1527 
1528 } // end anonymous namespace
1529 
1530 /// Calculate the resource constrained minimum initiation interval for the
1531 /// specified loop. We use the DFA to model the resources needed for
1532 /// each instruction, and we ignore dependences. A different DFA is created
1533 /// for each cycle that is required. When adding a new instruction, we attempt
1534 /// to add it to each existing DFA, until a legal space is found. If the
1535 /// instruction cannot be reserved in an existing DFA, we create a new one.
1536 unsigned SwingSchedulerDAG::calculateResMII() {
1537   LLVM_DEBUG(dbgs() << "calculateResMII:\n");
1538   ResourceManager RM(&MF.getSubtarget(), this);
1539   return RM.calculateResMII();
1540 }
1541 
1542 /// Calculate the recurrence-constrainted minimum initiation interval.
1543 /// Iterate over each circuit.  Compute the delay(c) and distance(c)
1544 /// for each circuit. The II needs to satisfy the inequality
1545 /// delay(c) - II*distance(c) <= 0. For each circuit, choose the smallest
1546 /// II that satisfies the inequality, and the RecMII is the maximum
1547 /// of those values.
1548 unsigned SwingSchedulerDAG::calculateRecMII(NodeSetType &NodeSets) {
1549   unsigned RecMII = 0;
1550 
1551   for (NodeSet &Nodes : NodeSets) {
1552     if (Nodes.empty())
1553       continue;
1554 
1555     unsigned Delay = Nodes.getLatency();
1556     unsigned Distance = 1;
1557 
1558     // ii = ceil(delay / distance)
1559     unsigned CurMII = (Delay + Distance - 1) / Distance;
1560     Nodes.setRecMII(CurMII);
1561     if (CurMII > RecMII)
1562       RecMII = CurMII;
1563   }
1564 
1565   return RecMII;
1566 }
1567 
1568 /// Swap all the anti dependences in the DAG. That means it is no longer a DAG,
1569 /// but we do this to find the circuits, and then change them back.
1570 static void swapAntiDependences(std::vector<SUnit> &SUnits) {
1571   SmallVector<std::pair<SUnit *, SDep>, 8> DepsAdded;
1572   for (SUnit &SU : SUnits) {
1573     for (SDep &Pred : SU.Preds)
1574       if (Pred.getKind() == SDep::Anti)
1575         DepsAdded.push_back(std::make_pair(&SU, Pred));
1576   }
1577   for (std::pair<SUnit *, SDep> &P : DepsAdded) {
1578     // Remove this anti dependency and add one in the reverse direction.
1579     SUnit *SU = P.first;
1580     SDep &D = P.second;
1581     SUnit *TargetSU = D.getSUnit();
1582     unsigned Reg = D.getReg();
1583     unsigned Lat = D.getLatency();
1584     SU->removePred(D);
1585     SDep Dep(SU, SDep::Anti, Reg);
1586     Dep.setLatency(Lat);
1587     TargetSU->addPred(Dep);
1588   }
1589 }
1590 
1591 /// Create the adjacency structure of the nodes in the graph.
1592 void SwingSchedulerDAG::Circuits::createAdjacencyStructure(
1593     SwingSchedulerDAG *DAG) {
1594   BitVector Added(SUnits.size());
1595   DenseMap<int, int> OutputDeps;
1596   for (int i = 0, e = SUnits.size(); i != e; ++i) {
1597     Added.reset();
1598     // Add any successor to the adjacency matrix and exclude duplicates.
1599     for (auto &SI : SUnits[i].Succs) {
1600       // Only create a back-edge on the first and last nodes of a dependence
1601       // chain. This records any chains and adds them later.
1602       if (SI.getKind() == SDep::Output) {
1603         int N = SI.getSUnit()->NodeNum;
1604         int BackEdge = i;
1605         auto Dep = OutputDeps.find(BackEdge);
1606         if (Dep != OutputDeps.end()) {
1607           BackEdge = Dep->second;
1608           OutputDeps.erase(Dep);
1609         }
1610         OutputDeps[N] = BackEdge;
1611       }
1612       // Do not process a boundary node, an artificial node.
1613       // A back-edge is processed only if it goes to a Phi.
1614       if (SI.getSUnit()->isBoundaryNode() || SI.isArtificial() ||
1615           (SI.getKind() == SDep::Anti && !SI.getSUnit()->getInstr()->isPHI()))
1616         continue;
1617       int N = SI.getSUnit()->NodeNum;
1618       if (!Added.test(N)) {
1619         AdjK[i].push_back(N);
1620         Added.set(N);
1621       }
1622     }
1623     // A chain edge between a store and a load is treated as a back-edge in the
1624     // adjacency matrix.
1625     for (auto &PI : SUnits[i].Preds) {
1626       if (!SUnits[i].getInstr()->mayStore() ||
1627           !DAG->isLoopCarriedDep(&SUnits[i], PI, false))
1628         continue;
1629       if (PI.getKind() == SDep::Order && PI.getSUnit()->getInstr()->mayLoad()) {
1630         int N = PI.getSUnit()->NodeNum;
1631         if (!Added.test(N)) {
1632           AdjK[i].push_back(N);
1633           Added.set(N);
1634         }
1635       }
1636     }
1637   }
1638   // Add back-edges in the adjacency matrix for the output dependences.
1639   for (auto &OD : OutputDeps)
1640     if (!Added.test(OD.second)) {
1641       AdjK[OD.first].push_back(OD.second);
1642       Added.set(OD.second);
1643     }
1644 }
1645 
1646 /// Identify an elementary circuit in the dependence graph starting at the
1647 /// specified node.
1648 bool SwingSchedulerDAG::Circuits::circuit(int V, int S, NodeSetType &NodeSets,
1649                                           bool HasBackedge) {
1650   SUnit *SV = &SUnits[V];
1651   bool F = false;
1652   Stack.insert(SV);
1653   Blocked.set(V);
1654 
1655   for (auto W : AdjK[V]) {
1656     if (NumPaths > MaxPaths)
1657       break;
1658     if (W < S)
1659       continue;
1660     if (W == S) {
1661       if (!HasBackedge)
1662         NodeSets.push_back(NodeSet(Stack.begin(), Stack.end()));
1663       F = true;
1664       ++NumPaths;
1665       break;
1666     } else if (!Blocked.test(W)) {
1667       if (circuit(W, S, NodeSets,
1668                   Node2Idx->at(W) < Node2Idx->at(V) ? true : HasBackedge))
1669         F = true;
1670     }
1671   }
1672 
1673   if (F)
1674     unblock(V);
1675   else {
1676     for (auto W : AdjK[V]) {
1677       if (W < S)
1678         continue;
1679       B[W].insert(SV);
1680     }
1681   }
1682   Stack.pop_back();
1683   return F;
1684 }
1685 
1686 /// Unblock a node in the circuit finding algorithm.
1687 void SwingSchedulerDAG::Circuits::unblock(int U) {
1688   Blocked.reset(U);
1689   SmallPtrSet<SUnit *, 4> &BU = B[U];
1690   while (!BU.empty()) {
1691     SmallPtrSet<SUnit *, 4>::iterator SI = BU.begin();
1692     assert(SI != BU.end() && "Invalid B set.");
1693     SUnit *W = *SI;
1694     BU.erase(W);
1695     if (Blocked.test(W->NodeNum))
1696       unblock(W->NodeNum);
1697   }
1698 }
1699 
1700 /// Identify all the elementary circuits in the dependence graph using
1701 /// Johnson's circuit algorithm.
1702 void SwingSchedulerDAG::findCircuits(NodeSetType &NodeSets) {
1703   // Swap all the anti dependences in the DAG. That means it is no longer a DAG,
1704   // but we do this to find the circuits, and then change them back.
1705   swapAntiDependences(SUnits);
1706 
1707   Circuits Cir(SUnits, Topo);
1708   // Create the adjacency structure.
1709   Cir.createAdjacencyStructure(this);
1710   for (int i = 0, e = SUnits.size(); i != e; ++i) {
1711     Cir.reset();
1712     Cir.circuit(i, i, NodeSets);
1713   }
1714 
1715   // Change the dependences back so that we've created a DAG again.
1716   swapAntiDependences(SUnits);
1717 }
1718 
1719 // Create artificial dependencies between the source of COPY/REG_SEQUENCE that
1720 // is loop-carried to the USE in next iteration. This will help pipeliner avoid
1721 // additional copies that are needed across iterations. An artificial dependence
1722 // edge is added from USE to SOURCE of COPY/REG_SEQUENCE.
1723 
1724 // PHI-------Anti-Dep-----> COPY/REG_SEQUENCE (loop-carried)
1725 // SRCOfCopY------True-Dep---> COPY/REG_SEQUENCE
1726 // PHI-------True-Dep------> USEOfPhi
1727 
1728 // The mutation creates
1729 // USEOfPHI -------Artificial-Dep---> SRCOfCopy
1730 
1731 // This overall will ensure, the USEOfPHI is scheduled before SRCOfCopy
1732 // (since USE is a predecessor), implies, the COPY/ REG_SEQUENCE is scheduled
1733 // late  to avoid additional copies across iterations. The possible scheduling
1734 // order would be
1735 // USEOfPHI --- SRCOfCopy---  COPY/REG_SEQUENCE.
1736 
1737 void SwingSchedulerDAG::CopyToPhiMutation::apply(ScheduleDAGInstrs *DAG) {
1738   for (SUnit &SU : DAG->SUnits) {
1739     // Find the COPY/REG_SEQUENCE instruction.
1740     if (!SU.getInstr()->isCopy() && !SU.getInstr()->isRegSequence())
1741       continue;
1742 
1743     // Record the loop carried PHIs.
1744     SmallVector<SUnit *, 4> PHISUs;
1745     // Record the SrcSUs that feed the COPY/REG_SEQUENCE instructions.
1746     SmallVector<SUnit *, 4> SrcSUs;
1747 
1748     for (auto &Dep : SU.Preds) {
1749       SUnit *TmpSU = Dep.getSUnit();
1750       MachineInstr *TmpMI = TmpSU->getInstr();
1751       SDep::Kind DepKind = Dep.getKind();
1752       // Save the loop carried PHI.
1753       if (DepKind == SDep::Anti && TmpMI->isPHI())
1754         PHISUs.push_back(TmpSU);
1755       // Save the source of COPY/REG_SEQUENCE.
1756       // If the source has no pre-decessors, we will end up creating cycles.
1757       else if (DepKind == SDep::Data && !TmpMI->isPHI() && TmpSU->NumPreds > 0)
1758         SrcSUs.push_back(TmpSU);
1759     }
1760 
1761     if (PHISUs.size() == 0 || SrcSUs.size() == 0)
1762       continue;
1763 
1764     // Find the USEs of PHI. If the use is a PHI or REG_SEQUENCE, push back this
1765     // SUnit to the container.
1766     SmallVector<SUnit *, 8> UseSUs;
1767     // Do not use iterator based loop here as we are updating the container.
1768     for (size_t Index = 0; Index < PHISUs.size(); ++Index) {
1769       for (auto &Dep : PHISUs[Index]->Succs) {
1770         if (Dep.getKind() != SDep::Data)
1771           continue;
1772 
1773         SUnit *TmpSU = Dep.getSUnit();
1774         MachineInstr *TmpMI = TmpSU->getInstr();
1775         if (TmpMI->isPHI() || TmpMI->isRegSequence()) {
1776           PHISUs.push_back(TmpSU);
1777           continue;
1778         }
1779         UseSUs.push_back(TmpSU);
1780       }
1781     }
1782 
1783     if (UseSUs.size() == 0)
1784       continue;
1785 
1786     SwingSchedulerDAG *SDAG = cast<SwingSchedulerDAG>(DAG);
1787     // Add the artificial dependencies if it does not form a cycle.
1788     for (auto *I : UseSUs) {
1789       for (auto *Src : SrcSUs) {
1790         if (!SDAG->Topo.IsReachable(I, Src) && Src != I) {
1791           Src->addPred(SDep(I, SDep::Artificial));
1792           SDAG->Topo.AddPred(Src, I);
1793         }
1794       }
1795     }
1796   }
1797 }
1798 
1799 /// Return true for DAG nodes that we ignore when computing the cost functions.
1800 /// We ignore the back-edge recurrence in order to avoid unbounded recursion
1801 /// in the calculation of the ASAP, ALAP, etc functions.
1802 static bool ignoreDependence(const SDep &D, bool isPred) {
1803   if (D.isArtificial() || D.getSUnit()->isBoundaryNode())
1804     return true;
1805   return D.getKind() == SDep::Anti && isPred;
1806 }
1807 
1808 /// Compute several functions need to order the nodes for scheduling.
1809 ///  ASAP - Earliest time to schedule a node.
1810 ///  ALAP - Latest time to schedule a node.
1811 ///  MOV - Mobility function, difference between ALAP and ASAP.
1812 ///  D - Depth of each node.
1813 ///  H - Height of each node.
1814 void SwingSchedulerDAG::computeNodeFunctions(NodeSetType &NodeSets) {
1815   ScheduleInfo.resize(SUnits.size());
1816 
1817   LLVM_DEBUG({
1818     for (int I : Topo) {
1819       const SUnit &SU = SUnits[I];
1820       dumpNode(SU);
1821     }
1822   });
1823 
1824   int maxASAP = 0;
1825   // Compute ASAP and ZeroLatencyDepth.
1826   for (int I : Topo) {
1827     int asap = 0;
1828     int zeroLatencyDepth = 0;
1829     SUnit *SU = &SUnits[I];
1830     for (const SDep &P : SU->Preds) {
1831       SUnit *pred = P.getSUnit();
1832       if (P.getLatency() == 0)
1833         zeroLatencyDepth =
1834             std::max(zeroLatencyDepth, getZeroLatencyDepth(pred) + 1);
1835       if (ignoreDependence(P, true))
1836         continue;
1837       asap = std::max(asap, (int)(getASAP(pred) + P.getLatency() -
1838                                   getDistance(pred, SU, P) * MII));
1839     }
1840     maxASAP = std::max(maxASAP, asap);
1841     ScheduleInfo[I].ASAP = asap;
1842     ScheduleInfo[I].ZeroLatencyDepth = zeroLatencyDepth;
1843   }
1844 
1845   // Compute ALAP, ZeroLatencyHeight, and MOV.
1846   for (int I : llvm::reverse(Topo)) {
1847     int alap = maxASAP;
1848     int zeroLatencyHeight = 0;
1849     SUnit *SU = &SUnits[I];
1850     for (const SDep &S : SU->Succs) {
1851       SUnit *succ = S.getSUnit();
1852       if (succ->isBoundaryNode())
1853         continue;
1854       if (S.getLatency() == 0)
1855         zeroLatencyHeight =
1856             std::max(zeroLatencyHeight, getZeroLatencyHeight(succ) + 1);
1857       if (ignoreDependence(S, true))
1858         continue;
1859       alap = std::min(alap, (int)(getALAP(succ) - S.getLatency() +
1860                                   getDistance(SU, succ, S) * MII));
1861     }
1862 
1863     ScheduleInfo[I].ALAP = alap;
1864     ScheduleInfo[I].ZeroLatencyHeight = zeroLatencyHeight;
1865   }
1866 
1867   // After computing the node functions, compute the summary for each node set.
1868   for (NodeSet &I : NodeSets)
1869     I.computeNodeSetInfo(this);
1870 
1871   LLVM_DEBUG({
1872     for (unsigned i = 0; i < SUnits.size(); i++) {
1873       dbgs() << "\tNode " << i << ":\n";
1874       dbgs() << "\t   ASAP = " << getASAP(&SUnits[i]) << "\n";
1875       dbgs() << "\t   ALAP = " << getALAP(&SUnits[i]) << "\n";
1876       dbgs() << "\t   MOV  = " << getMOV(&SUnits[i]) << "\n";
1877       dbgs() << "\t   D    = " << getDepth(&SUnits[i]) << "\n";
1878       dbgs() << "\t   H    = " << getHeight(&SUnits[i]) << "\n";
1879       dbgs() << "\t   ZLD  = " << getZeroLatencyDepth(&SUnits[i]) << "\n";
1880       dbgs() << "\t   ZLH  = " << getZeroLatencyHeight(&SUnits[i]) << "\n";
1881     }
1882   });
1883 }
1884 
1885 /// Compute the Pred_L(O) set, as defined in the paper. The set is defined
1886 /// as the predecessors of the elements of NodeOrder that are not also in
1887 /// NodeOrder.
1888 static bool pred_L(SetVector<SUnit *> &NodeOrder,
1889                    SmallSetVector<SUnit *, 8> &Preds,
1890                    const NodeSet *S = nullptr) {
1891   Preds.clear();
1892   for (const SUnit *SU : NodeOrder) {
1893     for (const SDep &Pred : SU->Preds) {
1894       if (S && S->count(Pred.getSUnit()) == 0)
1895         continue;
1896       if (ignoreDependence(Pred, true))
1897         continue;
1898       if (NodeOrder.count(Pred.getSUnit()) == 0)
1899         Preds.insert(Pred.getSUnit());
1900     }
1901     // Back-edges are predecessors with an anti-dependence.
1902     for (const SDep &Succ : SU->Succs) {
1903       if (Succ.getKind() != SDep::Anti)
1904         continue;
1905       if (S && S->count(Succ.getSUnit()) == 0)
1906         continue;
1907       if (NodeOrder.count(Succ.getSUnit()) == 0)
1908         Preds.insert(Succ.getSUnit());
1909     }
1910   }
1911   return !Preds.empty();
1912 }
1913 
1914 /// Compute the Succ_L(O) set, as defined in the paper. The set is defined
1915 /// as the successors of the elements of NodeOrder that are not also in
1916 /// NodeOrder.
1917 static bool succ_L(SetVector<SUnit *> &NodeOrder,
1918                    SmallSetVector<SUnit *, 8> &Succs,
1919                    const NodeSet *S = nullptr) {
1920   Succs.clear();
1921   for (const SUnit *SU : NodeOrder) {
1922     for (const SDep &Succ : SU->Succs) {
1923       if (S && S->count(Succ.getSUnit()) == 0)
1924         continue;
1925       if (ignoreDependence(Succ, false))
1926         continue;
1927       if (NodeOrder.count(Succ.getSUnit()) == 0)
1928         Succs.insert(Succ.getSUnit());
1929     }
1930     for (const SDep &Pred : SU->Preds) {
1931       if (Pred.getKind() != SDep::Anti)
1932         continue;
1933       if (S && S->count(Pred.getSUnit()) == 0)
1934         continue;
1935       if (NodeOrder.count(Pred.getSUnit()) == 0)
1936         Succs.insert(Pred.getSUnit());
1937     }
1938   }
1939   return !Succs.empty();
1940 }
1941 
1942 /// Return true if there is a path from the specified node to any of the nodes
1943 /// in DestNodes. Keep track and return the nodes in any path.
1944 static bool computePath(SUnit *Cur, SetVector<SUnit *> &Path,
1945                         SetVector<SUnit *> &DestNodes,
1946                         SetVector<SUnit *> &Exclude,
1947                         SmallPtrSet<SUnit *, 8> &Visited) {
1948   if (Cur->isBoundaryNode())
1949     return false;
1950   if (Exclude.contains(Cur))
1951     return false;
1952   if (DestNodes.contains(Cur))
1953     return true;
1954   if (!Visited.insert(Cur).second)
1955     return Path.contains(Cur);
1956   bool FoundPath = false;
1957   for (auto &SI : Cur->Succs)
1958     if (!ignoreDependence(SI, false))
1959       FoundPath |=
1960           computePath(SI.getSUnit(), Path, DestNodes, Exclude, Visited);
1961   for (auto &PI : Cur->Preds)
1962     if (PI.getKind() == SDep::Anti)
1963       FoundPath |=
1964           computePath(PI.getSUnit(), Path, DestNodes, Exclude, Visited);
1965   if (FoundPath)
1966     Path.insert(Cur);
1967   return FoundPath;
1968 }
1969 
1970 /// Compute the live-out registers for the instructions in a node-set.
1971 /// The live-out registers are those that are defined in the node-set,
1972 /// but not used. Except for use operands of Phis.
1973 static void computeLiveOuts(MachineFunction &MF, RegPressureTracker &RPTracker,
1974                             NodeSet &NS) {
1975   const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
1976   MachineRegisterInfo &MRI = MF.getRegInfo();
1977   SmallVector<RegisterMaskPair, 8> LiveOutRegs;
1978   SmallSet<unsigned, 4> Uses;
1979   for (SUnit *SU : NS) {
1980     const MachineInstr *MI = SU->getInstr();
1981     if (MI->isPHI())
1982       continue;
1983     for (const MachineOperand &MO : MI->all_uses()) {
1984       Register Reg = MO.getReg();
1985       if (Reg.isVirtual())
1986         Uses.insert(Reg);
1987       else if (MRI.isAllocatable(Reg))
1988         for (MCRegUnit Unit : TRI->regunits(Reg.asMCReg()))
1989           Uses.insert(Unit);
1990     }
1991   }
1992   for (SUnit *SU : NS)
1993     for (const MachineOperand &MO : SU->getInstr()->all_defs())
1994       if (!MO.isDead()) {
1995         Register Reg = MO.getReg();
1996         if (Reg.isVirtual()) {
1997           if (!Uses.count(Reg))
1998             LiveOutRegs.push_back(RegisterMaskPair(Reg,
1999                                                    LaneBitmask::getNone()));
2000         } else if (MRI.isAllocatable(Reg)) {
2001           for (MCRegUnit Unit : TRI->regunits(Reg.asMCReg()))
2002             if (!Uses.count(Unit))
2003               LiveOutRegs.push_back(
2004                   RegisterMaskPair(Unit, LaneBitmask::getNone()));
2005         }
2006       }
2007   RPTracker.addLiveRegs(LiveOutRegs);
2008 }
2009 
2010 /// A heuristic to filter nodes in recurrent node-sets if the register
2011 /// pressure of a set is too high.
2012 void SwingSchedulerDAG::registerPressureFilter(NodeSetType &NodeSets) {
2013   for (auto &NS : NodeSets) {
2014     // Skip small node-sets since they won't cause register pressure problems.
2015     if (NS.size() <= 2)
2016       continue;
2017     IntervalPressure RecRegPressure;
2018     RegPressureTracker RecRPTracker(RecRegPressure);
2019     RecRPTracker.init(&MF, &RegClassInfo, &LIS, BB, BB->end(), false, true);
2020     computeLiveOuts(MF, RecRPTracker, NS);
2021     RecRPTracker.closeBottom();
2022 
2023     std::vector<SUnit *> SUnits(NS.begin(), NS.end());
2024     llvm::sort(SUnits, [](const SUnit *A, const SUnit *B) {
2025       return A->NodeNum > B->NodeNum;
2026     });
2027 
2028     for (auto &SU : SUnits) {
2029       // Since we're computing the register pressure for a subset of the
2030       // instructions in a block, we need to set the tracker for each
2031       // instruction in the node-set. The tracker is set to the instruction
2032       // just after the one we're interested in.
2033       MachineBasicBlock::const_iterator CurInstI = SU->getInstr();
2034       RecRPTracker.setPos(std::next(CurInstI));
2035 
2036       RegPressureDelta RPDelta;
2037       ArrayRef<PressureChange> CriticalPSets;
2038       RecRPTracker.getMaxUpwardPressureDelta(SU->getInstr(), nullptr, RPDelta,
2039                                              CriticalPSets,
2040                                              RecRegPressure.MaxSetPressure);
2041       if (RPDelta.Excess.isValid()) {
2042         LLVM_DEBUG(
2043             dbgs() << "Excess register pressure: SU(" << SU->NodeNum << ") "
2044                    << TRI->getRegPressureSetName(RPDelta.Excess.getPSet())
2045                    << ":" << RPDelta.Excess.getUnitInc() << "\n");
2046         NS.setExceedPressure(SU);
2047         break;
2048       }
2049       RecRPTracker.recede();
2050     }
2051   }
2052 }
2053 
2054 /// A heuristic to colocate node sets that have the same set of
2055 /// successors.
2056 void SwingSchedulerDAG::colocateNodeSets(NodeSetType &NodeSets) {
2057   unsigned Colocate = 0;
2058   for (int i = 0, e = NodeSets.size(); i < e; ++i) {
2059     NodeSet &N1 = NodeSets[i];
2060     SmallSetVector<SUnit *, 8> S1;
2061     if (N1.empty() || !succ_L(N1, S1))
2062       continue;
2063     for (int j = i + 1; j < e; ++j) {
2064       NodeSet &N2 = NodeSets[j];
2065       if (N1.compareRecMII(N2) != 0)
2066         continue;
2067       SmallSetVector<SUnit *, 8> S2;
2068       if (N2.empty() || !succ_L(N2, S2))
2069         continue;
2070       if (llvm::set_is_subset(S1, S2) && S1.size() == S2.size()) {
2071         N1.setColocate(++Colocate);
2072         N2.setColocate(Colocate);
2073         break;
2074       }
2075     }
2076   }
2077 }
2078 
2079 /// Check if the existing node-sets are profitable. If not, then ignore the
2080 /// recurrent node-sets, and attempt to schedule all nodes together. This is
2081 /// a heuristic. If the MII is large and all the recurrent node-sets are small,
2082 /// then it's best to try to schedule all instructions together instead of
2083 /// starting with the recurrent node-sets.
2084 void SwingSchedulerDAG::checkNodeSets(NodeSetType &NodeSets) {
2085   // Look for loops with a large MII.
2086   if (MII < 17)
2087     return;
2088   // Check if the node-set contains only a simple add recurrence.
2089   for (auto &NS : NodeSets) {
2090     if (NS.getRecMII() > 2)
2091       return;
2092     if (NS.getMaxDepth() > MII)
2093       return;
2094   }
2095   NodeSets.clear();
2096   LLVM_DEBUG(dbgs() << "Clear recurrence node-sets\n");
2097 }
2098 
2099 /// Add the nodes that do not belong to a recurrence set into groups
2100 /// based upon connected components.
2101 void SwingSchedulerDAG::groupRemainingNodes(NodeSetType &NodeSets) {
2102   SetVector<SUnit *> NodesAdded;
2103   SmallPtrSet<SUnit *, 8> Visited;
2104   // Add the nodes that are on a path between the previous node sets and
2105   // the current node set.
2106   for (NodeSet &I : NodeSets) {
2107     SmallSetVector<SUnit *, 8> N;
2108     // Add the nodes from the current node set to the previous node set.
2109     if (succ_L(I, N)) {
2110       SetVector<SUnit *> Path;
2111       for (SUnit *NI : N) {
2112         Visited.clear();
2113         computePath(NI, Path, NodesAdded, I, Visited);
2114       }
2115       if (!Path.empty())
2116         I.insert(Path.begin(), Path.end());
2117     }
2118     // Add the nodes from the previous node set to the current node set.
2119     N.clear();
2120     if (succ_L(NodesAdded, N)) {
2121       SetVector<SUnit *> Path;
2122       for (SUnit *NI : N) {
2123         Visited.clear();
2124         computePath(NI, Path, I, NodesAdded, Visited);
2125       }
2126       if (!Path.empty())
2127         I.insert(Path.begin(), Path.end());
2128     }
2129     NodesAdded.insert(I.begin(), I.end());
2130   }
2131 
2132   // Create a new node set with the connected nodes of any successor of a node
2133   // in a recurrent set.
2134   NodeSet NewSet;
2135   SmallSetVector<SUnit *, 8> N;
2136   if (succ_L(NodesAdded, N))
2137     for (SUnit *I : N)
2138       addConnectedNodes(I, NewSet, NodesAdded);
2139   if (!NewSet.empty())
2140     NodeSets.push_back(NewSet);
2141 
2142   // Create a new node set with the connected nodes of any predecessor of a node
2143   // in a recurrent set.
2144   NewSet.clear();
2145   if (pred_L(NodesAdded, N))
2146     for (SUnit *I : N)
2147       addConnectedNodes(I, NewSet, NodesAdded);
2148   if (!NewSet.empty())
2149     NodeSets.push_back(NewSet);
2150 
2151   // Create new nodes sets with the connected nodes any remaining node that
2152   // has no predecessor.
2153   for (SUnit &SU : SUnits) {
2154     if (NodesAdded.count(&SU) == 0) {
2155       NewSet.clear();
2156       addConnectedNodes(&SU, NewSet, NodesAdded);
2157       if (!NewSet.empty())
2158         NodeSets.push_back(NewSet);
2159     }
2160   }
2161 }
2162 
2163 /// Add the node to the set, and add all of its connected nodes to the set.
2164 void SwingSchedulerDAG::addConnectedNodes(SUnit *SU, NodeSet &NewSet,
2165                                           SetVector<SUnit *> &NodesAdded) {
2166   NewSet.insert(SU);
2167   NodesAdded.insert(SU);
2168   for (auto &SI : SU->Succs) {
2169     SUnit *Successor = SI.getSUnit();
2170     if (!SI.isArtificial() && !Successor->isBoundaryNode() &&
2171         NodesAdded.count(Successor) == 0)
2172       addConnectedNodes(Successor, NewSet, NodesAdded);
2173   }
2174   for (auto &PI : SU->Preds) {
2175     SUnit *Predecessor = PI.getSUnit();
2176     if (!PI.isArtificial() && NodesAdded.count(Predecessor) == 0)
2177       addConnectedNodes(Predecessor, NewSet, NodesAdded);
2178   }
2179 }
2180 
2181 /// Return true if Set1 contains elements in Set2. The elements in common
2182 /// are returned in a different container.
2183 static bool isIntersect(SmallSetVector<SUnit *, 8> &Set1, const NodeSet &Set2,
2184                         SmallSetVector<SUnit *, 8> &Result) {
2185   Result.clear();
2186   for (SUnit *SU : Set1) {
2187     if (Set2.count(SU) != 0)
2188       Result.insert(SU);
2189   }
2190   return !Result.empty();
2191 }
2192 
2193 /// Merge the recurrence node sets that have the same initial node.
2194 void SwingSchedulerDAG::fuseRecs(NodeSetType &NodeSets) {
2195   for (NodeSetType::iterator I = NodeSets.begin(), E = NodeSets.end(); I != E;
2196        ++I) {
2197     NodeSet &NI = *I;
2198     for (NodeSetType::iterator J = I + 1; J != E;) {
2199       NodeSet &NJ = *J;
2200       if (NI.getNode(0)->NodeNum == NJ.getNode(0)->NodeNum) {
2201         if (NJ.compareRecMII(NI) > 0)
2202           NI.setRecMII(NJ.getRecMII());
2203         for (SUnit *SU : *J)
2204           I->insert(SU);
2205         NodeSets.erase(J);
2206         E = NodeSets.end();
2207       } else {
2208         ++J;
2209       }
2210     }
2211   }
2212 }
2213 
2214 /// Remove nodes that have been scheduled in previous NodeSets.
2215 void SwingSchedulerDAG::removeDuplicateNodes(NodeSetType &NodeSets) {
2216   for (NodeSetType::iterator I = NodeSets.begin(), E = NodeSets.end(); I != E;
2217        ++I)
2218     for (NodeSetType::iterator J = I + 1; J != E;) {
2219       J->remove_if([&](SUnit *SUJ) { return I->count(SUJ); });
2220 
2221       if (J->empty()) {
2222         NodeSets.erase(J);
2223         E = NodeSets.end();
2224       } else {
2225         ++J;
2226       }
2227     }
2228 }
2229 
2230 /// Compute an ordered list of the dependence graph nodes, which
2231 /// indicates the order that the nodes will be scheduled.  This is a
2232 /// two-level algorithm. First, a partial order is created, which
2233 /// consists of a list of sets ordered from highest to lowest priority.
2234 void SwingSchedulerDAG::computeNodeOrder(NodeSetType &NodeSets) {
2235   SmallSetVector<SUnit *, 8> R;
2236   NodeOrder.clear();
2237 
2238   for (auto &Nodes : NodeSets) {
2239     LLVM_DEBUG(dbgs() << "NodeSet size " << Nodes.size() << "\n");
2240     OrderKind Order;
2241     SmallSetVector<SUnit *, 8> N;
2242     if (pred_L(NodeOrder, N) && llvm::set_is_subset(N, Nodes)) {
2243       R.insert(N.begin(), N.end());
2244       Order = BottomUp;
2245       LLVM_DEBUG(dbgs() << "  Bottom up (preds) ");
2246     } else if (succ_L(NodeOrder, N) && llvm::set_is_subset(N, Nodes)) {
2247       R.insert(N.begin(), N.end());
2248       Order = TopDown;
2249       LLVM_DEBUG(dbgs() << "  Top down (succs) ");
2250     } else if (isIntersect(N, Nodes, R)) {
2251       // If some of the successors are in the existing node-set, then use the
2252       // top-down ordering.
2253       Order = TopDown;
2254       LLVM_DEBUG(dbgs() << "  Top down (intersect) ");
2255     } else if (NodeSets.size() == 1) {
2256       for (const auto &N : Nodes)
2257         if (N->Succs.size() == 0)
2258           R.insert(N);
2259       Order = BottomUp;
2260       LLVM_DEBUG(dbgs() << "  Bottom up (all) ");
2261     } else {
2262       // Find the node with the highest ASAP.
2263       SUnit *maxASAP = nullptr;
2264       for (SUnit *SU : Nodes) {
2265         if (maxASAP == nullptr || getASAP(SU) > getASAP(maxASAP) ||
2266             (getASAP(SU) == getASAP(maxASAP) && SU->NodeNum > maxASAP->NodeNum))
2267           maxASAP = SU;
2268       }
2269       R.insert(maxASAP);
2270       Order = BottomUp;
2271       LLVM_DEBUG(dbgs() << "  Bottom up (default) ");
2272     }
2273 
2274     while (!R.empty()) {
2275       if (Order == TopDown) {
2276         // Choose the node with the maximum height.  If more than one, choose
2277         // the node wiTH the maximum ZeroLatencyHeight. If still more than one,
2278         // choose the node with the lowest MOV.
2279         while (!R.empty()) {
2280           SUnit *maxHeight = nullptr;
2281           for (SUnit *I : R) {
2282             if (maxHeight == nullptr || getHeight(I) > getHeight(maxHeight))
2283               maxHeight = I;
2284             else if (getHeight(I) == getHeight(maxHeight) &&
2285                      getZeroLatencyHeight(I) > getZeroLatencyHeight(maxHeight))
2286               maxHeight = I;
2287             else if (getHeight(I) == getHeight(maxHeight) &&
2288                      getZeroLatencyHeight(I) ==
2289                          getZeroLatencyHeight(maxHeight) &&
2290                      getMOV(I) < getMOV(maxHeight))
2291               maxHeight = I;
2292           }
2293           NodeOrder.insert(maxHeight);
2294           LLVM_DEBUG(dbgs() << maxHeight->NodeNum << " ");
2295           R.remove(maxHeight);
2296           for (const auto &I : maxHeight->Succs) {
2297             if (Nodes.count(I.getSUnit()) == 0)
2298               continue;
2299             if (NodeOrder.contains(I.getSUnit()))
2300               continue;
2301             if (ignoreDependence(I, false))
2302               continue;
2303             R.insert(I.getSUnit());
2304           }
2305           // Back-edges are predecessors with an anti-dependence.
2306           for (const auto &I : maxHeight->Preds) {
2307             if (I.getKind() != SDep::Anti)
2308               continue;
2309             if (Nodes.count(I.getSUnit()) == 0)
2310               continue;
2311             if (NodeOrder.contains(I.getSUnit()))
2312               continue;
2313             R.insert(I.getSUnit());
2314           }
2315         }
2316         Order = BottomUp;
2317         LLVM_DEBUG(dbgs() << "\n   Switching order to bottom up ");
2318         SmallSetVector<SUnit *, 8> N;
2319         if (pred_L(NodeOrder, N, &Nodes))
2320           R.insert(N.begin(), N.end());
2321       } else {
2322         // Choose the node with the maximum depth.  If more than one, choose
2323         // the node with the maximum ZeroLatencyDepth. If still more than one,
2324         // choose the node with the lowest MOV.
2325         while (!R.empty()) {
2326           SUnit *maxDepth = nullptr;
2327           for (SUnit *I : R) {
2328             if (maxDepth == nullptr || getDepth(I) > getDepth(maxDepth))
2329               maxDepth = I;
2330             else if (getDepth(I) == getDepth(maxDepth) &&
2331                      getZeroLatencyDepth(I) > getZeroLatencyDepth(maxDepth))
2332               maxDepth = I;
2333             else if (getDepth(I) == getDepth(maxDepth) &&
2334                      getZeroLatencyDepth(I) == getZeroLatencyDepth(maxDepth) &&
2335                      getMOV(I) < getMOV(maxDepth))
2336               maxDepth = I;
2337           }
2338           NodeOrder.insert(maxDepth);
2339           LLVM_DEBUG(dbgs() << maxDepth->NodeNum << " ");
2340           R.remove(maxDepth);
2341           if (Nodes.isExceedSU(maxDepth)) {
2342             Order = TopDown;
2343             R.clear();
2344             R.insert(Nodes.getNode(0));
2345             break;
2346           }
2347           for (const auto &I : maxDepth->Preds) {
2348             if (Nodes.count(I.getSUnit()) == 0)
2349               continue;
2350             if (NodeOrder.contains(I.getSUnit()))
2351               continue;
2352             R.insert(I.getSUnit());
2353           }
2354           // Back-edges are predecessors with an anti-dependence.
2355           for (const auto &I : maxDepth->Succs) {
2356             if (I.getKind() != SDep::Anti)
2357               continue;
2358             if (Nodes.count(I.getSUnit()) == 0)
2359               continue;
2360             if (NodeOrder.contains(I.getSUnit()))
2361               continue;
2362             R.insert(I.getSUnit());
2363           }
2364         }
2365         Order = TopDown;
2366         LLVM_DEBUG(dbgs() << "\n   Switching order to top down ");
2367         SmallSetVector<SUnit *, 8> N;
2368         if (succ_L(NodeOrder, N, &Nodes))
2369           R.insert(N.begin(), N.end());
2370       }
2371     }
2372     LLVM_DEBUG(dbgs() << "\nDone with Nodeset\n");
2373   }
2374 
2375   LLVM_DEBUG({
2376     dbgs() << "Node order: ";
2377     for (SUnit *I : NodeOrder)
2378       dbgs() << " " << I->NodeNum << " ";
2379     dbgs() << "\n";
2380   });
2381 }
2382 
2383 /// Process the nodes in the computed order and create the pipelined schedule
2384 /// of the instructions, if possible. Return true if a schedule is found.
2385 bool SwingSchedulerDAG::schedulePipeline(SMSchedule &Schedule) {
2386 
2387   if (NodeOrder.empty()){
2388     LLVM_DEBUG(dbgs() << "NodeOrder is empty! abort scheduling\n" );
2389     return false;
2390   }
2391 
2392   bool scheduleFound = false;
2393   std::unique_ptr<HighRegisterPressureDetector> HRPDetector;
2394   if (LimitRegPressure) {
2395     HRPDetector =
2396         std::make_unique<HighRegisterPressureDetector>(Loop.getHeader(), MF);
2397     HRPDetector->init(RegClassInfo);
2398   }
2399   // Keep increasing II until a valid schedule is found.
2400   for (unsigned II = MII; II <= MAX_II && !scheduleFound; ++II) {
2401     Schedule.reset();
2402     Schedule.setInitiationInterval(II);
2403     LLVM_DEBUG(dbgs() << "Try to schedule with " << II << "\n");
2404 
2405     SetVector<SUnit *>::iterator NI = NodeOrder.begin();
2406     SetVector<SUnit *>::iterator NE = NodeOrder.end();
2407     do {
2408       SUnit *SU = *NI;
2409 
2410       // Compute the schedule time for the instruction, which is based
2411       // upon the scheduled time for any predecessors/successors.
2412       int EarlyStart = INT_MIN;
2413       int LateStart = INT_MAX;
2414       // These values are set when the size of the schedule window is limited
2415       // due to chain dependences.
2416       int SchedEnd = INT_MAX;
2417       int SchedStart = INT_MIN;
2418       Schedule.computeStart(SU, &EarlyStart, &LateStart, &SchedEnd, &SchedStart,
2419                             II, this);
2420       LLVM_DEBUG({
2421         dbgs() << "\n";
2422         dbgs() << "Inst (" << SU->NodeNum << ") ";
2423         SU->getInstr()->dump();
2424         dbgs() << "\n";
2425       });
2426       LLVM_DEBUG({
2427         dbgs() << format("\tes: %8x ls: %8x me: %8x ms: %8x\n", EarlyStart,
2428                          LateStart, SchedEnd, SchedStart);
2429       });
2430 
2431       if (EarlyStart > LateStart || SchedEnd < EarlyStart ||
2432           SchedStart > LateStart)
2433         scheduleFound = false;
2434       else if (EarlyStart != INT_MIN && LateStart == INT_MAX) {
2435         SchedEnd = std::min(SchedEnd, EarlyStart + (int)II - 1);
2436         scheduleFound = Schedule.insert(SU, EarlyStart, SchedEnd, II);
2437       } else if (EarlyStart == INT_MIN && LateStart != INT_MAX) {
2438         SchedStart = std::max(SchedStart, LateStart - (int)II + 1);
2439         scheduleFound = Schedule.insert(SU, LateStart, SchedStart, II);
2440       } else if (EarlyStart != INT_MIN && LateStart != INT_MAX) {
2441         SchedEnd =
2442             std::min(SchedEnd, std::min(LateStart, EarlyStart + (int)II - 1));
2443         // When scheduling a Phi it is better to start at the late cycle and go
2444         // backwards. The default order may insert the Phi too far away from
2445         // its first dependence.
2446         if (SU->getInstr()->isPHI())
2447           scheduleFound = Schedule.insert(SU, SchedEnd, EarlyStart, II);
2448         else
2449           scheduleFound = Schedule.insert(SU, EarlyStart, SchedEnd, II);
2450       } else {
2451         int FirstCycle = Schedule.getFirstCycle();
2452         scheduleFound = Schedule.insert(SU, FirstCycle + getASAP(SU),
2453                                         FirstCycle + getASAP(SU) + II - 1, II);
2454       }
2455       // Even if we find a schedule, make sure the schedule doesn't exceed the
2456       // allowable number of stages. We keep trying if this happens.
2457       if (scheduleFound)
2458         if (SwpMaxStages > -1 &&
2459             Schedule.getMaxStageCount() > (unsigned)SwpMaxStages)
2460           scheduleFound = false;
2461 
2462       LLVM_DEBUG({
2463         if (!scheduleFound)
2464           dbgs() << "\tCan't schedule\n";
2465       });
2466     } while (++NI != NE && scheduleFound);
2467 
2468     // If a schedule is found, ensure non-pipelined instructions are in stage 0
2469     if (scheduleFound)
2470       scheduleFound =
2471           Schedule.normalizeNonPipelinedInstructions(this, LoopPipelinerInfo);
2472 
2473     // If a schedule is found, check if it is a valid schedule too.
2474     if (scheduleFound)
2475       scheduleFound = Schedule.isValidSchedule(this);
2476 
2477     // If a schedule was found and the option is enabled, check if the schedule
2478     // might generate additional register spills/fills.
2479     if (scheduleFound && LimitRegPressure)
2480       scheduleFound =
2481           !HRPDetector->detect(this, Schedule, Schedule.getMaxStageCount());
2482   }
2483 
2484   LLVM_DEBUG(dbgs() << "Schedule Found? " << scheduleFound
2485                     << " (II=" << Schedule.getInitiationInterval()
2486                     << ")\n");
2487 
2488   if (scheduleFound) {
2489     scheduleFound = LoopPipelinerInfo->shouldUseSchedule(*this, Schedule);
2490     if (!scheduleFound)
2491       LLVM_DEBUG(dbgs() << "Target rejected schedule\n");
2492   }
2493 
2494   if (scheduleFound) {
2495     Schedule.finalizeSchedule(this);
2496     Pass.ORE->emit([&]() {
2497       return MachineOptimizationRemarkAnalysis(
2498                  DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
2499              << "Schedule found with Initiation Interval: "
2500              << ore::NV("II", Schedule.getInitiationInterval())
2501              << ", MaxStageCount: "
2502              << ore::NV("MaxStageCount", Schedule.getMaxStageCount());
2503     });
2504   } else
2505     Schedule.reset();
2506 
2507   return scheduleFound && Schedule.getMaxStageCount() > 0;
2508 }
2509 
2510 /// Return true if we can compute the amount the instruction changes
2511 /// during each iteration. Set Delta to the amount of the change.
2512 bool SwingSchedulerDAG::computeDelta(MachineInstr &MI, unsigned &Delta) {
2513   const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
2514   const MachineOperand *BaseOp;
2515   int64_t Offset;
2516   bool OffsetIsScalable;
2517   if (!TII->getMemOperandWithOffset(MI, BaseOp, Offset, OffsetIsScalable, TRI))
2518     return false;
2519 
2520   // FIXME: This algorithm assumes instructions have fixed-size offsets.
2521   if (OffsetIsScalable)
2522     return false;
2523 
2524   if (!BaseOp->isReg())
2525     return false;
2526 
2527   Register BaseReg = BaseOp->getReg();
2528 
2529   MachineRegisterInfo &MRI = MF.getRegInfo();
2530   // Check if there is a Phi. If so, get the definition in the loop.
2531   MachineInstr *BaseDef = MRI.getVRegDef(BaseReg);
2532   if (BaseDef && BaseDef->isPHI()) {
2533     BaseReg = getLoopPhiReg(*BaseDef, MI.getParent());
2534     BaseDef = MRI.getVRegDef(BaseReg);
2535   }
2536   if (!BaseDef)
2537     return false;
2538 
2539   int D = 0;
2540   if (!TII->getIncrementValue(*BaseDef, D) && D >= 0)
2541     return false;
2542 
2543   Delta = D;
2544   return true;
2545 }
2546 
2547 /// Check if we can change the instruction to use an offset value from the
2548 /// previous iteration. If so, return true and set the base and offset values
2549 /// so that we can rewrite the load, if necessary.
2550 ///   v1 = Phi(v0, v3)
2551 ///   v2 = load v1, 0
2552 ///   v3 = post_store v1, 4, x
2553 /// This function enables the load to be rewritten as v2 = load v3, 4.
2554 bool SwingSchedulerDAG::canUseLastOffsetValue(MachineInstr *MI,
2555                                               unsigned &BasePos,
2556                                               unsigned &OffsetPos,
2557                                               unsigned &NewBase,
2558                                               int64_t &Offset) {
2559   // Get the load instruction.
2560   if (TII->isPostIncrement(*MI))
2561     return false;
2562   unsigned BasePosLd, OffsetPosLd;
2563   if (!TII->getBaseAndOffsetPosition(*MI, BasePosLd, OffsetPosLd))
2564     return false;
2565   Register BaseReg = MI->getOperand(BasePosLd).getReg();
2566 
2567   // Look for the Phi instruction.
2568   MachineRegisterInfo &MRI = MI->getMF()->getRegInfo();
2569   MachineInstr *Phi = MRI.getVRegDef(BaseReg);
2570   if (!Phi || !Phi->isPHI())
2571     return false;
2572   // Get the register defined in the loop block.
2573   unsigned PrevReg = getLoopPhiReg(*Phi, MI->getParent());
2574   if (!PrevReg)
2575     return false;
2576 
2577   // Check for the post-increment load/store instruction.
2578   MachineInstr *PrevDef = MRI.getVRegDef(PrevReg);
2579   if (!PrevDef || PrevDef == MI)
2580     return false;
2581 
2582   if (!TII->isPostIncrement(*PrevDef))
2583     return false;
2584 
2585   unsigned BasePos1 = 0, OffsetPos1 = 0;
2586   if (!TII->getBaseAndOffsetPosition(*PrevDef, BasePos1, OffsetPos1))
2587     return false;
2588 
2589   // Make sure that the instructions do not access the same memory location in
2590   // the next iteration.
2591   int64_t LoadOffset = MI->getOperand(OffsetPosLd).getImm();
2592   int64_t StoreOffset = PrevDef->getOperand(OffsetPos1).getImm();
2593   MachineInstr *NewMI = MF.CloneMachineInstr(MI);
2594   NewMI->getOperand(OffsetPosLd).setImm(LoadOffset + StoreOffset);
2595   bool Disjoint = TII->areMemAccessesTriviallyDisjoint(*NewMI, *PrevDef);
2596   MF.deleteMachineInstr(NewMI);
2597   if (!Disjoint)
2598     return false;
2599 
2600   // Set the return value once we determine that we return true.
2601   BasePos = BasePosLd;
2602   OffsetPos = OffsetPosLd;
2603   NewBase = PrevReg;
2604   Offset = StoreOffset;
2605   return true;
2606 }
2607 
2608 /// Apply changes to the instruction if needed. The changes are need
2609 /// to improve the scheduling and depend up on the final schedule.
2610 void SwingSchedulerDAG::applyInstrChange(MachineInstr *MI,
2611                                          SMSchedule &Schedule) {
2612   SUnit *SU = getSUnit(MI);
2613   DenseMap<SUnit *, std::pair<unsigned, int64_t>>::iterator It =
2614       InstrChanges.find(SU);
2615   if (It != InstrChanges.end()) {
2616     std::pair<unsigned, int64_t> RegAndOffset = It->second;
2617     unsigned BasePos, OffsetPos;
2618     if (!TII->getBaseAndOffsetPosition(*MI, BasePos, OffsetPos))
2619       return;
2620     Register BaseReg = MI->getOperand(BasePos).getReg();
2621     MachineInstr *LoopDef = findDefInLoop(BaseReg);
2622     int DefStageNum = Schedule.stageScheduled(getSUnit(LoopDef));
2623     int DefCycleNum = Schedule.cycleScheduled(getSUnit(LoopDef));
2624     int BaseStageNum = Schedule.stageScheduled(SU);
2625     int BaseCycleNum = Schedule.cycleScheduled(SU);
2626     if (BaseStageNum < DefStageNum) {
2627       MachineInstr *NewMI = MF.CloneMachineInstr(MI);
2628       int OffsetDiff = DefStageNum - BaseStageNum;
2629       if (DefCycleNum < BaseCycleNum) {
2630         NewMI->getOperand(BasePos).setReg(RegAndOffset.first);
2631         if (OffsetDiff > 0)
2632           --OffsetDiff;
2633       }
2634       int64_t NewOffset =
2635           MI->getOperand(OffsetPos).getImm() + RegAndOffset.second * OffsetDiff;
2636       NewMI->getOperand(OffsetPos).setImm(NewOffset);
2637       SU->setInstr(NewMI);
2638       MISUnitMap[NewMI] = SU;
2639       NewMIs[MI] = NewMI;
2640     }
2641   }
2642 }
2643 
2644 /// Return the instruction in the loop that defines the register.
2645 /// If the definition is a Phi, then follow the Phi operand to
2646 /// the instruction in the loop.
2647 MachineInstr *SwingSchedulerDAG::findDefInLoop(Register Reg) {
2648   SmallPtrSet<MachineInstr *, 8> Visited;
2649   MachineInstr *Def = MRI.getVRegDef(Reg);
2650   while (Def->isPHI()) {
2651     if (!Visited.insert(Def).second)
2652       break;
2653     for (unsigned i = 1, e = Def->getNumOperands(); i < e; i += 2)
2654       if (Def->getOperand(i + 1).getMBB() == BB) {
2655         Def = MRI.getVRegDef(Def->getOperand(i).getReg());
2656         break;
2657       }
2658   }
2659   return Def;
2660 }
2661 
2662 /// Return true for an order or output dependence that is loop carried
2663 /// potentially. A dependence is loop carried if the destination defines a value
2664 /// that may be used or defined by the source in a subsequent iteration.
2665 bool SwingSchedulerDAG::isLoopCarriedDep(SUnit *Source, const SDep &Dep,
2666                                          bool isSucc) {
2667   if ((Dep.getKind() != SDep::Order && Dep.getKind() != SDep::Output) ||
2668       Dep.isArtificial() || Dep.getSUnit()->isBoundaryNode())
2669     return false;
2670 
2671   if (!SwpPruneLoopCarried)
2672     return true;
2673 
2674   if (Dep.getKind() == SDep::Output)
2675     return true;
2676 
2677   MachineInstr *SI = Source->getInstr();
2678   MachineInstr *DI = Dep.getSUnit()->getInstr();
2679   if (!isSucc)
2680     std::swap(SI, DI);
2681   assert(SI != nullptr && DI != nullptr && "Expecting SUnit with an MI.");
2682 
2683   // Assume ordered loads and stores may have a loop carried dependence.
2684   if (SI->hasUnmodeledSideEffects() || DI->hasUnmodeledSideEffects() ||
2685       SI->mayRaiseFPException() || DI->mayRaiseFPException() ||
2686       SI->hasOrderedMemoryRef() || DI->hasOrderedMemoryRef())
2687     return true;
2688 
2689   if (!DI->mayLoadOrStore() || !SI->mayLoadOrStore())
2690     return false;
2691 
2692   // The conservative assumption is that a dependence between memory operations
2693   // may be loop carried. The following code checks when it can be proved that
2694   // there is no loop carried dependence.
2695   unsigned DeltaS, DeltaD;
2696   if (!computeDelta(*SI, DeltaS) || !computeDelta(*DI, DeltaD))
2697     return true;
2698 
2699   const MachineOperand *BaseOpS, *BaseOpD;
2700   int64_t OffsetS, OffsetD;
2701   bool OffsetSIsScalable, OffsetDIsScalable;
2702   const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
2703   if (!TII->getMemOperandWithOffset(*SI, BaseOpS, OffsetS, OffsetSIsScalable,
2704                                     TRI) ||
2705       !TII->getMemOperandWithOffset(*DI, BaseOpD, OffsetD, OffsetDIsScalable,
2706                                     TRI))
2707     return true;
2708 
2709   assert(!OffsetSIsScalable && !OffsetDIsScalable &&
2710          "Expected offsets to be byte offsets");
2711 
2712   MachineInstr *DefS = MRI.getVRegDef(BaseOpS->getReg());
2713   MachineInstr *DefD = MRI.getVRegDef(BaseOpD->getReg());
2714   if (!DefS || !DefD || !DefS->isPHI() || !DefD->isPHI())
2715     return true;
2716 
2717   unsigned InitValS = 0;
2718   unsigned LoopValS = 0;
2719   unsigned InitValD = 0;
2720   unsigned LoopValD = 0;
2721   getPhiRegs(*DefS, BB, InitValS, LoopValS);
2722   getPhiRegs(*DefD, BB, InitValD, LoopValD);
2723   MachineInstr *InitDefS = MRI.getVRegDef(InitValS);
2724   MachineInstr *InitDefD = MRI.getVRegDef(InitValD);
2725 
2726   if (!InitDefS->isIdenticalTo(*InitDefD))
2727     return true;
2728 
2729   // Check that the base register is incremented by a constant value for each
2730   // iteration.
2731   MachineInstr *LoopDefS = MRI.getVRegDef(LoopValS);
2732   int D = 0;
2733   if (!LoopDefS || !TII->getIncrementValue(*LoopDefS, D))
2734     return true;
2735 
2736   uint64_t AccessSizeS = (*SI->memoperands_begin())->getSize();
2737   uint64_t AccessSizeD = (*DI->memoperands_begin())->getSize();
2738 
2739   // This is the main test, which checks the offset values and the loop
2740   // increment value to determine if the accesses may be loop carried.
2741   if (AccessSizeS == MemoryLocation::UnknownSize ||
2742       AccessSizeD == MemoryLocation::UnknownSize)
2743     return true;
2744 
2745   if (DeltaS != DeltaD || DeltaS < AccessSizeS || DeltaD < AccessSizeD)
2746     return true;
2747 
2748   return (OffsetS + (int64_t)AccessSizeS < OffsetD + (int64_t)AccessSizeD);
2749 }
2750 
2751 void SwingSchedulerDAG::postProcessDAG() {
2752   for (auto &M : Mutations)
2753     M->apply(this);
2754 }
2755 
2756 /// Try to schedule the node at the specified StartCycle and continue
2757 /// until the node is schedule or the EndCycle is reached.  This function
2758 /// returns true if the node is scheduled.  This routine may search either
2759 /// forward or backward for a place to insert the instruction based upon
2760 /// the relative values of StartCycle and EndCycle.
2761 bool SMSchedule::insert(SUnit *SU, int StartCycle, int EndCycle, int II) {
2762   bool forward = true;
2763   LLVM_DEBUG({
2764     dbgs() << "Trying to insert node between " << StartCycle << " and "
2765            << EndCycle << " II: " << II << "\n";
2766   });
2767   if (StartCycle > EndCycle)
2768     forward = false;
2769 
2770   // The terminating condition depends on the direction.
2771   int termCycle = forward ? EndCycle + 1 : EndCycle - 1;
2772   for (int curCycle = StartCycle; curCycle != termCycle;
2773        forward ? ++curCycle : --curCycle) {
2774 
2775     if (ST.getInstrInfo()->isZeroCost(SU->getInstr()->getOpcode()) ||
2776         ProcItinResources.canReserveResources(*SU, curCycle)) {
2777       LLVM_DEBUG({
2778         dbgs() << "\tinsert at cycle " << curCycle << " ";
2779         SU->getInstr()->dump();
2780       });
2781 
2782       if (!ST.getInstrInfo()->isZeroCost(SU->getInstr()->getOpcode()))
2783         ProcItinResources.reserveResources(*SU, curCycle);
2784       ScheduledInstrs[curCycle].push_back(SU);
2785       InstrToCycle.insert(std::make_pair(SU, curCycle));
2786       if (curCycle > LastCycle)
2787         LastCycle = curCycle;
2788       if (curCycle < FirstCycle)
2789         FirstCycle = curCycle;
2790       return true;
2791     }
2792     LLVM_DEBUG({
2793       dbgs() << "\tfailed to insert at cycle " << curCycle << " ";
2794       SU->getInstr()->dump();
2795     });
2796   }
2797   return false;
2798 }
2799 
2800 // Return the cycle of the earliest scheduled instruction in the chain.
2801 int SMSchedule::earliestCycleInChain(const SDep &Dep) {
2802   SmallPtrSet<SUnit *, 8> Visited;
2803   SmallVector<SDep, 8> Worklist;
2804   Worklist.push_back(Dep);
2805   int EarlyCycle = INT_MAX;
2806   while (!Worklist.empty()) {
2807     const SDep &Cur = Worklist.pop_back_val();
2808     SUnit *PrevSU = Cur.getSUnit();
2809     if (Visited.count(PrevSU))
2810       continue;
2811     std::map<SUnit *, int>::const_iterator it = InstrToCycle.find(PrevSU);
2812     if (it == InstrToCycle.end())
2813       continue;
2814     EarlyCycle = std::min(EarlyCycle, it->second);
2815     for (const auto &PI : PrevSU->Preds)
2816       if (PI.getKind() == SDep::Order || PI.getKind() == SDep::Output)
2817         Worklist.push_back(PI);
2818     Visited.insert(PrevSU);
2819   }
2820   return EarlyCycle;
2821 }
2822 
2823 // Return the cycle of the latest scheduled instruction in the chain.
2824 int SMSchedule::latestCycleInChain(const SDep &Dep) {
2825   SmallPtrSet<SUnit *, 8> Visited;
2826   SmallVector<SDep, 8> Worklist;
2827   Worklist.push_back(Dep);
2828   int LateCycle = INT_MIN;
2829   while (!Worklist.empty()) {
2830     const SDep &Cur = Worklist.pop_back_val();
2831     SUnit *SuccSU = Cur.getSUnit();
2832     if (Visited.count(SuccSU) || SuccSU->isBoundaryNode())
2833       continue;
2834     std::map<SUnit *, int>::const_iterator it = InstrToCycle.find(SuccSU);
2835     if (it == InstrToCycle.end())
2836       continue;
2837     LateCycle = std::max(LateCycle, it->second);
2838     for (const auto &SI : SuccSU->Succs)
2839       if (SI.getKind() == SDep::Order || SI.getKind() == SDep::Output)
2840         Worklist.push_back(SI);
2841     Visited.insert(SuccSU);
2842   }
2843   return LateCycle;
2844 }
2845 
2846 /// If an instruction has a use that spans multiple iterations, then
2847 /// return true. These instructions are characterized by having a back-ege
2848 /// to a Phi, which contains a reference to another Phi.
2849 static SUnit *multipleIterations(SUnit *SU, SwingSchedulerDAG *DAG) {
2850   for (auto &P : SU->Preds)
2851     if (DAG->isBackedge(SU, P) && P.getSUnit()->getInstr()->isPHI())
2852       for (auto &S : P.getSUnit()->Succs)
2853         if (S.getKind() == SDep::Data && S.getSUnit()->getInstr()->isPHI())
2854           return P.getSUnit();
2855   return nullptr;
2856 }
2857 
2858 /// Compute the scheduling start slot for the instruction.  The start slot
2859 /// depends on any predecessor or successor nodes scheduled already.
2860 void SMSchedule::computeStart(SUnit *SU, int *MaxEarlyStart, int *MinLateStart,
2861                               int *MinEnd, int *MaxStart, int II,
2862                               SwingSchedulerDAG *DAG) {
2863   // Iterate over each instruction that has been scheduled already.  The start
2864   // slot computation depends on whether the previously scheduled instruction
2865   // is a predecessor or successor of the specified instruction.
2866   for (int cycle = getFirstCycle(); cycle <= LastCycle; ++cycle) {
2867 
2868     // Iterate over each instruction in the current cycle.
2869     for (SUnit *I : getInstructions(cycle)) {
2870       // Because we're processing a DAG for the dependences, we recognize
2871       // the back-edge in recurrences by anti dependences.
2872       for (unsigned i = 0, e = (unsigned)SU->Preds.size(); i != e; ++i) {
2873         const SDep &Dep = SU->Preds[i];
2874         if (Dep.getSUnit() == I) {
2875           if (!DAG->isBackedge(SU, Dep)) {
2876             int EarlyStart = cycle + Dep.getLatency() -
2877                              DAG->getDistance(Dep.getSUnit(), SU, Dep) * II;
2878             *MaxEarlyStart = std::max(*MaxEarlyStart, EarlyStart);
2879             if (DAG->isLoopCarriedDep(SU, Dep, false)) {
2880               int End = earliestCycleInChain(Dep) + (II - 1);
2881               *MinEnd = std::min(*MinEnd, End);
2882             }
2883           } else {
2884             int LateStart = cycle - Dep.getLatency() +
2885                             DAG->getDistance(SU, Dep.getSUnit(), Dep) * II;
2886             *MinLateStart = std::min(*MinLateStart, LateStart);
2887           }
2888         }
2889         // For instruction that requires multiple iterations, make sure that
2890         // the dependent instruction is not scheduled past the definition.
2891         SUnit *BE = multipleIterations(I, DAG);
2892         if (BE && Dep.getSUnit() == BE && !SU->getInstr()->isPHI() &&
2893             !SU->isPred(I))
2894           *MinLateStart = std::min(*MinLateStart, cycle);
2895       }
2896       for (unsigned i = 0, e = (unsigned)SU->Succs.size(); i != e; ++i) {
2897         if (SU->Succs[i].getSUnit() == I) {
2898           const SDep &Dep = SU->Succs[i];
2899           if (!DAG->isBackedge(SU, Dep)) {
2900             int LateStart = cycle - Dep.getLatency() +
2901                             DAG->getDistance(SU, Dep.getSUnit(), Dep) * II;
2902             *MinLateStart = std::min(*MinLateStart, LateStart);
2903             if (DAG->isLoopCarriedDep(SU, Dep)) {
2904               int Start = latestCycleInChain(Dep) + 1 - II;
2905               *MaxStart = std::max(*MaxStart, Start);
2906             }
2907           } else {
2908             int EarlyStart = cycle + Dep.getLatency() -
2909                              DAG->getDistance(Dep.getSUnit(), SU, Dep) * II;
2910             *MaxEarlyStart = std::max(*MaxEarlyStart, EarlyStart);
2911           }
2912         }
2913       }
2914     }
2915   }
2916 }
2917 
2918 /// Order the instructions within a cycle so that the definitions occur
2919 /// before the uses. Returns true if the instruction is added to the start
2920 /// of the list, or false if added to the end.
2921 void SMSchedule::orderDependence(const SwingSchedulerDAG *SSD, SUnit *SU,
2922                                  std::deque<SUnit *> &Insts) const {
2923   MachineInstr *MI = SU->getInstr();
2924   bool OrderBeforeUse = false;
2925   bool OrderAfterDef = false;
2926   bool OrderBeforeDef = false;
2927   unsigned MoveDef = 0;
2928   unsigned MoveUse = 0;
2929   int StageInst1 = stageScheduled(SU);
2930 
2931   unsigned Pos = 0;
2932   for (std::deque<SUnit *>::iterator I = Insts.begin(), E = Insts.end(); I != E;
2933        ++I, ++Pos) {
2934     for (MachineOperand &MO : MI->operands()) {
2935       if (!MO.isReg() || !MO.getReg().isVirtual())
2936         continue;
2937 
2938       Register Reg = MO.getReg();
2939       unsigned BasePos, OffsetPos;
2940       if (ST.getInstrInfo()->getBaseAndOffsetPosition(*MI, BasePos, OffsetPos))
2941         if (MI->getOperand(BasePos).getReg() == Reg)
2942           if (unsigned NewReg = SSD->getInstrBaseReg(SU))
2943             Reg = NewReg;
2944       bool Reads, Writes;
2945       std::tie(Reads, Writes) =
2946           (*I)->getInstr()->readsWritesVirtualRegister(Reg);
2947       if (MO.isDef() && Reads && stageScheduled(*I) <= StageInst1) {
2948         OrderBeforeUse = true;
2949         if (MoveUse == 0)
2950           MoveUse = Pos;
2951       } else if (MO.isDef() && Reads && stageScheduled(*I) > StageInst1) {
2952         // Add the instruction after the scheduled instruction.
2953         OrderAfterDef = true;
2954         MoveDef = Pos;
2955       } else if (MO.isUse() && Writes && stageScheduled(*I) == StageInst1) {
2956         if (cycleScheduled(*I) == cycleScheduled(SU) && !(*I)->isSucc(SU)) {
2957           OrderBeforeUse = true;
2958           if (MoveUse == 0)
2959             MoveUse = Pos;
2960         } else {
2961           OrderAfterDef = true;
2962           MoveDef = Pos;
2963         }
2964       } else if (MO.isUse() && Writes && stageScheduled(*I) > StageInst1) {
2965         OrderBeforeUse = true;
2966         if (MoveUse == 0)
2967           MoveUse = Pos;
2968         if (MoveUse != 0) {
2969           OrderAfterDef = true;
2970           MoveDef = Pos - 1;
2971         }
2972       } else if (MO.isUse() && Writes && stageScheduled(*I) < StageInst1) {
2973         // Add the instruction before the scheduled instruction.
2974         OrderBeforeUse = true;
2975         if (MoveUse == 0)
2976           MoveUse = Pos;
2977       } else if (MO.isUse() && stageScheduled(*I) == StageInst1 &&
2978                  isLoopCarriedDefOfUse(SSD, (*I)->getInstr(), MO)) {
2979         if (MoveUse == 0) {
2980           OrderBeforeDef = true;
2981           MoveUse = Pos;
2982         }
2983       }
2984     }
2985     // Check for order dependences between instructions. Make sure the source
2986     // is ordered before the destination.
2987     for (auto &S : SU->Succs) {
2988       if (S.getSUnit() != *I)
2989         continue;
2990       if (S.getKind() == SDep::Order && stageScheduled(*I) == StageInst1) {
2991         OrderBeforeUse = true;
2992         if (Pos < MoveUse)
2993           MoveUse = Pos;
2994       }
2995       // We did not handle HW dependences in previous for loop,
2996       // and we normally set Latency = 0 for Anti deps,
2997       // so may have nodes in same cycle with Anti denpendent on HW regs.
2998       else if (S.getKind() == SDep::Anti && stageScheduled(*I) == StageInst1) {
2999         OrderBeforeUse = true;
3000         if ((MoveUse == 0) || (Pos < MoveUse))
3001           MoveUse = Pos;
3002       }
3003     }
3004     for (auto &P : SU->Preds) {
3005       if (P.getSUnit() != *I)
3006         continue;
3007       if (P.getKind() == SDep::Order && stageScheduled(*I) == StageInst1) {
3008         OrderAfterDef = true;
3009         MoveDef = Pos;
3010       }
3011     }
3012   }
3013 
3014   // A circular dependence.
3015   if (OrderAfterDef && OrderBeforeUse && MoveUse == MoveDef)
3016     OrderBeforeUse = false;
3017 
3018   // OrderAfterDef takes precedences over OrderBeforeDef. The latter is due
3019   // to a loop-carried dependence.
3020   if (OrderBeforeDef)
3021     OrderBeforeUse = !OrderAfterDef || (MoveUse > MoveDef);
3022 
3023   // The uncommon case when the instruction order needs to be updated because
3024   // there is both a use and def.
3025   if (OrderBeforeUse && OrderAfterDef) {
3026     SUnit *UseSU = Insts.at(MoveUse);
3027     SUnit *DefSU = Insts.at(MoveDef);
3028     if (MoveUse > MoveDef) {
3029       Insts.erase(Insts.begin() + MoveUse);
3030       Insts.erase(Insts.begin() + MoveDef);
3031     } else {
3032       Insts.erase(Insts.begin() + MoveDef);
3033       Insts.erase(Insts.begin() + MoveUse);
3034     }
3035     orderDependence(SSD, UseSU, Insts);
3036     orderDependence(SSD, SU, Insts);
3037     orderDependence(SSD, DefSU, Insts);
3038     return;
3039   }
3040   // Put the new instruction first if there is a use in the list. Otherwise,
3041   // put it at the end of the list.
3042   if (OrderBeforeUse)
3043     Insts.push_front(SU);
3044   else
3045     Insts.push_back(SU);
3046 }
3047 
3048 /// Return true if the scheduled Phi has a loop carried operand.
3049 bool SMSchedule::isLoopCarried(const SwingSchedulerDAG *SSD,
3050                                MachineInstr &Phi) const {
3051   if (!Phi.isPHI())
3052     return false;
3053   assert(Phi.isPHI() && "Expecting a Phi.");
3054   SUnit *DefSU = SSD->getSUnit(&Phi);
3055   unsigned DefCycle = cycleScheduled(DefSU);
3056   int DefStage = stageScheduled(DefSU);
3057 
3058   unsigned InitVal = 0;
3059   unsigned LoopVal = 0;
3060   getPhiRegs(Phi, Phi.getParent(), InitVal, LoopVal);
3061   SUnit *UseSU = SSD->getSUnit(MRI.getVRegDef(LoopVal));
3062   if (!UseSU)
3063     return true;
3064   if (UseSU->getInstr()->isPHI())
3065     return true;
3066   unsigned LoopCycle = cycleScheduled(UseSU);
3067   int LoopStage = stageScheduled(UseSU);
3068   return (LoopCycle > DefCycle) || (LoopStage <= DefStage);
3069 }
3070 
3071 /// Return true if the instruction is a definition that is loop carried
3072 /// and defines the use on the next iteration.
3073 ///        v1 = phi(v2, v3)
3074 ///  (Def) v3 = op v1
3075 ///  (MO)   = v1
3076 /// If MO appears before Def, then v1 and v3 may get assigned to the same
3077 /// register.
3078 bool SMSchedule::isLoopCarriedDefOfUse(const SwingSchedulerDAG *SSD,
3079                                        MachineInstr *Def,
3080                                        MachineOperand &MO) const {
3081   if (!MO.isReg())
3082     return false;
3083   if (Def->isPHI())
3084     return false;
3085   MachineInstr *Phi = MRI.getVRegDef(MO.getReg());
3086   if (!Phi || !Phi->isPHI() || Phi->getParent() != Def->getParent())
3087     return false;
3088   if (!isLoopCarried(SSD, *Phi))
3089     return false;
3090   unsigned LoopReg = getLoopPhiReg(*Phi, Phi->getParent());
3091   for (MachineOperand &DMO : Def->all_defs()) {
3092     if (DMO.getReg() == LoopReg)
3093       return true;
3094   }
3095   return false;
3096 }
3097 
3098 /// Determine transitive dependences of unpipelineable instructions
3099 SmallSet<SUnit *, 8> SMSchedule::computeUnpipelineableNodes(
3100     SwingSchedulerDAG *SSD, TargetInstrInfo::PipelinerLoopInfo *PLI) {
3101   SmallSet<SUnit *, 8> DoNotPipeline;
3102   SmallVector<SUnit *, 8> Worklist;
3103 
3104   for (auto &SU : SSD->SUnits)
3105     if (SU.isInstr() && PLI->shouldIgnoreForPipelining(SU.getInstr()))
3106       Worklist.push_back(&SU);
3107 
3108   while (!Worklist.empty()) {
3109     auto SU = Worklist.pop_back_val();
3110     if (DoNotPipeline.count(SU))
3111       continue;
3112     LLVM_DEBUG(dbgs() << "Do not pipeline SU(" << SU->NodeNum << ")\n");
3113     DoNotPipeline.insert(SU);
3114     for (auto &Dep : SU->Preds)
3115       Worklist.push_back(Dep.getSUnit());
3116     if (SU->getInstr()->isPHI())
3117       for (auto &Dep : SU->Succs)
3118         if (Dep.getKind() == SDep::Anti)
3119           Worklist.push_back(Dep.getSUnit());
3120   }
3121   return DoNotPipeline;
3122 }
3123 
3124 // Determine all instructions upon which any unpipelineable instruction depends
3125 // and ensure that they are in stage 0.  If unable to do so, return false.
3126 bool SMSchedule::normalizeNonPipelinedInstructions(
3127     SwingSchedulerDAG *SSD, TargetInstrInfo::PipelinerLoopInfo *PLI) {
3128   SmallSet<SUnit *, 8> DNP = computeUnpipelineableNodes(SSD, PLI);
3129 
3130   int NewLastCycle = INT_MIN;
3131   for (SUnit &SU : SSD->SUnits) {
3132     if (!SU.isInstr())
3133       continue;
3134     if (!DNP.contains(&SU) || stageScheduled(&SU) == 0) {
3135       NewLastCycle = std::max(NewLastCycle, InstrToCycle[&SU]);
3136       continue;
3137     }
3138 
3139     // Put the non-pipelined instruction as early as possible in the schedule
3140     int NewCycle = getFirstCycle();
3141     for (auto &Dep : SU.Preds)
3142       NewCycle = std::max(InstrToCycle[Dep.getSUnit()], NewCycle);
3143 
3144     int OldCycle = InstrToCycle[&SU];
3145     if (OldCycle != NewCycle) {
3146       InstrToCycle[&SU] = NewCycle;
3147       auto &OldS = getInstructions(OldCycle);
3148       llvm::erase(OldS, &SU);
3149       getInstructions(NewCycle).emplace_back(&SU);
3150       LLVM_DEBUG(dbgs() << "SU(" << SU.NodeNum
3151                         << ") is not pipelined; moving from cycle " << OldCycle
3152                         << " to " << NewCycle << " Instr:" << *SU.getInstr());
3153     }
3154     NewLastCycle = std::max(NewLastCycle, NewCycle);
3155   }
3156   LastCycle = NewLastCycle;
3157   return true;
3158 }
3159 
3160 // Check if the generated schedule is valid. This function checks if
3161 // an instruction that uses a physical register is scheduled in a
3162 // different stage than the definition. The pipeliner does not handle
3163 // physical register values that may cross a basic block boundary.
3164 // Furthermore, if a physical def/use pair is assigned to the same
3165 // cycle, orderDependence does not guarantee def/use ordering, so that
3166 // case should be considered invalid.  (The test checks for both
3167 // earlier and same-cycle use to be more robust.)
3168 bool SMSchedule::isValidSchedule(SwingSchedulerDAG *SSD) {
3169   for (SUnit &SU : SSD->SUnits) {
3170     if (!SU.hasPhysRegDefs)
3171       continue;
3172     int StageDef = stageScheduled(&SU);
3173     int CycleDef = InstrToCycle[&SU];
3174     assert(StageDef != -1 && "Instruction should have been scheduled.");
3175     for (auto &SI : SU.Succs)
3176       if (SI.isAssignedRegDep() && !SI.getSUnit()->isBoundaryNode())
3177         if (Register::isPhysicalRegister(SI.getReg())) {
3178           if (stageScheduled(SI.getSUnit()) != StageDef)
3179             return false;
3180           if (InstrToCycle[SI.getSUnit()] <= CycleDef)
3181             return false;
3182         }
3183   }
3184   return true;
3185 }
3186 
3187 /// A property of the node order in swing-modulo-scheduling is
3188 /// that for nodes outside circuits the following holds:
3189 /// none of them is scheduled after both a successor and a
3190 /// predecessor.
3191 /// The method below checks whether the property is met.
3192 /// If not, debug information is printed and statistics information updated.
3193 /// Note that we do not use an assert statement.
3194 /// The reason is that although an invalid node oder may prevent
3195 /// the pipeliner from finding a pipelined schedule for arbitrary II,
3196 /// it does not lead to the generation of incorrect code.
3197 void SwingSchedulerDAG::checkValidNodeOrder(const NodeSetType &Circuits) const {
3198 
3199   // a sorted vector that maps each SUnit to its index in the NodeOrder
3200   typedef std::pair<SUnit *, unsigned> UnitIndex;
3201   std::vector<UnitIndex> Indices(NodeOrder.size(), std::make_pair(nullptr, 0));
3202 
3203   for (unsigned i = 0, s = NodeOrder.size(); i < s; ++i)
3204     Indices.push_back(std::make_pair(NodeOrder[i], i));
3205 
3206   auto CompareKey = [](UnitIndex i1, UnitIndex i2) {
3207     return std::get<0>(i1) < std::get<0>(i2);
3208   };
3209 
3210   // sort, so that we can perform a binary search
3211   llvm::sort(Indices, CompareKey);
3212 
3213   bool Valid = true;
3214   (void)Valid;
3215   // for each SUnit in the NodeOrder, check whether
3216   // it appears after both a successor and a predecessor
3217   // of the SUnit. If this is the case, and the SUnit
3218   // is not part of circuit, then the NodeOrder is not
3219   // valid.
3220   for (unsigned i = 0, s = NodeOrder.size(); i < s; ++i) {
3221     SUnit *SU = NodeOrder[i];
3222     unsigned Index = i;
3223 
3224     bool PredBefore = false;
3225     bool SuccBefore = false;
3226 
3227     SUnit *Succ;
3228     SUnit *Pred;
3229     (void)Succ;
3230     (void)Pred;
3231 
3232     for (SDep &PredEdge : SU->Preds) {
3233       SUnit *PredSU = PredEdge.getSUnit();
3234       unsigned PredIndex = std::get<1>(
3235           *llvm::lower_bound(Indices, std::make_pair(PredSU, 0), CompareKey));
3236       if (!PredSU->getInstr()->isPHI() && PredIndex < Index) {
3237         PredBefore = true;
3238         Pred = PredSU;
3239         break;
3240       }
3241     }
3242 
3243     for (SDep &SuccEdge : SU->Succs) {
3244       SUnit *SuccSU = SuccEdge.getSUnit();
3245       // Do not process a boundary node, it was not included in NodeOrder,
3246       // hence not in Indices either, call to std::lower_bound() below will
3247       // return Indices.end().
3248       if (SuccSU->isBoundaryNode())
3249         continue;
3250       unsigned SuccIndex = std::get<1>(
3251           *llvm::lower_bound(Indices, std::make_pair(SuccSU, 0), CompareKey));
3252       if (!SuccSU->getInstr()->isPHI() && SuccIndex < Index) {
3253         SuccBefore = true;
3254         Succ = SuccSU;
3255         break;
3256       }
3257     }
3258 
3259     if (PredBefore && SuccBefore && !SU->getInstr()->isPHI()) {
3260       // instructions in circuits are allowed to be scheduled
3261       // after both a successor and predecessor.
3262       bool InCircuit = llvm::any_of(
3263           Circuits, [SU](const NodeSet &Circuit) { return Circuit.count(SU); });
3264       if (InCircuit)
3265         LLVM_DEBUG(dbgs() << "In a circuit, predecessor ";);
3266       else {
3267         Valid = false;
3268         NumNodeOrderIssues++;
3269         LLVM_DEBUG(dbgs() << "Predecessor ";);
3270       }
3271       LLVM_DEBUG(dbgs() << Pred->NodeNum << " and successor " << Succ->NodeNum
3272                         << " are scheduled before node " << SU->NodeNum
3273                         << "\n";);
3274     }
3275   }
3276 
3277   LLVM_DEBUG({
3278     if (!Valid)
3279       dbgs() << "Invalid node order found!\n";
3280   });
3281 }
3282 
3283 /// Attempt to fix the degenerate cases when the instruction serialization
3284 /// causes the register lifetimes to overlap. For example,
3285 ///   p' = store_pi(p, b)
3286 ///      = load p, offset
3287 /// In this case p and p' overlap, which means that two registers are needed.
3288 /// Instead, this function changes the load to use p' and updates the offset.
3289 void SwingSchedulerDAG::fixupRegisterOverlaps(std::deque<SUnit *> &Instrs) {
3290   unsigned OverlapReg = 0;
3291   unsigned NewBaseReg = 0;
3292   for (SUnit *SU : Instrs) {
3293     MachineInstr *MI = SU->getInstr();
3294     for (unsigned i = 0, e = MI->getNumOperands(); i < e; ++i) {
3295       const MachineOperand &MO = MI->getOperand(i);
3296       // Look for an instruction that uses p. The instruction occurs in the
3297       // same cycle but occurs later in the serialized order.
3298       if (MO.isReg() && MO.isUse() && MO.getReg() == OverlapReg) {
3299         // Check that the instruction appears in the InstrChanges structure,
3300         // which contains instructions that can have the offset updated.
3301         DenseMap<SUnit *, std::pair<unsigned, int64_t>>::iterator It =
3302           InstrChanges.find(SU);
3303         if (It != InstrChanges.end()) {
3304           unsigned BasePos, OffsetPos;
3305           // Update the base register and adjust the offset.
3306           if (TII->getBaseAndOffsetPosition(*MI, BasePos, OffsetPos)) {
3307             MachineInstr *NewMI = MF.CloneMachineInstr(MI);
3308             NewMI->getOperand(BasePos).setReg(NewBaseReg);
3309             int64_t NewOffset =
3310                 MI->getOperand(OffsetPos).getImm() - It->second.second;
3311             NewMI->getOperand(OffsetPos).setImm(NewOffset);
3312             SU->setInstr(NewMI);
3313             MISUnitMap[NewMI] = SU;
3314             NewMIs[MI] = NewMI;
3315           }
3316         }
3317         OverlapReg = 0;
3318         NewBaseReg = 0;
3319         break;
3320       }
3321       // Look for an instruction of the form p' = op(p), which uses and defines
3322       // two virtual registers that get allocated to the same physical register.
3323       unsigned TiedUseIdx = 0;
3324       if (MI->isRegTiedToUseOperand(i, &TiedUseIdx)) {
3325         // OverlapReg is p in the example above.
3326         OverlapReg = MI->getOperand(TiedUseIdx).getReg();
3327         // NewBaseReg is p' in the example above.
3328         NewBaseReg = MI->getOperand(i).getReg();
3329         break;
3330       }
3331     }
3332   }
3333 }
3334 
3335 std::deque<SUnit *>
3336 SMSchedule::reorderInstructions(const SwingSchedulerDAG *SSD,
3337                                 const std::deque<SUnit *> &Instrs) const {
3338   std::deque<SUnit *> NewOrderPhi;
3339   for (SUnit *SU : Instrs) {
3340     if (SU->getInstr()->isPHI())
3341       NewOrderPhi.push_back(SU);
3342   }
3343   std::deque<SUnit *> NewOrderI;
3344   for (SUnit *SU : Instrs) {
3345     if (!SU->getInstr()->isPHI())
3346       orderDependence(SSD, SU, NewOrderI);
3347   }
3348   llvm::append_range(NewOrderPhi, NewOrderI);
3349   return NewOrderPhi;
3350 }
3351 
3352 /// After the schedule has been formed, call this function to combine
3353 /// the instructions from the different stages/cycles.  That is, this
3354 /// function creates a schedule that represents a single iteration.
3355 void SMSchedule::finalizeSchedule(SwingSchedulerDAG *SSD) {
3356   // Move all instructions to the first stage from later stages.
3357   for (int cycle = getFirstCycle(); cycle <= getFinalCycle(); ++cycle) {
3358     for (int stage = 1, lastStage = getMaxStageCount(); stage <= lastStage;
3359          ++stage) {
3360       std::deque<SUnit *> &cycleInstrs =
3361           ScheduledInstrs[cycle + (stage * InitiationInterval)];
3362       for (SUnit *SU : llvm::reverse(cycleInstrs))
3363         ScheduledInstrs[cycle].push_front(SU);
3364     }
3365   }
3366 
3367   // Erase all the elements in the later stages. Only one iteration should
3368   // remain in the scheduled list, and it contains all the instructions.
3369   for (int cycle = getFinalCycle() + 1; cycle <= LastCycle; ++cycle)
3370     ScheduledInstrs.erase(cycle);
3371 
3372   // Change the registers in instruction as specified in the InstrChanges
3373   // map. We need to use the new registers to create the correct order.
3374   for (const SUnit &SU : SSD->SUnits)
3375     SSD->applyInstrChange(SU.getInstr(), *this);
3376 
3377   // Reorder the instructions in each cycle to fix and improve the
3378   // generated code.
3379   for (int Cycle = getFirstCycle(), E = getFinalCycle(); Cycle <= E; ++Cycle) {
3380     std::deque<SUnit *> &cycleInstrs = ScheduledInstrs[Cycle];
3381     cycleInstrs = reorderInstructions(SSD, cycleInstrs);
3382     SSD->fixupRegisterOverlaps(cycleInstrs);
3383   }
3384 
3385   LLVM_DEBUG(dump(););
3386 }
3387 
3388 void NodeSet::print(raw_ostream &os) const {
3389   os << "Num nodes " << size() << " rec " << RecMII << " mov " << MaxMOV
3390      << " depth " << MaxDepth << " col " << Colocate << "\n";
3391   for (const auto &I : Nodes)
3392     os << "   SU(" << I->NodeNum << ") " << *(I->getInstr());
3393   os << "\n";
3394 }
3395 
3396 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3397 /// Print the schedule information to the given output.
3398 void SMSchedule::print(raw_ostream &os) const {
3399   // Iterate over each cycle.
3400   for (int cycle = getFirstCycle(); cycle <= getFinalCycle(); ++cycle) {
3401     // Iterate over each instruction in the cycle.
3402     const_sched_iterator cycleInstrs = ScheduledInstrs.find(cycle);
3403     for (SUnit *CI : cycleInstrs->second) {
3404       os << "cycle " << cycle << " (" << stageScheduled(CI) << ") ";
3405       os << "(" << CI->NodeNum << ") ";
3406       CI->getInstr()->print(os);
3407       os << "\n";
3408     }
3409   }
3410 }
3411 
3412 /// Utility function used for debugging to print the schedule.
3413 LLVM_DUMP_METHOD void SMSchedule::dump() const { print(dbgs()); }
3414 LLVM_DUMP_METHOD void NodeSet::dump() const { print(dbgs()); }
3415 
3416 void ResourceManager::dumpMRT() const {
3417   LLVM_DEBUG({
3418     if (UseDFA)
3419       return;
3420     std::stringstream SS;
3421     SS << "MRT:\n";
3422     SS << std::setw(4) << "Slot";
3423     for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I)
3424       SS << std::setw(3) << I;
3425     SS << std::setw(7) << "#Mops"
3426        << "\n";
3427     for (int Slot = 0; Slot < InitiationInterval; ++Slot) {
3428       SS << std::setw(4) << Slot;
3429       for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I)
3430         SS << std::setw(3) << MRT[Slot][I];
3431       SS << std::setw(7) << NumScheduledMops[Slot] << "\n";
3432     }
3433     dbgs() << SS.str();
3434   });
3435 }
3436 #endif
3437 
3438 void ResourceManager::initProcResourceVectors(
3439     const MCSchedModel &SM, SmallVectorImpl<uint64_t> &Masks) {
3440   unsigned ProcResourceID = 0;
3441 
3442   // We currently limit the resource kinds to 64 and below so that we can use
3443   // uint64_t for Masks
3444   assert(SM.getNumProcResourceKinds() < 64 &&
3445          "Too many kinds of resources, unsupported");
3446   // Create a unique bitmask for every processor resource unit.
3447   // Skip resource at index 0, since it always references 'InvalidUnit'.
3448   Masks.resize(SM.getNumProcResourceKinds());
3449   for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I) {
3450     const MCProcResourceDesc &Desc = *SM.getProcResource(I);
3451     if (Desc.SubUnitsIdxBegin)
3452       continue;
3453     Masks[I] = 1ULL << ProcResourceID;
3454     ProcResourceID++;
3455   }
3456   // Create a unique bitmask for every processor resource group.
3457   for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I) {
3458     const MCProcResourceDesc &Desc = *SM.getProcResource(I);
3459     if (!Desc.SubUnitsIdxBegin)
3460       continue;
3461     Masks[I] = 1ULL << ProcResourceID;
3462     for (unsigned U = 0; U < Desc.NumUnits; ++U)
3463       Masks[I] |= Masks[Desc.SubUnitsIdxBegin[U]];
3464     ProcResourceID++;
3465   }
3466   LLVM_DEBUG({
3467     if (SwpShowResMask) {
3468       dbgs() << "ProcResourceDesc:\n";
3469       for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I) {
3470         const MCProcResourceDesc *ProcResource = SM.getProcResource(I);
3471         dbgs() << format(" %16s(%2d): Mask: 0x%08x, NumUnits:%2d\n",
3472                          ProcResource->Name, I, Masks[I],
3473                          ProcResource->NumUnits);
3474       }
3475       dbgs() << " -----------------\n";
3476     }
3477   });
3478 }
3479 
3480 bool ResourceManager::canReserveResources(SUnit &SU, int Cycle) {
3481   LLVM_DEBUG({
3482     if (SwpDebugResource)
3483       dbgs() << "canReserveResources:\n";
3484   });
3485   if (UseDFA)
3486     return DFAResources[positiveModulo(Cycle, InitiationInterval)]
3487         ->canReserveResources(&SU.getInstr()->getDesc());
3488 
3489   const MCSchedClassDesc *SCDesc = DAG->getSchedClass(&SU);
3490   if (!SCDesc->isValid()) {
3491     LLVM_DEBUG({
3492       dbgs() << "No valid Schedule Class Desc for schedClass!\n";
3493       dbgs() << "isPseudo:" << SU.getInstr()->isPseudo() << "\n";
3494     });
3495     return true;
3496   }
3497 
3498   reserveResources(SCDesc, Cycle);
3499   bool Result = !isOverbooked();
3500   unreserveResources(SCDesc, Cycle);
3501 
3502   LLVM_DEBUG(if (SwpDebugResource) dbgs() << "return " << Result << "\n\n";);
3503   return Result;
3504 }
3505 
3506 void ResourceManager::reserveResources(SUnit &SU, int Cycle) {
3507   LLVM_DEBUG({
3508     if (SwpDebugResource)
3509       dbgs() << "reserveResources:\n";
3510   });
3511   if (UseDFA)
3512     return DFAResources[positiveModulo(Cycle, InitiationInterval)]
3513         ->reserveResources(&SU.getInstr()->getDesc());
3514 
3515   const MCSchedClassDesc *SCDesc = DAG->getSchedClass(&SU);
3516   if (!SCDesc->isValid()) {
3517     LLVM_DEBUG({
3518       dbgs() << "No valid Schedule Class Desc for schedClass!\n";
3519       dbgs() << "isPseudo:" << SU.getInstr()->isPseudo() << "\n";
3520     });
3521     return;
3522   }
3523 
3524   reserveResources(SCDesc, Cycle);
3525 
3526   LLVM_DEBUG({
3527     if (SwpDebugResource) {
3528       dumpMRT();
3529       dbgs() << "reserveResources: done!\n\n";
3530     }
3531   });
3532 }
3533 
3534 void ResourceManager::reserveResources(const MCSchedClassDesc *SCDesc,
3535                                        int Cycle) {
3536   assert(!UseDFA);
3537   for (const MCWriteProcResEntry &PRE : make_range(
3538            STI->getWriteProcResBegin(SCDesc), STI->getWriteProcResEnd(SCDesc)))
3539     for (int C = Cycle; C < Cycle + PRE.ReleaseAtCycle; ++C)
3540       ++MRT[positiveModulo(C, InitiationInterval)][PRE.ProcResourceIdx];
3541 
3542   for (int C = Cycle; C < Cycle + SCDesc->NumMicroOps; ++C)
3543     ++NumScheduledMops[positiveModulo(C, InitiationInterval)];
3544 }
3545 
3546 void ResourceManager::unreserveResources(const MCSchedClassDesc *SCDesc,
3547                                          int Cycle) {
3548   assert(!UseDFA);
3549   for (const MCWriteProcResEntry &PRE : make_range(
3550            STI->getWriteProcResBegin(SCDesc), STI->getWriteProcResEnd(SCDesc)))
3551     for (int C = Cycle; C < Cycle + PRE.ReleaseAtCycle; ++C)
3552       --MRT[positiveModulo(C, InitiationInterval)][PRE.ProcResourceIdx];
3553 
3554   for (int C = Cycle; C < Cycle + SCDesc->NumMicroOps; ++C)
3555     --NumScheduledMops[positiveModulo(C, InitiationInterval)];
3556 }
3557 
3558 bool ResourceManager::isOverbooked() const {
3559   assert(!UseDFA);
3560   for (int Slot = 0; Slot < InitiationInterval; ++Slot) {
3561     for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I) {
3562       const MCProcResourceDesc *Desc = SM.getProcResource(I);
3563       if (MRT[Slot][I] > Desc->NumUnits)
3564         return true;
3565     }
3566     if (NumScheduledMops[Slot] > IssueWidth)
3567       return true;
3568   }
3569   return false;
3570 }
3571 
3572 int ResourceManager::calculateResMIIDFA() const {
3573   assert(UseDFA);
3574 
3575   // Sort the instructions by the number of available choices for scheduling,
3576   // least to most. Use the number of critical resources as the tie breaker.
3577   FuncUnitSorter FUS = FuncUnitSorter(*ST);
3578   for (SUnit &SU : DAG->SUnits)
3579     FUS.calcCriticalResources(*SU.getInstr());
3580   PriorityQueue<MachineInstr *, std::vector<MachineInstr *>, FuncUnitSorter>
3581       FuncUnitOrder(FUS);
3582 
3583   for (SUnit &SU : DAG->SUnits)
3584     FuncUnitOrder.push(SU.getInstr());
3585 
3586   SmallVector<std::unique_ptr<DFAPacketizer>, 8> Resources;
3587   Resources.push_back(
3588       std::unique_ptr<DFAPacketizer>(TII->CreateTargetScheduleState(*ST)));
3589 
3590   while (!FuncUnitOrder.empty()) {
3591     MachineInstr *MI = FuncUnitOrder.top();
3592     FuncUnitOrder.pop();
3593     if (TII->isZeroCost(MI->getOpcode()))
3594       continue;
3595 
3596     // Attempt to reserve the instruction in an existing DFA. At least one
3597     // DFA is needed for each cycle.
3598     unsigned NumCycles = DAG->getSUnit(MI)->Latency;
3599     unsigned ReservedCycles = 0;
3600     auto *RI = Resources.begin();
3601     auto *RE = Resources.end();
3602     LLVM_DEBUG({
3603       dbgs() << "Trying to reserve resource for " << NumCycles
3604              << " cycles for \n";
3605       MI->dump();
3606     });
3607     for (unsigned C = 0; C < NumCycles; ++C)
3608       while (RI != RE) {
3609         if ((*RI)->canReserveResources(*MI)) {
3610           (*RI)->reserveResources(*MI);
3611           ++ReservedCycles;
3612           break;
3613         }
3614         RI++;
3615       }
3616     LLVM_DEBUG(dbgs() << "ReservedCycles:" << ReservedCycles
3617                       << ", NumCycles:" << NumCycles << "\n");
3618     // Add new DFAs, if needed, to reserve resources.
3619     for (unsigned C = ReservedCycles; C < NumCycles; ++C) {
3620       LLVM_DEBUG(if (SwpDebugResource) dbgs()
3621                  << "NewResource created to reserve resources"
3622                  << "\n");
3623       auto *NewResource = TII->CreateTargetScheduleState(*ST);
3624       assert(NewResource->canReserveResources(*MI) && "Reserve error.");
3625       NewResource->reserveResources(*MI);
3626       Resources.push_back(std::unique_ptr<DFAPacketizer>(NewResource));
3627     }
3628   }
3629 
3630   int Resmii = Resources.size();
3631   LLVM_DEBUG(dbgs() << "Return Res MII:" << Resmii << "\n");
3632   return Resmii;
3633 }
3634 
3635 int ResourceManager::calculateResMII() const {
3636   if (UseDFA)
3637     return calculateResMIIDFA();
3638 
3639   // Count each resource consumption and divide it by the number of units.
3640   // ResMII is the max value among them.
3641 
3642   int NumMops = 0;
3643   SmallVector<uint64_t> ResourceCount(SM.getNumProcResourceKinds());
3644   for (SUnit &SU : DAG->SUnits) {
3645     if (TII->isZeroCost(SU.getInstr()->getOpcode()))
3646       continue;
3647 
3648     const MCSchedClassDesc *SCDesc = DAG->getSchedClass(&SU);
3649     if (!SCDesc->isValid())
3650       continue;
3651 
3652     LLVM_DEBUG({
3653       if (SwpDebugResource) {
3654         DAG->dumpNode(SU);
3655         dbgs() << "  #Mops: " << SCDesc->NumMicroOps << "\n"
3656                << "  WriteProcRes: ";
3657       }
3658     });
3659     NumMops += SCDesc->NumMicroOps;
3660     for (const MCWriteProcResEntry &PRE :
3661          make_range(STI->getWriteProcResBegin(SCDesc),
3662                     STI->getWriteProcResEnd(SCDesc))) {
3663       LLVM_DEBUG({
3664         if (SwpDebugResource) {
3665           const MCProcResourceDesc *Desc =
3666               SM.getProcResource(PRE.ProcResourceIdx);
3667           dbgs() << Desc->Name << ": " << PRE.ReleaseAtCycle << ", ";
3668         }
3669       });
3670       ResourceCount[PRE.ProcResourceIdx] += PRE.ReleaseAtCycle;
3671     }
3672     LLVM_DEBUG(if (SwpDebugResource) dbgs() << "\n");
3673   }
3674 
3675   int Result = (NumMops + IssueWidth - 1) / IssueWidth;
3676   LLVM_DEBUG({
3677     if (SwpDebugResource)
3678       dbgs() << "#Mops: " << NumMops << ", "
3679              << "IssueWidth: " << IssueWidth << ", "
3680              << "Cycles: " << Result << "\n";
3681   });
3682 
3683   LLVM_DEBUG({
3684     if (SwpDebugResource) {
3685       std::stringstream SS;
3686       SS << std::setw(2) << "ID" << std::setw(16) << "Name" << std::setw(10)
3687          << "Units" << std::setw(10) << "Consumed" << std::setw(10) << "Cycles"
3688          << "\n";
3689       dbgs() << SS.str();
3690     }
3691   });
3692   for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I) {
3693     const MCProcResourceDesc *Desc = SM.getProcResource(I);
3694     int Cycles = (ResourceCount[I] + Desc->NumUnits - 1) / Desc->NumUnits;
3695     LLVM_DEBUG({
3696       if (SwpDebugResource) {
3697         std::stringstream SS;
3698         SS << std::setw(2) << I << std::setw(16) << Desc->Name << std::setw(10)
3699            << Desc->NumUnits << std::setw(10) << ResourceCount[I]
3700            << std::setw(10) << Cycles << "\n";
3701         dbgs() << SS.str();
3702       }
3703     });
3704     if (Cycles > Result)
3705       Result = Cycles;
3706   }
3707   return Result;
3708 }
3709 
3710 void ResourceManager::init(int II) {
3711   InitiationInterval = II;
3712   DFAResources.clear();
3713   DFAResources.resize(II);
3714   for (auto &I : DFAResources)
3715     I.reset(ST->getInstrInfo()->CreateTargetScheduleState(*ST));
3716   MRT.clear();
3717   MRT.resize(II, SmallVector<uint64_t>(SM.getNumProcResourceKinds()));
3718   NumScheduledMops.clear();
3719   NumScheduledMops.resize(II);
3720 }
3721