xref: /llvm-project/llvm/lib/CodeGen/MachinePipeliner.cpp (revision b24af43fdfa1b1242b7cb77540462212227c57c4)
1 //===- MachinePipeliner.cpp - Machine Software Pipeliner Pass -------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // An implementation of the Swing Modulo Scheduling (SMS) software pipeliner.
10 //
11 // This SMS implementation is a target-independent back-end pass. When enabled,
12 // the pass runs just prior to the register allocation pass, while the machine
13 // IR is in SSA form. If software pipelining is successful, then the original
14 // loop is replaced by the optimized loop. The optimized loop contains one or
15 // more prolog blocks, the pipelined kernel, and one or more epilog blocks. If
16 // the instructions cannot be scheduled in a given MII, we increase the MII by
17 // one and try again.
18 //
19 // The SMS implementation is an extension of the ScheduleDAGInstrs class. We
20 // represent loop carried dependences in the DAG as order edges to the Phi
21 // nodes. We also perform several passes over the DAG to eliminate unnecessary
22 // edges that inhibit the ability to pipeline. The implementation uses the
23 // DFAPacketizer class to compute the minimum initiation interval and the check
24 // where an instruction may be inserted in the pipelined schedule.
25 //
26 // In order for the SMS pass to work, several target specific hooks need to be
27 // implemented to get information about the loop structure and to rewrite
28 // instructions.
29 //
30 //===----------------------------------------------------------------------===//
31 
32 #include "llvm/CodeGen/MachinePipeliner.h"
33 #include "llvm/ADT/ArrayRef.h"
34 #include "llvm/ADT/BitVector.h"
35 #include "llvm/ADT/DenseMap.h"
36 #include "llvm/ADT/MapVector.h"
37 #include "llvm/ADT/PriorityQueue.h"
38 #include "llvm/ADT/STLExtras.h"
39 #include "llvm/ADT/SetOperations.h"
40 #include "llvm/ADT/SetVector.h"
41 #include "llvm/ADT/SmallPtrSet.h"
42 #include "llvm/ADT/SmallSet.h"
43 #include "llvm/ADT/SmallVector.h"
44 #include "llvm/ADT/Statistic.h"
45 #include "llvm/ADT/iterator_range.h"
46 #include "llvm/Analysis/AliasAnalysis.h"
47 #include "llvm/Analysis/CycleAnalysis.h"
48 #include "llvm/Analysis/MemoryLocation.h"
49 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
50 #include "llvm/Analysis/ValueTracking.h"
51 #include "llvm/CodeGen/DFAPacketizer.h"
52 #include "llvm/CodeGen/LiveIntervals.h"
53 #include "llvm/CodeGen/MachineBasicBlock.h"
54 #include "llvm/CodeGen/MachineDominators.h"
55 #include "llvm/CodeGen/MachineFunction.h"
56 #include "llvm/CodeGen/MachineFunctionPass.h"
57 #include "llvm/CodeGen/MachineInstr.h"
58 #include "llvm/CodeGen/MachineInstrBuilder.h"
59 #include "llvm/CodeGen/MachineLoopInfo.h"
60 #include "llvm/CodeGen/MachineMemOperand.h"
61 #include "llvm/CodeGen/MachineOperand.h"
62 #include "llvm/CodeGen/MachineRegisterInfo.h"
63 #include "llvm/CodeGen/ModuloSchedule.h"
64 #include "llvm/CodeGen/Register.h"
65 #include "llvm/CodeGen/RegisterClassInfo.h"
66 #include "llvm/CodeGen/RegisterPressure.h"
67 #include "llvm/CodeGen/ScheduleDAG.h"
68 #include "llvm/CodeGen/ScheduleDAGMutation.h"
69 #include "llvm/CodeGen/TargetInstrInfo.h"
70 #include "llvm/CodeGen/TargetOpcodes.h"
71 #include "llvm/CodeGen/TargetRegisterInfo.h"
72 #include "llvm/CodeGen/TargetSubtargetInfo.h"
73 #include "llvm/Config/llvm-config.h"
74 #include "llvm/IR/Attributes.h"
75 #include "llvm/IR/Function.h"
76 #include "llvm/MC/LaneBitmask.h"
77 #include "llvm/MC/MCInstrDesc.h"
78 #include "llvm/MC/MCInstrItineraries.h"
79 #include "llvm/MC/MCRegisterInfo.h"
80 #include "llvm/Pass.h"
81 #include "llvm/Support/CommandLine.h"
82 #include "llvm/Support/Compiler.h"
83 #include "llvm/Support/Debug.h"
84 #include "llvm/Support/MathExtras.h"
85 #include "llvm/Support/raw_ostream.h"
86 #include <algorithm>
87 #include <cassert>
88 #include <climits>
89 #include <cstdint>
90 #include <deque>
91 #include <functional>
92 #include <iomanip>
93 #include <iterator>
94 #include <map>
95 #include <memory>
96 #include <sstream>
97 #include <tuple>
98 #include <utility>
99 #include <vector>
100 
101 using namespace llvm;
102 
103 #define DEBUG_TYPE "pipeliner"
104 
105 STATISTIC(NumTrytoPipeline, "Number of loops that we attempt to pipeline");
106 STATISTIC(NumPipelined, "Number of loops software pipelined");
107 STATISTIC(NumNodeOrderIssues, "Number of node order issues found");
108 STATISTIC(NumFailBranch, "Pipeliner abort due to unknown branch");
109 STATISTIC(NumFailLoop, "Pipeliner abort due to unsupported loop");
110 STATISTIC(NumFailPreheader, "Pipeliner abort due to missing preheader");
111 STATISTIC(NumFailLargeMaxMII, "Pipeliner abort due to MaxMII too large");
112 STATISTIC(NumFailZeroMII, "Pipeliner abort due to zero MII");
113 STATISTIC(NumFailNoSchedule, "Pipeliner abort due to no schedule found");
114 STATISTIC(NumFailZeroStage, "Pipeliner abort due to zero stage");
115 STATISTIC(NumFailLargeMaxStage, "Pipeliner abort due to too many stages");
116 
117 /// A command line option to turn software pipelining on or off.
118 static cl::opt<bool> EnableSWP("enable-pipeliner", cl::Hidden, cl::init(true),
119                                cl::desc("Enable Software Pipelining"));
120 
121 /// A command line option to enable SWP at -Os.
122 static cl::opt<bool> EnableSWPOptSize("enable-pipeliner-opt-size",
123                                       cl::desc("Enable SWP at Os."), cl::Hidden,
124                                       cl::init(false));
125 
126 /// A command line argument to limit minimum initial interval for pipelining.
127 static cl::opt<int> SwpMaxMii("pipeliner-max-mii",
128                               cl::desc("Size limit for the MII."),
129                               cl::Hidden, cl::init(27));
130 
131 /// A command line argument to force pipeliner to use specified initial
132 /// interval.
133 static cl::opt<int> SwpForceII("pipeliner-force-ii",
134                                cl::desc("Force pipeliner to use specified II."),
135                                cl::Hidden, cl::init(-1));
136 
137 /// A command line argument to limit the number of stages in the pipeline.
138 static cl::opt<int>
139     SwpMaxStages("pipeliner-max-stages",
140                  cl::desc("Maximum stages allowed in the generated scheduled."),
141                  cl::Hidden, cl::init(3));
142 
143 /// A command line option to disable the pruning of chain dependences due to
144 /// an unrelated Phi.
145 static cl::opt<bool>
146     SwpPruneDeps("pipeliner-prune-deps",
147                  cl::desc("Prune dependences between unrelated Phi nodes."),
148                  cl::Hidden, cl::init(true));
149 
150 /// A command line option to disable the pruning of loop carried order
151 /// dependences.
152 static cl::opt<bool>
153     SwpPruneLoopCarried("pipeliner-prune-loop-carried",
154                         cl::desc("Prune loop carried order dependences."),
155                         cl::Hidden, cl::init(true));
156 
157 #ifndef NDEBUG
158 static cl::opt<int> SwpLoopLimit("pipeliner-max", cl::Hidden, cl::init(-1));
159 #endif
160 
161 static cl::opt<bool> SwpIgnoreRecMII("pipeliner-ignore-recmii",
162                                      cl::ReallyHidden,
163                                      cl::desc("Ignore RecMII"));
164 
165 static cl::opt<bool> SwpShowResMask("pipeliner-show-mask", cl::Hidden,
166                                     cl::init(false));
167 static cl::opt<bool> SwpDebugResource("pipeliner-dbg-res", cl::Hidden,
168                                       cl::init(false));
169 
170 static cl::opt<bool> EmitTestAnnotations(
171     "pipeliner-annotate-for-testing", cl::Hidden, cl::init(false),
172     cl::desc("Instead of emitting the pipelined code, annotate instructions "
173              "with the generated schedule for feeding into the "
174              "-modulo-schedule-test pass"));
175 
176 static cl::opt<bool> ExperimentalCodeGen(
177     "pipeliner-experimental-cg", cl::Hidden, cl::init(false),
178     cl::desc(
179         "Use the experimental peeling code generator for software pipelining"));
180 
181 static cl::opt<int> SwpIISearchRange("pipeliner-ii-search-range",
182                                      cl::desc("Range to search for II"),
183                                      cl::Hidden, cl::init(10));
184 
185 static cl::opt<bool>
186     LimitRegPressure("pipeliner-register-pressure", cl::Hidden, cl::init(false),
187                      cl::desc("Limit register pressure of scheduled loop"));
188 
189 static cl::opt<int>
190     RegPressureMargin("pipeliner-register-pressure-margin", cl::Hidden,
191                       cl::init(5),
192                       cl::desc("Margin representing the unused percentage of "
193                                "the register pressure limit"));
194 
195 namespace llvm {
196 
197 // A command line option to enable the CopyToPhi DAG mutation.
198 cl::opt<bool> SwpEnableCopyToPhi("pipeliner-enable-copytophi", cl::ReallyHidden,
199                                  cl::init(true),
200                                  cl::desc("Enable CopyToPhi DAG Mutation"));
201 
202 /// A command line argument to force pipeliner to use specified issue
203 /// width.
204 cl::opt<int> SwpForceIssueWidth(
205     "pipeliner-force-issue-width",
206     cl::desc("Force pipeliner to use specified issue width."), cl::Hidden,
207     cl::init(-1));
208 
209 } // end namespace llvm
210 
211 unsigned SwingSchedulerDAG::Circuits::MaxPaths = 5;
212 char MachinePipeliner::ID = 0;
213 #ifndef NDEBUG
214 int MachinePipeliner::NumTries = 0;
215 #endif
216 char &llvm::MachinePipelinerID = MachinePipeliner::ID;
217 
218 INITIALIZE_PASS_BEGIN(MachinePipeliner, DEBUG_TYPE,
219                       "Modulo Software Pipelining", false, false)
220 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
221 INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
222 INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
223 INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
224 INITIALIZE_PASS_END(MachinePipeliner, DEBUG_TYPE,
225                     "Modulo Software Pipelining", false, false)
226 
227 /// The "main" function for implementing Swing Modulo Scheduling.
228 bool MachinePipeliner::runOnMachineFunction(MachineFunction &mf) {
229   if (skipFunction(mf.getFunction()))
230     return false;
231 
232   if (!EnableSWP)
233     return false;
234 
235   if (mf.getFunction().getAttributes().hasFnAttr(Attribute::OptimizeForSize) &&
236       !EnableSWPOptSize.getPosition())
237     return false;
238 
239   if (!mf.getSubtarget().enableMachinePipeliner())
240     return false;
241 
242   // Cannot pipeline loops without instruction itineraries if we are using
243   // DFA for the pipeliner.
244   if (mf.getSubtarget().useDFAforSMS() &&
245       (!mf.getSubtarget().getInstrItineraryData() ||
246        mf.getSubtarget().getInstrItineraryData()->isEmpty()))
247     return false;
248 
249   MF = &mf;
250   MLI = &getAnalysis<MachineLoopInfo>();
251   MDT = &getAnalysis<MachineDominatorTree>();
252   ORE = &getAnalysis<MachineOptimizationRemarkEmitterPass>().getORE();
253   TII = MF->getSubtarget().getInstrInfo();
254   RegClassInfo.runOnMachineFunction(*MF);
255 
256   for (const auto &L : *MLI)
257     scheduleLoop(*L);
258 
259   return false;
260 }
261 
262 /// Attempt to perform the SMS algorithm on the specified loop. This function is
263 /// the main entry point for the algorithm.  The function identifies candidate
264 /// loops, calculates the minimum initiation interval, and attempts to schedule
265 /// the loop.
266 bool MachinePipeliner::scheduleLoop(MachineLoop &L) {
267   bool Changed = false;
268   for (const auto &InnerLoop : L)
269     Changed |= scheduleLoop(*InnerLoop);
270 
271 #ifndef NDEBUG
272   // Stop trying after reaching the limit (if any).
273   int Limit = SwpLoopLimit;
274   if (Limit >= 0) {
275     if (NumTries >= SwpLoopLimit)
276       return Changed;
277     NumTries++;
278   }
279 #endif
280 
281   setPragmaPipelineOptions(L);
282   if (!canPipelineLoop(L)) {
283     LLVM_DEBUG(dbgs() << "\n!!! Can not pipeline loop.\n");
284     ORE->emit([&]() {
285       return MachineOptimizationRemarkMissed(DEBUG_TYPE, "canPipelineLoop",
286                                              L.getStartLoc(), L.getHeader())
287              << "Failed to pipeline loop";
288     });
289 
290     LI.LoopPipelinerInfo.reset();
291     return Changed;
292   }
293 
294   ++NumTrytoPipeline;
295 
296   Changed = swingModuloScheduler(L);
297 
298   LI.LoopPipelinerInfo.reset();
299   return Changed;
300 }
301 
302 void MachinePipeliner::setPragmaPipelineOptions(MachineLoop &L) {
303   // Reset the pragma for the next loop in iteration.
304   disabledByPragma = false;
305   II_setByPragma = 0;
306 
307   MachineBasicBlock *LBLK = L.getTopBlock();
308 
309   if (LBLK == nullptr)
310     return;
311 
312   const BasicBlock *BBLK = LBLK->getBasicBlock();
313   if (BBLK == nullptr)
314     return;
315 
316   const Instruction *TI = BBLK->getTerminator();
317   if (TI == nullptr)
318     return;
319 
320   MDNode *LoopID = TI->getMetadata(LLVMContext::MD_loop);
321   if (LoopID == nullptr)
322     return;
323 
324   assert(LoopID->getNumOperands() > 0 && "requires atleast one operand");
325   assert(LoopID->getOperand(0) == LoopID && "invalid loop");
326 
327   for (const MDOperand &MDO : llvm::drop_begin(LoopID->operands())) {
328     MDNode *MD = dyn_cast<MDNode>(MDO);
329 
330     if (MD == nullptr)
331       continue;
332 
333     MDString *S = dyn_cast<MDString>(MD->getOperand(0));
334 
335     if (S == nullptr)
336       continue;
337 
338     if (S->getString() == "llvm.loop.pipeline.initiationinterval") {
339       assert(MD->getNumOperands() == 2 &&
340              "Pipeline initiation interval hint metadata should have two operands.");
341       II_setByPragma =
342           mdconst::extract<ConstantInt>(MD->getOperand(1))->getZExtValue();
343       assert(II_setByPragma >= 1 && "Pipeline initiation interval must be positive.");
344     } else if (S->getString() == "llvm.loop.pipeline.disable") {
345       disabledByPragma = true;
346     }
347   }
348 }
349 
350 /// Return true if the loop can be software pipelined.  The algorithm is
351 /// restricted to loops with a single basic block.  Make sure that the
352 /// branch in the loop can be analyzed.
353 bool MachinePipeliner::canPipelineLoop(MachineLoop &L) {
354   if (L.getNumBlocks() != 1) {
355     ORE->emit([&]() {
356       return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "canPipelineLoop",
357                                                L.getStartLoc(), L.getHeader())
358              << "Not a single basic block: "
359              << ore::NV("NumBlocks", L.getNumBlocks());
360     });
361     return false;
362   }
363 
364   if (disabledByPragma) {
365     ORE->emit([&]() {
366       return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "canPipelineLoop",
367                                                L.getStartLoc(), L.getHeader())
368              << "Disabled by Pragma.";
369     });
370     return false;
371   }
372 
373   // Check if the branch can't be understood because we can't do pipelining
374   // if that's the case.
375   LI.TBB = nullptr;
376   LI.FBB = nullptr;
377   LI.BrCond.clear();
378   if (TII->analyzeBranch(*L.getHeader(), LI.TBB, LI.FBB, LI.BrCond)) {
379     LLVM_DEBUG(dbgs() << "Unable to analyzeBranch, can NOT pipeline Loop\n");
380     NumFailBranch++;
381     ORE->emit([&]() {
382       return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "canPipelineLoop",
383                                                L.getStartLoc(), L.getHeader())
384              << "The branch can't be understood";
385     });
386     return false;
387   }
388 
389   LI.LoopInductionVar = nullptr;
390   LI.LoopCompare = nullptr;
391   LI.LoopPipelinerInfo = TII->analyzeLoopForPipelining(L.getTopBlock());
392   if (!LI.LoopPipelinerInfo) {
393     LLVM_DEBUG(dbgs() << "Unable to analyzeLoop, can NOT pipeline Loop\n");
394     NumFailLoop++;
395     ORE->emit([&]() {
396       return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "canPipelineLoop",
397                                                L.getStartLoc(), L.getHeader())
398              << "The loop structure is not supported";
399     });
400     return false;
401   }
402 
403   if (!L.getLoopPreheader()) {
404     LLVM_DEBUG(dbgs() << "Preheader not found, can NOT pipeline Loop\n");
405     NumFailPreheader++;
406     ORE->emit([&]() {
407       return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "canPipelineLoop",
408                                                L.getStartLoc(), L.getHeader())
409              << "No loop preheader found";
410     });
411     return false;
412   }
413 
414   // Remove any subregisters from inputs to phi nodes.
415   preprocessPhiNodes(*L.getHeader());
416   return true;
417 }
418 
419 void MachinePipeliner::preprocessPhiNodes(MachineBasicBlock &B) {
420   MachineRegisterInfo &MRI = MF->getRegInfo();
421   SlotIndexes &Slots = *getAnalysis<LiveIntervals>().getSlotIndexes();
422 
423   for (MachineInstr &PI : B.phis()) {
424     MachineOperand &DefOp = PI.getOperand(0);
425     assert(DefOp.getSubReg() == 0);
426     auto *RC = MRI.getRegClass(DefOp.getReg());
427 
428     for (unsigned i = 1, n = PI.getNumOperands(); i != n; i += 2) {
429       MachineOperand &RegOp = PI.getOperand(i);
430       if (RegOp.getSubReg() == 0)
431         continue;
432 
433       // If the operand uses a subregister, replace it with a new register
434       // without subregisters, and generate a copy to the new register.
435       Register NewReg = MRI.createVirtualRegister(RC);
436       MachineBasicBlock &PredB = *PI.getOperand(i+1).getMBB();
437       MachineBasicBlock::iterator At = PredB.getFirstTerminator();
438       const DebugLoc &DL = PredB.findDebugLoc(At);
439       auto Copy = BuildMI(PredB, At, DL, TII->get(TargetOpcode::COPY), NewReg)
440                     .addReg(RegOp.getReg(), getRegState(RegOp),
441                             RegOp.getSubReg());
442       Slots.insertMachineInstrInMaps(*Copy);
443       RegOp.setReg(NewReg);
444       RegOp.setSubReg(0);
445     }
446   }
447 }
448 
449 /// The SMS algorithm consists of the following main steps:
450 /// 1. Computation and analysis of the dependence graph.
451 /// 2. Ordering of the nodes (instructions).
452 /// 3. Attempt to Schedule the loop.
453 bool MachinePipeliner::swingModuloScheduler(MachineLoop &L) {
454   assert(L.getBlocks().size() == 1 && "SMS works on single blocks only.");
455 
456   SwingSchedulerDAG SMS(*this, L, getAnalysis<LiveIntervals>(), RegClassInfo,
457                         II_setByPragma, LI.LoopPipelinerInfo.get());
458 
459   MachineBasicBlock *MBB = L.getHeader();
460   // The kernel should not include any terminator instructions.  These
461   // will be added back later.
462   SMS.startBlock(MBB);
463 
464   // Compute the number of 'real' instructions in the basic block by
465   // ignoring terminators.
466   unsigned size = MBB->size();
467   for (MachineBasicBlock::iterator I = MBB->getFirstTerminator(),
468                                    E = MBB->instr_end();
469        I != E; ++I, --size)
470     ;
471 
472   SMS.enterRegion(MBB, MBB->begin(), MBB->getFirstTerminator(), size);
473   SMS.schedule();
474   SMS.exitRegion();
475 
476   SMS.finishBlock();
477   return SMS.hasNewSchedule();
478 }
479 
480 void MachinePipeliner::getAnalysisUsage(AnalysisUsage &AU) const {
481   AU.addRequired<AAResultsWrapperPass>();
482   AU.addPreserved<AAResultsWrapperPass>();
483   AU.addRequired<MachineLoopInfo>();
484   AU.addRequired<MachineDominatorTree>();
485   AU.addRequired<LiveIntervals>();
486   AU.addRequired<MachineOptimizationRemarkEmitterPass>();
487   MachineFunctionPass::getAnalysisUsage(AU);
488 }
489 
490 void SwingSchedulerDAG::setMII(unsigned ResMII, unsigned RecMII) {
491   if (SwpForceII > 0)
492     MII = SwpForceII;
493   else if (II_setByPragma > 0)
494     MII = II_setByPragma;
495   else
496     MII = std::max(ResMII, RecMII);
497 }
498 
499 void SwingSchedulerDAG::setMAX_II() {
500   if (SwpForceII > 0)
501     MAX_II = SwpForceII;
502   else if (II_setByPragma > 0)
503     MAX_II = II_setByPragma;
504   else
505     MAX_II = MII + SwpIISearchRange;
506 }
507 
508 /// We override the schedule function in ScheduleDAGInstrs to implement the
509 /// scheduling part of the Swing Modulo Scheduling algorithm.
510 void SwingSchedulerDAG::schedule() {
511   AliasAnalysis *AA = &Pass.getAnalysis<AAResultsWrapperPass>().getAAResults();
512   buildSchedGraph(AA);
513   addLoopCarriedDependences(AA);
514   updatePhiDependences();
515   Topo.InitDAGTopologicalSorting();
516   changeDependences();
517   postProcessDAG();
518   LLVM_DEBUG(dump());
519 
520   NodeSetType NodeSets;
521   findCircuits(NodeSets);
522   NodeSetType Circuits = NodeSets;
523 
524   // Calculate the MII.
525   unsigned ResMII = calculateResMII();
526   unsigned RecMII = calculateRecMII(NodeSets);
527 
528   fuseRecs(NodeSets);
529 
530   // This flag is used for testing and can cause correctness problems.
531   if (SwpIgnoreRecMII)
532     RecMII = 0;
533 
534   setMII(ResMII, RecMII);
535   setMAX_II();
536 
537   LLVM_DEBUG(dbgs() << "MII = " << MII << " MAX_II = " << MAX_II
538                     << " (rec=" << RecMII << ", res=" << ResMII << ")\n");
539 
540   // Can't schedule a loop without a valid MII.
541   if (MII == 0) {
542     LLVM_DEBUG(dbgs() << "Invalid Minimal Initiation Interval: 0\n");
543     NumFailZeroMII++;
544     Pass.ORE->emit([&]() {
545       return MachineOptimizationRemarkAnalysis(
546                  DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
547              << "Invalid Minimal Initiation Interval: 0";
548     });
549     return;
550   }
551 
552   // Don't pipeline large loops.
553   if (SwpMaxMii != -1 && (int)MII > SwpMaxMii) {
554     LLVM_DEBUG(dbgs() << "MII > " << SwpMaxMii
555                       << ", we don't pipeline large loops\n");
556     NumFailLargeMaxMII++;
557     Pass.ORE->emit([&]() {
558       return MachineOptimizationRemarkAnalysis(
559                  DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
560              << "Minimal Initiation Interval too large: "
561              << ore::NV("MII", (int)MII) << " > "
562              << ore::NV("SwpMaxMii", SwpMaxMii) << "."
563              << "Refer to -pipeliner-max-mii.";
564     });
565     return;
566   }
567 
568   computeNodeFunctions(NodeSets);
569 
570   registerPressureFilter(NodeSets);
571 
572   colocateNodeSets(NodeSets);
573 
574   checkNodeSets(NodeSets);
575 
576   LLVM_DEBUG({
577     for (auto &I : NodeSets) {
578       dbgs() << "  Rec NodeSet ";
579       I.dump();
580     }
581   });
582 
583   llvm::stable_sort(NodeSets, std::greater<NodeSet>());
584 
585   groupRemainingNodes(NodeSets);
586 
587   removeDuplicateNodes(NodeSets);
588 
589   LLVM_DEBUG({
590     for (auto &I : NodeSets) {
591       dbgs() << "  NodeSet ";
592       I.dump();
593     }
594   });
595 
596   computeNodeOrder(NodeSets);
597 
598   // check for node order issues
599   checkValidNodeOrder(Circuits);
600 
601   SMSchedule Schedule(Pass.MF, this);
602   Scheduled = schedulePipeline(Schedule);
603 
604   if (!Scheduled){
605     LLVM_DEBUG(dbgs() << "No schedule found, return\n");
606     NumFailNoSchedule++;
607     Pass.ORE->emit([&]() {
608       return MachineOptimizationRemarkAnalysis(
609                  DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
610              << "Unable to find schedule";
611     });
612     return;
613   }
614 
615   unsigned numStages = Schedule.getMaxStageCount();
616   // No need to generate pipeline if there are no overlapped iterations.
617   if (numStages == 0) {
618     LLVM_DEBUG(dbgs() << "No overlapped iterations, skip.\n");
619     NumFailZeroStage++;
620     Pass.ORE->emit([&]() {
621       return MachineOptimizationRemarkAnalysis(
622                  DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
623              << "No need to pipeline - no overlapped iterations in schedule.";
624     });
625     return;
626   }
627   // Check that the maximum stage count is less than user-defined limit.
628   if (SwpMaxStages > -1 && (int)numStages > SwpMaxStages) {
629     LLVM_DEBUG(dbgs() << "numStages:" << numStages << ">" << SwpMaxStages
630                       << " : too many stages, abort\n");
631     NumFailLargeMaxStage++;
632     Pass.ORE->emit([&]() {
633       return MachineOptimizationRemarkAnalysis(
634                  DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
635              << "Too many stages in schedule: "
636              << ore::NV("numStages", (int)numStages) << " > "
637              << ore::NV("SwpMaxStages", SwpMaxStages)
638              << ". Refer to -pipeliner-max-stages.";
639     });
640     return;
641   }
642 
643   Pass.ORE->emit([&]() {
644     return MachineOptimizationRemark(DEBUG_TYPE, "schedule", Loop.getStartLoc(),
645                                      Loop.getHeader())
646            << "Pipelined succesfully!";
647   });
648 
649   // Generate the schedule as a ModuloSchedule.
650   DenseMap<MachineInstr *, int> Cycles, Stages;
651   std::vector<MachineInstr *> OrderedInsts;
652   for (int Cycle = Schedule.getFirstCycle(); Cycle <= Schedule.getFinalCycle();
653        ++Cycle) {
654     for (SUnit *SU : Schedule.getInstructions(Cycle)) {
655       OrderedInsts.push_back(SU->getInstr());
656       Cycles[SU->getInstr()] = Cycle;
657       Stages[SU->getInstr()] = Schedule.stageScheduled(SU);
658     }
659   }
660   DenseMap<MachineInstr *, std::pair<unsigned, int64_t>> NewInstrChanges;
661   for (auto &KV : NewMIs) {
662     Cycles[KV.first] = Cycles[KV.second];
663     Stages[KV.first] = Stages[KV.second];
664     NewInstrChanges[KV.first] = InstrChanges[getSUnit(KV.first)];
665   }
666 
667   ModuloSchedule MS(MF, &Loop, std::move(OrderedInsts), std::move(Cycles),
668                     std::move(Stages));
669   if (EmitTestAnnotations) {
670     assert(NewInstrChanges.empty() &&
671            "Cannot serialize a schedule with InstrChanges!");
672     ModuloScheduleTestAnnotater MSTI(MF, MS);
673     MSTI.annotate();
674     return;
675   }
676   // The experimental code generator can't work if there are InstChanges.
677   if (ExperimentalCodeGen && NewInstrChanges.empty()) {
678     PeelingModuloScheduleExpander MSE(MF, MS, &LIS);
679     MSE.expand();
680   } else {
681     ModuloScheduleExpander MSE(MF, MS, LIS, std::move(NewInstrChanges));
682     MSE.expand();
683     MSE.cleanup();
684   }
685   ++NumPipelined;
686 }
687 
688 /// Clean up after the software pipeliner runs.
689 void SwingSchedulerDAG::finishBlock() {
690   for (auto &KV : NewMIs)
691     MF.deleteMachineInstr(KV.second);
692   NewMIs.clear();
693 
694   // Call the superclass.
695   ScheduleDAGInstrs::finishBlock();
696 }
697 
698 /// Return the register values for  the operands of a Phi instruction.
699 /// This function assume the instruction is a Phi.
700 static void getPhiRegs(MachineInstr &Phi, MachineBasicBlock *Loop,
701                        unsigned &InitVal, unsigned &LoopVal) {
702   assert(Phi.isPHI() && "Expecting a Phi.");
703 
704   InitVal = 0;
705   LoopVal = 0;
706   for (unsigned i = 1, e = Phi.getNumOperands(); i != e; i += 2)
707     if (Phi.getOperand(i + 1).getMBB() != Loop)
708       InitVal = Phi.getOperand(i).getReg();
709     else
710       LoopVal = Phi.getOperand(i).getReg();
711 
712   assert(InitVal != 0 && LoopVal != 0 && "Unexpected Phi structure.");
713 }
714 
715 /// Return the Phi register value that comes the loop block.
716 static unsigned getLoopPhiReg(const MachineInstr &Phi,
717                               const MachineBasicBlock *LoopBB) {
718   for (unsigned i = 1, e = Phi.getNumOperands(); i != e; i += 2)
719     if (Phi.getOperand(i + 1).getMBB() == LoopBB)
720       return Phi.getOperand(i).getReg();
721   return 0;
722 }
723 
724 /// Return true if SUb can be reached from SUa following the chain edges.
725 static bool isSuccOrder(SUnit *SUa, SUnit *SUb) {
726   SmallPtrSet<SUnit *, 8> Visited;
727   SmallVector<SUnit *, 8> Worklist;
728   Worklist.push_back(SUa);
729   while (!Worklist.empty()) {
730     const SUnit *SU = Worklist.pop_back_val();
731     for (const auto &SI : SU->Succs) {
732       SUnit *SuccSU = SI.getSUnit();
733       if (SI.getKind() == SDep::Order) {
734         if (Visited.count(SuccSU))
735           continue;
736         if (SuccSU == SUb)
737           return true;
738         Worklist.push_back(SuccSU);
739         Visited.insert(SuccSU);
740       }
741     }
742   }
743   return false;
744 }
745 
746 /// Return true if the instruction causes a chain between memory
747 /// references before and after it.
748 static bool isDependenceBarrier(MachineInstr &MI) {
749   return MI.isCall() || MI.mayRaiseFPException() ||
750          MI.hasUnmodeledSideEffects() ||
751          (MI.hasOrderedMemoryRef() &&
752           (!MI.mayLoad() || !MI.isDereferenceableInvariantLoad()));
753 }
754 
755 /// Return the underlying objects for the memory references of an instruction.
756 /// This function calls the code in ValueTracking, but first checks that the
757 /// instruction has a memory operand.
758 static void getUnderlyingObjects(const MachineInstr *MI,
759                                  SmallVectorImpl<const Value *> &Objs) {
760   if (!MI->hasOneMemOperand())
761     return;
762   MachineMemOperand *MM = *MI->memoperands_begin();
763   if (!MM->getValue())
764     return;
765   getUnderlyingObjects(MM->getValue(), Objs);
766   for (const Value *V : Objs) {
767     if (!isIdentifiedObject(V)) {
768       Objs.clear();
769       return;
770     }
771   }
772 }
773 
774 /// Add a chain edge between a load and store if the store can be an
775 /// alias of the load on a subsequent iteration, i.e., a loop carried
776 /// dependence. This code is very similar to the code in ScheduleDAGInstrs
777 /// but that code doesn't create loop carried dependences.
778 void SwingSchedulerDAG::addLoopCarriedDependences(AliasAnalysis *AA) {
779   MapVector<const Value *, SmallVector<SUnit *, 4>> PendingLoads;
780   Value *UnknownValue =
781     UndefValue::get(Type::getVoidTy(MF.getFunction().getContext()));
782   for (auto &SU : SUnits) {
783     MachineInstr &MI = *SU.getInstr();
784     if (isDependenceBarrier(MI))
785       PendingLoads.clear();
786     else if (MI.mayLoad()) {
787       SmallVector<const Value *, 4> Objs;
788       ::getUnderlyingObjects(&MI, Objs);
789       if (Objs.empty())
790         Objs.push_back(UnknownValue);
791       for (const auto *V : Objs) {
792         SmallVector<SUnit *, 4> &SUs = PendingLoads[V];
793         SUs.push_back(&SU);
794       }
795     } else if (MI.mayStore()) {
796       SmallVector<const Value *, 4> Objs;
797       ::getUnderlyingObjects(&MI, Objs);
798       if (Objs.empty())
799         Objs.push_back(UnknownValue);
800       for (const auto *V : Objs) {
801         MapVector<const Value *, SmallVector<SUnit *, 4>>::iterator I =
802             PendingLoads.find(V);
803         if (I == PendingLoads.end())
804           continue;
805         for (auto *Load : I->second) {
806           if (isSuccOrder(Load, &SU))
807             continue;
808           MachineInstr &LdMI = *Load->getInstr();
809           // First, perform the cheaper check that compares the base register.
810           // If they are the same and the load offset is less than the store
811           // offset, then mark the dependence as loop carried potentially.
812           const MachineOperand *BaseOp1, *BaseOp2;
813           int64_t Offset1, Offset2;
814           bool Offset1IsScalable, Offset2IsScalable;
815           if (TII->getMemOperandWithOffset(LdMI, BaseOp1, Offset1,
816                                            Offset1IsScalable, TRI) &&
817               TII->getMemOperandWithOffset(MI, BaseOp2, Offset2,
818                                            Offset2IsScalable, TRI)) {
819             if (BaseOp1->isIdenticalTo(*BaseOp2) &&
820                 Offset1IsScalable == Offset2IsScalable &&
821                 (int)Offset1 < (int)Offset2) {
822               assert(TII->areMemAccessesTriviallyDisjoint(LdMI, MI) &&
823                      "What happened to the chain edge?");
824               SDep Dep(Load, SDep::Barrier);
825               Dep.setLatency(1);
826               SU.addPred(Dep);
827               continue;
828             }
829           }
830           // Second, the more expensive check that uses alias analysis on the
831           // base registers. If they alias, and the load offset is less than
832           // the store offset, the mark the dependence as loop carried.
833           if (!AA) {
834             SDep Dep(Load, SDep::Barrier);
835             Dep.setLatency(1);
836             SU.addPred(Dep);
837             continue;
838           }
839           MachineMemOperand *MMO1 = *LdMI.memoperands_begin();
840           MachineMemOperand *MMO2 = *MI.memoperands_begin();
841           if (!MMO1->getValue() || !MMO2->getValue()) {
842             SDep Dep(Load, SDep::Barrier);
843             Dep.setLatency(1);
844             SU.addPred(Dep);
845             continue;
846           }
847           if (MMO1->getValue() == MMO2->getValue() &&
848               MMO1->getOffset() <= MMO2->getOffset()) {
849             SDep Dep(Load, SDep::Barrier);
850             Dep.setLatency(1);
851             SU.addPred(Dep);
852             continue;
853           }
854           if (!AA->isNoAlias(
855                   MemoryLocation::getAfter(MMO1->getValue(), MMO1->getAAInfo()),
856                   MemoryLocation::getAfter(MMO2->getValue(),
857                                            MMO2->getAAInfo()))) {
858             SDep Dep(Load, SDep::Barrier);
859             Dep.setLatency(1);
860             SU.addPred(Dep);
861           }
862         }
863       }
864     }
865   }
866 }
867 
868 /// Update the phi dependences to the DAG because ScheduleDAGInstrs no longer
869 /// processes dependences for PHIs. This function adds true dependences
870 /// from a PHI to a use, and a loop carried dependence from the use to the
871 /// PHI. The loop carried dependence is represented as an anti dependence
872 /// edge. This function also removes chain dependences between unrelated
873 /// PHIs.
874 void SwingSchedulerDAG::updatePhiDependences() {
875   SmallVector<SDep, 4> RemoveDeps;
876   const TargetSubtargetInfo &ST = MF.getSubtarget<TargetSubtargetInfo>();
877 
878   // Iterate over each DAG node.
879   for (SUnit &I : SUnits) {
880     RemoveDeps.clear();
881     // Set to true if the instruction has an operand defined by a Phi.
882     unsigned HasPhiUse = 0;
883     unsigned HasPhiDef = 0;
884     MachineInstr *MI = I.getInstr();
885     // Iterate over each operand, and we process the definitions.
886     for (const MachineOperand &MO : MI->operands()) {
887       if (!MO.isReg())
888         continue;
889       Register Reg = MO.getReg();
890       if (MO.isDef()) {
891         // If the register is used by a Phi, then create an anti dependence.
892         for (MachineRegisterInfo::use_instr_iterator
893                  UI = MRI.use_instr_begin(Reg),
894                  UE = MRI.use_instr_end();
895              UI != UE; ++UI) {
896           MachineInstr *UseMI = &*UI;
897           SUnit *SU = getSUnit(UseMI);
898           if (SU != nullptr && UseMI->isPHI()) {
899             if (!MI->isPHI()) {
900               SDep Dep(SU, SDep::Anti, Reg);
901               Dep.setLatency(1);
902               I.addPred(Dep);
903             } else {
904               HasPhiDef = Reg;
905               // Add a chain edge to a dependent Phi that isn't an existing
906               // predecessor.
907               if (SU->NodeNum < I.NodeNum && !I.isPred(SU))
908                 I.addPred(SDep(SU, SDep::Barrier));
909             }
910           }
911         }
912       } else if (MO.isUse()) {
913         // If the register is defined by a Phi, then create a true dependence.
914         MachineInstr *DefMI = MRI.getUniqueVRegDef(Reg);
915         if (DefMI == nullptr)
916           continue;
917         SUnit *SU = getSUnit(DefMI);
918         if (SU != nullptr && DefMI->isPHI()) {
919           if (!MI->isPHI()) {
920             SDep Dep(SU, SDep::Data, Reg);
921             Dep.setLatency(0);
922             ST.adjustSchedDependency(SU, 0, &I, MO.getOperandNo(), Dep,
923                                      &SchedModel);
924             I.addPred(Dep);
925           } else {
926             HasPhiUse = Reg;
927             // Add a chain edge to a dependent Phi that isn't an existing
928             // predecessor.
929             if (SU->NodeNum < I.NodeNum && !I.isPred(SU))
930               I.addPred(SDep(SU, SDep::Barrier));
931           }
932         }
933       }
934     }
935     // Remove order dependences from an unrelated Phi.
936     if (!SwpPruneDeps)
937       continue;
938     for (auto &PI : I.Preds) {
939       MachineInstr *PMI = PI.getSUnit()->getInstr();
940       if (PMI->isPHI() && PI.getKind() == SDep::Order) {
941         if (I.getInstr()->isPHI()) {
942           if (PMI->getOperand(0).getReg() == HasPhiUse)
943             continue;
944           if (getLoopPhiReg(*PMI, PMI->getParent()) == HasPhiDef)
945             continue;
946         }
947         RemoveDeps.push_back(PI);
948       }
949     }
950     for (int i = 0, e = RemoveDeps.size(); i != e; ++i)
951       I.removePred(RemoveDeps[i]);
952   }
953 }
954 
955 /// Iterate over each DAG node and see if we can change any dependences
956 /// in order to reduce the recurrence MII.
957 void SwingSchedulerDAG::changeDependences() {
958   // See if an instruction can use a value from the previous iteration.
959   // If so, we update the base and offset of the instruction and change
960   // the dependences.
961   for (SUnit &I : SUnits) {
962     unsigned BasePos = 0, OffsetPos = 0, NewBase = 0;
963     int64_t NewOffset = 0;
964     if (!canUseLastOffsetValue(I.getInstr(), BasePos, OffsetPos, NewBase,
965                                NewOffset))
966       continue;
967 
968     // Get the MI and SUnit for the instruction that defines the original base.
969     Register OrigBase = I.getInstr()->getOperand(BasePos).getReg();
970     MachineInstr *DefMI = MRI.getUniqueVRegDef(OrigBase);
971     if (!DefMI)
972       continue;
973     SUnit *DefSU = getSUnit(DefMI);
974     if (!DefSU)
975       continue;
976     // Get the MI and SUnit for the instruction that defins the new base.
977     MachineInstr *LastMI = MRI.getUniqueVRegDef(NewBase);
978     if (!LastMI)
979       continue;
980     SUnit *LastSU = getSUnit(LastMI);
981     if (!LastSU)
982       continue;
983 
984     if (Topo.IsReachable(&I, LastSU))
985       continue;
986 
987     // Remove the dependence. The value now depends on a prior iteration.
988     SmallVector<SDep, 4> Deps;
989     for (const SDep &P : I.Preds)
990       if (P.getSUnit() == DefSU)
991         Deps.push_back(P);
992     for (int i = 0, e = Deps.size(); i != e; i++) {
993       Topo.RemovePred(&I, Deps[i].getSUnit());
994       I.removePred(Deps[i]);
995     }
996     // Remove the chain dependence between the instructions.
997     Deps.clear();
998     for (auto &P : LastSU->Preds)
999       if (P.getSUnit() == &I && P.getKind() == SDep::Order)
1000         Deps.push_back(P);
1001     for (int i = 0, e = Deps.size(); i != e; i++) {
1002       Topo.RemovePred(LastSU, Deps[i].getSUnit());
1003       LastSU->removePred(Deps[i]);
1004     }
1005 
1006     // Add a dependence between the new instruction and the instruction
1007     // that defines the new base.
1008     SDep Dep(&I, SDep::Anti, NewBase);
1009     Topo.AddPred(LastSU, &I);
1010     LastSU->addPred(Dep);
1011 
1012     // Remember the base and offset information so that we can update the
1013     // instruction during code generation.
1014     InstrChanges[&I] = std::make_pair(NewBase, NewOffset);
1015   }
1016 }
1017 
1018 /// Create an instruction stream that represents a single iteration and stage of
1019 /// each instruction. This function differs from SMSchedule::finalizeSchedule in
1020 /// that this doesn't have any side-effect to SwingSchedulerDAG. That is, this
1021 /// function is an approximation of SMSchedule::finalizeSchedule with all
1022 /// non-const operations removed.
1023 static void computeScheduledInsts(const SwingSchedulerDAG *SSD,
1024                                   SMSchedule &Schedule,
1025                                   std::vector<MachineInstr *> &OrderedInsts,
1026                                   DenseMap<MachineInstr *, unsigned> &Stages) {
1027   DenseMap<int, std::deque<SUnit *>> Instrs;
1028 
1029   // Move all instructions to the first stage from the later stages.
1030   for (int Cycle = Schedule.getFirstCycle(); Cycle <= Schedule.getFinalCycle();
1031        ++Cycle) {
1032     for (int Stage = 0, LastStage = Schedule.getMaxStageCount();
1033          Stage <= LastStage; ++Stage) {
1034       for (SUnit *SU : llvm::reverse(Schedule.getInstructions(
1035                Cycle + Stage * Schedule.getInitiationInterval()))) {
1036         Instrs[Cycle].push_front(SU);
1037       }
1038     }
1039   }
1040 
1041   for (int Cycle = Schedule.getFirstCycle(); Cycle <= Schedule.getFinalCycle();
1042        ++Cycle) {
1043     std::deque<SUnit *> &CycleInstrs = Instrs[Cycle];
1044     CycleInstrs = Schedule.reorderInstructions(SSD, CycleInstrs);
1045     for (SUnit *SU : CycleInstrs) {
1046       MachineInstr *MI = SU->getInstr();
1047       OrderedInsts.push_back(MI);
1048       Stages[MI] = Schedule.stageScheduled(SU);
1049     }
1050   }
1051 }
1052 
1053 namespace {
1054 
1055 // FuncUnitSorter - Comparison operator used to sort instructions by
1056 // the number of functional unit choices.
1057 struct FuncUnitSorter {
1058   const InstrItineraryData *InstrItins;
1059   const MCSubtargetInfo *STI;
1060   DenseMap<InstrStage::FuncUnits, unsigned> Resources;
1061 
1062   FuncUnitSorter(const TargetSubtargetInfo &TSI)
1063       : InstrItins(TSI.getInstrItineraryData()), STI(&TSI) {}
1064 
1065   // Compute the number of functional unit alternatives needed
1066   // at each stage, and take the minimum value. We prioritize the
1067   // instructions by the least number of choices first.
1068   unsigned minFuncUnits(const MachineInstr *Inst,
1069                         InstrStage::FuncUnits &F) const {
1070     unsigned SchedClass = Inst->getDesc().getSchedClass();
1071     unsigned min = UINT_MAX;
1072     if (InstrItins && !InstrItins->isEmpty()) {
1073       for (const InstrStage &IS :
1074            make_range(InstrItins->beginStage(SchedClass),
1075                       InstrItins->endStage(SchedClass))) {
1076         InstrStage::FuncUnits funcUnits = IS.getUnits();
1077         unsigned numAlternatives = llvm::popcount(funcUnits);
1078         if (numAlternatives < min) {
1079           min = numAlternatives;
1080           F = funcUnits;
1081         }
1082       }
1083       return min;
1084     }
1085     if (STI && STI->getSchedModel().hasInstrSchedModel()) {
1086       const MCSchedClassDesc *SCDesc =
1087           STI->getSchedModel().getSchedClassDesc(SchedClass);
1088       if (!SCDesc->isValid())
1089         // No valid Schedule Class Desc for schedClass, should be
1090         // Pseudo/PostRAPseudo
1091         return min;
1092 
1093       for (const MCWriteProcResEntry &PRE :
1094            make_range(STI->getWriteProcResBegin(SCDesc),
1095                       STI->getWriteProcResEnd(SCDesc))) {
1096         if (!PRE.ReleaseAtCycle)
1097           continue;
1098         const MCProcResourceDesc *ProcResource =
1099             STI->getSchedModel().getProcResource(PRE.ProcResourceIdx);
1100         unsigned NumUnits = ProcResource->NumUnits;
1101         if (NumUnits < min) {
1102           min = NumUnits;
1103           F = PRE.ProcResourceIdx;
1104         }
1105       }
1106       return min;
1107     }
1108     llvm_unreachable("Should have non-empty InstrItins or hasInstrSchedModel!");
1109   }
1110 
1111   // Compute the critical resources needed by the instruction. This
1112   // function records the functional units needed by instructions that
1113   // must use only one functional unit. We use this as a tie breaker
1114   // for computing the resource MII. The instrutions that require
1115   // the same, highly used, functional unit have high priority.
1116   void calcCriticalResources(MachineInstr &MI) {
1117     unsigned SchedClass = MI.getDesc().getSchedClass();
1118     if (InstrItins && !InstrItins->isEmpty()) {
1119       for (const InstrStage &IS :
1120            make_range(InstrItins->beginStage(SchedClass),
1121                       InstrItins->endStage(SchedClass))) {
1122         InstrStage::FuncUnits FuncUnits = IS.getUnits();
1123         if (llvm::popcount(FuncUnits) == 1)
1124           Resources[FuncUnits]++;
1125       }
1126       return;
1127     }
1128     if (STI && STI->getSchedModel().hasInstrSchedModel()) {
1129       const MCSchedClassDesc *SCDesc =
1130           STI->getSchedModel().getSchedClassDesc(SchedClass);
1131       if (!SCDesc->isValid())
1132         // No valid Schedule Class Desc for schedClass, should be
1133         // Pseudo/PostRAPseudo
1134         return;
1135 
1136       for (const MCWriteProcResEntry &PRE :
1137            make_range(STI->getWriteProcResBegin(SCDesc),
1138                       STI->getWriteProcResEnd(SCDesc))) {
1139         if (!PRE.ReleaseAtCycle)
1140           continue;
1141         Resources[PRE.ProcResourceIdx]++;
1142       }
1143       return;
1144     }
1145     llvm_unreachable("Should have non-empty InstrItins or hasInstrSchedModel!");
1146   }
1147 
1148   /// Return true if IS1 has less priority than IS2.
1149   bool operator()(const MachineInstr *IS1, const MachineInstr *IS2) const {
1150     InstrStage::FuncUnits F1 = 0, F2 = 0;
1151     unsigned MFUs1 = minFuncUnits(IS1, F1);
1152     unsigned MFUs2 = minFuncUnits(IS2, F2);
1153     if (MFUs1 == MFUs2)
1154       return Resources.lookup(F1) < Resources.lookup(F2);
1155     return MFUs1 > MFUs2;
1156   }
1157 };
1158 
1159 /// Calculate the maximum register pressure of the scheduled instructions stream
1160 class HighRegisterPressureDetector {
1161   MachineBasicBlock *OrigMBB;
1162   const MachineFunction &MF;
1163   const MachineRegisterInfo &MRI;
1164   const TargetRegisterInfo *TRI;
1165 
1166   const unsigned PSetNum;
1167 
1168   // Indexed by PSet ID
1169   // InitSetPressure takes into account the register pressure of live-in
1170   // registers. It's not depend on how the loop is scheduled, so it's enough to
1171   // calculate them once at the beginning.
1172   std::vector<unsigned> InitSetPressure;
1173 
1174   // Indexed by PSet ID
1175   // Upper limit for each register pressure set
1176   std::vector<unsigned> PressureSetLimit;
1177 
1178   DenseMap<MachineInstr *, RegisterOperands> ROMap;
1179 
1180   using Instr2LastUsesTy = DenseMap<MachineInstr *, SmallDenseSet<Register, 4>>;
1181 
1182 public:
1183   using OrderedInstsTy = std::vector<MachineInstr *>;
1184   using Instr2StageTy = DenseMap<MachineInstr *, unsigned>;
1185 
1186 private:
1187   static void dumpRegisterPressures(const std::vector<unsigned> &Pressures) {
1188     if (Pressures.size() == 0) {
1189       dbgs() << "[]";
1190     } else {
1191       char Prefix = '[';
1192       for (unsigned P : Pressures) {
1193         dbgs() << Prefix << P;
1194         Prefix = ' ';
1195       }
1196       dbgs() << ']';
1197     }
1198   }
1199 
1200   void dumpPSet(Register Reg) const {
1201     dbgs() << "Reg=" << printReg(Reg, TRI, 0, &MRI) << " PSet=";
1202     for (auto PSetIter = MRI.getPressureSets(Reg); PSetIter.isValid();
1203          ++PSetIter) {
1204       dbgs() << *PSetIter << ' ';
1205     }
1206     dbgs() << '\n';
1207   }
1208 
1209   void increaseRegisterPressure(std::vector<unsigned> &Pressure,
1210                                 Register Reg) const {
1211     auto PSetIter = MRI.getPressureSets(Reg);
1212     unsigned Weight = PSetIter.getWeight();
1213     for (; PSetIter.isValid(); ++PSetIter)
1214       Pressure[*PSetIter] += Weight;
1215   }
1216 
1217   void decreaseRegisterPressure(std::vector<unsigned> &Pressure,
1218                                 Register Reg) const {
1219     auto PSetIter = MRI.getPressureSets(Reg);
1220     unsigned Weight = PSetIter.getWeight();
1221     for (; PSetIter.isValid(); ++PSetIter) {
1222       auto &P = Pressure[*PSetIter];
1223       assert(P >= Weight &&
1224              "register pressure must be greater than or equal weight");
1225       P -= Weight;
1226     }
1227   }
1228 
1229   // Return true if Reg is fixed one, for example, stack pointer
1230   bool isFixedRegister(Register Reg) const {
1231     return Reg.isPhysical() && TRI->isFixedRegister(MF, Reg.asMCReg());
1232   }
1233 
1234   bool isDefinedInThisLoop(Register Reg) const {
1235     return Reg.isVirtual() && MRI.getVRegDef(Reg)->getParent() == OrigMBB;
1236   }
1237 
1238   // Search for live-in variables. They are factored into the register pressure
1239   // from the begining. Live-in variables used by every iteration should be
1240   // considered as alive throughout the loop. For example, the variable `c` in
1241   // following code. \code
1242   //   int c = ...;
1243   //   for (int i = 0; i < n; i++)
1244   //     a[i] += b[i] + c;
1245   // \endcode
1246   void computeLiveIn() {
1247     DenseSet<Register> Used;
1248     for (auto &MI : *OrigMBB) {
1249       if (MI.isDebugInstr())
1250         continue;
1251       for (auto &Use : ROMap[&MI].Uses) {
1252         auto Reg = Use.RegUnit;
1253         // Ignore the variable that appears only on one side of phi instruction
1254         // because it's used only at the first iteration.
1255         if (MI.isPHI() && Reg != getLoopPhiReg(MI, OrigMBB))
1256           continue;
1257         if (isFixedRegister(Reg))
1258           continue;
1259         if (isDefinedInThisLoop(Reg))
1260           continue;
1261         Used.insert(Reg);
1262       }
1263     }
1264 
1265     for (auto LiveIn : Used)
1266       increaseRegisterPressure(InitSetPressure, LiveIn);
1267   }
1268 
1269   // Calculate the upper limit of each pressure set
1270   void computePressureSetLimit(const RegisterClassInfo &RCI) {
1271     for (unsigned PSet = 0; PSet < PSetNum; PSet++)
1272       PressureSetLimit[PSet] = TRI->getRegPressureSetLimit(MF, PSet);
1273 
1274     // We assume fixed registers, such as stack pointer, are already in use.
1275     // Therefore subtracting the weight of the fixed registers from the limit of
1276     // each pressure set in advance.
1277     SmallDenseSet<Register, 8> FixedRegs;
1278     for (const TargetRegisterClass *TRC : TRI->regclasses()) {
1279       for (const MCPhysReg Reg : *TRC)
1280         if (isFixedRegister(Reg))
1281           FixedRegs.insert(Reg);
1282     }
1283 
1284     LLVM_DEBUG({
1285       for (auto Reg : FixedRegs) {
1286         dbgs() << printReg(Reg, TRI, 0, &MRI) << ": [";
1287         const int *Sets = TRI->getRegUnitPressureSets(Reg);
1288         for (; *Sets != -1; Sets++) {
1289           dbgs() << TRI->getRegPressureSetName(*Sets) << ", ";
1290         }
1291         dbgs() << "]\n";
1292       }
1293     });
1294 
1295     for (auto Reg : FixedRegs) {
1296       LLVM_DEBUG(dbgs() << "fixed register: " << printReg(Reg, TRI, 0, &MRI)
1297                         << "\n");
1298       auto PSetIter = MRI.getPressureSets(Reg);
1299       unsigned Weight = PSetIter.getWeight();
1300       for (; PSetIter.isValid(); ++PSetIter) {
1301         unsigned &Limit = PressureSetLimit[*PSetIter];
1302         assert(Limit >= Weight &&
1303                "register pressure limit must be greater than or equal weight");
1304         Limit -= Weight;
1305         LLVM_DEBUG(dbgs() << "PSet=" << *PSetIter << " Limit=" << Limit
1306                           << " (decreased by " << Weight << ")\n");
1307       }
1308     }
1309   }
1310 
1311   // There are two patterns of last-use.
1312   //   - by an instruction of the current iteration
1313   //   - by a phi instruction of the next iteration (loop carried value)
1314   //
1315   // Furthermore, following two groups of instructions are executed
1316   // simultaneously
1317   //   - next iteration's phi instructions in i-th stage
1318   //   - current iteration's instructions in i+1-th stage
1319   //
1320   // This function calculates the last-use of each register while taking into
1321   // account the above two patterns.
1322   Instr2LastUsesTy computeLastUses(const OrderedInstsTy &OrderedInsts,
1323                                    Instr2StageTy &Stages) const {
1324     // We treat virtual registers that are defined and used in this loop.
1325     // Following virtual register will be ignored
1326     //   - live-in one
1327     //   - defined but not used in the loop (potentially live-out)
1328     DenseSet<Register> TargetRegs;
1329     const auto UpdateTargetRegs = [this, &TargetRegs](Register Reg) {
1330       if (isDefinedInThisLoop(Reg))
1331         TargetRegs.insert(Reg);
1332     };
1333     for (MachineInstr *MI : OrderedInsts) {
1334       if (MI->isPHI()) {
1335         Register Reg = getLoopPhiReg(*MI, OrigMBB);
1336         UpdateTargetRegs(Reg);
1337       } else {
1338         for (auto &Use : ROMap.find(MI)->getSecond().Uses)
1339           UpdateTargetRegs(Use.RegUnit);
1340       }
1341     }
1342 
1343     const auto InstrScore = [&Stages](MachineInstr *MI) {
1344       return Stages[MI] + MI->isPHI();
1345     };
1346 
1347     DenseMap<Register, MachineInstr *> LastUseMI;
1348     for (MachineInstr *MI : llvm::reverse(OrderedInsts)) {
1349       for (auto &Use : ROMap.find(MI)->getSecond().Uses) {
1350         auto Reg = Use.RegUnit;
1351         if (!TargetRegs.contains(Reg))
1352           continue;
1353         auto Ite = LastUseMI.find(Reg);
1354         if (Ite == LastUseMI.end()) {
1355           LastUseMI[Reg] = MI;
1356         } else {
1357           MachineInstr *Orig = Ite->second;
1358           MachineInstr *New = MI;
1359           if (InstrScore(Orig) < InstrScore(New))
1360             LastUseMI[Reg] = New;
1361         }
1362       }
1363     }
1364 
1365     Instr2LastUsesTy LastUses;
1366     for (auto &Entry : LastUseMI)
1367       LastUses[Entry.second].insert(Entry.first);
1368     return LastUses;
1369   }
1370 
1371   // Compute the maximum register pressure of the kernel. We'll simulate #Stage
1372   // iterations and check the register pressure at the point where all stages
1373   // overlapping.
1374   //
1375   // An example of unrolled loop where #Stage is 4..
1376   // Iter   i+0 i+1 i+2 i+3
1377   // ------------------------
1378   // Stage   0
1379   // Stage   1   0
1380   // Stage   2   1   0
1381   // Stage   3   2   1   0  <- All stages overlap
1382   //
1383   std::vector<unsigned>
1384   computeMaxSetPressure(const OrderedInstsTy &OrderedInsts,
1385                         Instr2StageTy &Stages,
1386                         const unsigned StageCount) const {
1387     using RegSetTy = SmallDenseSet<Register, 16>;
1388 
1389     // Indexed by #Iter. To treat "local" variables of each stage separately, we
1390     // manage the liveness of the registers independently by iterations.
1391     SmallVector<RegSetTy> LiveRegSets(StageCount);
1392 
1393     auto CurSetPressure = InitSetPressure;
1394     auto MaxSetPressure = InitSetPressure;
1395     auto LastUses = computeLastUses(OrderedInsts, Stages);
1396 
1397     LLVM_DEBUG({
1398       dbgs() << "Ordered instructions:\n";
1399       for (MachineInstr *MI : OrderedInsts) {
1400         dbgs() << "Stage " << Stages[MI] << ": ";
1401         MI->dump();
1402       }
1403     });
1404 
1405     const auto InsertReg = [this, &CurSetPressure](RegSetTy &RegSet,
1406                                                    Register Reg) {
1407       if (!Reg.isValid() || isFixedRegister(Reg))
1408         return;
1409 
1410       bool Inserted = RegSet.insert(Reg).second;
1411       if (!Inserted)
1412         return;
1413 
1414       LLVM_DEBUG(dbgs() << "insert " << printReg(Reg, TRI, 0, &MRI) << "\n");
1415       increaseRegisterPressure(CurSetPressure, Reg);
1416       LLVM_DEBUG(dumpPSet(Reg));
1417     };
1418 
1419     const auto EraseReg = [this, &CurSetPressure](RegSetTy &RegSet,
1420                                                   Register Reg) {
1421       if (!Reg.isValid() || isFixedRegister(Reg))
1422         return;
1423 
1424       // live-in register
1425       if (!RegSet.contains(Reg))
1426         return;
1427 
1428       LLVM_DEBUG(dbgs() << "erase " << printReg(Reg, TRI, 0, &MRI) << "\n");
1429       RegSet.erase(Reg);
1430       decreaseRegisterPressure(CurSetPressure, Reg);
1431       LLVM_DEBUG(dumpPSet(Reg));
1432     };
1433 
1434     for (unsigned I = 0; I < StageCount; I++) {
1435       for (MachineInstr *MI : OrderedInsts) {
1436         const auto Stage = Stages[MI];
1437         if (I < Stage)
1438           continue;
1439 
1440         const unsigned Iter = I - Stage;
1441 
1442         for (auto &Def : ROMap.find(MI)->getSecond().Defs)
1443           InsertReg(LiveRegSets[Iter], Def.RegUnit);
1444 
1445         for (auto LastUse : LastUses[MI]) {
1446           if (MI->isPHI()) {
1447             if (Iter != 0)
1448               EraseReg(LiveRegSets[Iter - 1], LastUse);
1449           } else {
1450             EraseReg(LiveRegSets[Iter], LastUse);
1451           }
1452         }
1453 
1454         for (unsigned PSet = 0; PSet < PSetNum; PSet++)
1455           MaxSetPressure[PSet] =
1456               std::max(MaxSetPressure[PSet], CurSetPressure[PSet]);
1457 
1458         LLVM_DEBUG({
1459           dbgs() << "CurSetPressure=";
1460           dumpRegisterPressures(CurSetPressure);
1461           dbgs() << " iter=" << Iter << " stage=" << Stage << ":";
1462           MI->dump();
1463         });
1464       }
1465     }
1466 
1467     return MaxSetPressure;
1468   }
1469 
1470 public:
1471   HighRegisterPressureDetector(MachineBasicBlock *OrigMBB,
1472                                const MachineFunction &MF)
1473       : OrigMBB(OrigMBB), MF(MF), MRI(MF.getRegInfo()),
1474         TRI(MF.getSubtarget().getRegisterInfo()),
1475         PSetNum(TRI->getNumRegPressureSets()), InitSetPressure(PSetNum, 0),
1476         PressureSetLimit(PSetNum, 0) {}
1477 
1478   // Used to calculate register pressure, which is independent of loop
1479   // scheduling.
1480   void init(const RegisterClassInfo &RCI) {
1481     for (MachineInstr &MI : *OrigMBB) {
1482       if (MI.isDebugInstr())
1483         continue;
1484       ROMap[&MI].collect(MI, *TRI, MRI, false, true);
1485     }
1486 
1487     computeLiveIn();
1488     computePressureSetLimit(RCI);
1489   }
1490 
1491   // Calculate the maximum register pressures of the loop and check if they
1492   // exceed the limit
1493   bool detect(const SwingSchedulerDAG *SSD, SMSchedule &Schedule,
1494               const unsigned MaxStage) const {
1495     assert(0 <= RegPressureMargin && RegPressureMargin <= 100 &&
1496            "the percentage of the margin must be between 0 to 100");
1497 
1498     OrderedInstsTy OrderedInsts;
1499     Instr2StageTy Stages;
1500     computeScheduledInsts(SSD, Schedule, OrderedInsts, Stages);
1501     const auto MaxSetPressure =
1502         computeMaxSetPressure(OrderedInsts, Stages, MaxStage + 1);
1503 
1504     LLVM_DEBUG({
1505       dbgs() << "Dump MaxSetPressure:\n";
1506       for (unsigned I = 0; I < MaxSetPressure.size(); I++) {
1507         dbgs() << format("MaxSetPressure[%d]=%d\n", I, MaxSetPressure[I]);
1508       }
1509       dbgs() << '\n';
1510     });
1511 
1512     for (unsigned PSet = 0; PSet < PSetNum; PSet++) {
1513       unsigned Limit = PressureSetLimit[PSet];
1514       unsigned Margin = Limit * RegPressureMargin / 100;
1515       LLVM_DEBUG(dbgs() << "PSet=" << PSet << " Limit=" << Limit
1516                         << " Margin=" << Margin << "\n");
1517       if (Limit < MaxSetPressure[PSet] + Margin) {
1518         LLVM_DEBUG(
1519             dbgs()
1520             << "Rejected the schedule because of too high register pressure\n");
1521         return true;
1522       }
1523     }
1524     return false;
1525   }
1526 };
1527 
1528 } // end anonymous namespace
1529 
1530 /// Calculate the resource constrained minimum initiation interval for the
1531 /// specified loop. We use the DFA to model the resources needed for
1532 /// each instruction, and we ignore dependences. A different DFA is created
1533 /// for each cycle that is required. When adding a new instruction, we attempt
1534 /// to add it to each existing DFA, until a legal space is found. If the
1535 /// instruction cannot be reserved in an existing DFA, we create a new one.
1536 unsigned SwingSchedulerDAG::calculateResMII() {
1537   LLVM_DEBUG(dbgs() << "calculateResMII:\n");
1538   ResourceManager RM(&MF.getSubtarget(), this);
1539   return RM.calculateResMII();
1540 }
1541 
1542 /// Calculate the recurrence-constrainted minimum initiation interval.
1543 /// Iterate over each circuit.  Compute the delay(c) and distance(c)
1544 /// for each circuit. The II needs to satisfy the inequality
1545 /// delay(c) - II*distance(c) <= 0. For each circuit, choose the smallest
1546 /// II that satisfies the inequality, and the RecMII is the maximum
1547 /// of those values.
1548 unsigned SwingSchedulerDAG::calculateRecMII(NodeSetType &NodeSets) {
1549   unsigned RecMII = 0;
1550 
1551   for (NodeSet &Nodes : NodeSets) {
1552     if (Nodes.empty())
1553       continue;
1554 
1555     unsigned Delay = Nodes.getLatency();
1556     unsigned Distance = 1;
1557 
1558     // ii = ceil(delay / distance)
1559     unsigned CurMII = (Delay + Distance - 1) / Distance;
1560     Nodes.setRecMII(CurMII);
1561     if (CurMII > RecMII)
1562       RecMII = CurMII;
1563   }
1564 
1565   return RecMII;
1566 }
1567 
1568 /// Swap all the anti dependences in the DAG. That means it is no longer a DAG,
1569 /// but we do this to find the circuits, and then change them back.
1570 static void swapAntiDependences(std::vector<SUnit> &SUnits) {
1571   SmallVector<std::pair<SUnit *, SDep>, 8> DepsAdded;
1572   for (SUnit &SU : SUnits) {
1573     for (SDep &Pred : SU.Preds)
1574       if (Pred.getKind() == SDep::Anti)
1575         DepsAdded.push_back(std::make_pair(&SU, Pred));
1576   }
1577   for (std::pair<SUnit *, SDep> &P : DepsAdded) {
1578     // Remove this anti dependency and add one in the reverse direction.
1579     SUnit *SU = P.first;
1580     SDep &D = P.second;
1581     SUnit *TargetSU = D.getSUnit();
1582     unsigned Reg = D.getReg();
1583     unsigned Lat = D.getLatency();
1584     SU->removePred(D);
1585     SDep Dep(SU, SDep::Anti, Reg);
1586     Dep.setLatency(Lat);
1587     TargetSU->addPred(Dep);
1588   }
1589 }
1590 
1591 /// Create the adjacency structure of the nodes in the graph.
1592 void SwingSchedulerDAG::Circuits::createAdjacencyStructure(
1593     SwingSchedulerDAG *DAG) {
1594   BitVector Added(SUnits.size());
1595   DenseMap<int, int> OutputDeps;
1596   for (int i = 0, e = SUnits.size(); i != e; ++i) {
1597     Added.reset();
1598     // Add any successor to the adjacency matrix and exclude duplicates.
1599     for (auto &SI : SUnits[i].Succs) {
1600       // Only create a back-edge on the first and last nodes of a dependence
1601       // chain. This records any chains and adds them later.
1602       if (SI.getKind() == SDep::Output) {
1603         int N = SI.getSUnit()->NodeNum;
1604         int BackEdge = i;
1605         auto Dep = OutputDeps.find(BackEdge);
1606         if (Dep != OutputDeps.end()) {
1607           BackEdge = Dep->second;
1608           OutputDeps.erase(Dep);
1609         }
1610         OutputDeps[N] = BackEdge;
1611       }
1612       // Do not process a boundary node, an artificial node.
1613       // A back-edge is processed only if it goes to a Phi.
1614       if (SI.getSUnit()->isBoundaryNode() || SI.isArtificial() ||
1615           (SI.getKind() == SDep::Anti && !SI.getSUnit()->getInstr()->isPHI()))
1616         continue;
1617       int N = SI.getSUnit()->NodeNum;
1618       if (!Added.test(N)) {
1619         AdjK[i].push_back(N);
1620         Added.set(N);
1621       }
1622     }
1623     // A chain edge between a store and a load is treated as a back-edge in the
1624     // adjacency matrix.
1625     for (auto &PI : SUnits[i].Preds) {
1626       if (!SUnits[i].getInstr()->mayStore() ||
1627           !DAG->isLoopCarriedDep(&SUnits[i], PI, false))
1628         continue;
1629       if (PI.getKind() == SDep::Order && PI.getSUnit()->getInstr()->mayLoad()) {
1630         int N = PI.getSUnit()->NodeNum;
1631         if (!Added.test(N)) {
1632           AdjK[i].push_back(N);
1633           Added.set(N);
1634         }
1635       }
1636     }
1637   }
1638   // Add back-edges in the adjacency matrix for the output dependences.
1639   for (auto &OD : OutputDeps)
1640     if (!Added.test(OD.second)) {
1641       AdjK[OD.first].push_back(OD.second);
1642       Added.set(OD.second);
1643     }
1644 }
1645 
1646 /// Identify an elementary circuit in the dependence graph starting at the
1647 /// specified node.
1648 bool SwingSchedulerDAG::Circuits::circuit(int V, int S, NodeSetType &NodeSets,
1649                                           bool HasBackedge) {
1650   SUnit *SV = &SUnits[V];
1651   bool F = false;
1652   Stack.insert(SV);
1653   Blocked.set(V);
1654 
1655   for (auto W : AdjK[V]) {
1656     if (NumPaths > MaxPaths)
1657       break;
1658     if (W < S)
1659       continue;
1660     if (W == S) {
1661       if (!HasBackedge)
1662         NodeSets.push_back(NodeSet(Stack.begin(), Stack.end()));
1663       F = true;
1664       ++NumPaths;
1665       break;
1666     } else if (!Blocked.test(W)) {
1667       if (circuit(W, S, NodeSets,
1668                   Node2Idx->at(W) < Node2Idx->at(V) ? true : HasBackedge))
1669         F = true;
1670     }
1671   }
1672 
1673   if (F)
1674     unblock(V);
1675   else {
1676     for (auto W : AdjK[V]) {
1677       if (W < S)
1678         continue;
1679       B[W].insert(SV);
1680     }
1681   }
1682   Stack.pop_back();
1683   return F;
1684 }
1685 
1686 /// Unblock a node in the circuit finding algorithm.
1687 void SwingSchedulerDAG::Circuits::unblock(int U) {
1688   Blocked.reset(U);
1689   SmallPtrSet<SUnit *, 4> &BU = B[U];
1690   while (!BU.empty()) {
1691     SmallPtrSet<SUnit *, 4>::iterator SI = BU.begin();
1692     assert(SI != BU.end() && "Invalid B set.");
1693     SUnit *W = *SI;
1694     BU.erase(W);
1695     if (Blocked.test(W->NodeNum))
1696       unblock(W->NodeNum);
1697   }
1698 }
1699 
1700 /// Identify all the elementary circuits in the dependence graph using
1701 /// Johnson's circuit algorithm.
1702 void SwingSchedulerDAG::findCircuits(NodeSetType &NodeSets) {
1703   // Swap all the anti dependences in the DAG. That means it is no longer a DAG,
1704   // but we do this to find the circuits, and then change them back.
1705   swapAntiDependences(SUnits);
1706 
1707   Circuits Cir(SUnits, Topo);
1708   // Create the adjacency structure.
1709   Cir.createAdjacencyStructure(this);
1710   for (int i = 0, e = SUnits.size(); i != e; ++i) {
1711     Cir.reset();
1712     Cir.circuit(i, i, NodeSets);
1713   }
1714 
1715   // Change the dependences back so that we've created a DAG again.
1716   swapAntiDependences(SUnits);
1717 }
1718 
1719 // Create artificial dependencies between the source of COPY/REG_SEQUENCE that
1720 // is loop-carried to the USE in next iteration. This will help pipeliner avoid
1721 // additional copies that are needed across iterations. An artificial dependence
1722 // edge is added from USE to SOURCE of COPY/REG_SEQUENCE.
1723 
1724 // PHI-------Anti-Dep-----> COPY/REG_SEQUENCE (loop-carried)
1725 // SRCOfCopY------True-Dep---> COPY/REG_SEQUENCE
1726 // PHI-------True-Dep------> USEOfPhi
1727 
1728 // The mutation creates
1729 // USEOfPHI -------Artificial-Dep---> SRCOfCopy
1730 
1731 // This overall will ensure, the USEOfPHI is scheduled before SRCOfCopy
1732 // (since USE is a predecessor), implies, the COPY/ REG_SEQUENCE is scheduled
1733 // late  to avoid additional copies across iterations. The possible scheduling
1734 // order would be
1735 // USEOfPHI --- SRCOfCopy---  COPY/REG_SEQUENCE.
1736 
1737 void SwingSchedulerDAG::CopyToPhiMutation::apply(ScheduleDAGInstrs *DAG) {
1738   for (SUnit &SU : DAG->SUnits) {
1739     // Find the COPY/REG_SEQUENCE instruction.
1740     if (!SU.getInstr()->isCopy() && !SU.getInstr()->isRegSequence())
1741       continue;
1742 
1743     // Record the loop carried PHIs.
1744     SmallVector<SUnit *, 4> PHISUs;
1745     // Record the SrcSUs that feed the COPY/REG_SEQUENCE instructions.
1746     SmallVector<SUnit *, 4> SrcSUs;
1747 
1748     for (auto &Dep : SU.Preds) {
1749       SUnit *TmpSU = Dep.getSUnit();
1750       MachineInstr *TmpMI = TmpSU->getInstr();
1751       SDep::Kind DepKind = Dep.getKind();
1752       // Save the loop carried PHI.
1753       if (DepKind == SDep::Anti && TmpMI->isPHI())
1754         PHISUs.push_back(TmpSU);
1755       // Save the source of COPY/REG_SEQUENCE.
1756       // If the source has no pre-decessors, we will end up creating cycles.
1757       else if (DepKind == SDep::Data && !TmpMI->isPHI() && TmpSU->NumPreds > 0)
1758         SrcSUs.push_back(TmpSU);
1759     }
1760 
1761     if (PHISUs.size() == 0 || SrcSUs.size() == 0)
1762       continue;
1763 
1764     // Find the USEs of PHI. If the use is a PHI or REG_SEQUENCE, push back this
1765     // SUnit to the container.
1766     SmallVector<SUnit *, 8> UseSUs;
1767     // Do not use iterator based loop here as we are updating the container.
1768     for (size_t Index = 0; Index < PHISUs.size(); ++Index) {
1769       for (auto &Dep : PHISUs[Index]->Succs) {
1770         if (Dep.getKind() != SDep::Data)
1771           continue;
1772 
1773         SUnit *TmpSU = Dep.getSUnit();
1774         MachineInstr *TmpMI = TmpSU->getInstr();
1775         if (TmpMI->isPHI() || TmpMI->isRegSequence()) {
1776           PHISUs.push_back(TmpSU);
1777           continue;
1778         }
1779         UseSUs.push_back(TmpSU);
1780       }
1781     }
1782 
1783     if (UseSUs.size() == 0)
1784       continue;
1785 
1786     SwingSchedulerDAG *SDAG = cast<SwingSchedulerDAG>(DAG);
1787     // Add the artificial dependencies if it does not form a cycle.
1788     for (auto *I : UseSUs) {
1789       for (auto *Src : SrcSUs) {
1790         if (!SDAG->Topo.IsReachable(I, Src) && Src != I) {
1791           Src->addPred(SDep(I, SDep::Artificial));
1792           SDAG->Topo.AddPred(Src, I);
1793         }
1794       }
1795     }
1796   }
1797 }
1798 
1799 /// Return true for DAG nodes that we ignore when computing the cost functions.
1800 /// We ignore the back-edge recurrence in order to avoid unbounded recursion
1801 /// in the calculation of the ASAP, ALAP, etc functions.
1802 static bool ignoreDependence(const SDep &D, bool isPred) {
1803   if (D.isArtificial() || D.getSUnit()->isBoundaryNode())
1804     return true;
1805   return D.getKind() == SDep::Anti && isPred;
1806 }
1807 
1808 /// Compute several functions need to order the nodes for scheduling.
1809 ///  ASAP - Earliest time to schedule a node.
1810 ///  ALAP - Latest time to schedule a node.
1811 ///  MOV - Mobility function, difference between ALAP and ASAP.
1812 ///  D - Depth of each node.
1813 ///  H - Height of each node.
1814 void SwingSchedulerDAG::computeNodeFunctions(NodeSetType &NodeSets) {
1815   ScheduleInfo.resize(SUnits.size());
1816 
1817   LLVM_DEBUG({
1818     for (int I : Topo) {
1819       const SUnit &SU = SUnits[I];
1820       dumpNode(SU);
1821     }
1822   });
1823 
1824   int maxASAP = 0;
1825   // Compute ASAP and ZeroLatencyDepth.
1826   for (int I : Topo) {
1827     int asap = 0;
1828     int zeroLatencyDepth = 0;
1829     SUnit *SU = &SUnits[I];
1830     for (const SDep &P : SU->Preds) {
1831       SUnit *pred = P.getSUnit();
1832       if (P.getLatency() == 0)
1833         zeroLatencyDepth =
1834             std::max(zeroLatencyDepth, getZeroLatencyDepth(pred) + 1);
1835       if (ignoreDependence(P, true))
1836         continue;
1837       asap = std::max(asap, (int)(getASAP(pred) + P.getLatency() -
1838                                   getDistance(pred, SU, P) * MII));
1839     }
1840     maxASAP = std::max(maxASAP, asap);
1841     ScheduleInfo[I].ASAP = asap;
1842     ScheduleInfo[I].ZeroLatencyDepth = zeroLatencyDepth;
1843   }
1844 
1845   // Compute ALAP, ZeroLatencyHeight, and MOV.
1846   for (int I : llvm::reverse(Topo)) {
1847     int alap = maxASAP;
1848     int zeroLatencyHeight = 0;
1849     SUnit *SU = &SUnits[I];
1850     for (const SDep &S : SU->Succs) {
1851       SUnit *succ = S.getSUnit();
1852       if (succ->isBoundaryNode())
1853         continue;
1854       if (S.getLatency() == 0)
1855         zeroLatencyHeight =
1856             std::max(zeroLatencyHeight, getZeroLatencyHeight(succ) + 1);
1857       if (ignoreDependence(S, true))
1858         continue;
1859       alap = std::min(alap, (int)(getALAP(succ) - S.getLatency() +
1860                                   getDistance(SU, succ, S) * MII));
1861     }
1862 
1863     ScheduleInfo[I].ALAP = alap;
1864     ScheduleInfo[I].ZeroLatencyHeight = zeroLatencyHeight;
1865   }
1866 
1867   // After computing the node functions, compute the summary for each node set.
1868   for (NodeSet &I : NodeSets)
1869     I.computeNodeSetInfo(this);
1870 
1871   LLVM_DEBUG({
1872     for (unsigned i = 0; i < SUnits.size(); i++) {
1873       dbgs() << "\tNode " << i << ":\n";
1874       dbgs() << "\t   ASAP = " << getASAP(&SUnits[i]) << "\n";
1875       dbgs() << "\t   ALAP = " << getALAP(&SUnits[i]) << "\n";
1876       dbgs() << "\t   MOV  = " << getMOV(&SUnits[i]) << "\n";
1877       dbgs() << "\t   D    = " << getDepth(&SUnits[i]) << "\n";
1878       dbgs() << "\t   H    = " << getHeight(&SUnits[i]) << "\n";
1879       dbgs() << "\t   ZLD  = " << getZeroLatencyDepth(&SUnits[i]) << "\n";
1880       dbgs() << "\t   ZLH  = " << getZeroLatencyHeight(&SUnits[i]) << "\n";
1881     }
1882   });
1883 }
1884 
1885 /// Compute the Pred_L(O) set, as defined in the paper. The set is defined
1886 /// as the predecessors of the elements of NodeOrder that are not also in
1887 /// NodeOrder.
1888 static bool pred_L(SetVector<SUnit *> &NodeOrder,
1889                    SmallSetVector<SUnit *, 8> &Preds,
1890                    const NodeSet *S = nullptr) {
1891   Preds.clear();
1892   for (const SUnit *SU : NodeOrder) {
1893     for (const SDep &Pred : SU->Preds) {
1894       if (S && S->count(Pred.getSUnit()) == 0)
1895         continue;
1896       if (ignoreDependence(Pred, true))
1897         continue;
1898       if (NodeOrder.count(Pred.getSUnit()) == 0)
1899         Preds.insert(Pred.getSUnit());
1900     }
1901     // Back-edges are predecessors with an anti-dependence.
1902     for (const SDep &Succ : SU->Succs) {
1903       if (Succ.getKind() != SDep::Anti)
1904         continue;
1905       if (S && S->count(Succ.getSUnit()) == 0)
1906         continue;
1907       if (NodeOrder.count(Succ.getSUnit()) == 0)
1908         Preds.insert(Succ.getSUnit());
1909     }
1910   }
1911   return !Preds.empty();
1912 }
1913 
1914 /// Compute the Succ_L(O) set, as defined in the paper. The set is defined
1915 /// as the successors of the elements of NodeOrder that are not also in
1916 /// NodeOrder.
1917 static bool succ_L(SetVector<SUnit *> &NodeOrder,
1918                    SmallSetVector<SUnit *, 8> &Succs,
1919                    const NodeSet *S = nullptr) {
1920   Succs.clear();
1921   for (const SUnit *SU : NodeOrder) {
1922     for (const SDep &Succ : SU->Succs) {
1923       if (S && S->count(Succ.getSUnit()) == 0)
1924         continue;
1925       if (ignoreDependence(Succ, false))
1926         continue;
1927       if (NodeOrder.count(Succ.getSUnit()) == 0)
1928         Succs.insert(Succ.getSUnit());
1929     }
1930     for (const SDep &Pred : SU->Preds) {
1931       if (Pred.getKind() != SDep::Anti)
1932         continue;
1933       if (S && S->count(Pred.getSUnit()) == 0)
1934         continue;
1935       if (NodeOrder.count(Pred.getSUnit()) == 0)
1936         Succs.insert(Pred.getSUnit());
1937     }
1938   }
1939   return !Succs.empty();
1940 }
1941 
1942 /// Return true if there is a path from the specified node to any of the nodes
1943 /// in DestNodes. Keep track and return the nodes in any path.
1944 static bool computePath(SUnit *Cur, SetVector<SUnit *> &Path,
1945                         SetVector<SUnit *> &DestNodes,
1946                         SetVector<SUnit *> &Exclude,
1947                         SmallPtrSet<SUnit *, 8> &Visited) {
1948   if (Cur->isBoundaryNode())
1949     return false;
1950   if (Exclude.contains(Cur))
1951     return false;
1952   if (DestNodes.contains(Cur))
1953     return true;
1954   if (!Visited.insert(Cur).second)
1955     return Path.contains(Cur);
1956   bool FoundPath = false;
1957   for (auto &SI : Cur->Succs)
1958     if (!ignoreDependence(SI, false))
1959       FoundPath |=
1960           computePath(SI.getSUnit(), Path, DestNodes, Exclude, Visited);
1961   for (auto &PI : Cur->Preds)
1962     if (PI.getKind() == SDep::Anti)
1963       FoundPath |=
1964           computePath(PI.getSUnit(), Path, DestNodes, Exclude, Visited);
1965   if (FoundPath)
1966     Path.insert(Cur);
1967   return FoundPath;
1968 }
1969 
1970 /// Compute the live-out registers for the instructions in a node-set.
1971 /// The live-out registers are those that are defined in the node-set,
1972 /// but not used. Except for use operands of Phis.
1973 static void computeLiveOuts(MachineFunction &MF, RegPressureTracker &RPTracker,
1974                             NodeSet &NS) {
1975   const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
1976   MachineRegisterInfo &MRI = MF.getRegInfo();
1977   SmallVector<RegisterMaskPair, 8> LiveOutRegs;
1978   SmallSet<unsigned, 4> Uses;
1979   for (SUnit *SU : NS) {
1980     const MachineInstr *MI = SU->getInstr();
1981     if (MI->isPHI())
1982       continue;
1983     for (const MachineOperand &MO : MI->all_uses()) {
1984       Register Reg = MO.getReg();
1985       if (Reg.isVirtual())
1986         Uses.insert(Reg);
1987       else if (MRI.isAllocatable(Reg))
1988         for (MCRegUnit Unit : TRI->regunits(Reg.asMCReg()))
1989           Uses.insert(Unit);
1990     }
1991   }
1992   for (SUnit *SU : NS)
1993     for (const MachineOperand &MO : SU->getInstr()->all_defs())
1994       if (!MO.isDead()) {
1995         Register Reg = MO.getReg();
1996         if (Reg.isVirtual()) {
1997           if (!Uses.count(Reg))
1998             LiveOutRegs.push_back(RegisterMaskPair(Reg,
1999                                                    LaneBitmask::getNone()));
2000         } else if (MRI.isAllocatable(Reg)) {
2001           for (MCRegUnit Unit : TRI->regunits(Reg.asMCReg()))
2002             if (!Uses.count(Unit))
2003               LiveOutRegs.push_back(
2004                   RegisterMaskPair(Unit, LaneBitmask::getNone()));
2005         }
2006       }
2007   RPTracker.addLiveRegs(LiveOutRegs);
2008 }
2009 
2010 /// A heuristic to filter nodes in recurrent node-sets if the register
2011 /// pressure of a set is too high.
2012 void SwingSchedulerDAG::registerPressureFilter(NodeSetType &NodeSets) {
2013   for (auto &NS : NodeSets) {
2014     // Skip small node-sets since they won't cause register pressure problems.
2015     if (NS.size() <= 2)
2016       continue;
2017     IntervalPressure RecRegPressure;
2018     RegPressureTracker RecRPTracker(RecRegPressure);
2019     RecRPTracker.init(&MF, &RegClassInfo, &LIS, BB, BB->end(), false, true);
2020     computeLiveOuts(MF, RecRPTracker, NS);
2021     RecRPTracker.closeBottom();
2022 
2023     std::vector<SUnit *> SUnits(NS.begin(), NS.end());
2024     llvm::sort(SUnits, [](const SUnit *A, const SUnit *B) {
2025       return A->NodeNum > B->NodeNum;
2026     });
2027 
2028     for (auto &SU : SUnits) {
2029       // Since we're computing the register pressure for a subset of the
2030       // instructions in a block, we need to set the tracker for each
2031       // instruction in the node-set. The tracker is set to the instruction
2032       // just after the one we're interested in.
2033       MachineBasicBlock::const_iterator CurInstI = SU->getInstr();
2034       RecRPTracker.setPos(std::next(CurInstI));
2035 
2036       RegPressureDelta RPDelta;
2037       ArrayRef<PressureChange> CriticalPSets;
2038       RecRPTracker.getMaxUpwardPressureDelta(SU->getInstr(), nullptr, RPDelta,
2039                                              CriticalPSets,
2040                                              RecRegPressure.MaxSetPressure);
2041       if (RPDelta.Excess.isValid()) {
2042         LLVM_DEBUG(
2043             dbgs() << "Excess register pressure: SU(" << SU->NodeNum << ") "
2044                    << TRI->getRegPressureSetName(RPDelta.Excess.getPSet())
2045                    << ":" << RPDelta.Excess.getUnitInc() << "\n");
2046         NS.setExceedPressure(SU);
2047         break;
2048       }
2049       RecRPTracker.recede();
2050     }
2051   }
2052 }
2053 
2054 /// A heuristic to colocate node sets that have the same set of
2055 /// successors.
2056 void SwingSchedulerDAG::colocateNodeSets(NodeSetType &NodeSets) {
2057   unsigned Colocate = 0;
2058   for (int i = 0, e = NodeSets.size(); i < e; ++i) {
2059     NodeSet &N1 = NodeSets[i];
2060     SmallSetVector<SUnit *, 8> S1;
2061     if (N1.empty() || !succ_L(N1, S1))
2062       continue;
2063     for (int j = i + 1; j < e; ++j) {
2064       NodeSet &N2 = NodeSets[j];
2065       if (N1.compareRecMII(N2) != 0)
2066         continue;
2067       SmallSetVector<SUnit *, 8> S2;
2068       if (N2.empty() || !succ_L(N2, S2))
2069         continue;
2070       if (llvm::set_is_subset(S1, S2) && S1.size() == S2.size()) {
2071         N1.setColocate(++Colocate);
2072         N2.setColocate(Colocate);
2073         break;
2074       }
2075     }
2076   }
2077 }
2078 
2079 /// Check if the existing node-sets are profitable. If not, then ignore the
2080 /// recurrent node-sets, and attempt to schedule all nodes together. This is
2081 /// a heuristic. If the MII is large and all the recurrent node-sets are small,
2082 /// then it's best to try to schedule all instructions together instead of
2083 /// starting with the recurrent node-sets.
2084 void SwingSchedulerDAG::checkNodeSets(NodeSetType &NodeSets) {
2085   // Look for loops with a large MII.
2086   if (MII < 17)
2087     return;
2088   // Check if the node-set contains only a simple add recurrence.
2089   for (auto &NS : NodeSets) {
2090     if (NS.getRecMII() > 2)
2091       return;
2092     if (NS.getMaxDepth() > MII)
2093       return;
2094   }
2095   NodeSets.clear();
2096   LLVM_DEBUG(dbgs() << "Clear recurrence node-sets\n");
2097 }
2098 
2099 /// Add the nodes that do not belong to a recurrence set into groups
2100 /// based upon connected components.
2101 void SwingSchedulerDAG::groupRemainingNodes(NodeSetType &NodeSets) {
2102   SetVector<SUnit *> NodesAdded;
2103   SmallPtrSet<SUnit *, 8> Visited;
2104   // Add the nodes that are on a path between the previous node sets and
2105   // the current node set.
2106   for (NodeSet &I : NodeSets) {
2107     SmallSetVector<SUnit *, 8> N;
2108     // Add the nodes from the current node set to the previous node set.
2109     if (succ_L(I, N)) {
2110       SetVector<SUnit *> Path;
2111       for (SUnit *NI : N) {
2112         Visited.clear();
2113         computePath(NI, Path, NodesAdded, I, Visited);
2114       }
2115       if (!Path.empty())
2116         I.insert(Path.begin(), Path.end());
2117     }
2118     // Add the nodes from the previous node set to the current node set.
2119     N.clear();
2120     if (succ_L(NodesAdded, N)) {
2121       SetVector<SUnit *> Path;
2122       for (SUnit *NI : N) {
2123         Visited.clear();
2124         computePath(NI, Path, I, NodesAdded, Visited);
2125       }
2126       if (!Path.empty())
2127         I.insert(Path.begin(), Path.end());
2128     }
2129     NodesAdded.insert(I.begin(), I.end());
2130   }
2131 
2132   // Create a new node set with the connected nodes of any successor of a node
2133   // in a recurrent set.
2134   NodeSet NewSet;
2135   SmallSetVector<SUnit *, 8> N;
2136   if (succ_L(NodesAdded, N))
2137     for (SUnit *I : N)
2138       addConnectedNodes(I, NewSet, NodesAdded);
2139   if (!NewSet.empty())
2140     NodeSets.push_back(NewSet);
2141 
2142   // Create a new node set with the connected nodes of any predecessor of a node
2143   // in a recurrent set.
2144   NewSet.clear();
2145   if (pred_L(NodesAdded, N))
2146     for (SUnit *I : N)
2147       addConnectedNodes(I, NewSet, NodesAdded);
2148   if (!NewSet.empty())
2149     NodeSets.push_back(NewSet);
2150 
2151   // Create new nodes sets with the connected nodes any remaining node that
2152   // has no predecessor.
2153   for (SUnit &SU : SUnits) {
2154     if (NodesAdded.count(&SU) == 0) {
2155       NewSet.clear();
2156       addConnectedNodes(&SU, NewSet, NodesAdded);
2157       if (!NewSet.empty())
2158         NodeSets.push_back(NewSet);
2159     }
2160   }
2161 }
2162 
2163 /// Add the node to the set, and add all of its connected nodes to the set.
2164 void SwingSchedulerDAG::addConnectedNodes(SUnit *SU, NodeSet &NewSet,
2165                                           SetVector<SUnit *> &NodesAdded) {
2166   NewSet.insert(SU);
2167   NodesAdded.insert(SU);
2168   for (auto &SI : SU->Succs) {
2169     SUnit *Successor = SI.getSUnit();
2170     if (!SI.isArtificial() && !Successor->isBoundaryNode() &&
2171         NodesAdded.count(Successor) == 0)
2172       addConnectedNodes(Successor, NewSet, NodesAdded);
2173   }
2174   for (auto &PI : SU->Preds) {
2175     SUnit *Predecessor = PI.getSUnit();
2176     if (!PI.isArtificial() && NodesAdded.count(Predecessor) == 0)
2177       addConnectedNodes(Predecessor, NewSet, NodesAdded);
2178   }
2179 }
2180 
2181 /// Return true if Set1 contains elements in Set2. The elements in common
2182 /// are returned in a different container.
2183 static bool isIntersect(SmallSetVector<SUnit *, 8> &Set1, const NodeSet &Set2,
2184                         SmallSetVector<SUnit *, 8> &Result) {
2185   Result.clear();
2186   for (SUnit *SU : Set1) {
2187     if (Set2.count(SU) != 0)
2188       Result.insert(SU);
2189   }
2190   return !Result.empty();
2191 }
2192 
2193 /// Merge the recurrence node sets that have the same initial node.
2194 void SwingSchedulerDAG::fuseRecs(NodeSetType &NodeSets) {
2195   for (NodeSetType::iterator I = NodeSets.begin(), E = NodeSets.end(); I != E;
2196        ++I) {
2197     NodeSet &NI = *I;
2198     for (NodeSetType::iterator J = I + 1; J != E;) {
2199       NodeSet &NJ = *J;
2200       if (NI.getNode(0)->NodeNum == NJ.getNode(0)->NodeNum) {
2201         if (NJ.compareRecMII(NI) > 0)
2202           NI.setRecMII(NJ.getRecMII());
2203         for (SUnit *SU : *J)
2204           I->insert(SU);
2205         NodeSets.erase(J);
2206         E = NodeSets.end();
2207       } else {
2208         ++J;
2209       }
2210     }
2211   }
2212 }
2213 
2214 /// Remove nodes that have been scheduled in previous NodeSets.
2215 void SwingSchedulerDAG::removeDuplicateNodes(NodeSetType &NodeSets) {
2216   for (NodeSetType::iterator I = NodeSets.begin(), E = NodeSets.end(); I != E;
2217        ++I)
2218     for (NodeSetType::iterator J = I + 1; J != E;) {
2219       J->remove_if([&](SUnit *SUJ) { return I->count(SUJ); });
2220 
2221       if (J->empty()) {
2222         NodeSets.erase(J);
2223         E = NodeSets.end();
2224       } else {
2225         ++J;
2226       }
2227     }
2228 }
2229 
2230 /// Compute an ordered list of the dependence graph nodes, which
2231 /// indicates the order that the nodes will be scheduled.  This is a
2232 /// two-level algorithm. First, a partial order is created, which
2233 /// consists of a list of sets ordered from highest to lowest priority.
2234 void SwingSchedulerDAG::computeNodeOrder(NodeSetType &NodeSets) {
2235   SmallSetVector<SUnit *, 8> R;
2236   NodeOrder.clear();
2237 
2238   for (auto &Nodes : NodeSets) {
2239     LLVM_DEBUG(dbgs() << "NodeSet size " << Nodes.size() << "\n");
2240     OrderKind Order;
2241     SmallSetVector<SUnit *, 8> N;
2242     if (pred_L(NodeOrder, N) && llvm::set_is_subset(N, Nodes)) {
2243       R.insert(N.begin(), N.end());
2244       Order = BottomUp;
2245       LLVM_DEBUG(dbgs() << "  Bottom up (preds) ");
2246     } else if (succ_L(NodeOrder, N) && llvm::set_is_subset(N, Nodes)) {
2247       R.insert(N.begin(), N.end());
2248       Order = TopDown;
2249       LLVM_DEBUG(dbgs() << "  Top down (succs) ");
2250     } else if (isIntersect(N, Nodes, R)) {
2251       // If some of the successors are in the existing node-set, then use the
2252       // top-down ordering.
2253       Order = TopDown;
2254       LLVM_DEBUG(dbgs() << "  Top down (intersect) ");
2255     } else if (NodeSets.size() == 1) {
2256       for (const auto &N : Nodes)
2257         if (N->Succs.size() == 0)
2258           R.insert(N);
2259       Order = BottomUp;
2260       LLVM_DEBUG(dbgs() << "  Bottom up (all) ");
2261     } else {
2262       // Find the node with the highest ASAP.
2263       SUnit *maxASAP = nullptr;
2264       for (SUnit *SU : Nodes) {
2265         if (maxASAP == nullptr || getASAP(SU) > getASAP(maxASAP) ||
2266             (getASAP(SU) == getASAP(maxASAP) && SU->NodeNum > maxASAP->NodeNum))
2267           maxASAP = SU;
2268       }
2269       R.insert(maxASAP);
2270       Order = BottomUp;
2271       LLVM_DEBUG(dbgs() << "  Bottom up (default) ");
2272     }
2273 
2274     while (!R.empty()) {
2275       if (Order == TopDown) {
2276         // Choose the node with the maximum height.  If more than one, choose
2277         // the node wiTH the maximum ZeroLatencyHeight. If still more than one,
2278         // choose the node with the lowest MOV.
2279         while (!R.empty()) {
2280           SUnit *maxHeight = nullptr;
2281           for (SUnit *I : R) {
2282             if (maxHeight == nullptr || getHeight(I) > getHeight(maxHeight))
2283               maxHeight = I;
2284             else if (getHeight(I) == getHeight(maxHeight) &&
2285                      getZeroLatencyHeight(I) > getZeroLatencyHeight(maxHeight))
2286               maxHeight = I;
2287             else if (getHeight(I) == getHeight(maxHeight) &&
2288                      getZeroLatencyHeight(I) ==
2289                          getZeroLatencyHeight(maxHeight) &&
2290                      getMOV(I) < getMOV(maxHeight))
2291               maxHeight = I;
2292           }
2293           NodeOrder.insert(maxHeight);
2294           LLVM_DEBUG(dbgs() << maxHeight->NodeNum << " ");
2295           R.remove(maxHeight);
2296           for (const auto &I : maxHeight->Succs) {
2297             if (Nodes.count(I.getSUnit()) == 0)
2298               continue;
2299             if (NodeOrder.contains(I.getSUnit()))
2300               continue;
2301             if (ignoreDependence(I, false))
2302               continue;
2303             R.insert(I.getSUnit());
2304           }
2305           // Back-edges are predecessors with an anti-dependence.
2306           for (const auto &I : maxHeight->Preds) {
2307             if (I.getKind() != SDep::Anti)
2308               continue;
2309             if (Nodes.count(I.getSUnit()) == 0)
2310               continue;
2311             if (NodeOrder.contains(I.getSUnit()))
2312               continue;
2313             R.insert(I.getSUnit());
2314           }
2315         }
2316         Order = BottomUp;
2317         LLVM_DEBUG(dbgs() << "\n   Switching order to bottom up ");
2318         SmallSetVector<SUnit *, 8> N;
2319         if (pred_L(NodeOrder, N, &Nodes))
2320           R.insert(N.begin(), N.end());
2321       } else {
2322         // Choose the node with the maximum depth.  If more than one, choose
2323         // the node with the maximum ZeroLatencyDepth. If still more than one,
2324         // choose the node with the lowest MOV.
2325         while (!R.empty()) {
2326           SUnit *maxDepth = nullptr;
2327           for (SUnit *I : R) {
2328             if (maxDepth == nullptr || getDepth(I) > getDepth(maxDepth))
2329               maxDepth = I;
2330             else if (getDepth(I) == getDepth(maxDepth) &&
2331                      getZeroLatencyDepth(I) > getZeroLatencyDepth(maxDepth))
2332               maxDepth = I;
2333             else if (getDepth(I) == getDepth(maxDepth) &&
2334                      getZeroLatencyDepth(I) == getZeroLatencyDepth(maxDepth) &&
2335                      getMOV(I) < getMOV(maxDepth))
2336               maxDepth = I;
2337           }
2338           NodeOrder.insert(maxDepth);
2339           LLVM_DEBUG(dbgs() << maxDepth->NodeNum << " ");
2340           R.remove(maxDepth);
2341           if (Nodes.isExceedSU(maxDepth)) {
2342             Order = TopDown;
2343             R.clear();
2344             R.insert(Nodes.getNode(0));
2345             break;
2346           }
2347           for (const auto &I : maxDepth->Preds) {
2348             if (Nodes.count(I.getSUnit()) == 0)
2349               continue;
2350             if (NodeOrder.contains(I.getSUnit()))
2351               continue;
2352             R.insert(I.getSUnit());
2353           }
2354           // Back-edges are predecessors with an anti-dependence.
2355           for (const auto &I : maxDepth->Succs) {
2356             if (I.getKind() != SDep::Anti)
2357               continue;
2358             if (Nodes.count(I.getSUnit()) == 0)
2359               continue;
2360             if (NodeOrder.contains(I.getSUnit()))
2361               continue;
2362             R.insert(I.getSUnit());
2363           }
2364         }
2365         Order = TopDown;
2366         LLVM_DEBUG(dbgs() << "\n   Switching order to top down ");
2367         SmallSetVector<SUnit *, 8> N;
2368         if (succ_L(NodeOrder, N, &Nodes))
2369           R.insert(N.begin(), N.end());
2370       }
2371     }
2372     LLVM_DEBUG(dbgs() << "\nDone with Nodeset\n");
2373   }
2374 
2375   LLVM_DEBUG({
2376     dbgs() << "Node order: ";
2377     for (SUnit *I : NodeOrder)
2378       dbgs() << " " << I->NodeNum << " ";
2379     dbgs() << "\n";
2380   });
2381 }
2382 
2383 /// Process the nodes in the computed order and create the pipelined schedule
2384 /// of the instructions, if possible. Return true if a schedule is found.
2385 bool SwingSchedulerDAG::schedulePipeline(SMSchedule &Schedule) {
2386 
2387   if (NodeOrder.empty()){
2388     LLVM_DEBUG(dbgs() << "NodeOrder is empty! abort scheduling\n" );
2389     return false;
2390   }
2391 
2392   bool scheduleFound = false;
2393   std::unique_ptr<HighRegisterPressureDetector> HRPDetector;
2394   if (LimitRegPressure) {
2395     HRPDetector =
2396         std::make_unique<HighRegisterPressureDetector>(Loop.getHeader(), MF);
2397     HRPDetector->init(RegClassInfo);
2398   }
2399   // Keep increasing II until a valid schedule is found.
2400   for (unsigned II = MII; II <= MAX_II && !scheduleFound; ++II) {
2401     Schedule.reset();
2402     Schedule.setInitiationInterval(II);
2403     LLVM_DEBUG(dbgs() << "Try to schedule with " << II << "\n");
2404 
2405     SetVector<SUnit *>::iterator NI = NodeOrder.begin();
2406     SetVector<SUnit *>::iterator NE = NodeOrder.end();
2407     do {
2408       SUnit *SU = *NI;
2409 
2410       // Compute the schedule time for the instruction, which is based
2411       // upon the scheduled time for any predecessors/successors.
2412       int EarlyStart = INT_MIN;
2413       int LateStart = INT_MAX;
2414       // These values are set when the size of the schedule window is limited
2415       // due to chain dependences.
2416       int SchedEnd = INT_MAX;
2417       int SchedStart = INT_MIN;
2418       Schedule.computeStart(SU, &EarlyStart, &LateStart, &SchedEnd, &SchedStart,
2419                             II, this);
2420       LLVM_DEBUG({
2421         dbgs() << "\n";
2422         dbgs() << "Inst (" << SU->NodeNum << ") ";
2423         SU->getInstr()->dump();
2424         dbgs() << "\n";
2425       });
2426       LLVM_DEBUG({
2427         dbgs() << format("\tes: %8x ls: %8x me: %8x ms: %8x\n", EarlyStart,
2428                          LateStart, SchedEnd, SchedStart);
2429       });
2430 
2431       if (EarlyStart > LateStart || SchedEnd < EarlyStart ||
2432           SchedStart > LateStart)
2433         scheduleFound = false;
2434       else if (EarlyStart != INT_MIN && LateStart == INT_MAX) {
2435         SchedEnd = std::min(SchedEnd, EarlyStart + (int)II - 1);
2436         scheduleFound = Schedule.insert(SU, EarlyStart, SchedEnd, II);
2437       } else if (EarlyStart == INT_MIN && LateStart != INT_MAX) {
2438         SchedStart = std::max(SchedStart, LateStart - (int)II + 1);
2439         scheduleFound = Schedule.insert(SU, LateStart, SchedStart, II);
2440       } else if (EarlyStart != INT_MIN && LateStart != INT_MAX) {
2441         SchedEnd =
2442             std::min(SchedEnd, std::min(LateStart, EarlyStart + (int)II - 1));
2443         // When scheduling a Phi it is better to start at the late cycle and go
2444         // backwards. The default order may insert the Phi too far away from
2445         // its first dependence.
2446         if (SU->getInstr()->isPHI())
2447           scheduleFound = Schedule.insert(SU, SchedEnd, EarlyStart, II);
2448         else
2449           scheduleFound = Schedule.insert(SU, EarlyStart, SchedEnd, II);
2450       } else {
2451         int FirstCycle = Schedule.getFirstCycle();
2452         scheduleFound = Schedule.insert(SU, FirstCycle + getASAP(SU),
2453                                         FirstCycle + getASAP(SU) + II - 1, II);
2454       }
2455       // Even if we find a schedule, make sure the schedule doesn't exceed the
2456       // allowable number of stages. We keep trying if this happens.
2457       if (scheduleFound)
2458         if (SwpMaxStages > -1 &&
2459             Schedule.getMaxStageCount() > (unsigned)SwpMaxStages)
2460           scheduleFound = false;
2461 
2462       LLVM_DEBUG({
2463         if (!scheduleFound)
2464           dbgs() << "\tCan't schedule\n";
2465       });
2466     } while (++NI != NE && scheduleFound);
2467 
2468     // If a schedule is found, ensure non-pipelined instructions are in stage 0
2469     if (scheduleFound)
2470       scheduleFound =
2471           Schedule.normalizeNonPipelinedInstructions(this, LoopPipelinerInfo);
2472 
2473     // If a schedule is found, check if it is a valid schedule too.
2474     if (scheduleFound)
2475       scheduleFound = Schedule.isValidSchedule(this);
2476 
2477     // If a schedule was found and the option is enabled, check if the schedule
2478     // might generate additional register spills/fills.
2479     if (scheduleFound && LimitRegPressure)
2480       scheduleFound =
2481           !HRPDetector->detect(this, Schedule, Schedule.getMaxStageCount());
2482   }
2483 
2484   LLVM_DEBUG(dbgs() << "Schedule Found? " << scheduleFound
2485                     << " (II=" << Schedule.getInitiationInterval()
2486                     << ")\n");
2487 
2488   if (scheduleFound) {
2489     scheduleFound = LoopPipelinerInfo->shouldUseSchedule(*this, Schedule);
2490     if (!scheduleFound)
2491       LLVM_DEBUG(dbgs() << "Target rejected schedule\n");
2492   }
2493 
2494   if (scheduleFound) {
2495     Schedule.finalizeSchedule(this);
2496     Pass.ORE->emit([&]() {
2497       return MachineOptimizationRemarkAnalysis(
2498                  DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
2499              << "Schedule found with Initiation Interval: "
2500              << ore::NV("II", Schedule.getInitiationInterval())
2501              << ", MaxStageCount: "
2502              << ore::NV("MaxStageCount", Schedule.getMaxStageCount());
2503     });
2504   } else
2505     Schedule.reset();
2506 
2507   return scheduleFound && Schedule.getMaxStageCount() > 0;
2508 }
2509 
2510 /// Return true if we can compute the amount the instruction changes
2511 /// during each iteration. Set Delta to the amount of the change.
2512 bool SwingSchedulerDAG::computeDelta(MachineInstr &MI, unsigned &Delta) {
2513   const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
2514   const MachineOperand *BaseOp;
2515   int64_t Offset;
2516   bool OffsetIsScalable;
2517   if (!TII->getMemOperandWithOffset(MI, BaseOp, Offset, OffsetIsScalable, TRI))
2518     return false;
2519 
2520   // FIXME: This algorithm assumes instructions have fixed-size offsets.
2521   if (OffsetIsScalable)
2522     return false;
2523 
2524   if (!BaseOp->isReg())
2525     return false;
2526 
2527   Register BaseReg = BaseOp->getReg();
2528 
2529   MachineRegisterInfo &MRI = MF.getRegInfo();
2530   // Check if there is a Phi. If so, get the definition in the loop.
2531   MachineInstr *BaseDef = MRI.getVRegDef(BaseReg);
2532   if (BaseDef && BaseDef->isPHI()) {
2533     BaseReg = getLoopPhiReg(*BaseDef, MI.getParent());
2534     BaseDef = MRI.getVRegDef(BaseReg);
2535   }
2536   if (!BaseDef)
2537     return false;
2538 
2539   int D = 0;
2540   if (!TII->getIncrementValue(*BaseDef, D) && D >= 0)
2541     return false;
2542 
2543   Delta = D;
2544   return true;
2545 }
2546 
2547 /// Check if we can change the instruction to use an offset value from the
2548 /// previous iteration. If so, return true and set the base and offset values
2549 /// so that we can rewrite the load, if necessary.
2550 ///   v1 = Phi(v0, v3)
2551 ///   v2 = load v1, 0
2552 ///   v3 = post_store v1, 4, x
2553 /// This function enables the load to be rewritten as v2 = load v3, 4.
2554 bool SwingSchedulerDAG::canUseLastOffsetValue(MachineInstr *MI,
2555                                               unsigned &BasePos,
2556                                               unsigned &OffsetPos,
2557                                               unsigned &NewBase,
2558                                               int64_t &Offset) {
2559   // Get the load instruction.
2560   if (TII->isPostIncrement(*MI))
2561     return false;
2562   unsigned BasePosLd, OffsetPosLd;
2563   if (!TII->getBaseAndOffsetPosition(*MI, BasePosLd, OffsetPosLd))
2564     return false;
2565   Register BaseReg = MI->getOperand(BasePosLd).getReg();
2566 
2567   // Look for the Phi instruction.
2568   MachineRegisterInfo &MRI = MI->getMF()->getRegInfo();
2569   MachineInstr *Phi = MRI.getVRegDef(BaseReg);
2570   if (!Phi || !Phi->isPHI())
2571     return false;
2572   // Get the register defined in the loop block.
2573   unsigned PrevReg = getLoopPhiReg(*Phi, MI->getParent());
2574   if (!PrevReg)
2575     return false;
2576 
2577   // Check for the post-increment load/store instruction.
2578   MachineInstr *PrevDef = MRI.getVRegDef(PrevReg);
2579   if (!PrevDef || PrevDef == MI)
2580     return false;
2581 
2582   if (!TII->isPostIncrement(*PrevDef))
2583     return false;
2584 
2585   unsigned BasePos1 = 0, OffsetPos1 = 0;
2586   if (!TII->getBaseAndOffsetPosition(*PrevDef, BasePos1, OffsetPos1))
2587     return false;
2588 
2589   // Make sure that the instructions do not access the same memory location in
2590   // the next iteration.
2591   int64_t LoadOffset = MI->getOperand(OffsetPosLd).getImm();
2592   int64_t StoreOffset = PrevDef->getOperand(OffsetPos1).getImm();
2593   MachineInstr *NewMI = MF.CloneMachineInstr(MI);
2594   NewMI->getOperand(OffsetPosLd).setImm(LoadOffset + StoreOffset);
2595   bool Disjoint = TII->areMemAccessesTriviallyDisjoint(*NewMI, *PrevDef);
2596   MF.deleteMachineInstr(NewMI);
2597   if (!Disjoint)
2598     return false;
2599 
2600   // Set the return value once we determine that we return true.
2601   BasePos = BasePosLd;
2602   OffsetPos = OffsetPosLd;
2603   NewBase = PrevReg;
2604   Offset = StoreOffset;
2605   return true;
2606 }
2607 
2608 /// Apply changes to the instruction if needed. The changes are need
2609 /// to improve the scheduling and depend up on the final schedule.
2610 void SwingSchedulerDAG::applyInstrChange(MachineInstr *MI,
2611                                          SMSchedule &Schedule) {
2612   SUnit *SU = getSUnit(MI);
2613   DenseMap<SUnit *, std::pair<unsigned, int64_t>>::iterator It =
2614       InstrChanges.find(SU);
2615   if (It != InstrChanges.end()) {
2616     std::pair<unsigned, int64_t> RegAndOffset = It->second;
2617     unsigned BasePos, OffsetPos;
2618     if (!TII->getBaseAndOffsetPosition(*MI, BasePos, OffsetPos))
2619       return;
2620     Register BaseReg = MI->getOperand(BasePos).getReg();
2621     MachineInstr *LoopDef = findDefInLoop(BaseReg);
2622     int DefStageNum = Schedule.stageScheduled(getSUnit(LoopDef));
2623     int DefCycleNum = Schedule.cycleScheduled(getSUnit(LoopDef));
2624     int BaseStageNum = Schedule.stageScheduled(SU);
2625     int BaseCycleNum = Schedule.cycleScheduled(SU);
2626     if (BaseStageNum < DefStageNum) {
2627       MachineInstr *NewMI = MF.CloneMachineInstr(MI);
2628       int OffsetDiff = DefStageNum - BaseStageNum;
2629       if (DefCycleNum < BaseCycleNum) {
2630         NewMI->getOperand(BasePos).setReg(RegAndOffset.first);
2631         if (OffsetDiff > 0)
2632           --OffsetDiff;
2633       }
2634       int64_t NewOffset =
2635           MI->getOperand(OffsetPos).getImm() + RegAndOffset.second * OffsetDiff;
2636       NewMI->getOperand(OffsetPos).setImm(NewOffset);
2637       SU->setInstr(NewMI);
2638       MISUnitMap[NewMI] = SU;
2639       NewMIs[MI] = NewMI;
2640     }
2641   }
2642 }
2643 
2644 /// Return the instruction in the loop that defines the register.
2645 /// If the definition is a Phi, then follow the Phi operand to
2646 /// the instruction in the loop.
2647 MachineInstr *SwingSchedulerDAG::findDefInLoop(Register Reg) {
2648   SmallPtrSet<MachineInstr *, 8> Visited;
2649   MachineInstr *Def = MRI.getVRegDef(Reg);
2650   while (Def->isPHI()) {
2651     if (!Visited.insert(Def).second)
2652       break;
2653     for (unsigned i = 1, e = Def->getNumOperands(); i < e; i += 2)
2654       if (Def->getOperand(i + 1).getMBB() == BB) {
2655         Def = MRI.getVRegDef(Def->getOperand(i).getReg());
2656         break;
2657       }
2658   }
2659   return Def;
2660 }
2661 
2662 /// Return true for an order or output dependence that is loop carried
2663 /// potentially. A dependence is loop carried if the destination defines a value
2664 /// that may be used or defined by the source in a subsequent iteration.
2665 bool SwingSchedulerDAG::isLoopCarriedDep(SUnit *Source, const SDep &Dep,
2666                                          bool isSucc) {
2667   if ((Dep.getKind() != SDep::Order && Dep.getKind() != SDep::Output) ||
2668       Dep.isArtificial() || Dep.getSUnit()->isBoundaryNode())
2669     return false;
2670 
2671   if (!SwpPruneLoopCarried)
2672     return true;
2673 
2674   if (Dep.getKind() == SDep::Output)
2675     return true;
2676 
2677   MachineInstr *SI = Source->getInstr();
2678   MachineInstr *DI = Dep.getSUnit()->getInstr();
2679   if (!isSucc)
2680     std::swap(SI, DI);
2681   assert(SI != nullptr && DI != nullptr && "Expecting SUnit with an MI.");
2682 
2683   // Assume ordered loads and stores may have a loop carried dependence.
2684   if (SI->hasUnmodeledSideEffects() || DI->hasUnmodeledSideEffects() ||
2685       SI->mayRaiseFPException() || DI->mayRaiseFPException() ||
2686       SI->hasOrderedMemoryRef() || DI->hasOrderedMemoryRef())
2687     return true;
2688 
2689   if (!DI->mayLoadOrStore() || !SI->mayLoadOrStore())
2690     return false;
2691 
2692   // The conservative assumption is that a dependence between memory operations
2693   // may be loop carried. The following code checks when it can be proved that
2694   // there is no loop carried dependence.
2695   unsigned DeltaS, DeltaD;
2696   if (!computeDelta(*SI, DeltaS) || !computeDelta(*DI, DeltaD))
2697     return true;
2698 
2699   const MachineOperand *BaseOpS, *BaseOpD;
2700   int64_t OffsetS, OffsetD;
2701   bool OffsetSIsScalable, OffsetDIsScalable;
2702   const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
2703   if (!TII->getMemOperandWithOffset(*SI, BaseOpS, OffsetS, OffsetSIsScalable,
2704                                     TRI) ||
2705       !TII->getMemOperandWithOffset(*DI, BaseOpD, OffsetD, OffsetDIsScalable,
2706                                     TRI))
2707     return true;
2708 
2709   assert(!OffsetSIsScalable && !OffsetDIsScalable &&
2710          "Expected offsets to be byte offsets");
2711 
2712   MachineInstr *DefS = MRI.getVRegDef(BaseOpS->getReg());
2713   MachineInstr *DefD = MRI.getVRegDef(BaseOpD->getReg());
2714   if (!DefS || !DefD || !DefS->isPHI() || !DefD->isPHI())
2715     return true;
2716 
2717   unsigned InitValS = 0;
2718   unsigned LoopValS = 0;
2719   unsigned InitValD = 0;
2720   unsigned LoopValD = 0;
2721   getPhiRegs(*DefS, BB, InitValS, LoopValS);
2722   getPhiRegs(*DefD, BB, InitValD, LoopValD);
2723   MachineInstr *InitDefS = MRI.getVRegDef(InitValS);
2724   MachineInstr *InitDefD = MRI.getVRegDef(InitValD);
2725 
2726   if (!InitDefS->isIdenticalTo(*InitDefD))
2727     return true;
2728 
2729   // Check that the base register is incremented by a constant value for each
2730   // iteration.
2731   MachineInstr *LoopDefS = MRI.getVRegDef(LoopValS);
2732   int D = 0;
2733   if (!LoopDefS || !TII->getIncrementValue(*LoopDefS, D))
2734     return true;
2735 
2736   LocationSize AccessSizeS = (*SI->memoperands_begin())->getSize();
2737   LocationSize AccessSizeD = (*DI->memoperands_begin())->getSize();
2738 
2739   // This is the main test, which checks the offset values and the loop
2740   // increment value to determine if the accesses may be loop carried.
2741   if (!AccessSizeS.hasValue() || !AccessSizeD.hasValue())
2742     return true;
2743 
2744   if (DeltaS != DeltaD || DeltaS < AccessSizeS.getValue() ||
2745       DeltaD < AccessSizeD.getValue())
2746     return true;
2747 
2748   return (OffsetS + (int64_t)AccessSizeS.getValue() <
2749           OffsetD + (int64_t)AccessSizeD.getValue());
2750 }
2751 
2752 void SwingSchedulerDAG::postProcessDAG() {
2753   for (auto &M : Mutations)
2754     M->apply(this);
2755 }
2756 
2757 /// Try to schedule the node at the specified StartCycle and continue
2758 /// until the node is schedule or the EndCycle is reached.  This function
2759 /// returns true if the node is scheduled.  This routine may search either
2760 /// forward or backward for a place to insert the instruction based upon
2761 /// the relative values of StartCycle and EndCycle.
2762 bool SMSchedule::insert(SUnit *SU, int StartCycle, int EndCycle, int II) {
2763   bool forward = true;
2764   LLVM_DEBUG({
2765     dbgs() << "Trying to insert node between " << StartCycle << " and "
2766            << EndCycle << " II: " << II << "\n";
2767   });
2768   if (StartCycle > EndCycle)
2769     forward = false;
2770 
2771   // The terminating condition depends on the direction.
2772   int termCycle = forward ? EndCycle + 1 : EndCycle - 1;
2773   for (int curCycle = StartCycle; curCycle != termCycle;
2774        forward ? ++curCycle : --curCycle) {
2775 
2776     if (ST.getInstrInfo()->isZeroCost(SU->getInstr()->getOpcode()) ||
2777         ProcItinResources.canReserveResources(*SU, curCycle)) {
2778       LLVM_DEBUG({
2779         dbgs() << "\tinsert at cycle " << curCycle << " ";
2780         SU->getInstr()->dump();
2781       });
2782 
2783       if (!ST.getInstrInfo()->isZeroCost(SU->getInstr()->getOpcode()))
2784         ProcItinResources.reserveResources(*SU, curCycle);
2785       ScheduledInstrs[curCycle].push_back(SU);
2786       InstrToCycle.insert(std::make_pair(SU, curCycle));
2787       if (curCycle > LastCycle)
2788         LastCycle = curCycle;
2789       if (curCycle < FirstCycle)
2790         FirstCycle = curCycle;
2791       return true;
2792     }
2793     LLVM_DEBUG({
2794       dbgs() << "\tfailed to insert at cycle " << curCycle << " ";
2795       SU->getInstr()->dump();
2796     });
2797   }
2798   return false;
2799 }
2800 
2801 // Return the cycle of the earliest scheduled instruction in the chain.
2802 int SMSchedule::earliestCycleInChain(const SDep &Dep) {
2803   SmallPtrSet<SUnit *, 8> Visited;
2804   SmallVector<SDep, 8> Worklist;
2805   Worklist.push_back(Dep);
2806   int EarlyCycle = INT_MAX;
2807   while (!Worklist.empty()) {
2808     const SDep &Cur = Worklist.pop_back_val();
2809     SUnit *PrevSU = Cur.getSUnit();
2810     if (Visited.count(PrevSU))
2811       continue;
2812     std::map<SUnit *, int>::const_iterator it = InstrToCycle.find(PrevSU);
2813     if (it == InstrToCycle.end())
2814       continue;
2815     EarlyCycle = std::min(EarlyCycle, it->second);
2816     for (const auto &PI : PrevSU->Preds)
2817       if (PI.getKind() == SDep::Order || PI.getKind() == SDep::Output)
2818         Worklist.push_back(PI);
2819     Visited.insert(PrevSU);
2820   }
2821   return EarlyCycle;
2822 }
2823 
2824 // Return the cycle of the latest scheduled instruction in the chain.
2825 int SMSchedule::latestCycleInChain(const SDep &Dep) {
2826   SmallPtrSet<SUnit *, 8> Visited;
2827   SmallVector<SDep, 8> Worklist;
2828   Worklist.push_back(Dep);
2829   int LateCycle = INT_MIN;
2830   while (!Worklist.empty()) {
2831     const SDep &Cur = Worklist.pop_back_val();
2832     SUnit *SuccSU = Cur.getSUnit();
2833     if (Visited.count(SuccSU) || SuccSU->isBoundaryNode())
2834       continue;
2835     std::map<SUnit *, int>::const_iterator it = InstrToCycle.find(SuccSU);
2836     if (it == InstrToCycle.end())
2837       continue;
2838     LateCycle = std::max(LateCycle, it->second);
2839     for (const auto &SI : SuccSU->Succs)
2840       if (SI.getKind() == SDep::Order || SI.getKind() == SDep::Output)
2841         Worklist.push_back(SI);
2842     Visited.insert(SuccSU);
2843   }
2844   return LateCycle;
2845 }
2846 
2847 /// If an instruction has a use that spans multiple iterations, then
2848 /// return true. These instructions are characterized by having a back-ege
2849 /// to a Phi, which contains a reference to another Phi.
2850 static SUnit *multipleIterations(SUnit *SU, SwingSchedulerDAG *DAG) {
2851   for (auto &P : SU->Preds)
2852     if (DAG->isBackedge(SU, P) && P.getSUnit()->getInstr()->isPHI())
2853       for (auto &S : P.getSUnit()->Succs)
2854         if (S.getKind() == SDep::Data && S.getSUnit()->getInstr()->isPHI())
2855           return P.getSUnit();
2856   return nullptr;
2857 }
2858 
2859 /// Compute the scheduling start slot for the instruction.  The start slot
2860 /// depends on any predecessor or successor nodes scheduled already.
2861 void SMSchedule::computeStart(SUnit *SU, int *MaxEarlyStart, int *MinLateStart,
2862                               int *MinEnd, int *MaxStart, int II,
2863                               SwingSchedulerDAG *DAG) {
2864   // Iterate over each instruction that has been scheduled already.  The start
2865   // slot computation depends on whether the previously scheduled instruction
2866   // is a predecessor or successor of the specified instruction.
2867   for (int cycle = getFirstCycle(); cycle <= LastCycle; ++cycle) {
2868 
2869     // Iterate over each instruction in the current cycle.
2870     for (SUnit *I : getInstructions(cycle)) {
2871       // Because we're processing a DAG for the dependences, we recognize
2872       // the back-edge in recurrences by anti dependences.
2873       for (unsigned i = 0, e = (unsigned)SU->Preds.size(); i != e; ++i) {
2874         const SDep &Dep = SU->Preds[i];
2875         if (Dep.getSUnit() == I) {
2876           if (!DAG->isBackedge(SU, Dep)) {
2877             int EarlyStart = cycle + Dep.getLatency() -
2878                              DAG->getDistance(Dep.getSUnit(), SU, Dep) * II;
2879             *MaxEarlyStart = std::max(*MaxEarlyStart, EarlyStart);
2880             if (DAG->isLoopCarriedDep(SU, Dep, false)) {
2881               int End = earliestCycleInChain(Dep) + (II - 1);
2882               *MinEnd = std::min(*MinEnd, End);
2883             }
2884           } else {
2885             int LateStart = cycle - Dep.getLatency() +
2886                             DAG->getDistance(SU, Dep.getSUnit(), Dep) * II;
2887             *MinLateStart = std::min(*MinLateStart, LateStart);
2888           }
2889         }
2890         // For instruction that requires multiple iterations, make sure that
2891         // the dependent instruction is not scheduled past the definition.
2892         SUnit *BE = multipleIterations(I, DAG);
2893         if (BE && Dep.getSUnit() == BE && !SU->getInstr()->isPHI() &&
2894             !SU->isPred(I))
2895           *MinLateStart = std::min(*MinLateStart, cycle);
2896       }
2897       for (unsigned i = 0, e = (unsigned)SU->Succs.size(); i != e; ++i) {
2898         if (SU->Succs[i].getSUnit() == I) {
2899           const SDep &Dep = SU->Succs[i];
2900           if (!DAG->isBackedge(SU, Dep)) {
2901             int LateStart = cycle - Dep.getLatency() +
2902                             DAG->getDistance(SU, Dep.getSUnit(), Dep) * II;
2903             *MinLateStart = std::min(*MinLateStart, LateStart);
2904             if (DAG->isLoopCarriedDep(SU, Dep)) {
2905               int Start = latestCycleInChain(Dep) + 1 - II;
2906               *MaxStart = std::max(*MaxStart, Start);
2907             }
2908           } else {
2909             int EarlyStart = cycle + Dep.getLatency() -
2910                              DAG->getDistance(Dep.getSUnit(), SU, Dep) * II;
2911             *MaxEarlyStart = std::max(*MaxEarlyStart, EarlyStart);
2912           }
2913         }
2914       }
2915     }
2916   }
2917 }
2918 
2919 /// Order the instructions within a cycle so that the definitions occur
2920 /// before the uses. Returns true if the instruction is added to the start
2921 /// of the list, or false if added to the end.
2922 void SMSchedule::orderDependence(const SwingSchedulerDAG *SSD, SUnit *SU,
2923                                  std::deque<SUnit *> &Insts) const {
2924   MachineInstr *MI = SU->getInstr();
2925   bool OrderBeforeUse = false;
2926   bool OrderAfterDef = false;
2927   bool OrderBeforeDef = false;
2928   unsigned MoveDef = 0;
2929   unsigned MoveUse = 0;
2930   int StageInst1 = stageScheduled(SU);
2931 
2932   unsigned Pos = 0;
2933   for (std::deque<SUnit *>::iterator I = Insts.begin(), E = Insts.end(); I != E;
2934        ++I, ++Pos) {
2935     for (MachineOperand &MO : MI->operands()) {
2936       if (!MO.isReg() || !MO.getReg().isVirtual())
2937         continue;
2938 
2939       Register Reg = MO.getReg();
2940       unsigned BasePos, OffsetPos;
2941       if (ST.getInstrInfo()->getBaseAndOffsetPosition(*MI, BasePos, OffsetPos))
2942         if (MI->getOperand(BasePos).getReg() == Reg)
2943           if (unsigned NewReg = SSD->getInstrBaseReg(SU))
2944             Reg = NewReg;
2945       bool Reads, Writes;
2946       std::tie(Reads, Writes) =
2947           (*I)->getInstr()->readsWritesVirtualRegister(Reg);
2948       if (MO.isDef() && Reads && stageScheduled(*I) <= StageInst1) {
2949         OrderBeforeUse = true;
2950         if (MoveUse == 0)
2951           MoveUse = Pos;
2952       } else if (MO.isDef() && Reads && stageScheduled(*I) > StageInst1) {
2953         // Add the instruction after the scheduled instruction.
2954         OrderAfterDef = true;
2955         MoveDef = Pos;
2956       } else if (MO.isUse() && Writes && stageScheduled(*I) == StageInst1) {
2957         if (cycleScheduled(*I) == cycleScheduled(SU) && !(*I)->isSucc(SU)) {
2958           OrderBeforeUse = true;
2959           if (MoveUse == 0)
2960             MoveUse = Pos;
2961         } else {
2962           OrderAfterDef = true;
2963           MoveDef = Pos;
2964         }
2965       } else if (MO.isUse() && Writes && stageScheduled(*I) > StageInst1) {
2966         OrderBeforeUse = true;
2967         if (MoveUse == 0)
2968           MoveUse = Pos;
2969         if (MoveUse != 0) {
2970           OrderAfterDef = true;
2971           MoveDef = Pos - 1;
2972         }
2973       } else if (MO.isUse() && Writes && stageScheduled(*I) < StageInst1) {
2974         // Add the instruction before the scheduled instruction.
2975         OrderBeforeUse = true;
2976         if (MoveUse == 0)
2977           MoveUse = Pos;
2978       } else if (MO.isUse() && stageScheduled(*I) == StageInst1 &&
2979                  isLoopCarriedDefOfUse(SSD, (*I)->getInstr(), MO)) {
2980         if (MoveUse == 0) {
2981           OrderBeforeDef = true;
2982           MoveUse = Pos;
2983         }
2984       }
2985     }
2986     // Check for order dependences between instructions. Make sure the source
2987     // is ordered before the destination.
2988     for (auto &S : SU->Succs) {
2989       if (S.getSUnit() != *I)
2990         continue;
2991       if (S.getKind() == SDep::Order && stageScheduled(*I) == StageInst1) {
2992         OrderBeforeUse = true;
2993         if (Pos < MoveUse)
2994           MoveUse = Pos;
2995       }
2996       // We did not handle HW dependences in previous for loop,
2997       // and we normally set Latency = 0 for Anti deps,
2998       // so may have nodes in same cycle with Anti denpendent on HW regs.
2999       else if (S.getKind() == SDep::Anti && stageScheduled(*I) == StageInst1) {
3000         OrderBeforeUse = true;
3001         if ((MoveUse == 0) || (Pos < MoveUse))
3002           MoveUse = Pos;
3003       }
3004     }
3005     for (auto &P : SU->Preds) {
3006       if (P.getSUnit() != *I)
3007         continue;
3008       if (P.getKind() == SDep::Order && stageScheduled(*I) == StageInst1) {
3009         OrderAfterDef = true;
3010         MoveDef = Pos;
3011       }
3012     }
3013   }
3014 
3015   // A circular dependence.
3016   if (OrderAfterDef && OrderBeforeUse && MoveUse == MoveDef)
3017     OrderBeforeUse = false;
3018 
3019   // OrderAfterDef takes precedences over OrderBeforeDef. The latter is due
3020   // to a loop-carried dependence.
3021   if (OrderBeforeDef)
3022     OrderBeforeUse = !OrderAfterDef || (MoveUse > MoveDef);
3023 
3024   // The uncommon case when the instruction order needs to be updated because
3025   // there is both a use and def.
3026   if (OrderBeforeUse && OrderAfterDef) {
3027     SUnit *UseSU = Insts.at(MoveUse);
3028     SUnit *DefSU = Insts.at(MoveDef);
3029     if (MoveUse > MoveDef) {
3030       Insts.erase(Insts.begin() + MoveUse);
3031       Insts.erase(Insts.begin() + MoveDef);
3032     } else {
3033       Insts.erase(Insts.begin() + MoveDef);
3034       Insts.erase(Insts.begin() + MoveUse);
3035     }
3036     orderDependence(SSD, UseSU, Insts);
3037     orderDependence(SSD, SU, Insts);
3038     orderDependence(SSD, DefSU, Insts);
3039     return;
3040   }
3041   // Put the new instruction first if there is a use in the list. Otherwise,
3042   // put it at the end of the list.
3043   if (OrderBeforeUse)
3044     Insts.push_front(SU);
3045   else
3046     Insts.push_back(SU);
3047 }
3048 
3049 /// Return true if the scheduled Phi has a loop carried operand.
3050 bool SMSchedule::isLoopCarried(const SwingSchedulerDAG *SSD,
3051                                MachineInstr &Phi) const {
3052   if (!Phi.isPHI())
3053     return false;
3054   assert(Phi.isPHI() && "Expecting a Phi.");
3055   SUnit *DefSU = SSD->getSUnit(&Phi);
3056   unsigned DefCycle = cycleScheduled(DefSU);
3057   int DefStage = stageScheduled(DefSU);
3058 
3059   unsigned InitVal = 0;
3060   unsigned LoopVal = 0;
3061   getPhiRegs(Phi, Phi.getParent(), InitVal, LoopVal);
3062   SUnit *UseSU = SSD->getSUnit(MRI.getVRegDef(LoopVal));
3063   if (!UseSU)
3064     return true;
3065   if (UseSU->getInstr()->isPHI())
3066     return true;
3067   unsigned LoopCycle = cycleScheduled(UseSU);
3068   int LoopStage = stageScheduled(UseSU);
3069   return (LoopCycle > DefCycle) || (LoopStage <= DefStage);
3070 }
3071 
3072 /// Return true if the instruction is a definition that is loop carried
3073 /// and defines the use on the next iteration.
3074 ///        v1 = phi(v2, v3)
3075 ///  (Def) v3 = op v1
3076 ///  (MO)   = v1
3077 /// If MO appears before Def, then v1 and v3 may get assigned to the same
3078 /// register.
3079 bool SMSchedule::isLoopCarriedDefOfUse(const SwingSchedulerDAG *SSD,
3080                                        MachineInstr *Def,
3081                                        MachineOperand &MO) const {
3082   if (!MO.isReg())
3083     return false;
3084   if (Def->isPHI())
3085     return false;
3086   MachineInstr *Phi = MRI.getVRegDef(MO.getReg());
3087   if (!Phi || !Phi->isPHI() || Phi->getParent() != Def->getParent())
3088     return false;
3089   if (!isLoopCarried(SSD, *Phi))
3090     return false;
3091   unsigned LoopReg = getLoopPhiReg(*Phi, Phi->getParent());
3092   for (MachineOperand &DMO : Def->all_defs()) {
3093     if (DMO.getReg() == LoopReg)
3094       return true;
3095   }
3096   return false;
3097 }
3098 
3099 /// Determine transitive dependences of unpipelineable instructions
3100 SmallSet<SUnit *, 8> SMSchedule::computeUnpipelineableNodes(
3101     SwingSchedulerDAG *SSD, TargetInstrInfo::PipelinerLoopInfo *PLI) {
3102   SmallSet<SUnit *, 8> DoNotPipeline;
3103   SmallVector<SUnit *, 8> Worklist;
3104 
3105   for (auto &SU : SSD->SUnits)
3106     if (SU.isInstr() && PLI->shouldIgnoreForPipelining(SU.getInstr()))
3107       Worklist.push_back(&SU);
3108 
3109   while (!Worklist.empty()) {
3110     auto SU = Worklist.pop_back_val();
3111     if (DoNotPipeline.count(SU))
3112       continue;
3113     LLVM_DEBUG(dbgs() << "Do not pipeline SU(" << SU->NodeNum << ")\n");
3114     DoNotPipeline.insert(SU);
3115     for (auto &Dep : SU->Preds)
3116       Worklist.push_back(Dep.getSUnit());
3117     if (SU->getInstr()->isPHI())
3118       for (auto &Dep : SU->Succs)
3119         if (Dep.getKind() == SDep::Anti)
3120           Worklist.push_back(Dep.getSUnit());
3121   }
3122   return DoNotPipeline;
3123 }
3124 
3125 // Determine all instructions upon which any unpipelineable instruction depends
3126 // and ensure that they are in stage 0.  If unable to do so, return false.
3127 bool SMSchedule::normalizeNonPipelinedInstructions(
3128     SwingSchedulerDAG *SSD, TargetInstrInfo::PipelinerLoopInfo *PLI) {
3129   SmallSet<SUnit *, 8> DNP = computeUnpipelineableNodes(SSD, PLI);
3130 
3131   int NewLastCycle = INT_MIN;
3132   for (SUnit &SU : SSD->SUnits) {
3133     if (!SU.isInstr())
3134       continue;
3135     if (!DNP.contains(&SU) || stageScheduled(&SU) == 0) {
3136       NewLastCycle = std::max(NewLastCycle, InstrToCycle[&SU]);
3137       continue;
3138     }
3139 
3140     // Put the non-pipelined instruction as early as possible in the schedule
3141     int NewCycle = getFirstCycle();
3142     for (auto &Dep : SU.Preds)
3143       NewCycle = std::max(InstrToCycle[Dep.getSUnit()], NewCycle);
3144 
3145     int OldCycle = InstrToCycle[&SU];
3146     if (OldCycle != NewCycle) {
3147       InstrToCycle[&SU] = NewCycle;
3148       auto &OldS = getInstructions(OldCycle);
3149       llvm::erase(OldS, &SU);
3150       getInstructions(NewCycle).emplace_back(&SU);
3151       LLVM_DEBUG(dbgs() << "SU(" << SU.NodeNum
3152                         << ") is not pipelined; moving from cycle " << OldCycle
3153                         << " to " << NewCycle << " Instr:" << *SU.getInstr());
3154     }
3155     NewLastCycle = std::max(NewLastCycle, NewCycle);
3156   }
3157   LastCycle = NewLastCycle;
3158   return true;
3159 }
3160 
3161 // Check if the generated schedule is valid. This function checks if
3162 // an instruction that uses a physical register is scheduled in a
3163 // different stage than the definition. The pipeliner does not handle
3164 // physical register values that may cross a basic block boundary.
3165 // Furthermore, if a physical def/use pair is assigned to the same
3166 // cycle, orderDependence does not guarantee def/use ordering, so that
3167 // case should be considered invalid.  (The test checks for both
3168 // earlier and same-cycle use to be more robust.)
3169 bool SMSchedule::isValidSchedule(SwingSchedulerDAG *SSD) {
3170   for (SUnit &SU : SSD->SUnits) {
3171     if (!SU.hasPhysRegDefs)
3172       continue;
3173     int StageDef = stageScheduled(&SU);
3174     int CycleDef = InstrToCycle[&SU];
3175     assert(StageDef != -1 && "Instruction should have been scheduled.");
3176     for (auto &SI : SU.Succs)
3177       if (SI.isAssignedRegDep() && !SI.getSUnit()->isBoundaryNode())
3178         if (Register::isPhysicalRegister(SI.getReg())) {
3179           if (stageScheduled(SI.getSUnit()) != StageDef)
3180             return false;
3181           if (InstrToCycle[SI.getSUnit()] <= CycleDef)
3182             return false;
3183         }
3184   }
3185   return true;
3186 }
3187 
3188 /// A property of the node order in swing-modulo-scheduling is
3189 /// that for nodes outside circuits the following holds:
3190 /// none of them is scheduled after both a successor and a
3191 /// predecessor.
3192 /// The method below checks whether the property is met.
3193 /// If not, debug information is printed and statistics information updated.
3194 /// Note that we do not use an assert statement.
3195 /// The reason is that although an invalid node oder may prevent
3196 /// the pipeliner from finding a pipelined schedule for arbitrary II,
3197 /// it does not lead to the generation of incorrect code.
3198 void SwingSchedulerDAG::checkValidNodeOrder(const NodeSetType &Circuits) const {
3199 
3200   // a sorted vector that maps each SUnit to its index in the NodeOrder
3201   typedef std::pair<SUnit *, unsigned> UnitIndex;
3202   std::vector<UnitIndex> Indices(NodeOrder.size(), std::make_pair(nullptr, 0));
3203 
3204   for (unsigned i = 0, s = NodeOrder.size(); i < s; ++i)
3205     Indices.push_back(std::make_pair(NodeOrder[i], i));
3206 
3207   auto CompareKey = [](UnitIndex i1, UnitIndex i2) {
3208     return std::get<0>(i1) < std::get<0>(i2);
3209   };
3210 
3211   // sort, so that we can perform a binary search
3212   llvm::sort(Indices, CompareKey);
3213 
3214   bool Valid = true;
3215   (void)Valid;
3216   // for each SUnit in the NodeOrder, check whether
3217   // it appears after both a successor and a predecessor
3218   // of the SUnit. If this is the case, and the SUnit
3219   // is not part of circuit, then the NodeOrder is not
3220   // valid.
3221   for (unsigned i = 0, s = NodeOrder.size(); i < s; ++i) {
3222     SUnit *SU = NodeOrder[i];
3223     unsigned Index = i;
3224 
3225     bool PredBefore = false;
3226     bool SuccBefore = false;
3227 
3228     SUnit *Succ;
3229     SUnit *Pred;
3230     (void)Succ;
3231     (void)Pred;
3232 
3233     for (SDep &PredEdge : SU->Preds) {
3234       SUnit *PredSU = PredEdge.getSUnit();
3235       unsigned PredIndex = std::get<1>(
3236           *llvm::lower_bound(Indices, std::make_pair(PredSU, 0), CompareKey));
3237       if (!PredSU->getInstr()->isPHI() && PredIndex < Index) {
3238         PredBefore = true;
3239         Pred = PredSU;
3240         break;
3241       }
3242     }
3243 
3244     for (SDep &SuccEdge : SU->Succs) {
3245       SUnit *SuccSU = SuccEdge.getSUnit();
3246       // Do not process a boundary node, it was not included in NodeOrder,
3247       // hence not in Indices either, call to std::lower_bound() below will
3248       // return Indices.end().
3249       if (SuccSU->isBoundaryNode())
3250         continue;
3251       unsigned SuccIndex = std::get<1>(
3252           *llvm::lower_bound(Indices, std::make_pair(SuccSU, 0), CompareKey));
3253       if (!SuccSU->getInstr()->isPHI() && SuccIndex < Index) {
3254         SuccBefore = true;
3255         Succ = SuccSU;
3256         break;
3257       }
3258     }
3259 
3260     if (PredBefore && SuccBefore && !SU->getInstr()->isPHI()) {
3261       // instructions in circuits are allowed to be scheduled
3262       // after both a successor and predecessor.
3263       bool InCircuit = llvm::any_of(
3264           Circuits, [SU](const NodeSet &Circuit) { return Circuit.count(SU); });
3265       if (InCircuit)
3266         LLVM_DEBUG(dbgs() << "In a circuit, predecessor ";);
3267       else {
3268         Valid = false;
3269         NumNodeOrderIssues++;
3270         LLVM_DEBUG(dbgs() << "Predecessor ";);
3271       }
3272       LLVM_DEBUG(dbgs() << Pred->NodeNum << " and successor " << Succ->NodeNum
3273                         << " are scheduled before node " << SU->NodeNum
3274                         << "\n";);
3275     }
3276   }
3277 
3278   LLVM_DEBUG({
3279     if (!Valid)
3280       dbgs() << "Invalid node order found!\n";
3281   });
3282 }
3283 
3284 /// Attempt to fix the degenerate cases when the instruction serialization
3285 /// causes the register lifetimes to overlap. For example,
3286 ///   p' = store_pi(p, b)
3287 ///      = load p, offset
3288 /// In this case p and p' overlap, which means that two registers are needed.
3289 /// Instead, this function changes the load to use p' and updates the offset.
3290 void SwingSchedulerDAG::fixupRegisterOverlaps(std::deque<SUnit *> &Instrs) {
3291   unsigned OverlapReg = 0;
3292   unsigned NewBaseReg = 0;
3293   for (SUnit *SU : Instrs) {
3294     MachineInstr *MI = SU->getInstr();
3295     for (unsigned i = 0, e = MI->getNumOperands(); i < e; ++i) {
3296       const MachineOperand &MO = MI->getOperand(i);
3297       // Look for an instruction that uses p. The instruction occurs in the
3298       // same cycle but occurs later in the serialized order.
3299       if (MO.isReg() && MO.isUse() && MO.getReg() == OverlapReg) {
3300         // Check that the instruction appears in the InstrChanges structure,
3301         // which contains instructions that can have the offset updated.
3302         DenseMap<SUnit *, std::pair<unsigned, int64_t>>::iterator It =
3303           InstrChanges.find(SU);
3304         if (It != InstrChanges.end()) {
3305           unsigned BasePos, OffsetPos;
3306           // Update the base register and adjust the offset.
3307           if (TII->getBaseAndOffsetPosition(*MI, BasePos, OffsetPos)) {
3308             MachineInstr *NewMI = MF.CloneMachineInstr(MI);
3309             NewMI->getOperand(BasePos).setReg(NewBaseReg);
3310             int64_t NewOffset =
3311                 MI->getOperand(OffsetPos).getImm() - It->second.second;
3312             NewMI->getOperand(OffsetPos).setImm(NewOffset);
3313             SU->setInstr(NewMI);
3314             MISUnitMap[NewMI] = SU;
3315             NewMIs[MI] = NewMI;
3316           }
3317         }
3318         OverlapReg = 0;
3319         NewBaseReg = 0;
3320         break;
3321       }
3322       // Look for an instruction of the form p' = op(p), which uses and defines
3323       // two virtual registers that get allocated to the same physical register.
3324       unsigned TiedUseIdx = 0;
3325       if (MI->isRegTiedToUseOperand(i, &TiedUseIdx)) {
3326         // OverlapReg is p in the example above.
3327         OverlapReg = MI->getOperand(TiedUseIdx).getReg();
3328         // NewBaseReg is p' in the example above.
3329         NewBaseReg = MI->getOperand(i).getReg();
3330         break;
3331       }
3332     }
3333   }
3334 }
3335 
3336 std::deque<SUnit *>
3337 SMSchedule::reorderInstructions(const SwingSchedulerDAG *SSD,
3338                                 const std::deque<SUnit *> &Instrs) const {
3339   std::deque<SUnit *> NewOrderPhi;
3340   for (SUnit *SU : Instrs) {
3341     if (SU->getInstr()->isPHI())
3342       NewOrderPhi.push_back(SU);
3343   }
3344   std::deque<SUnit *> NewOrderI;
3345   for (SUnit *SU : Instrs) {
3346     if (!SU->getInstr()->isPHI())
3347       orderDependence(SSD, SU, NewOrderI);
3348   }
3349   llvm::append_range(NewOrderPhi, NewOrderI);
3350   return NewOrderPhi;
3351 }
3352 
3353 /// After the schedule has been formed, call this function to combine
3354 /// the instructions from the different stages/cycles.  That is, this
3355 /// function creates a schedule that represents a single iteration.
3356 void SMSchedule::finalizeSchedule(SwingSchedulerDAG *SSD) {
3357   // Move all instructions to the first stage from later stages.
3358   for (int cycle = getFirstCycle(); cycle <= getFinalCycle(); ++cycle) {
3359     for (int stage = 1, lastStage = getMaxStageCount(); stage <= lastStage;
3360          ++stage) {
3361       std::deque<SUnit *> &cycleInstrs =
3362           ScheduledInstrs[cycle + (stage * InitiationInterval)];
3363       for (SUnit *SU : llvm::reverse(cycleInstrs))
3364         ScheduledInstrs[cycle].push_front(SU);
3365     }
3366   }
3367 
3368   // Erase all the elements in the later stages. Only one iteration should
3369   // remain in the scheduled list, and it contains all the instructions.
3370   for (int cycle = getFinalCycle() + 1; cycle <= LastCycle; ++cycle)
3371     ScheduledInstrs.erase(cycle);
3372 
3373   // Change the registers in instruction as specified in the InstrChanges
3374   // map. We need to use the new registers to create the correct order.
3375   for (const SUnit &SU : SSD->SUnits)
3376     SSD->applyInstrChange(SU.getInstr(), *this);
3377 
3378   // Reorder the instructions in each cycle to fix and improve the
3379   // generated code.
3380   for (int Cycle = getFirstCycle(), E = getFinalCycle(); Cycle <= E; ++Cycle) {
3381     std::deque<SUnit *> &cycleInstrs = ScheduledInstrs[Cycle];
3382     cycleInstrs = reorderInstructions(SSD, cycleInstrs);
3383     SSD->fixupRegisterOverlaps(cycleInstrs);
3384   }
3385 
3386   LLVM_DEBUG(dump(););
3387 }
3388 
3389 void NodeSet::print(raw_ostream &os) const {
3390   os << "Num nodes " << size() << " rec " << RecMII << " mov " << MaxMOV
3391      << " depth " << MaxDepth << " col " << Colocate << "\n";
3392   for (const auto &I : Nodes)
3393     os << "   SU(" << I->NodeNum << ") " << *(I->getInstr());
3394   os << "\n";
3395 }
3396 
3397 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3398 /// Print the schedule information to the given output.
3399 void SMSchedule::print(raw_ostream &os) const {
3400   // Iterate over each cycle.
3401   for (int cycle = getFirstCycle(); cycle <= getFinalCycle(); ++cycle) {
3402     // Iterate over each instruction in the cycle.
3403     const_sched_iterator cycleInstrs = ScheduledInstrs.find(cycle);
3404     for (SUnit *CI : cycleInstrs->second) {
3405       os << "cycle " << cycle << " (" << stageScheduled(CI) << ") ";
3406       os << "(" << CI->NodeNum << ") ";
3407       CI->getInstr()->print(os);
3408       os << "\n";
3409     }
3410   }
3411 }
3412 
3413 /// Utility function used for debugging to print the schedule.
3414 LLVM_DUMP_METHOD void SMSchedule::dump() const { print(dbgs()); }
3415 LLVM_DUMP_METHOD void NodeSet::dump() const { print(dbgs()); }
3416 
3417 void ResourceManager::dumpMRT() const {
3418   LLVM_DEBUG({
3419     if (UseDFA)
3420       return;
3421     std::stringstream SS;
3422     SS << "MRT:\n";
3423     SS << std::setw(4) << "Slot";
3424     for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I)
3425       SS << std::setw(3) << I;
3426     SS << std::setw(7) << "#Mops"
3427        << "\n";
3428     for (int Slot = 0; Slot < InitiationInterval; ++Slot) {
3429       SS << std::setw(4) << Slot;
3430       for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I)
3431         SS << std::setw(3) << MRT[Slot][I];
3432       SS << std::setw(7) << NumScheduledMops[Slot] << "\n";
3433     }
3434     dbgs() << SS.str();
3435   });
3436 }
3437 #endif
3438 
3439 void ResourceManager::initProcResourceVectors(
3440     const MCSchedModel &SM, SmallVectorImpl<uint64_t> &Masks) {
3441   unsigned ProcResourceID = 0;
3442 
3443   // We currently limit the resource kinds to 64 and below so that we can use
3444   // uint64_t for Masks
3445   assert(SM.getNumProcResourceKinds() < 64 &&
3446          "Too many kinds of resources, unsupported");
3447   // Create a unique bitmask for every processor resource unit.
3448   // Skip resource at index 0, since it always references 'InvalidUnit'.
3449   Masks.resize(SM.getNumProcResourceKinds());
3450   for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I) {
3451     const MCProcResourceDesc &Desc = *SM.getProcResource(I);
3452     if (Desc.SubUnitsIdxBegin)
3453       continue;
3454     Masks[I] = 1ULL << ProcResourceID;
3455     ProcResourceID++;
3456   }
3457   // Create a unique bitmask for every processor resource group.
3458   for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I) {
3459     const MCProcResourceDesc &Desc = *SM.getProcResource(I);
3460     if (!Desc.SubUnitsIdxBegin)
3461       continue;
3462     Masks[I] = 1ULL << ProcResourceID;
3463     for (unsigned U = 0; U < Desc.NumUnits; ++U)
3464       Masks[I] |= Masks[Desc.SubUnitsIdxBegin[U]];
3465     ProcResourceID++;
3466   }
3467   LLVM_DEBUG({
3468     if (SwpShowResMask) {
3469       dbgs() << "ProcResourceDesc:\n";
3470       for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I) {
3471         const MCProcResourceDesc *ProcResource = SM.getProcResource(I);
3472         dbgs() << format(" %16s(%2d): Mask: 0x%08x, NumUnits:%2d\n",
3473                          ProcResource->Name, I, Masks[I],
3474                          ProcResource->NumUnits);
3475       }
3476       dbgs() << " -----------------\n";
3477     }
3478   });
3479 }
3480 
3481 bool ResourceManager::canReserveResources(SUnit &SU, int Cycle) {
3482   LLVM_DEBUG({
3483     if (SwpDebugResource)
3484       dbgs() << "canReserveResources:\n";
3485   });
3486   if (UseDFA)
3487     return DFAResources[positiveModulo(Cycle, InitiationInterval)]
3488         ->canReserveResources(&SU.getInstr()->getDesc());
3489 
3490   const MCSchedClassDesc *SCDesc = DAG->getSchedClass(&SU);
3491   if (!SCDesc->isValid()) {
3492     LLVM_DEBUG({
3493       dbgs() << "No valid Schedule Class Desc for schedClass!\n";
3494       dbgs() << "isPseudo:" << SU.getInstr()->isPseudo() << "\n";
3495     });
3496     return true;
3497   }
3498 
3499   reserveResources(SCDesc, Cycle);
3500   bool Result = !isOverbooked();
3501   unreserveResources(SCDesc, Cycle);
3502 
3503   LLVM_DEBUG(if (SwpDebugResource) dbgs() << "return " << Result << "\n\n";);
3504   return Result;
3505 }
3506 
3507 void ResourceManager::reserveResources(SUnit &SU, int Cycle) {
3508   LLVM_DEBUG({
3509     if (SwpDebugResource)
3510       dbgs() << "reserveResources:\n";
3511   });
3512   if (UseDFA)
3513     return DFAResources[positiveModulo(Cycle, InitiationInterval)]
3514         ->reserveResources(&SU.getInstr()->getDesc());
3515 
3516   const MCSchedClassDesc *SCDesc = DAG->getSchedClass(&SU);
3517   if (!SCDesc->isValid()) {
3518     LLVM_DEBUG({
3519       dbgs() << "No valid Schedule Class Desc for schedClass!\n";
3520       dbgs() << "isPseudo:" << SU.getInstr()->isPseudo() << "\n";
3521     });
3522     return;
3523   }
3524 
3525   reserveResources(SCDesc, Cycle);
3526 
3527   LLVM_DEBUG({
3528     if (SwpDebugResource) {
3529       dumpMRT();
3530       dbgs() << "reserveResources: done!\n\n";
3531     }
3532   });
3533 }
3534 
3535 void ResourceManager::reserveResources(const MCSchedClassDesc *SCDesc,
3536                                        int Cycle) {
3537   assert(!UseDFA);
3538   for (const MCWriteProcResEntry &PRE : make_range(
3539            STI->getWriteProcResBegin(SCDesc), STI->getWriteProcResEnd(SCDesc)))
3540     for (int C = Cycle; C < Cycle + PRE.ReleaseAtCycle; ++C)
3541       ++MRT[positiveModulo(C, InitiationInterval)][PRE.ProcResourceIdx];
3542 
3543   for (int C = Cycle; C < Cycle + SCDesc->NumMicroOps; ++C)
3544     ++NumScheduledMops[positiveModulo(C, InitiationInterval)];
3545 }
3546 
3547 void ResourceManager::unreserveResources(const MCSchedClassDesc *SCDesc,
3548                                          int Cycle) {
3549   assert(!UseDFA);
3550   for (const MCWriteProcResEntry &PRE : make_range(
3551            STI->getWriteProcResBegin(SCDesc), STI->getWriteProcResEnd(SCDesc)))
3552     for (int C = Cycle; C < Cycle + PRE.ReleaseAtCycle; ++C)
3553       --MRT[positiveModulo(C, InitiationInterval)][PRE.ProcResourceIdx];
3554 
3555   for (int C = Cycle; C < Cycle + SCDesc->NumMicroOps; ++C)
3556     --NumScheduledMops[positiveModulo(C, InitiationInterval)];
3557 }
3558 
3559 bool ResourceManager::isOverbooked() const {
3560   assert(!UseDFA);
3561   for (int Slot = 0; Slot < InitiationInterval; ++Slot) {
3562     for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I) {
3563       const MCProcResourceDesc *Desc = SM.getProcResource(I);
3564       if (MRT[Slot][I] > Desc->NumUnits)
3565         return true;
3566     }
3567     if (NumScheduledMops[Slot] > IssueWidth)
3568       return true;
3569   }
3570   return false;
3571 }
3572 
3573 int ResourceManager::calculateResMIIDFA() const {
3574   assert(UseDFA);
3575 
3576   // Sort the instructions by the number of available choices for scheduling,
3577   // least to most. Use the number of critical resources as the tie breaker.
3578   FuncUnitSorter FUS = FuncUnitSorter(*ST);
3579   for (SUnit &SU : DAG->SUnits)
3580     FUS.calcCriticalResources(*SU.getInstr());
3581   PriorityQueue<MachineInstr *, std::vector<MachineInstr *>, FuncUnitSorter>
3582       FuncUnitOrder(FUS);
3583 
3584   for (SUnit &SU : DAG->SUnits)
3585     FuncUnitOrder.push(SU.getInstr());
3586 
3587   SmallVector<std::unique_ptr<DFAPacketizer>, 8> Resources;
3588   Resources.push_back(
3589       std::unique_ptr<DFAPacketizer>(TII->CreateTargetScheduleState(*ST)));
3590 
3591   while (!FuncUnitOrder.empty()) {
3592     MachineInstr *MI = FuncUnitOrder.top();
3593     FuncUnitOrder.pop();
3594     if (TII->isZeroCost(MI->getOpcode()))
3595       continue;
3596 
3597     // Attempt to reserve the instruction in an existing DFA. At least one
3598     // DFA is needed for each cycle.
3599     unsigned NumCycles = DAG->getSUnit(MI)->Latency;
3600     unsigned ReservedCycles = 0;
3601     auto *RI = Resources.begin();
3602     auto *RE = Resources.end();
3603     LLVM_DEBUG({
3604       dbgs() << "Trying to reserve resource for " << NumCycles
3605              << " cycles for \n";
3606       MI->dump();
3607     });
3608     for (unsigned C = 0; C < NumCycles; ++C)
3609       while (RI != RE) {
3610         if ((*RI)->canReserveResources(*MI)) {
3611           (*RI)->reserveResources(*MI);
3612           ++ReservedCycles;
3613           break;
3614         }
3615         RI++;
3616       }
3617     LLVM_DEBUG(dbgs() << "ReservedCycles:" << ReservedCycles
3618                       << ", NumCycles:" << NumCycles << "\n");
3619     // Add new DFAs, if needed, to reserve resources.
3620     for (unsigned C = ReservedCycles; C < NumCycles; ++C) {
3621       LLVM_DEBUG(if (SwpDebugResource) dbgs()
3622                  << "NewResource created to reserve resources"
3623                  << "\n");
3624       auto *NewResource = TII->CreateTargetScheduleState(*ST);
3625       assert(NewResource->canReserveResources(*MI) && "Reserve error.");
3626       NewResource->reserveResources(*MI);
3627       Resources.push_back(std::unique_ptr<DFAPacketizer>(NewResource));
3628     }
3629   }
3630 
3631   int Resmii = Resources.size();
3632   LLVM_DEBUG(dbgs() << "Return Res MII:" << Resmii << "\n");
3633   return Resmii;
3634 }
3635 
3636 int ResourceManager::calculateResMII() const {
3637   if (UseDFA)
3638     return calculateResMIIDFA();
3639 
3640   // Count each resource consumption and divide it by the number of units.
3641   // ResMII is the max value among them.
3642 
3643   int NumMops = 0;
3644   SmallVector<uint64_t> ResourceCount(SM.getNumProcResourceKinds());
3645   for (SUnit &SU : DAG->SUnits) {
3646     if (TII->isZeroCost(SU.getInstr()->getOpcode()))
3647       continue;
3648 
3649     const MCSchedClassDesc *SCDesc = DAG->getSchedClass(&SU);
3650     if (!SCDesc->isValid())
3651       continue;
3652 
3653     LLVM_DEBUG({
3654       if (SwpDebugResource) {
3655         DAG->dumpNode(SU);
3656         dbgs() << "  #Mops: " << SCDesc->NumMicroOps << "\n"
3657                << "  WriteProcRes: ";
3658       }
3659     });
3660     NumMops += SCDesc->NumMicroOps;
3661     for (const MCWriteProcResEntry &PRE :
3662          make_range(STI->getWriteProcResBegin(SCDesc),
3663                     STI->getWriteProcResEnd(SCDesc))) {
3664       LLVM_DEBUG({
3665         if (SwpDebugResource) {
3666           const MCProcResourceDesc *Desc =
3667               SM.getProcResource(PRE.ProcResourceIdx);
3668           dbgs() << Desc->Name << ": " << PRE.ReleaseAtCycle << ", ";
3669         }
3670       });
3671       ResourceCount[PRE.ProcResourceIdx] += PRE.ReleaseAtCycle;
3672     }
3673     LLVM_DEBUG(if (SwpDebugResource) dbgs() << "\n");
3674   }
3675 
3676   int Result = (NumMops + IssueWidth - 1) / IssueWidth;
3677   LLVM_DEBUG({
3678     if (SwpDebugResource)
3679       dbgs() << "#Mops: " << NumMops << ", "
3680              << "IssueWidth: " << IssueWidth << ", "
3681              << "Cycles: " << Result << "\n";
3682   });
3683 
3684   LLVM_DEBUG({
3685     if (SwpDebugResource) {
3686       std::stringstream SS;
3687       SS << std::setw(2) << "ID" << std::setw(16) << "Name" << std::setw(10)
3688          << "Units" << std::setw(10) << "Consumed" << std::setw(10) << "Cycles"
3689          << "\n";
3690       dbgs() << SS.str();
3691     }
3692   });
3693   for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I) {
3694     const MCProcResourceDesc *Desc = SM.getProcResource(I);
3695     int Cycles = (ResourceCount[I] + Desc->NumUnits - 1) / Desc->NumUnits;
3696     LLVM_DEBUG({
3697       if (SwpDebugResource) {
3698         std::stringstream SS;
3699         SS << std::setw(2) << I << std::setw(16) << Desc->Name << std::setw(10)
3700            << Desc->NumUnits << std::setw(10) << ResourceCount[I]
3701            << std::setw(10) << Cycles << "\n";
3702         dbgs() << SS.str();
3703       }
3704     });
3705     if (Cycles > Result)
3706       Result = Cycles;
3707   }
3708   return Result;
3709 }
3710 
3711 void ResourceManager::init(int II) {
3712   InitiationInterval = II;
3713   DFAResources.clear();
3714   DFAResources.resize(II);
3715   for (auto &I : DFAResources)
3716     I.reset(ST->getInstrInfo()->CreateTargetScheduleState(*ST));
3717   MRT.clear();
3718   MRT.resize(II, SmallVector<uint64_t>(SM.getNumProcResourceKinds()));
3719   NumScheduledMops.clear();
3720   NumScheduledMops.resize(II);
3721 }
3722