xref: /freebsd-src/contrib/llvm-project/llvm/lib/CodeGen/WindowScheduler.cpp (revision 0fca6ea1d4eea4c934cfff25ac9ee8ad6fe95583)
1 //======----------- WindowScheduler.cpp - window scheduler -------------======//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // An implementation of the Window Scheduling software pipelining algorithm.
10 //
11 // The fundamental concept of the window scheduling algorithm involves folding
12 // the original MBB at a specific position, followed by list scheduling on the
13 // folded MIs. The optimal scheduling result is then chosen from various folding
14 // positions as the final scheduling outcome.
15 //
16 // The primary challenge in this algorithm lies in generating the folded MIs and
17 // establishing their dependencies. We have innovatively employed a new MBB,
18 // created by copying the original MBB three times, known as TripleMBB. This
19 // TripleMBB enables the convenient implementation of MI folding and dependency
20 // establishment. To facilitate the algorithm's implementation, we have also
21 // devised data structures such as OriMIs, TriMIs, TriToOri, and OriToCycle.
22 //
23 // Another challenge in the algorithm is the scheduling of phis. Semantically,
24 // it is difficult to place the phis in the window and perform list scheduling.
25 // Therefore, we schedule these phis separately after each list scheduling.
26 //
27 // The provided implementation is designed for use before the Register Allocator
28 // (RA). If the target requires implementation after RA, it is recommended to
29 // reimplement analyseII(), schedulePhi(), and expand(). Additionally,
30 // target-specific logic can be added in initialize(), preProcess(), and
31 // postProcess().
32 //
33 // Lastly, it is worth mentioning that getSearchIndexes() is an important
34 // function. We have experimented with more complex heuristics on downstream
35 // target and achieved favorable results.
36 //
37 //===----------------------------------------------------------------------===//
38 #include "llvm/CodeGen/WindowScheduler.h"
39 #include "llvm/ADT/Statistic.h"
40 #include "llvm/CodeGen/LiveIntervals.h"
41 #include "llvm/CodeGen/MachineLoopInfo.h"
42 #include "llvm/CodeGen/MachinePipeliner.h"
43 #include "llvm/CodeGen/ModuloSchedule.h"
44 #include "llvm/CodeGen/TargetPassConfig.h"
45 #include "llvm/Support/CommandLine.h"
46 #include "llvm/Support/Debug.h"
47 #include "llvm/Support/TimeProfiler.h"
48 
49 using namespace llvm;
50 
51 #define DEBUG_TYPE "pipeliner"
52 
53 namespace {
54 STATISTIC(NumTryWindowSchedule,
55           "Number of loops that we attempt to use window scheduling");
56 STATISTIC(NumTryWindowSearch,
57           "Number of times that we run list schedule in the window scheduling");
58 STATISTIC(NumWindowSchedule,
59           "Number of loops that we successfully use window scheduling");
60 STATISTIC(NumFailAnalyseII,
61           "Window scheduling abort due to the failure of the II analysis");
62 
63 cl::opt<unsigned>
64     WindowSearchNum("window-search-num",
65                     cl::desc("The number of searches per loop in the window "
66                              "algorithm. 0 means no search number limit."),
67                     cl::Hidden, cl::init(6));
68 
69 cl::opt<unsigned> WindowSearchRatio(
70     "window-search-ratio",
71     cl::desc("The ratio of searches per loop in the window algorithm. 100 "
72              "means search all positions in the loop, while 0 means not "
73              "performing any search."),
74     cl::Hidden, cl::init(40));
75 
76 cl::opt<unsigned> WindowIICoeff(
77     "window-ii-coeff",
78     cl::desc(
79         "The coefficient used when initializing II in the window algorithm."),
80     cl::Hidden, cl::init(5));
81 
82 cl::opt<unsigned> WindowRegionLimit(
83     "window-region-limit",
84     cl::desc(
85         "The lower limit of the scheduling region in the window algorithm."),
86     cl::Hidden, cl::init(3));
87 
88 cl::opt<unsigned> WindowDiffLimit(
89     "window-diff-limit",
90     cl::desc("The lower limit of the difference between best II and base II in "
91              "the window algorithm. If the difference is smaller than "
92              "this lower limit, window scheduling will not be performed."),
93     cl::Hidden, cl::init(2));
94 } // namespace
95 
96 // WindowIILimit serves as an indicator of abnormal scheduling results and could
97 // potentially be referenced by the derived target window scheduler.
98 cl::opt<unsigned>
99     WindowIILimit("window-ii-limit",
100                   cl::desc("The upper limit of II in the window algorithm."),
101                   cl::Hidden, cl::init(1000));
102 
103 WindowScheduler::WindowScheduler(MachineSchedContext *C, MachineLoop &ML)
104     : Context(C), MF(C->MF), MBB(ML.getHeader()), Loop(ML),
105       Subtarget(&MF->getSubtarget()), TII(Subtarget->getInstrInfo()),
106       TRI(Subtarget->getRegisterInfo()), MRI(&MF->getRegInfo()) {
107   TripleDAG = std::unique_ptr<ScheduleDAGInstrs>(
108       createMachineScheduler(/*OnlyBuildGraph=*/true));
109 }
110 
111 bool WindowScheduler::run() {
112   if (!initialize()) {
113     LLVM_DEBUG(dbgs() << "The WindowScheduler failed to initialize!\n");
114     return false;
115   }
116   // The window algorithm is time-consuming, and its compilation time should be
117   // taken into consideration.
118   TimeTraceScope Scope("WindowSearch");
119   ++NumTryWindowSchedule;
120   // Performing the relevant processing before window scheduling.
121   preProcess();
122   // The main window scheduling begins.
123   std::unique_ptr<ScheduleDAGInstrs> SchedDAG(createMachineScheduler());
124   auto SearchIndexes = getSearchIndexes(WindowSearchNum, WindowSearchRatio);
125   for (unsigned Idx : SearchIndexes) {
126     OriToCycle.clear();
127     ++NumTryWindowSearch;
128     // The scheduling starts with non-phi instruction, so SchedPhiNum needs to
129     // be added to Idx.
130     unsigned Offset = Idx + SchedPhiNum;
131     auto Range = getScheduleRange(Offset, SchedInstrNum);
132     SchedDAG->startBlock(MBB);
133     SchedDAG->enterRegion(MBB, Range.begin(), Range.end(), SchedInstrNum);
134     SchedDAG->schedule();
135     LLVM_DEBUG(SchedDAG->dump());
136     unsigned II = analyseII(*SchedDAG, Offset);
137     if (II == WindowIILimit) {
138       restoreTripleMBB();
139       LLVM_DEBUG(dbgs() << "Can't find a valid II. Keep searching...\n");
140       ++NumFailAnalyseII;
141       continue;
142     }
143     schedulePhi(Offset, II);
144     updateScheduleResult(Offset, II);
145     restoreTripleMBB();
146     LLVM_DEBUG(dbgs() << "Current window Offset is " << Offset << " and II is "
147                       << II << ".\n");
148   }
149   // Performing the relevant processing after window scheduling.
150   postProcess();
151   // Check whether the scheduling result is valid.
152   if (!isScheduleValid()) {
153     LLVM_DEBUG(dbgs() << "Window scheduling is not needed!\n");
154     return false;
155   }
156   LLVM_DEBUG(dbgs() << "\nBest window offset is " << BestOffset
157                     << " and Best II is " << BestII << ".\n");
158   // Expand the scheduling result to prologue, kernel, and epilogue.
159   expand();
160   ++NumWindowSchedule;
161   return true;
162 }
163 
164 ScheduleDAGInstrs *
165 WindowScheduler::createMachineScheduler(bool OnlyBuildGraph) {
166   return OnlyBuildGraph
167              ? new ScheduleDAGMI(
168                    Context, std::make_unique<PostGenericScheduler>(Context),
169                    true)
170              : Context->PassConfig->createMachineScheduler(Context);
171 }
172 
173 bool WindowScheduler::initialize() {
174   if (!Subtarget->enableWindowScheduler()) {
175     LLVM_DEBUG(dbgs() << "Target disables the window scheduling!\n");
176     return false;
177   }
178   // Initialized the member variables used by window algorithm.
179   OriMIs.clear();
180   TriMIs.clear();
181   TriToOri.clear();
182   OriToCycle.clear();
183   SchedResult.clear();
184   SchedPhiNum = 0;
185   SchedInstrNum = 0;
186   BestII = UINT_MAX;
187   BestOffset = 0;
188   BaseII = 0;
189   // List scheduling used in the window algorithm depends on LiveIntervals.
190   if (!Context->LIS) {
191     LLVM_DEBUG(dbgs() << "There is no LiveIntervals information!\n");
192     return false;
193   }
194   // Check each MI in MBB.
195   SmallSet<Register, 8> PrevDefs;
196   SmallSet<Register, 8> PrevUses;
197   auto IsLoopCarried = [&](MachineInstr &Phi) {
198     // Two cases are checked here: (1)The virtual register defined by the
199     // preceding phi is used by the succeeding phi;(2)The preceding phi uses the
200     // virtual register defined by the succeeding phi.
201     if (PrevUses.count(Phi.getOperand(0).getReg()))
202       return true;
203     PrevDefs.insert(Phi.getOperand(0).getReg());
204     for (unsigned I = 1, E = Phi.getNumOperands(); I != E; I += 2) {
205       if (PrevDefs.count(Phi.getOperand(I).getReg()))
206         return true;
207       PrevUses.insert(Phi.getOperand(I).getReg());
208     }
209     return false;
210   };
211   auto PLI = TII->analyzeLoopForPipelining(MBB);
212   for (auto &MI : *MBB) {
213     if (MI.isMetaInstruction() || MI.isTerminator())
214       continue;
215     if (MI.isPHI()) {
216       if (IsLoopCarried(MI)) {
217         LLVM_DEBUG(dbgs() << "Loop carried phis are not supported yet!\n");
218         return false;
219       }
220       ++SchedPhiNum;
221       ++BestOffset;
222     } else
223       ++SchedInstrNum;
224     if (TII->isSchedulingBoundary(MI, MBB, *MF)) {
225       LLVM_DEBUG(
226           dbgs() << "Boundary MI is not allowed in window scheduling!\n");
227       return false;
228     }
229     if (PLI->shouldIgnoreForPipelining(&MI)) {
230       LLVM_DEBUG(dbgs() << "Special MI defined by target is not allowed in "
231                            "window scheduling!\n");
232       return false;
233     }
234     for (auto &Def : MI.all_defs())
235       if (Def.isReg() && Def.getReg().isPhysical())
236         return false;
237   }
238   if (SchedInstrNum <= WindowRegionLimit) {
239     LLVM_DEBUG(dbgs() << "There are too few MIs in the window region!\n");
240     return false;
241   }
242   return true;
243 }
244 
245 void WindowScheduler::preProcess() {
246   // Prior to window scheduling, it's necessary to backup the original MBB,
247   // generate a new TripleMBB, and build a TripleDAG based on the TripleMBB.
248   backupMBB();
249   generateTripleMBB();
250   TripleDAG->startBlock(MBB);
251   TripleDAG->enterRegion(
252       MBB, MBB->begin(), MBB->getFirstTerminator(),
253       std::distance(MBB->begin(), MBB->getFirstTerminator()));
254   TripleDAG->buildSchedGraph(Context->AA);
255 }
256 
257 void WindowScheduler::postProcess() {
258   // After window scheduling, it's necessary to clear the TripleDAG and restore
259   // to the original MBB.
260   TripleDAG->exitRegion();
261   TripleDAG->finishBlock();
262   restoreMBB();
263 }
264 
265 void WindowScheduler::backupMBB() {
266   for (auto &MI : MBB->instrs())
267     OriMIs.push_back(&MI);
268   // Remove MIs and the corresponding live intervals.
269   for (auto &MI : make_early_inc_range(*MBB)) {
270     Context->LIS->getSlotIndexes()->removeMachineInstrFromMaps(MI, true);
271     MBB->remove(&MI);
272   }
273 }
274 
275 void WindowScheduler::restoreMBB() {
276   // Erase MIs and the corresponding live intervals.
277   for (auto &MI : make_early_inc_range(*MBB)) {
278     Context->LIS->getSlotIndexes()->removeMachineInstrFromMaps(MI, true);
279     MI.eraseFromParent();
280   }
281   // Restore MBB to the state before window scheduling.
282   for (auto *MI : OriMIs)
283     MBB->push_back(MI);
284   updateLiveIntervals();
285 }
286 
287 void WindowScheduler::generateTripleMBB() {
288   const unsigned DuplicateNum = 3;
289   TriMIs.clear();
290   TriToOri.clear();
291   assert(OriMIs.size() > 0 && "The Original MIs were not backed up!");
292   // Step 1: Performing the first copy of MBB instructions, excluding
293   // terminators. At the same time, we back up the anti-register of phis.
294   // DefPairs hold the old and new define register pairs.
295   DenseMap<Register, Register> DefPairs;
296   for (auto *MI : OriMIs) {
297     if (MI->isMetaInstruction() || MI->isTerminator())
298       continue;
299     if (MI->isPHI())
300       if (Register AntiReg = getAntiRegister(MI))
301         DefPairs[MI->getOperand(0).getReg()] = AntiReg;
302     auto *NewMI = MF->CloneMachineInstr(MI);
303     MBB->push_back(NewMI);
304     TriMIs.push_back(NewMI);
305     TriToOri[NewMI] = MI;
306   }
307   // Step 2: Performing the remaining two copies of MBB instructions excluding
308   // phis, and the last one contains terminators. At the same time, registers
309   // are updated accordingly.
310   for (size_t Cnt = 1; Cnt < DuplicateNum; ++Cnt) {
311     for (auto *MI : OriMIs) {
312       if (MI->isPHI() || MI->isMetaInstruction() ||
313           (MI->isTerminator() && Cnt < DuplicateNum - 1))
314         continue;
315       auto *NewMI = MF->CloneMachineInstr(MI);
316       DenseMap<Register, Register> NewDefs;
317       // New defines are updated.
318       for (auto MO : NewMI->all_defs())
319         if (MO.isReg() && MO.getReg().isVirtual()) {
320           Register NewDef =
321               MRI->createVirtualRegister(MRI->getRegClass(MO.getReg()));
322           NewMI->substituteRegister(MO.getReg(), NewDef, 0, *TRI);
323           NewDefs[MO.getReg()] = NewDef;
324         }
325       // New uses are updated.
326       for (auto DefRegPair : DefPairs)
327         if (NewMI->readsRegister(DefRegPair.first, TRI)) {
328           Register NewUse = DefRegPair.second;
329           // Note the update process for '%1 -> %9' in '%10 = sub i32 %9, %3':
330           //
331           // BB.3:                                  DefPairs
332           // ==================================
333           // %1 = phi i32 [%2, %BB.1], [%7, %BB.3]  (%1,%7)
334           // ...
335           // ==================================
336           // ...
337           // %4 = sub i32 %1, %3
338           // ...
339           // %7 = add i32 %5, %6
340           // ...
341           // ----------------------------------
342           // ...
343           // %8 = sub i32 %7, %3                    (%1,%7),(%4,%8)
344           // ...
345           // %9 = add i32 %5, %6                    (%1,%7),(%4,%8),(%7,%9)
346           // ...
347           // ----------------------------------
348           // ...
349           // %10 = sub i32 %9, %3                   (%1,%7),(%4,%10),(%7,%9)
350           // ...            ^
351           // %11 = add i32 %5, %6                   (%1,%7),(%4,%10),(%7,%11)
352           // ...
353           // ==================================
354           //          < Terminators >
355           // ==================================
356           if (DefPairs.count(NewUse))
357             NewUse = DefPairs[NewUse];
358           NewMI->substituteRegister(DefRegPair.first, NewUse, 0, *TRI);
359         }
360       // DefPairs is updated at last.
361       for (auto &NewDef : NewDefs)
362         DefPairs[NewDef.first] = NewDef.second;
363       MBB->push_back(NewMI);
364       TriMIs.push_back(NewMI);
365       TriToOri[NewMI] = MI;
366     }
367   }
368   // Step 3: The registers used by phis are updated, and they are generated in
369   // the third copy of MBB.
370   // In the privious example, the old phi is:
371   // %1 = phi i32 [%2, %BB.1], [%7, %BB.3]
372   // The new phi is:
373   // %1 = phi i32 [%2, %BB.1], [%11, %BB.3]
374   for (auto &Phi : MBB->phis()) {
375     for (auto DefRegPair : DefPairs)
376       if (Phi.readsRegister(DefRegPair.first, TRI))
377         Phi.substituteRegister(DefRegPair.first, DefRegPair.second, 0, *TRI);
378   }
379   updateLiveIntervals();
380 }
381 
382 void WindowScheduler::restoreTripleMBB() {
383   // After list scheduling, the MBB is restored in one traversal.
384   for (size_t I = 0; I < TriMIs.size(); ++I) {
385     auto *MI = TriMIs[I];
386     auto OldPos = MBB->begin();
387     std::advance(OldPos, I);
388     auto CurPos = MI->getIterator();
389     if (CurPos != OldPos) {
390       MBB->splice(OldPos, MBB, CurPos);
391       Context->LIS->handleMove(*MI, /*UpdateFlags=*/false);
392     }
393   }
394 }
395 
396 SmallVector<unsigned> WindowScheduler::getSearchIndexes(unsigned SearchNum,
397                                                         unsigned SearchRatio) {
398   // We use SearchRatio to get the index range, and then evenly get the indexes
399   // according to the SearchNum. This is a simple huristic. Depending on the
400   // characteristics of the target, more complex algorithms can be used for both
401   // performance and compilation time.
402   assert(SearchRatio <= 100 && "SearchRatio should be equal or less than 100!");
403   unsigned MaxIdx = SchedInstrNum * SearchRatio / 100;
404   unsigned Step = SearchNum > 0 && SearchNum <= MaxIdx ? MaxIdx / SearchNum : 1;
405   SmallVector<unsigned> SearchIndexes;
406   for (unsigned Idx = 0; Idx < MaxIdx; Idx += Step)
407     SearchIndexes.push_back(Idx);
408   return SearchIndexes;
409 }
410 
411 int WindowScheduler::getEstimatedII(ScheduleDAGInstrs &DAG) {
412   // Sometimes MaxDepth is 0, so it should be limited to the minimum of 1.
413   unsigned MaxDepth = 1;
414   for (auto &SU : DAG.SUnits)
415     MaxDepth = std::max(SU.getDepth() + SU.Latency, MaxDepth);
416   return MaxDepth * WindowIICoeff;
417 }
418 
419 int WindowScheduler::calculateMaxCycle(ScheduleDAGInstrs &DAG,
420                                        unsigned Offset) {
421   int InitII = getEstimatedII(DAG);
422   ResourceManager RM(Subtarget, &DAG);
423   RM.init(InitII);
424   // ResourceManager and DAG are used to calculate the maximum cycle for the
425   // scheduled MIs. Since MIs in the Region have already been scheduled, the
426   // emit cycles can be estimated in order here.
427   int CurCycle = 0;
428   auto Range = getScheduleRange(Offset, SchedInstrNum);
429   for (auto &MI : Range) {
430     auto *SU = DAG.getSUnit(&MI);
431     int ExpectCycle = CurCycle;
432     // The predecessors of current MI determine its earliest issue cycle.
433     for (auto &Pred : SU->Preds) {
434       if (Pred.isWeak())
435         continue;
436       auto *PredMI = Pred.getSUnit()->getInstr();
437       int PredCycle = getOriCycle(PredMI);
438       ExpectCycle = std::max(ExpectCycle, PredCycle + (int)Pred.getLatency());
439     }
440     // ResourceManager can be used to detect resource conflicts between the
441     // current MI and the previously inserted MIs.
442     while (!RM.canReserveResources(*SU, CurCycle) || CurCycle < ExpectCycle) {
443       ++CurCycle;
444       if (CurCycle == (int)WindowIILimit)
445         return CurCycle;
446     }
447     RM.reserveResources(*SU, CurCycle);
448     OriToCycle[getOriMI(&MI)] = CurCycle;
449     LLVM_DEBUG(dbgs() << "\tCycle " << CurCycle << " [S."
450                       << getOriStage(getOriMI(&MI), Offset) << "]: " << MI);
451   }
452   LLVM_DEBUG(dbgs() << "MaxCycle is " << CurCycle << ".\n");
453   return CurCycle;
454 }
455 
456 // By utilizing TripleDAG, we can easily establish dependencies between A and B.
457 // Based on the MaxCycle and the issue cycle of A and B, we can determine
458 // whether it is necessary to add a stall cycle. This is because, without
459 // inserting the stall cycle, the latency constraint between A and B cannot be
460 // satisfied. The details are as follows:
461 //
462 // New MBB:
463 // ========================================
464 //                 < Phis >
465 // ========================================     (sliding direction)
466 // MBB copy 1                                            |
467 //                                                       V
468 //
469 // ~~~~~~~~~~~~~~~~~~~|~~~~~~~~~~~~~~~~~~~~  ----schedule window-----
470 //                    |                                  |
471 // ===================V====================              |
472 // MBB copy 2      < MI B >                              |
473 //                                                       |
474 //                 < MI A >                              V
475 // ~~~~~~~~~~~~~~~~~~~:~~~~~~~~~~~~~~~~~~~~  ------------------------
476 //                    :
477 // ===================V====================
478 // MBB copy 3      < MI B'>
479 //
480 //
481 //
482 //
483 // ========================================
484 //              < Terminators >
485 // ========================================
486 int WindowScheduler::calculateStallCycle(unsigned Offset, int MaxCycle) {
487   int MaxStallCycle = 0;
488   auto Range = getScheduleRange(Offset, SchedInstrNum);
489   for (auto &MI : Range) {
490     auto *SU = TripleDAG->getSUnit(&MI);
491     int DefCycle = getOriCycle(&MI);
492     for (auto &Succ : SU->Succs) {
493       if (Succ.isWeak() || Succ.getSUnit() == &TripleDAG->ExitSU)
494         continue;
495       // If the expected cycle does not exceed MaxCycle, no check is needed.
496       if (DefCycle + (int)Succ.getLatency() <= MaxCycle)
497         continue;
498       // If the cycle of the scheduled MI A is less than that of the scheduled
499       // MI B, the scheduling will fail because the lifetime of the
500       // corresponding register exceeds II.
501       auto *SuccMI = Succ.getSUnit()->getInstr();
502       int UseCycle = getOriCycle(SuccMI);
503       if (DefCycle < UseCycle)
504         return WindowIILimit;
505       // Get the stall cycle introduced by the register between two trips.
506       int StallCycle = DefCycle + (int)Succ.getLatency() - MaxCycle - UseCycle;
507       MaxStallCycle = std::max(MaxStallCycle, StallCycle);
508     }
509   }
510   LLVM_DEBUG(dbgs() << "MaxStallCycle is " << MaxStallCycle << ".\n");
511   return MaxStallCycle;
512 }
513 
514 unsigned WindowScheduler::analyseII(ScheduleDAGInstrs &DAG, unsigned Offset) {
515   LLVM_DEBUG(dbgs() << "Start analyzing II:\n");
516   int MaxCycle = calculateMaxCycle(DAG, Offset);
517   if (MaxCycle == (int)WindowIILimit)
518     return MaxCycle;
519   int StallCycle = calculateStallCycle(Offset, MaxCycle);
520   if (StallCycle == (int)WindowIILimit)
521     return StallCycle;
522   // The value of II is equal to the maximum execution cycle plus 1.
523   return MaxCycle + StallCycle + 1;
524 }
525 
526 void WindowScheduler::schedulePhi(int Offset, unsigned &II) {
527   LLVM_DEBUG(dbgs() << "Start scheduling Phis:\n");
528   for (auto &Phi : MBB->phis()) {
529     int LateCycle = INT_MAX;
530     auto *SU = TripleDAG->getSUnit(&Phi);
531     for (auto &Succ : SU->Succs) {
532       // Phi doesn't have any Anti successors.
533       if (Succ.getKind() != SDep::Data)
534         continue;
535       // Phi is scheduled before the successor of stage 0. The issue cycle of
536       // phi is the latest cycle in this interval.
537       auto *SuccMI = Succ.getSUnit()->getInstr();
538       int Cycle = getOriCycle(SuccMI);
539       if (getOriStage(getOriMI(SuccMI), Offset) == 0)
540         LateCycle = std::min(LateCycle, Cycle);
541     }
542     // The anti-dependency of phi need to be handled separately in the same way.
543     if (Register AntiReg = getAntiRegister(&Phi)) {
544       auto *AntiMI = MRI->getVRegDef(AntiReg);
545       // AntiReg may be defined outside the kernel MBB.
546       if (AntiMI->getParent() == MBB) {
547         auto AntiCycle = getOriCycle(AntiMI);
548         if (getOriStage(getOriMI(AntiMI), Offset) == 0)
549           LateCycle = std::min(LateCycle, AntiCycle);
550       }
551     }
552     // If there is no limit to the late cycle, a default value is given.
553     if (LateCycle == INT_MAX)
554       LateCycle = (int)(II - 1);
555     LLVM_DEBUG(dbgs() << "\tCycle range [0, " << LateCycle << "] " << Phi);
556     // The issue cycle of phi is set to the latest cycle in the interval.
557     auto *OriPhi = getOriMI(&Phi);
558     OriToCycle[OriPhi] = LateCycle;
559   }
560 }
561 
562 DenseMap<MachineInstr *, int> WindowScheduler::getIssueOrder(unsigned Offset,
563                                                              unsigned II) {
564   // At each issue cycle, phi is placed before MIs in stage 0. So the simplest
565   // way is to put phi at the beginning of the current cycle.
566   DenseMap<int, SmallVector<MachineInstr *>> CycleToMIs;
567   auto Range = getScheduleRange(Offset, SchedInstrNum);
568   for (auto &Phi : MBB->phis())
569     CycleToMIs[getOriCycle(&Phi)].push_back(getOriMI(&Phi));
570   for (auto &MI : Range)
571     CycleToMIs[getOriCycle(&MI)].push_back(getOriMI(&MI));
572   // Each MI is assigned a separate ordered Id, which is used as a sort marker
573   // in the following expand process.
574   DenseMap<MachineInstr *, int> IssueOrder;
575   int Id = 0;
576   for (int Cycle = 0; Cycle < (int)II; ++Cycle) {
577     if (!CycleToMIs.count(Cycle))
578       continue;
579     for (auto *MI : CycleToMIs[Cycle])
580       IssueOrder[MI] = Id++;
581   }
582   return IssueOrder;
583 }
584 
585 void WindowScheduler::updateScheduleResult(unsigned Offset, unsigned II) {
586   // At the first update, Offset is equal to SchedPhiNum. At this time, only
587   // BestII, BestOffset, and BaseII need to be updated.
588   if (Offset == SchedPhiNum) {
589     BestII = II;
590     BestOffset = SchedPhiNum;
591     BaseII = II;
592     return;
593   }
594   // The update will only continue if the II is smaller than BestII and the II
595   // is sufficiently small.
596   if ((II >= BestII) || (II + WindowDiffLimit > BaseII))
597     return;
598   BestII = II;
599   BestOffset = Offset;
600   // Record the result of the current list scheduling, noting that each MI is
601   // stored unordered in SchedResult.
602   SchedResult.clear();
603   auto IssueOrder = getIssueOrder(Offset, II);
604   for (auto &Pair : OriToCycle) {
605     assert(IssueOrder.count(Pair.first) && "Cannot find original MI!");
606     SchedResult.push_back(std::make_tuple(Pair.first, Pair.second,
607                                           getOriStage(Pair.first, Offset),
608                                           IssueOrder[Pair.first]));
609   }
610 }
611 
612 void WindowScheduler::expand() {
613   // The MIs in the SchedResult are sorted by the issue order ID.
614   llvm::stable_sort(SchedResult,
615                     [](const std::tuple<MachineInstr *, int, int, int> &A,
616                        const std::tuple<MachineInstr *, int, int, int> &B) {
617                       return std::get<3>(A) < std::get<3>(B);
618                     });
619   // Use the scheduling infrastructure for expansion, noting that InstrChanges
620   // is not supported here.
621   DenseMap<MachineInstr *, int> Cycles, Stages;
622   std::vector<MachineInstr *> OrderedInsts;
623   for (auto &Info : SchedResult) {
624     auto *MI = std::get<0>(Info);
625     OrderedInsts.push_back(MI);
626     Cycles[MI] = std::get<1>(Info);
627     Stages[MI] = std::get<2>(Info);
628     LLVM_DEBUG(dbgs() << "\tCycle " << Cycles[MI] << " [S." << Stages[MI]
629                       << "]: " << *MI);
630   }
631   ModuloSchedule MS(*MF, &Loop, std::move(OrderedInsts), std::move(Cycles),
632                     std::move(Stages));
633   ModuloScheduleExpander MSE(*MF, MS, *Context->LIS,
634                              ModuloScheduleExpander::InstrChangesTy());
635   MSE.expand();
636   MSE.cleanup();
637 }
638 
639 void WindowScheduler::updateLiveIntervals() {
640   SmallVector<Register, 128> UsedRegs;
641   for (MachineInstr &MI : *MBB)
642     for (const MachineOperand &MO : MI.operands()) {
643       if (!MO.isReg() || MO.getReg() == 0)
644         continue;
645       Register Reg = MO.getReg();
646       if (!is_contained(UsedRegs, Reg))
647         UsedRegs.push_back(Reg);
648     }
649   Context->LIS->repairIntervalsInRange(MBB, MBB->begin(), MBB->end(), UsedRegs);
650 }
651 
652 iterator_range<MachineBasicBlock::iterator>
653 WindowScheduler::getScheduleRange(unsigned Offset, unsigned Num) {
654   auto RegionBegin = MBB->begin();
655   std::advance(RegionBegin, Offset);
656   auto RegionEnd = RegionBegin;
657   std::advance(RegionEnd, Num);
658   return make_range(RegionBegin, RegionEnd);
659 }
660 
661 int WindowScheduler::getOriCycle(MachineInstr *NewMI) {
662   assert(TriToOri.count(NewMI) && "Cannot find original MI!");
663   auto *OriMI = TriToOri[NewMI];
664   assert(OriToCycle.count(OriMI) && "Cannot find schedule cycle!");
665   return OriToCycle[OriMI];
666 }
667 
668 MachineInstr *WindowScheduler::getOriMI(MachineInstr *NewMI) {
669   assert(TriToOri.count(NewMI) && "Cannot find original MI!");
670   return TriToOri[NewMI];
671 }
672 
673 unsigned WindowScheduler::getOriStage(MachineInstr *OriMI, unsigned Offset) {
674   assert(llvm::find(OriMIs, OriMI) != OriMIs.end() &&
675          "Cannot find OriMI in OriMIs!");
676   // If there is no instruction fold, all MI stages are 0.
677   if (Offset == SchedPhiNum)
678     return 0;
679   // For those MIs with an ID less than the Offset, their stages are set to 0,
680   // while the rest are set to 1.
681   unsigned Id = 0;
682   for (auto *MI : OriMIs) {
683     if (MI->isMetaInstruction())
684       continue;
685     if (MI == OriMI)
686       break;
687     ++Id;
688   }
689   return Id >= (size_t)Offset ? 1 : 0;
690 }
691 
692 Register WindowScheduler::getAntiRegister(MachineInstr *Phi) {
693   assert(Phi->isPHI() && "Expecting PHI!");
694   Register AntiReg;
695   for (auto MO : Phi->uses()) {
696     if (MO.isReg())
697       AntiReg = MO.getReg();
698     else if (MO.isMBB() && MO.getMBB() == MBB)
699       return AntiReg;
700   }
701   return 0;
702 }
703