xref: /llvm-project/llvm/lib/CodeGen/ModuloSchedule.cpp (revision 0cc74a8941884d56a4718c28cc5b8ef8dbe17047)
1 //===- ModuloSchedule.cpp - Software pipeline schedule expansion ----------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
9 #include "llvm/CodeGen/ModuloSchedule.h"
10 #include "llvm/ADT/StringExtras.h"
11 #include "llvm/Analysis/MemoryLocation.h"
12 #include "llvm/CodeGen/LiveIntervals.h"
13 #include "llvm/CodeGen/MachineInstrBuilder.h"
14 #include "llvm/CodeGen/MachineLoopInfo.h"
15 #include "llvm/CodeGen/MachineRegisterInfo.h"
16 #include "llvm/InitializePasses.h"
17 #include "llvm/MC/MCContext.h"
18 #include "llvm/Support/Debug.h"
19 #include "llvm/Support/ErrorHandling.h"
20 #include "llvm/Support/raw_ostream.h"
21 
22 #define DEBUG_TYPE "pipeliner"
23 using namespace llvm;
24 
25 static cl::opt<bool> SwapBranchTargetsMVE(
26     "pipeliner-swap-branch-targets-mve", cl::Hidden, cl::init(false),
27     cl::desc("Swap target blocks of a conditional branch for MVE expander"));
28 
29 void ModuloSchedule::print(raw_ostream &OS) {
30   for (MachineInstr *MI : ScheduledInstrs)
31     OS << "[stage " << getStage(MI) << " @" << getCycle(MI) << "c] " << *MI;
32 }
33 
34 //===----------------------------------------------------------------------===//
35 // ModuloScheduleExpander implementation
36 //===----------------------------------------------------------------------===//
37 
38 /// Return the register values for  the operands of a Phi instruction.
39 /// This function assume the instruction is a Phi.
40 static void getPhiRegs(MachineInstr &Phi, MachineBasicBlock *Loop,
41                        unsigned &InitVal, unsigned &LoopVal) {
42   assert(Phi.isPHI() && "Expecting a Phi.");
43 
44   InitVal = 0;
45   LoopVal = 0;
46   for (unsigned i = 1, e = Phi.getNumOperands(); i != e; i += 2)
47     if (Phi.getOperand(i + 1).getMBB() != Loop)
48       InitVal = Phi.getOperand(i).getReg();
49     else
50       LoopVal = Phi.getOperand(i).getReg();
51 
52   assert(InitVal != 0 && LoopVal != 0 && "Unexpected Phi structure.");
53 }
54 
55 /// Return the Phi register value that comes from the incoming block.
56 static unsigned getInitPhiReg(MachineInstr &Phi, MachineBasicBlock *LoopBB) {
57   for (unsigned i = 1, e = Phi.getNumOperands(); i != e; i += 2)
58     if (Phi.getOperand(i + 1).getMBB() != LoopBB)
59       return Phi.getOperand(i).getReg();
60   return 0;
61 }
62 
63 /// Return the Phi register value that comes the loop block.
64 static unsigned getLoopPhiReg(MachineInstr &Phi, MachineBasicBlock *LoopBB) {
65   for (unsigned i = 1, e = Phi.getNumOperands(); i != e; i += 2)
66     if (Phi.getOperand(i + 1).getMBB() == LoopBB)
67       return Phi.getOperand(i).getReg();
68   return 0;
69 }
70 
71 void ModuloScheduleExpander::expand() {
72   BB = Schedule.getLoop()->getTopBlock();
73   Preheader = *BB->pred_begin();
74   if (Preheader == BB)
75     Preheader = *std::next(BB->pred_begin());
76 
77   // Iterate over the definitions in each instruction, and compute the
78   // stage difference for each use.  Keep the maximum value.
79   for (MachineInstr *MI : Schedule.getInstructions()) {
80     int DefStage = Schedule.getStage(MI);
81     for (const MachineOperand &Op : MI->all_defs()) {
82       Register Reg = Op.getReg();
83       unsigned MaxDiff = 0;
84       bool PhiIsSwapped = false;
85       for (MachineOperand &UseOp : MRI.use_operands(Reg)) {
86         MachineInstr *UseMI = UseOp.getParent();
87         int UseStage = Schedule.getStage(UseMI);
88         unsigned Diff = 0;
89         if (UseStage != -1 && UseStage >= DefStage)
90           Diff = UseStage - DefStage;
91         if (MI->isPHI()) {
92           if (isLoopCarried(*MI))
93             ++Diff;
94           else
95             PhiIsSwapped = true;
96         }
97         MaxDiff = std::max(Diff, MaxDiff);
98       }
99       RegToStageDiff[Reg] = std::make_pair(MaxDiff, PhiIsSwapped);
100     }
101   }
102 
103   generatePipelinedLoop();
104 }
105 
106 void ModuloScheduleExpander::generatePipelinedLoop() {
107   LoopInfo = TII->analyzeLoopForPipelining(BB);
108   assert(LoopInfo && "Must be able to analyze loop!");
109 
110   // Create a new basic block for the kernel and add it to the CFG.
111   MachineBasicBlock *KernelBB = MF.CreateMachineBasicBlock(BB->getBasicBlock());
112 
113   unsigned MaxStageCount = Schedule.getNumStages() - 1;
114 
115   // Remember the registers that are used in different stages. The index is
116   // the iteration, or stage, that the instruction is scheduled in.  This is
117   // a map between register names in the original block and the names created
118   // in each stage of the pipelined loop.
119   ValueMapTy *VRMap = new ValueMapTy[(MaxStageCount + 1) * 2];
120 
121   // The renaming destination by Phis for the registers across stages.
122   // This map is updated during Phis generation to point to the most recent
123   // renaming destination.
124   ValueMapTy *VRMapPhi = new ValueMapTy[(MaxStageCount + 1) * 2];
125 
126   InstrMapTy InstrMap;
127 
128   SmallVector<MachineBasicBlock *, 4> PrologBBs;
129 
130   // Generate the prolog instructions that set up the pipeline.
131   generateProlog(MaxStageCount, KernelBB, VRMap, PrologBBs);
132   MF.insert(BB->getIterator(), KernelBB);
133   LIS.insertMBBInMaps(KernelBB);
134 
135   // Rearrange the instructions to generate the new, pipelined loop,
136   // and update register names as needed.
137   for (MachineInstr *CI : Schedule.getInstructions()) {
138     if (CI->isPHI())
139       continue;
140     unsigned StageNum = Schedule.getStage(CI);
141     MachineInstr *NewMI = cloneInstr(CI, MaxStageCount, StageNum);
142     updateInstruction(NewMI, false, MaxStageCount, StageNum, VRMap);
143     KernelBB->push_back(NewMI);
144     InstrMap[NewMI] = CI;
145   }
146 
147   // Copy any terminator instructions to the new kernel, and update
148   // names as needed.
149   for (MachineInstr &MI : BB->terminators()) {
150     MachineInstr *NewMI = MF.CloneMachineInstr(&MI);
151     updateInstruction(NewMI, false, MaxStageCount, 0, VRMap);
152     KernelBB->push_back(NewMI);
153     InstrMap[NewMI] = &MI;
154   }
155 
156   NewKernel = KernelBB;
157   KernelBB->transferSuccessors(BB);
158   KernelBB->replaceSuccessor(BB, KernelBB);
159 
160   generateExistingPhis(KernelBB, PrologBBs.back(), KernelBB, KernelBB, VRMap,
161                        InstrMap, MaxStageCount, MaxStageCount, false);
162   generatePhis(KernelBB, PrologBBs.back(), KernelBB, KernelBB, VRMap, VRMapPhi,
163                InstrMap, MaxStageCount, MaxStageCount, false);
164 
165   LLVM_DEBUG(dbgs() << "New block\n"; KernelBB->dump(););
166 
167   SmallVector<MachineBasicBlock *, 4> EpilogBBs;
168   // Generate the epilog instructions to complete the pipeline.
169   generateEpilog(MaxStageCount, KernelBB, BB, VRMap, VRMapPhi, EpilogBBs,
170                  PrologBBs);
171 
172   // We need this step because the register allocation doesn't handle some
173   // situations well, so we insert copies to help out.
174   splitLifetimes(KernelBB, EpilogBBs);
175 
176   // Remove dead instructions due to loop induction variables.
177   removeDeadInstructions(KernelBB, EpilogBBs);
178 
179   // Add branches between prolog and epilog blocks.
180   addBranches(*Preheader, PrologBBs, KernelBB, EpilogBBs, VRMap);
181 
182   delete[] VRMap;
183   delete[] VRMapPhi;
184 }
185 
186 void ModuloScheduleExpander::cleanup() {
187   // Remove the original loop since it's no longer referenced.
188   for (auto &I : *BB)
189     LIS.RemoveMachineInstrFromMaps(I);
190   BB->clear();
191   BB->eraseFromParent();
192 }
193 
194 /// Generate the pipeline prolog code.
195 void ModuloScheduleExpander::generateProlog(unsigned LastStage,
196                                             MachineBasicBlock *KernelBB,
197                                             ValueMapTy *VRMap,
198                                             MBBVectorTy &PrologBBs) {
199   MachineBasicBlock *PredBB = Preheader;
200   InstrMapTy InstrMap;
201 
202   // Generate a basic block for each stage, not including the last stage,
203   // which will be generated in the kernel. Each basic block may contain
204   // instructions from multiple stages/iterations.
205   for (unsigned i = 0; i < LastStage; ++i) {
206     // Create and insert the prolog basic block prior to the original loop
207     // basic block.  The original loop is removed later.
208     MachineBasicBlock *NewBB = MF.CreateMachineBasicBlock(BB->getBasicBlock());
209     PrologBBs.push_back(NewBB);
210     MF.insert(BB->getIterator(), NewBB);
211     NewBB->transferSuccessors(PredBB);
212     PredBB->addSuccessor(NewBB);
213     PredBB = NewBB;
214     LIS.insertMBBInMaps(NewBB);
215 
216     // Generate instructions for each appropriate stage. Process instructions
217     // in original program order.
218     for (int StageNum = i; StageNum >= 0; --StageNum) {
219       for (MachineBasicBlock::iterator BBI = BB->instr_begin(),
220                                        BBE = BB->getFirstTerminator();
221            BBI != BBE; ++BBI) {
222         if (Schedule.getStage(&*BBI) == StageNum) {
223           if (BBI->isPHI())
224             continue;
225           MachineInstr *NewMI =
226               cloneAndChangeInstr(&*BBI, i, (unsigned)StageNum);
227           updateInstruction(NewMI, false, i, (unsigned)StageNum, VRMap);
228           NewBB->push_back(NewMI);
229           InstrMap[NewMI] = &*BBI;
230         }
231       }
232     }
233     rewritePhiValues(NewBB, i, VRMap, InstrMap);
234     LLVM_DEBUG({
235       dbgs() << "prolog:\n";
236       NewBB->dump();
237     });
238   }
239 
240   PredBB->replaceSuccessor(BB, KernelBB);
241 
242   // Check if we need to remove the branch from the preheader to the original
243   // loop, and replace it with a branch to the new loop.
244   unsigned numBranches = TII->removeBranch(*Preheader);
245   if (numBranches) {
246     SmallVector<MachineOperand, 0> Cond;
247     TII->insertBranch(*Preheader, PrologBBs[0], nullptr, Cond, DebugLoc());
248   }
249 }
250 
251 /// Generate the pipeline epilog code. The epilog code finishes the iterations
252 /// that were started in either the prolog or the kernel.  We create a basic
253 /// block for each stage that needs to complete.
254 void ModuloScheduleExpander::generateEpilog(
255     unsigned LastStage, MachineBasicBlock *KernelBB, MachineBasicBlock *OrigBB,
256     ValueMapTy *VRMap, ValueMapTy *VRMapPhi, MBBVectorTy &EpilogBBs,
257     MBBVectorTy &PrologBBs) {
258   // We need to change the branch from the kernel to the first epilog block, so
259   // this call to analyze branch uses the kernel rather than the original BB.
260   MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
261   SmallVector<MachineOperand, 4> Cond;
262   bool checkBranch = TII->analyzeBranch(*KernelBB, TBB, FBB, Cond);
263   assert(!checkBranch && "generateEpilog must be able to analyze the branch");
264   if (checkBranch)
265     return;
266 
267   MachineBasicBlock::succ_iterator LoopExitI = KernelBB->succ_begin();
268   if (*LoopExitI == KernelBB)
269     ++LoopExitI;
270   assert(LoopExitI != KernelBB->succ_end() && "Expecting a successor");
271   MachineBasicBlock *LoopExitBB = *LoopExitI;
272 
273   MachineBasicBlock *PredBB = KernelBB;
274   MachineBasicBlock *EpilogStart = LoopExitBB;
275   InstrMapTy InstrMap;
276 
277   // Generate a basic block for each stage, not including the last stage,
278   // which was generated for the kernel.  Each basic block may contain
279   // instructions from multiple stages/iterations.
280   int EpilogStage = LastStage + 1;
281   for (unsigned i = LastStage; i >= 1; --i, ++EpilogStage) {
282     MachineBasicBlock *NewBB = MF.CreateMachineBasicBlock();
283     EpilogBBs.push_back(NewBB);
284     MF.insert(BB->getIterator(), NewBB);
285 
286     PredBB->replaceSuccessor(LoopExitBB, NewBB);
287     NewBB->addSuccessor(LoopExitBB);
288     LIS.insertMBBInMaps(NewBB);
289 
290     if (EpilogStart == LoopExitBB)
291       EpilogStart = NewBB;
292 
293     // Add instructions to the epilog depending on the current block.
294     // Process instructions in original program order.
295     for (unsigned StageNum = i; StageNum <= LastStage; ++StageNum) {
296       for (auto &BBI : *BB) {
297         if (BBI.isPHI())
298           continue;
299         MachineInstr *In = &BBI;
300         if ((unsigned)Schedule.getStage(In) == StageNum) {
301           // Instructions with memoperands in the epilog are updated with
302           // conservative values.
303           MachineInstr *NewMI = cloneInstr(In, UINT_MAX, 0);
304           updateInstruction(NewMI, i == 1, EpilogStage, 0, VRMap);
305           NewBB->push_back(NewMI);
306           InstrMap[NewMI] = In;
307         }
308       }
309     }
310     generateExistingPhis(NewBB, PrologBBs[i - 1], PredBB, KernelBB, VRMap,
311                          InstrMap, LastStage, EpilogStage, i == 1);
312     generatePhis(NewBB, PrologBBs[i - 1], PredBB, KernelBB, VRMap, VRMapPhi,
313                  InstrMap, LastStage, EpilogStage, i == 1);
314     PredBB = NewBB;
315 
316     LLVM_DEBUG({
317       dbgs() << "epilog:\n";
318       NewBB->dump();
319     });
320   }
321 
322   // Fix any Phi nodes in the loop exit block.
323   LoopExitBB->replacePhiUsesWith(BB, PredBB);
324 
325   // Create a branch to the new epilog from the kernel.
326   // Remove the original branch and add a new branch to the epilog.
327   TII->removeBranch(*KernelBB);
328   assert((OrigBB == TBB || OrigBB == FBB) &&
329          "Unable to determine looping branch direction");
330   if (OrigBB != TBB)
331     TII->insertBranch(*KernelBB, EpilogStart, KernelBB, Cond, DebugLoc());
332   else
333     TII->insertBranch(*KernelBB, KernelBB, EpilogStart, Cond, DebugLoc());
334   // Add a branch to the loop exit.
335   if (EpilogBBs.size() > 0) {
336     MachineBasicBlock *LastEpilogBB = EpilogBBs.back();
337     SmallVector<MachineOperand, 4> Cond1;
338     TII->insertBranch(*LastEpilogBB, LoopExitBB, nullptr, Cond1, DebugLoc());
339   }
340 }
341 
342 /// Replace all uses of FromReg that appear outside the specified
343 /// basic block with ToReg.
344 static void replaceRegUsesAfterLoop(unsigned FromReg, unsigned ToReg,
345                                     MachineBasicBlock *MBB,
346                                     MachineRegisterInfo &MRI,
347                                     LiveIntervals &LIS) {
348   for (MachineOperand &O :
349        llvm::make_early_inc_range(MRI.use_operands(FromReg)))
350     if (O.getParent()->getParent() != MBB)
351       O.setReg(ToReg);
352   if (!LIS.hasInterval(ToReg))
353     LIS.createEmptyInterval(ToReg);
354 }
355 
356 /// Return true if the register has a use that occurs outside the
357 /// specified loop.
358 static bool hasUseAfterLoop(unsigned Reg, MachineBasicBlock *BB,
359                             MachineRegisterInfo &MRI) {
360   for (const MachineOperand &MO : MRI.use_operands(Reg))
361     if (MO.getParent()->getParent() != BB)
362       return true;
363   return false;
364 }
365 
366 /// Generate Phis for the specific block in the generated pipelined code.
367 /// This function looks at the Phis from the original code to guide the
368 /// creation of new Phis.
369 void ModuloScheduleExpander::generateExistingPhis(
370     MachineBasicBlock *NewBB, MachineBasicBlock *BB1, MachineBasicBlock *BB2,
371     MachineBasicBlock *KernelBB, ValueMapTy *VRMap, InstrMapTy &InstrMap,
372     unsigned LastStageNum, unsigned CurStageNum, bool IsLast) {
373   // Compute the stage number for the initial value of the Phi, which
374   // comes from the prolog. The prolog to use depends on to which kernel/
375   // epilog that we're adding the Phi.
376   unsigned PrologStage = 0;
377   unsigned PrevStage = 0;
378   bool InKernel = (LastStageNum == CurStageNum);
379   if (InKernel) {
380     PrologStage = LastStageNum - 1;
381     PrevStage = CurStageNum;
382   } else {
383     PrologStage = LastStageNum - (CurStageNum - LastStageNum);
384     PrevStage = LastStageNum + (CurStageNum - LastStageNum) - 1;
385   }
386 
387   for (MachineBasicBlock::iterator BBI = BB->instr_begin(),
388                                    BBE = BB->getFirstNonPHI();
389        BBI != BBE; ++BBI) {
390     Register Def = BBI->getOperand(0).getReg();
391 
392     unsigned InitVal = 0;
393     unsigned LoopVal = 0;
394     getPhiRegs(*BBI, BB, InitVal, LoopVal);
395 
396     unsigned PhiOp1 = 0;
397     // The Phi value from the loop body typically is defined in the loop, but
398     // not always. So, we need to check if the value is defined in the loop.
399     unsigned PhiOp2 = LoopVal;
400     if (auto It = VRMap[LastStageNum].find(LoopVal);
401         It != VRMap[LastStageNum].end())
402       PhiOp2 = It->second;
403 
404     int StageScheduled = Schedule.getStage(&*BBI);
405     int LoopValStage = Schedule.getStage(MRI.getVRegDef(LoopVal));
406     unsigned NumStages = getStagesForReg(Def, CurStageNum);
407     if (NumStages == 0) {
408       // We don't need to generate a Phi anymore, but we need to rename any uses
409       // of the Phi value.
410       unsigned NewReg = VRMap[PrevStage][LoopVal];
411       rewriteScheduledInstr(NewBB, InstrMap, CurStageNum, 0, &*BBI, Def,
412                             InitVal, NewReg);
413       if (VRMap[CurStageNum].count(LoopVal))
414         VRMap[CurStageNum][Def] = VRMap[CurStageNum][LoopVal];
415     }
416     // Adjust the number of Phis needed depending on the number of prologs left,
417     // and the distance from where the Phi is first scheduled. The number of
418     // Phis cannot exceed the number of prolog stages. Each stage can
419     // potentially define two values.
420     unsigned MaxPhis = PrologStage + 2;
421     if (!InKernel && (int)PrologStage <= LoopValStage)
422       MaxPhis = std::max((int)MaxPhis - (int)LoopValStage, 1);
423     unsigned NumPhis = std::min(NumStages, MaxPhis);
424 
425     unsigned NewReg = 0;
426     unsigned AccessStage = (LoopValStage != -1) ? LoopValStage : StageScheduled;
427     // In the epilog, we may need to look back one stage to get the correct
428     // Phi name, because the epilog and prolog blocks execute the same stage.
429     // The correct name is from the previous block only when the Phi has
430     // been completely scheduled prior to the epilog, and Phi value is not
431     // needed in multiple stages.
432     int StageDiff = 0;
433     if (!InKernel && StageScheduled >= LoopValStage && AccessStage == 0 &&
434         NumPhis == 1)
435       StageDiff = 1;
436     // Adjust the computations below when the phi and the loop definition
437     // are scheduled in different stages.
438     if (InKernel && LoopValStage != -1 && StageScheduled > LoopValStage)
439       StageDiff = StageScheduled - LoopValStage;
440     for (unsigned np = 0; np < NumPhis; ++np) {
441       // If the Phi hasn't been scheduled, then use the initial Phi operand
442       // value. Otherwise, use the scheduled version of the instruction. This
443       // is a little complicated when a Phi references another Phi.
444       if (np > PrologStage || StageScheduled >= (int)LastStageNum)
445         PhiOp1 = InitVal;
446       // Check if the Phi has already been scheduled in a prolog stage.
447       else if (PrologStage >= AccessStage + StageDiff + np &&
448                VRMap[PrologStage - StageDiff - np].count(LoopVal) != 0)
449         PhiOp1 = VRMap[PrologStage - StageDiff - np][LoopVal];
450       // Check if the Phi has already been scheduled, but the loop instruction
451       // is either another Phi, or doesn't occur in the loop.
452       else if (PrologStage >= AccessStage + StageDiff + np) {
453         // If the Phi references another Phi, we need to examine the other
454         // Phi to get the correct value.
455         PhiOp1 = LoopVal;
456         MachineInstr *InstOp1 = MRI.getVRegDef(PhiOp1);
457         int Indirects = 1;
458         while (InstOp1 && InstOp1->isPHI() && InstOp1->getParent() == BB) {
459           int PhiStage = Schedule.getStage(InstOp1);
460           if ((int)(PrologStage - StageDiff - np) < PhiStage + Indirects)
461             PhiOp1 = getInitPhiReg(*InstOp1, BB);
462           else
463             PhiOp1 = getLoopPhiReg(*InstOp1, BB);
464           InstOp1 = MRI.getVRegDef(PhiOp1);
465           int PhiOpStage = Schedule.getStage(InstOp1);
466           int StageAdj = (PhiOpStage != -1 ? PhiStage - PhiOpStage : 0);
467           if (PhiOpStage != -1 && PrologStage - StageAdj >= Indirects + np &&
468               VRMap[PrologStage - StageAdj - Indirects - np].count(PhiOp1)) {
469             PhiOp1 = VRMap[PrologStage - StageAdj - Indirects - np][PhiOp1];
470             break;
471           }
472           ++Indirects;
473         }
474       } else
475         PhiOp1 = InitVal;
476       // If this references a generated Phi in the kernel, get the Phi operand
477       // from the incoming block.
478       if (MachineInstr *InstOp1 = MRI.getVRegDef(PhiOp1))
479         if (InstOp1->isPHI() && InstOp1->getParent() == KernelBB)
480           PhiOp1 = getInitPhiReg(*InstOp1, KernelBB);
481 
482       MachineInstr *PhiInst = MRI.getVRegDef(LoopVal);
483       bool LoopDefIsPhi = PhiInst && PhiInst->isPHI();
484       // In the epilog, a map lookup is needed to get the value from the kernel,
485       // or previous epilog block. How is does this depends on if the
486       // instruction is scheduled in the previous block.
487       if (!InKernel) {
488         int StageDiffAdj = 0;
489         if (LoopValStage != -1 && StageScheduled > LoopValStage)
490           StageDiffAdj = StageScheduled - LoopValStage;
491         // Use the loop value defined in the kernel, unless the kernel
492         // contains the last definition of the Phi.
493         if (np == 0 && PrevStage == LastStageNum &&
494             (StageScheduled != 0 || LoopValStage != 0) &&
495             VRMap[PrevStage - StageDiffAdj].count(LoopVal))
496           PhiOp2 = VRMap[PrevStage - StageDiffAdj][LoopVal];
497         // Use the value defined by the Phi. We add one because we switch
498         // from looking at the loop value to the Phi definition.
499         else if (np > 0 && PrevStage == LastStageNum &&
500                  VRMap[PrevStage - np + 1].count(Def))
501           PhiOp2 = VRMap[PrevStage - np + 1][Def];
502         // Use the loop value defined in the kernel.
503         else if (static_cast<unsigned>(LoopValStage) > PrologStage + 1 &&
504                  VRMap[PrevStage - StageDiffAdj - np].count(LoopVal))
505           PhiOp2 = VRMap[PrevStage - StageDiffAdj - np][LoopVal];
506         // Use the value defined by the Phi, unless we're generating the first
507         // epilog and the Phi refers to a Phi in a different stage.
508         else if (VRMap[PrevStage - np].count(Def) &&
509                  (!LoopDefIsPhi || (PrevStage != LastStageNum) ||
510                   (LoopValStage == StageScheduled)))
511           PhiOp2 = VRMap[PrevStage - np][Def];
512       }
513 
514       // Check if we can reuse an existing Phi. This occurs when a Phi
515       // references another Phi, and the other Phi is scheduled in an
516       // earlier stage. We can try to reuse an existing Phi up until the last
517       // stage of the current Phi.
518       if (LoopDefIsPhi) {
519         if (static_cast<int>(PrologStage - np) >= StageScheduled) {
520           int LVNumStages = getStagesForPhi(LoopVal);
521           int StageDiff = (StageScheduled - LoopValStage);
522           LVNumStages -= StageDiff;
523           // Make sure the loop value Phi has been processed already.
524           if (LVNumStages > (int)np && VRMap[CurStageNum].count(LoopVal)) {
525             NewReg = PhiOp2;
526             unsigned ReuseStage = CurStageNum;
527             if (isLoopCarried(*PhiInst))
528               ReuseStage -= LVNumStages;
529             // Check if the Phi to reuse has been generated yet. If not, then
530             // there is nothing to reuse.
531             if (VRMap[ReuseStage - np].count(LoopVal)) {
532               NewReg = VRMap[ReuseStage - np][LoopVal];
533 
534               rewriteScheduledInstr(NewBB, InstrMap, CurStageNum, np, &*BBI,
535                                     Def, NewReg);
536               // Update the map with the new Phi name.
537               VRMap[CurStageNum - np][Def] = NewReg;
538               PhiOp2 = NewReg;
539               if (VRMap[LastStageNum - np - 1].count(LoopVal))
540                 PhiOp2 = VRMap[LastStageNum - np - 1][LoopVal];
541 
542               if (IsLast && np == NumPhis - 1)
543                 replaceRegUsesAfterLoop(Def, NewReg, BB, MRI, LIS);
544               continue;
545             }
546           }
547         }
548         if (InKernel && StageDiff > 0 &&
549             VRMap[CurStageNum - StageDiff - np].count(LoopVal))
550           PhiOp2 = VRMap[CurStageNum - StageDiff - np][LoopVal];
551       }
552 
553       const TargetRegisterClass *RC = MRI.getRegClass(Def);
554       NewReg = MRI.createVirtualRegister(RC);
555 
556       MachineInstrBuilder NewPhi =
557           BuildMI(*NewBB, NewBB->getFirstNonPHI(), DebugLoc(),
558                   TII->get(TargetOpcode::PHI), NewReg);
559       NewPhi.addReg(PhiOp1).addMBB(BB1);
560       NewPhi.addReg(PhiOp2).addMBB(BB2);
561       if (np == 0)
562         InstrMap[NewPhi] = &*BBI;
563 
564       // We define the Phis after creating the new pipelined code, so
565       // we need to rename the Phi values in scheduled instructions.
566 
567       unsigned PrevReg = 0;
568       if (InKernel && VRMap[PrevStage - np].count(LoopVal))
569         PrevReg = VRMap[PrevStage - np][LoopVal];
570       rewriteScheduledInstr(NewBB, InstrMap, CurStageNum, np, &*BBI, Def,
571                             NewReg, PrevReg);
572       // If the Phi has been scheduled, use the new name for rewriting.
573       if (VRMap[CurStageNum - np].count(Def)) {
574         unsigned R = VRMap[CurStageNum - np][Def];
575         rewriteScheduledInstr(NewBB, InstrMap, CurStageNum, np, &*BBI, R,
576                               NewReg);
577       }
578 
579       // Check if we need to rename any uses that occurs after the loop. The
580       // register to replace depends on whether the Phi is scheduled in the
581       // epilog.
582       if (IsLast && np == NumPhis - 1)
583         replaceRegUsesAfterLoop(Def, NewReg, BB, MRI, LIS);
584 
585       // In the kernel, a dependent Phi uses the value from this Phi.
586       if (InKernel)
587         PhiOp2 = NewReg;
588 
589       // Update the map with the new Phi name.
590       VRMap[CurStageNum - np][Def] = NewReg;
591     }
592 
593     while (NumPhis++ < NumStages) {
594       rewriteScheduledInstr(NewBB, InstrMap, CurStageNum, NumPhis, &*BBI, Def,
595                             NewReg, 0);
596     }
597 
598     // Check if we need to rename a Phi that has been eliminated due to
599     // scheduling.
600     if (NumStages == 0 && IsLast && VRMap[CurStageNum].count(LoopVal))
601       replaceRegUsesAfterLoop(Def, VRMap[CurStageNum][LoopVal], BB, MRI, LIS);
602   }
603 }
604 
605 /// Generate Phis for the specified block in the generated pipelined code.
606 /// These are new Phis needed because the definition is scheduled after the
607 /// use in the pipelined sequence.
608 void ModuloScheduleExpander::generatePhis(
609     MachineBasicBlock *NewBB, MachineBasicBlock *BB1, MachineBasicBlock *BB2,
610     MachineBasicBlock *KernelBB, ValueMapTy *VRMap, ValueMapTy *VRMapPhi,
611     InstrMapTy &InstrMap, unsigned LastStageNum, unsigned CurStageNum,
612     bool IsLast) {
613   // Compute the stage number that contains the initial Phi value, and
614   // the Phi from the previous stage.
615   unsigned PrologStage = 0;
616   unsigned PrevStage = 0;
617   unsigned StageDiff = CurStageNum - LastStageNum;
618   bool InKernel = (StageDiff == 0);
619   if (InKernel) {
620     PrologStage = LastStageNum - 1;
621     PrevStage = CurStageNum;
622   } else {
623     PrologStage = LastStageNum - StageDiff;
624     PrevStage = LastStageNum + StageDiff - 1;
625   }
626 
627   for (MachineBasicBlock::iterator BBI = BB->getFirstNonPHI(),
628                                    BBE = BB->instr_end();
629        BBI != BBE; ++BBI) {
630     for (unsigned i = 0, e = BBI->getNumOperands(); i != e; ++i) {
631       MachineOperand &MO = BBI->getOperand(i);
632       if (!MO.isReg() || !MO.isDef() || !MO.getReg().isVirtual())
633         continue;
634 
635       int StageScheduled = Schedule.getStage(&*BBI);
636       assert(StageScheduled != -1 && "Expecting scheduled instruction.");
637       Register Def = MO.getReg();
638       unsigned NumPhis = getStagesForReg(Def, CurStageNum);
639       // An instruction scheduled in stage 0 and is used after the loop
640       // requires a phi in the epilog for the last definition from either
641       // the kernel or prolog.
642       if (!InKernel && NumPhis == 0 && StageScheduled == 0 &&
643           hasUseAfterLoop(Def, BB, MRI))
644         NumPhis = 1;
645       if (!InKernel && (unsigned)StageScheduled > PrologStage)
646         continue;
647 
648       unsigned PhiOp2;
649       if (InKernel) {
650         PhiOp2 = VRMap[PrevStage][Def];
651         if (MachineInstr *InstOp2 = MRI.getVRegDef(PhiOp2))
652           if (InstOp2->isPHI() && InstOp2->getParent() == NewBB)
653             PhiOp2 = getLoopPhiReg(*InstOp2, BB2);
654       }
655       // The number of Phis can't exceed the number of prolog stages. The
656       // prolog stage number is zero based.
657       if (NumPhis > PrologStage + 1 - StageScheduled)
658         NumPhis = PrologStage + 1 - StageScheduled;
659       for (unsigned np = 0; np < NumPhis; ++np) {
660         // Example for
661         // Org:
662         //   %Org = ... (Scheduled at Stage#0, NumPhi = 2)
663         //
664         // Prolog0 (Stage0):
665         //   %Clone0 = ...
666         // Prolog1 (Stage1):
667         //   %Clone1 = ...
668         // Kernel (Stage2):
669         //   %Phi0 = Phi %Clone1, Prolog1, %Clone2, Kernel
670         //   %Phi1 = Phi %Clone0, Prolog1, %Phi0, Kernel
671         //   %Clone2 = ...
672         // Epilog0 (Stage3):
673         //   %Phi2 = Phi %Clone1, Prolog1, %Clone2, Kernel
674         //   %Phi3 = Phi %Clone0, Prolog1, %Phi0, Kernel
675         // Epilog1 (Stage4):
676         //   %Phi4 = Phi %Clone0, Prolog0, %Phi2, Epilog0
677         //
678         // VRMap = {0: %Clone0, 1: %Clone1, 2: %Clone2}
679         // VRMapPhi (after Kernel) = {0: %Phi1, 1: %Phi0}
680         // VRMapPhi (after Epilog0) = {0: %Phi3, 1: %Phi2}
681 
682         unsigned PhiOp1 = VRMap[PrologStage][Def];
683         if (np <= PrologStage)
684           PhiOp1 = VRMap[PrologStage - np][Def];
685         if (!InKernel) {
686           if (PrevStage == LastStageNum && np == 0)
687             PhiOp2 = VRMap[LastStageNum][Def];
688           else
689             PhiOp2 = VRMapPhi[PrevStage - np][Def];
690         }
691 
692         const TargetRegisterClass *RC = MRI.getRegClass(Def);
693         Register NewReg = MRI.createVirtualRegister(RC);
694 
695         MachineInstrBuilder NewPhi =
696             BuildMI(*NewBB, NewBB->getFirstNonPHI(), DebugLoc(),
697                     TII->get(TargetOpcode::PHI), NewReg);
698         NewPhi.addReg(PhiOp1).addMBB(BB1);
699         NewPhi.addReg(PhiOp2).addMBB(BB2);
700         if (np == 0)
701           InstrMap[NewPhi] = &*BBI;
702 
703         // Rewrite uses and update the map. The actions depend upon whether
704         // we generating code for the kernel or epilog blocks.
705         if (InKernel) {
706           rewriteScheduledInstr(NewBB, InstrMap, CurStageNum, np, &*BBI, PhiOp1,
707                                 NewReg);
708           rewriteScheduledInstr(NewBB, InstrMap, CurStageNum, np, &*BBI, PhiOp2,
709                                 NewReg);
710 
711           PhiOp2 = NewReg;
712           VRMapPhi[PrevStage - np - 1][Def] = NewReg;
713         } else {
714           VRMapPhi[CurStageNum - np][Def] = NewReg;
715           if (np == NumPhis - 1)
716             rewriteScheduledInstr(NewBB, InstrMap, CurStageNum, np, &*BBI, Def,
717                                   NewReg);
718         }
719         if (IsLast && np == NumPhis - 1)
720           replaceRegUsesAfterLoop(Def, NewReg, BB, MRI, LIS);
721       }
722     }
723   }
724 }
725 
726 /// Remove instructions that generate values with no uses.
727 /// Typically, these are induction variable operations that generate values
728 /// used in the loop itself.  A dead instruction has a definition with
729 /// no uses, or uses that occur in the original loop only.
730 void ModuloScheduleExpander::removeDeadInstructions(MachineBasicBlock *KernelBB,
731                                                     MBBVectorTy &EpilogBBs) {
732   // For each epilog block, check that the value defined by each instruction
733   // is used.  If not, delete it.
734   for (MachineBasicBlock *MBB : llvm::reverse(EpilogBBs))
735     for (MachineBasicBlock::reverse_instr_iterator MI = MBB->instr_rbegin(),
736                                                    ME = MBB->instr_rend();
737          MI != ME;) {
738       // From DeadMachineInstructionElem. Don't delete inline assembly.
739       if (MI->isInlineAsm()) {
740         ++MI;
741         continue;
742       }
743       bool SawStore = false;
744       // Check if it's safe to remove the instruction due to side effects.
745       // We can, and want to, remove Phis here.
746       if (!MI->isSafeToMove(SawStore) && !MI->isPHI()) {
747         ++MI;
748         continue;
749       }
750       bool used = true;
751       for (const MachineOperand &MO : MI->all_defs()) {
752         Register reg = MO.getReg();
753         // Assume physical registers are used, unless they are marked dead.
754         if (reg.isPhysical()) {
755           used = !MO.isDead();
756           if (used)
757             break;
758           continue;
759         }
760         unsigned realUses = 0;
761         for (const MachineOperand &U : MRI.use_operands(reg)) {
762           // Check if there are any uses that occur only in the original
763           // loop.  If so, that's not a real use.
764           if (U.getParent()->getParent() != BB) {
765             realUses++;
766             used = true;
767             break;
768           }
769         }
770         if (realUses > 0)
771           break;
772         used = false;
773       }
774       if (!used) {
775         LIS.RemoveMachineInstrFromMaps(*MI);
776         MI++->eraseFromParent();
777         continue;
778       }
779       ++MI;
780     }
781   // In the kernel block, check if we can remove a Phi that generates a value
782   // used in an instruction removed in the epilog block.
783   for (MachineInstr &MI : llvm::make_early_inc_range(KernelBB->phis())) {
784     Register reg = MI.getOperand(0).getReg();
785     if (MRI.use_begin(reg) == MRI.use_end()) {
786       LIS.RemoveMachineInstrFromMaps(MI);
787       MI.eraseFromParent();
788     }
789   }
790 }
791 
792 /// For loop carried definitions, we split the lifetime of a virtual register
793 /// that has uses past the definition in the next iteration. A copy with a new
794 /// virtual register is inserted before the definition, which helps with
795 /// generating a better register assignment.
796 ///
797 ///   v1 = phi(a, v2)     v1 = phi(a, v2)
798 ///   v2 = phi(b, v3)     v2 = phi(b, v3)
799 ///   v3 = ..             v4 = copy v1
800 ///   .. = V1             v3 = ..
801 ///                       .. = v4
802 void ModuloScheduleExpander::splitLifetimes(MachineBasicBlock *KernelBB,
803                                             MBBVectorTy &EpilogBBs) {
804   const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
805   for (auto &PHI : KernelBB->phis()) {
806     Register Def = PHI.getOperand(0).getReg();
807     // Check for any Phi definition that used as an operand of another Phi
808     // in the same block.
809     for (MachineRegisterInfo::use_instr_iterator I = MRI.use_instr_begin(Def),
810                                                  E = MRI.use_instr_end();
811          I != E; ++I) {
812       if (I->isPHI() && I->getParent() == KernelBB) {
813         // Get the loop carried definition.
814         unsigned LCDef = getLoopPhiReg(PHI, KernelBB);
815         if (!LCDef)
816           continue;
817         MachineInstr *MI = MRI.getVRegDef(LCDef);
818         if (!MI || MI->getParent() != KernelBB || MI->isPHI())
819           continue;
820         // Search through the rest of the block looking for uses of the Phi
821         // definition. If one occurs, then split the lifetime.
822         unsigned SplitReg = 0;
823         for (auto &BBJ : make_range(MachineBasicBlock::instr_iterator(MI),
824                                     KernelBB->instr_end()))
825           if (BBJ.readsRegister(Def, /*TRI=*/nullptr)) {
826             // We split the lifetime when we find the first use.
827             if (SplitReg == 0) {
828               SplitReg = MRI.createVirtualRegister(MRI.getRegClass(Def));
829               BuildMI(*KernelBB, MI, MI->getDebugLoc(),
830                       TII->get(TargetOpcode::COPY), SplitReg)
831                   .addReg(Def);
832             }
833             BBJ.substituteRegister(Def, SplitReg, 0, *TRI);
834           }
835         if (!SplitReg)
836           continue;
837         // Search through each of the epilog blocks for any uses to be renamed.
838         for (auto &Epilog : EpilogBBs)
839           for (auto &I : *Epilog)
840             if (I.readsRegister(Def, /*TRI=*/nullptr))
841               I.substituteRegister(Def, SplitReg, 0, *TRI);
842         break;
843       }
844     }
845   }
846 }
847 
848 /// Remove the incoming block from the Phis in a basic block.
849 static void removePhis(MachineBasicBlock *BB, MachineBasicBlock *Incoming) {
850   for (MachineInstr &MI : *BB) {
851     if (!MI.isPHI())
852       break;
853     for (unsigned i = 1, e = MI.getNumOperands(); i != e; i += 2)
854       if (MI.getOperand(i + 1).getMBB() == Incoming) {
855         MI.removeOperand(i + 1);
856         MI.removeOperand(i);
857         break;
858       }
859   }
860 }
861 
862 /// Create branches from each prolog basic block to the appropriate epilog
863 /// block.  These edges are needed if the loop ends before reaching the
864 /// kernel.
865 void ModuloScheduleExpander::addBranches(MachineBasicBlock &PreheaderBB,
866                                          MBBVectorTy &PrologBBs,
867                                          MachineBasicBlock *KernelBB,
868                                          MBBVectorTy &EpilogBBs,
869                                          ValueMapTy *VRMap) {
870   assert(PrologBBs.size() == EpilogBBs.size() && "Prolog/Epilog mismatch");
871   MachineBasicBlock *LastPro = KernelBB;
872   MachineBasicBlock *LastEpi = KernelBB;
873 
874   // Start from the blocks connected to the kernel and work "out"
875   // to the first prolog and the last epilog blocks.
876   SmallVector<MachineInstr *, 4> PrevInsts;
877   unsigned MaxIter = PrologBBs.size() - 1;
878   for (unsigned i = 0, j = MaxIter; i <= MaxIter; ++i, --j) {
879     // Add branches to the prolog that go to the corresponding
880     // epilog, and the fall-thru prolog/kernel block.
881     MachineBasicBlock *Prolog = PrologBBs[j];
882     MachineBasicBlock *Epilog = EpilogBBs[i];
883 
884     SmallVector<MachineOperand, 4> Cond;
885     std::optional<bool> StaticallyGreater =
886         LoopInfo->createTripCountGreaterCondition(j + 1, *Prolog, Cond);
887     unsigned numAdded = 0;
888     if (!StaticallyGreater) {
889       Prolog->addSuccessor(Epilog);
890       numAdded = TII->insertBranch(*Prolog, Epilog, LastPro, Cond, DebugLoc());
891     } else if (*StaticallyGreater == false) {
892       Prolog->addSuccessor(Epilog);
893       Prolog->removeSuccessor(LastPro);
894       LastEpi->removeSuccessor(Epilog);
895       numAdded = TII->insertBranch(*Prolog, Epilog, nullptr, Cond, DebugLoc());
896       removePhis(Epilog, LastEpi);
897       // Remove the blocks that are no longer referenced.
898       if (LastPro != LastEpi) {
899         LastEpi->clear();
900         LastEpi->eraseFromParent();
901       }
902       if (LastPro == KernelBB) {
903         LoopInfo->disposed(&LIS);
904         NewKernel = nullptr;
905       }
906       LastPro->clear();
907       LastPro->eraseFromParent();
908     } else {
909       numAdded = TII->insertBranch(*Prolog, LastPro, nullptr, Cond, DebugLoc());
910       removePhis(Epilog, Prolog);
911     }
912     LastPro = Prolog;
913     LastEpi = Epilog;
914     for (MachineBasicBlock::reverse_instr_iterator I = Prolog->instr_rbegin(),
915                                                    E = Prolog->instr_rend();
916          I != E && numAdded > 0; ++I, --numAdded)
917       updateInstruction(&*I, false, j, 0, VRMap);
918   }
919 
920   if (NewKernel) {
921     LoopInfo->setPreheader(PrologBBs[MaxIter]);
922     LoopInfo->adjustTripCount(-(MaxIter + 1));
923   }
924 }
925 
926 /// Return true if we can compute the amount the instruction changes
927 /// during each iteration. Set Delta to the amount of the change.
928 bool ModuloScheduleExpander::computeDelta(MachineInstr &MI, unsigned &Delta) {
929   const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
930   const MachineOperand *BaseOp;
931   int64_t Offset;
932   bool OffsetIsScalable;
933   if (!TII->getMemOperandWithOffset(MI, BaseOp, Offset, OffsetIsScalable, TRI))
934     return false;
935 
936   // FIXME: This algorithm assumes instructions have fixed-size offsets.
937   if (OffsetIsScalable)
938     return false;
939 
940   if (!BaseOp->isReg())
941     return false;
942 
943   Register BaseReg = BaseOp->getReg();
944 
945   MachineRegisterInfo &MRI = MF.getRegInfo();
946   // Check if there is a Phi. If so, get the definition in the loop.
947   MachineInstr *BaseDef = MRI.getVRegDef(BaseReg);
948   if (BaseDef && BaseDef->isPHI()) {
949     BaseReg = getLoopPhiReg(*BaseDef, MI.getParent());
950     BaseDef = MRI.getVRegDef(BaseReg);
951   }
952   if (!BaseDef)
953     return false;
954 
955   int D = 0;
956   if (!TII->getIncrementValue(*BaseDef, D) && D >= 0)
957     return false;
958 
959   Delta = D;
960   return true;
961 }
962 
963 /// Update the memory operand with a new offset when the pipeliner
964 /// generates a new copy of the instruction that refers to a
965 /// different memory location.
966 void ModuloScheduleExpander::updateMemOperands(MachineInstr &NewMI,
967                                                MachineInstr &OldMI,
968                                                unsigned Num) {
969   if (Num == 0)
970     return;
971   // If the instruction has memory operands, then adjust the offset
972   // when the instruction appears in different stages.
973   if (NewMI.memoperands_empty())
974     return;
975   SmallVector<MachineMemOperand *, 2> NewMMOs;
976   for (MachineMemOperand *MMO : NewMI.memoperands()) {
977     // TODO: Figure out whether isAtomic is really necessary (see D57601).
978     if (MMO->isVolatile() || MMO->isAtomic() ||
979         (MMO->isInvariant() && MMO->isDereferenceable()) ||
980         (!MMO->getValue())) {
981       NewMMOs.push_back(MMO);
982       continue;
983     }
984     unsigned Delta;
985     if (Num != UINT_MAX && computeDelta(OldMI, Delta)) {
986       int64_t AdjOffset = Delta * Num;
987       NewMMOs.push_back(
988           MF.getMachineMemOperand(MMO, AdjOffset, MMO->getSize()));
989     } else {
990       NewMMOs.push_back(MF.getMachineMemOperand(
991           MMO, 0, LocationSize::beforeOrAfterPointer()));
992     }
993   }
994   NewMI.setMemRefs(MF, NewMMOs);
995 }
996 
997 /// Clone the instruction for the new pipelined loop and update the
998 /// memory operands, if needed.
999 MachineInstr *ModuloScheduleExpander::cloneInstr(MachineInstr *OldMI,
1000                                                  unsigned CurStageNum,
1001                                                  unsigned InstStageNum) {
1002   MachineInstr *NewMI = MF.CloneMachineInstr(OldMI);
1003   updateMemOperands(*NewMI, *OldMI, CurStageNum - InstStageNum);
1004   return NewMI;
1005 }
1006 
1007 /// Clone the instruction for the new pipelined loop. If needed, this
1008 /// function updates the instruction using the values saved in the
1009 /// InstrChanges structure.
1010 MachineInstr *ModuloScheduleExpander::cloneAndChangeInstr(
1011     MachineInstr *OldMI, unsigned CurStageNum, unsigned InstStageNum) {
1012   MachineInstr *NewMI = MF.CloneMachineInstr(OldMI);
1013   auto It = InstrChanges.find(OldMI);
1014   if (It != InstrChanges.end()) {
1015     std::pair<unsigned, int64_t> RegAndOffset = It->second;
1016     unsigned BasePos, OffsetPos;
1017     if (!TII->getBaseAndOffsetPosition(*OldMI, BasePos, OffsetPos))
1018       return nullptr;
1019     int64_t NewOffset = OldMI->getOperand(OffsetPos).getImm();
1020     MachineInstr *LoopDef = findDefInLoop(RegAndOffset.first);
1021     if (Schedule.getStage(LoopDef) > (signed)InstStageNum)
1022       NewOffset += RegAndOffset.second * (CurStageNum - InstStageNum);
1023     NewMI->getOperand(OffsetPos).setImm(NewOffset);
1024   }
1025   updateMemOperands(*NewMI, *OldMI, CurStageNum - InstStageNum);
1026   return NewMI;
1027 }
1028 
1029 /// Update the machine instruction with new virtual registers.  This
1030 /// function may change the definitions and/or uses.
1031 void ModuloScheduleExpander::updateInstruction(MachineInstr *NewMI,
1032                                                bool LastDef,
1033                                                unsigned CurStageNum,
1034                                                unsigned InstrStageNum,
1035                                                ValueMapTy *VRMap) {
1036   for (MachineOperand &MO : NewMI->operands()) {
1037     if (!MO.isReg() || !MO.getReg().isVirtual())
1038       continue;
1039     Register reg = MO.getReg();
1040     if (MO.isDef()) {
1041       // Create a new virtual register for the definition.
1042       const TargetRegisterClass *RC = MRI.getRegClass(reg);
1043       Register NewReg = MRI.createVirtualRegister(RC);
1044       MO.setReg(NewReg);
1045       VRMap[CurStageNum][reg] = NewReg;
1046       if (LastDef)
1047         replaceRegUsesAfterLoop(reg, NewReg, BB, MRI, LIS);
1048     } else if (MO.isUse()) {
1049       MachineInstr *Def = MRI.getVRegDef(reg);
1050       // Compute the stage that contains the last definition for instruction.
1051       int DefStageNum = Schedule.getStage(Def);
1052       unsigned StageNum = CurStageNum;
1053       if (DefStageNum != -1 && (int)InstrStageNum > DefStageNum) {
1054         // Compute the difference in stages between the defintion and the use.
1055         unsigned StageDiff = (InstrStageNum - DefStageNum);
1056         // Make an adjustment to get the last definition.
1057         StageNum -= StageDiff;
1058       }
1059       if (auto It = VRMap[StageNum].find(reg); It != VRMap[StageNum].end())
1060         MO.setReg(It->second);
1061     }
1062   }
1063 }
1064 
1065 /// Return the instruction in the loop that defines the register.
1066 /// If the definition is a Phi, then follow the Phi operand to
1067 /// the instruction in the loop.
1068 MachineInstr *ModuloScheduleExpander::findDefInLoop(unsigned Reg) {
1069   SmallPtrSet<MachineInstr *, 8> Visited;
1070   MachineInstr *Def = MRI.getVRegDef(Reg);
1071   while (Def->isPHI()) {
1072     if (!Visited.insert(Def).second)
1073       break;
1074     for (unsigned i = 1, e = Def->getNumOperands(); i < e; i += 2)
1075       if (Def->getOperand(i + 1).getMBB() == BB) {
1076         Def = MRI.getVRegDef(Def->getOperand(i).getReg());
1077         break;
1078       }
1079   }
1080   return Def;
1081 }
1082 
1083 /// Return the new name for the value from the previous stage.
1084 unsigned ModuloScheduleExpander::getPrevMapVal(
1085     unsigned StageNum, unsigned PhiStage, unsigned LoopVal, unsigned LoopStage,
1086     ValueMapTy *VRMap, MachineBasicBlock *BB) {
1087   unsigned PrevVal = 0;
1088   if (StageNum > PhiStage) {
1089     MachineInstr *LoopInst = MRI.getVRegDef(LoopVal);
1090     if (PhiStage == LoopStage && VRMap[StageNum - 1].count(LoopVal))
1091       // The name is defined in the previous stage.
1092       PrevVal = VRMap[StageNum - 1][LoopVal];
1093     else if (VRMap[StageNum].count(LoopVal))
1094       // The previous name is defined in the current stage when the instruction
1095       // order is swapped.
1096       PrevVal = VRMap[StageNum][LoopVal];
1097     else if (!LoopInst->isPHI() || LoopInst->getParent() != BB)
1098       // The loop value hasn't yet been scheduled.
1099       PrevVal = LoopVal;
1100     else if (StageNum == PhiStage + 1)
1101       // The loop value is another phi, which has not been scheduled.
1102       PrevVal = getInitPhiReg(*LoopInst, BB);
1103     else if (StageNum > PhiStage + 1 && LoopInst->getParent() == BB)
1104       // The loop value is another phi, which has been scheduled.
1105       PrevVal =
1106           getPrevMapVal(StageNum - 1, PhiStage, getLoopPhiReg(*LoopInst, BB),
1107                         LoopStage, VRMap, BB);
1108   }
1109   return PrevVal;
1110 }
1111 
1112 /// Rewrite the Phi values in the specified block to use the mappings
1113 /// from the initial operand. Once the Phi is scheduled, we switch
1114 /// to using the loop value instead of the Phi value, so those names
1115 /// do not need to be rewritten.
1116 void ModuloScheduleExpander::rewritePhiValues(MachineBasicBlock *NewBB,
1117                                               unsigned StageNum,
1118                                               ValueMapTy *VRMap,
1119                                               InstrMapTy &InstrMap) {
1120   for (auto &PHI : BB->phis()) {
1121     unsigned InitVal = 0;
1122     unsigned LoopVal = 0;
1123     getPhiRegs(PHI, BB, InitVal, LoopVal);
1124     Register PhiDef = PHI.getOperand(0).getReg();
1125 
1126     unsigned PhiStage = (unsigned)Schedule.getStage(MRI.getVRegDef(PhiDef));
1127     unsigned LoopStage = (unsigned)Schedule.getStage(MRI.getVRegDef(LoopVal));
1128     unsigned NumPhis = getStagesForPhi(PhiDef);
1129     if (NumPhis > StageNum)
1130       NumPhis = StageNum;
1131     for (unsigned np = 0; np <= NumPhis; ++np) {
1132       unsigned NewVal =
1133           getPrevMapVal(StageNum - np, PhiStage, LoopVal, LoopStage, VRMap, BB);
1134       if (!NewVal)
1135         NewVal = InitVal;
1136       rewriteScheduledInstr(NewBB, InstrMap, StageNum - np, np, &PHI, PhiDef,
1137                             NewVal);
1138     }
1139   }
1140 }
1141 
1142 /// Rewrite a previously scheduled instruction to use the register value
1143 /// from the new instruction. Make sure the instruction occurs in the
1144 /// basic block, and we don't change the uses in the new instruction.
1145 void ModuloScheduleExpander::rewriteScheduledInstr(
1146     MachineBasicBlock *BB, InstrMapTy &InstrMap, unsigned CurStageNum,
1147     unsigned PhiNum, MachineInstr *Phi, unsigned OldReg, unsigned NewReg,
1148     unsigned PrevReg) {
1149   bool InProlog = (CurStageNum < (unsigned)Schedule.getNumStages() - 1);
1150   int StagePhi = Schedule.getStage(Phi) + PhiNum;
1151   // Rewrite uses that have been scheduled already to use the new
1152   // Phi register.
1153   for (MachineOperand &UseOp :
1154        llvm::make_early_inc_range(MRI.use_operands(OldReg))) {
1155     MachineInstr *UseMI = UseOp.getParent();
1156     if (UseMI->getParent() != BB)
1157       continue;
1158     if (UseMI->isPHI()) {
1159       if (!Phi->isPHI() && UseMI->getOperand(0).getReg() == NewReg)
1160         continue;
1161       if (getLoopPhiReg(*UseMI, BB) != OldReg)
1162         continue;
1163     }
1164     InstrMapTy::iterator OrigInstr = InstrMap.find(UseMI);
1165     assert(OrigInstr != InstrMap.end() && "Instruction not scheduled.");
1166     MachineInstr *OrigMI = OrigInstr->second;
1167     int StageSched = Schedule.getStage(OrigMI);
1168     int CycleSched = Schedule.getCycle(OrigMI);
1169     unsigned ReplaceReg = 0;
1170     // This is the stage for the scheduled instruction.
1171     if (StagePhi == StageSched && Phi->isPHI()) {
1172       int CyclePhi = Schedule.getCycle(Phi);
1173       if (PrevReg && InProlog)
1174         ReplaceReg = PrevReg;
1175       else if (PrevReg && !isLoopCarried(*Phi) &&
1176                (CyclePhi <= CycleSched || OrigMI->isPHI()))
1177         ReplaceReg = PrevReg;
1178       else
1179         ReplaceReg = NewReg;
1180     }
1181     // The scheduled instruction occurs before the scheduled Phi, and the
1182     // Phi is not loop carried.
1183     if (!InProlog && StagePhi + 1 == StageSched && !isLoopCarried(*Phi))
1184       ReplaceReg = NewReg;
1185     if (StagePhi > StageSched && Phi->isPHI())
1186       ReplaceReg = NewReg;
1187     if (!InProlog && !Phi->isPHI() && StagePhi < StageSched)
1188       ReplaceReg = NewReg;
1189     if (ReplaceReg) {
1190       const TargetRegisterClass *NRC =
1191           MRI.constrainRegClass(ReplaceReg, MRI.getRegClass(OldReg));
1192       if (NRC)
1193         UseOp.setReg(ReplaceReg);
1194       else {
1195         Register SplitReg = MRI.createVirtualRegister(MRI.getRegClass(OldReg));
1196         BuildMI(*BB, UseMI, UseMI->getDebugLoc(), TII->get(TargetOpcode::COPY),
1197                 SplitReg)
1198             .addReg(ReplaceReg);
1199         UseOp.setReg(SplitReg);
1200       }
1201     }
1202   }
1203 }
1204 
1205 bool ModuloScheduleExpander::isLoopCarried(MachineInstr &Phi) {
1206   if (!Phi.isPHI())
1207     return false;
1208   int DefCycle = Schedule.getCycle(&Phi);
1209   int DefStage = Schedule.getStage(&Phi);
1210 
1211   unsigned InitVal = 0;
1212   unsigned LoopVal = 0;
1213   getPhiRegs(Phi, Phi.getParent(), InitVal, LoopVal);
1214   MachineInstr *Use = MRI.getVRegDef(LoopVal);
1215   if (!Use || Use->isPHI())
1216     return true;
1217   int LoopCycle = Schedule.getCycle(Use);
1218   int LoopStage = Schedule.getStage(Use);
1219   return (LoopCycle > DefCycle) || (LoopStage <= DefStage);
1220 }
1221 
1222 //===----------------------------------------------------------------------===//
1223 // PeelingModuloScheduleExpander implementation
1224 //===----------------------------------------------------------------------===//
1225 // This is a reimplementation of ModuloScheduleExpander that works by creating
1226 // a fully correct steady-state kernel and peeling off the prolog and epilogs.
1227 //===----------------------------------------------------------------------===//
1228 
1229 namespace {
1230 // Remove any dead phis in MBB. Dead phis either have only one block as input
1231 // (in which case they are the identity) or have no uses.
1232 void EliminateDeadPhis(MachineBasicBlock *MBB, MachineRegisterInfo &MRI,
1233                        LiveIntervals *LIS, bool KeepSingleSrcPhi = false) {
1234   bool Changed = true;
1235   while (Changed) {
1236     Changed = false;
1237     for (MachineInstr &MI : llvm::make_early_inc_range(MBB->phis())) {
1238       assert(MI.isPHI());
1239       if (MRI.use_empty(MI.getOperand(0).getReg())) {
1240         if (LIS)
1241           LIS->RemoveMachineInstrFromMaps(MI);
1242         MI.eraseFromParent();
1243         Changed = true;
1244       } else if (!KeepSingleSrcPhi && MI.getNumExplicitOperands() == 3) {
1245         const TargetRegisterClass *ConstrainRegClass =
1246             MRI.constrainRegClass(MI.getOperand(1).getReg(),
1247                                   MRI.getRegClass(MI.getOperand(0).getReg()));
1248         assert(ConstrainRegClass &&
1249                "Expected a valid constrained register class!");
1250         (void)ConstrainRegClass;
1251         MRI.replaceRegWith(MI.getOperand(0).getReg(),
1252                            MI.getOperand(1).getReg());
1253         if (LIS)
1254           LIS->RemoveMachineInstrFromMaps(MI);
1255         MI.eraseFromParent();
1256         Changed = true;
1257       }
1258     }
1259   }
1260 }
1261 
1262 /// Rewrites the kernel block in-place to adhere to the given schedule.
1263 /// KernelRewriter holds all of the state required to perform the rewriting.
1264 class KernelRewriter {
1265   ModuloSchedule &S;
1266   MachineBasicBlock *BB;
1267   MachineBasicBlock *PreheaderBB, *ExitBB;
1268   MachineRegisterInfo &MRI;
1269   const TargetInstrInfo *TII;
1270   LiveIntervals *LIS;
1271 
1272   // Map from register class to canonical undef register for that class.
1273   DenseMap<const TargetRegisterClass *, Register> Undefs;
1274   // Map from <LoopReg, InitReg> to phi register for all created phis. Note that
1275   // this map is only used when InitReg is non-undef.
1276   DenseMap<std::pair<unsigned, unsigned>, Register> Phis;
1277   // Map from LoopReg to phi register where the InitReg is undef.
1278   DenseMap<Register, Register> UndefPhis;
1279 
1280   // Reg is used by MI. Return the new register MI should use to adhere to the
1281   // schedule. Insert phis as necessary.
1282   Register remapUse(Register Reg, MachineInstr &MI);
1283   // Insert a phi that carries LoopReg from the loop body and InitReg otherwise.
1284   // If InitReg is not given it is chosen arbitrarily. It will either be undef
1285   // or will be chosen so as to share another phi.
1286   Register phi(Register LoopReg, std::optional<Register> InitReg = {},
1287                const TargetRegisterClass *RC = nullptr);
1288   // Create an undef register of the given register class.
1289   Register undef(const TargetRegisterClass *RC);
1290 
1291 public:
1292   KernelRewriter(MachineLoop &L, ModuloSchedule &S, MachineBasicBlock *LoopBB,
1293                  LiveIntervals *LIS = nullptr);
1294   void rewrite();
1295 };
1296 } // namespace
1297 
1298 KernelRewriter::KernelRewriter(MachineLoop &L, ModuloSchedule &S,
1299                                MachineBasicBlock *LoopBB, LiveIntervals *LIS)
1300     : S(S), BB(LoopBB), PreheaderBB(L.getLoopPreheader()),
1301       ExitBB(L.getExitBlock()), MRI(BB->getParent()->getRegInfo()),
1302       TII(BB->getParent()->getSubtarget().getInstrInfo()), LIS(LIS) {
1303   PreheaderBB = *BB->pred_begin();
1304   if (PreheaderBB == BB)
1305     PreheaderBB = *std::next(BB->pred_begin());
1306 }
1307 
1308 void KernelRewriter::rewrite() {
1309   // Rearrange the loop to be in schedule order. Note that the schedule may
1310   // contain instructions that are not owned by the loop block (InstrChanges and
1311   // friends), so we gracefully handle unowned instructions and delete any
1312   // instructions that weren't in the schedule.
1313   auto InsertPt = BB->getFirstTerminator();
1314   MachineInstr *FirstMI = nullptr;
1315   for (MachineInstr *MI : S.getInstructions()) {
1316     if (MI->isPHI())
1317       continue;
1318     if (MI->getParent())
1319       MI->removeFromParent();
1320     BB->insert(InsertPt, MI);
1321     if (!FirstMI)
1322       FirstMI = MI;
1323   }
1324   assert(FirstMI && "Failed to find first MI in schedule");
1325 
1326   // At this point all of the scheduled instructions are between FirstMI
1327   // and the end of the block. Kill from the first non-phi to FirstMI.
1328   for (auto I = BB->getFirstNonPHI(); I != FirstMI->getIterator();) {
1329     if (LIS)
1330       LIS->RemoveMachineInstrFromMaps(*I);
1331     (I++)->eraseFromParent();
1332   }
1333 
1334   // Now remap every instruction in the loop.
1335   for (MachineInstr &MI : *BB) {
1336     if (MI.isPHI() || MI.isTerminator())
1337       continue;
1338     for (MachineOperand &MO : MI.uses()) {
1339       if (!MO.isReg() || MO.getReg().isPhysical() || MO.isImplicit())
1340         continue;
1341       Register Reg = remapUse(MO.getReg(), MI);
1342       MO.setReg(Reg);
1343     }
1344   }
1345   EliminateDeadPhis(BB, MRI, LIS);
1346 
1347   // Ensure a phi exists for all instructions that are either referenced by
1348   // an illegal phi or by an instruction outside the loop. This allows us to
1349   // treat remaps of these values the same as "normal" values that come from
1350   // loop-carried phis.
1351   for (auto MI = BB->getFirstNonPHI(); MI != BB->end(); ++MI) {
1352     if (MI->isPHI()) {
1353       Register R = MI->getOperand(0).getReg();
1354       phi(R);
1355       continue;
1356     }
1357 
1358     for (MachineOperand &Def : MI->defs()) {
1359       for (MachineInstr &MI : MRI.use_instructions(Def.getReg())) {
1360         if (MI.getParent() != BB) {
1361           phi(Def.getReg());
1362           break;
1363         }
1364       }
1365     }
1366   }
1367 }
1368 
1369 Register KernelRewriter::remapUse(Register Reg, MachineInstr &MI) {
1370   MachineInstr *Producer = MRI.getUniqueVRegDef(Reg);
1371   if (!Producer)
1372     return Reg;
1373 
1374   int ConsumerStage = S.getStage(&MI);
1375   if (!Producer->isPHI()) {
1376     // Non-phi producers are simple to remap. Insert as many phis as the
1377     // difference between the consumer and producer stages.
1378     if (Producer->getParent() != BB)
1379       // Producer was not inside the loop. Use the register as-is.
1380       return Reg;
1381     int ProducerStage = S.getStage(Producer);
1382     assert(ConsumerStage != -1 &&
1383            "In-loop consumer should always be scheduled!");
1384     assert(ConsumerStage >= ProducerStage);
1385     unsigned StageDiff = ConsumerStage - ProducerStage;
1386 
1387     for (unsigned I = 0; I < StageDiff; ++I)
1388       Reg = phi(Reg);
1389     return Reg;
1390   }
1391 
1392   // First, dive through the phi chain to find the defaults for the generated
1393   // phis.
1394   SmallVector<std::optional<Register>, 4> Defaults;
1395   Register LoopReg = Reg;
1396   auto LoopProducer = Producer;
1397   while (LoopProducer->isPHI() && LoopProducer->getParent() == BB) {
1398     LoopReg = getLoopPhiReg(*LoopProducer, BB);
1399     Defaults.emplace_back(getInitPhiReg(*LoopProducer, BB));
1400     LoopProducer = MRI.getUniqueVRegDef(LoopReg);
1401     assert(LoopProducer);
1402   }
1403   int LoopProducerStage = S.getStage(LoopProducer);
1404 
1405   std::optional<Register> IllegalPhiDefault;
1406 
1407   if (LoopProducerStage == -1) {
1408     // Do nothing.
1409   } else if (LoopProducerStage > ConsumerStage) {
1410     // This schedule is only representable if ProducerStage == ConsumerStage+1.
1411     // In addition, Consumer's cycle must be scheduled after Producer in the
1412     // rescheduled loop. This is enforced by the pipeliner's ASAP and ALAP
1413     // functions.
1414 #ifndef NDEBUG // Silence unused variables in non-asserts mode.
1415     int LoopProducerCycle = S.getCycle(LoopProducer);
1416     int ConsumerCycle = S.getCycle(&MI);
1417 #endif
1418     assert(LoopProducerCycle <= ConsumerCycle);
1419     assert(LoopProducerStage == ConsumerStage + 1);
1420     // Peel off the first phi from Defaults and insert a phi between producer
1421     // and consumer. This phi will not be at the front of the block so we
1422     // consider it illegal. It will only exist during the rewrite process; it
1423     // needs to exist while we peel off prologs because these could take the
1424     // default value. After that we can replace all uses with the loop producer
1425     // value.
1426     IllegalPhiDefault = Defaults.front();
1427     Defaults.erase(Defaults.begin());
1428   } else {
1429     assert(ConsumerStage >= LoopProducerStage);
1430     int StageDiff = ConsumerStage - LoopProducerStage;
1431     if (StageDiff > 0) {
1432       LLVM_DEBUG(dbgs() << " -- padding defaults array from " << Defaults.size()
1433                         << " to " << (Defaults.size() + StageDiff) << "\n");
1434       // If we need more phis than we have defaults for, pad out with undefs for
1435       // the earliest phis, which are at the end of the defaults chain (the
1436       // chain is in reverse order).
1437       Defaults.resize(Defaults.size() + StageDiff,
1438                       Defaults.empty() ? std::optional<Register>()
1439                                        : Defaults.back());
1440     }
1441   }
1442 
1443   // Now we know the number of stages to jump back, insert the phi chain.
1444   auto DefaultI = Defaults.rbegin();
1445   while (DefaultI != Defaults.rend())
1446     LoopReg = phi(LoopReg, *DefaultI++, MRI.getRegClass(Reg));
1447 
1448   if (IllegalPhiDefault) {
1449     // The consumer optionally consumes LoopProducer in the same iteration
1450     // (because the producer is scheduled at an earlier cycle than the consumer)
1451     // or the initial value. To facilitate this we create an illegal block here
1452     // by embedding a phi in the middle of the block. We will fix this up
1453     // immediately prior to pruning.
1454     auto RC = MRI.getRegClass(Reg);
1455     Register R = MRI.createVirtualRegister(RC);
1456     MachineInstr *IllegalPhi =
1457         BuildMI(*BB, MI, DebugLoc(), TII->get(TargetOpcode::PHI), R)
1458             .addReg(*IllegalPhiDefault)
1459             .addMBB(PreheaderBB) // Block choice is arbitrary and has no effect.
1460             .addReg(LoopReg)
1461             .addMBB(BB); // Block choice is arbitrary and has no effect.
1462     // Illegal phi should belong to the producer stage so that it can be
1463     // filtered correctly during peeling.
1464     S.setStage(IllegalPhi, LoopProducerStage);
1465     return R;
1466   }
1467 
1468   return LoopReg;
1469 }
1470 
1471 Register KernelRewriter::phi(Register LoopReg, std::optional<Register> InitReg,
1472                              const TargetRegisterClass *RC) {
1473   // If the init register is not undef, try and find an existing phi.
1474   if (InitReg) {
1475     auto I = Phis.find({LoopReg, *InitReg});
1476     if (I != Phis.end())
1477       return I->second;
1478   } else {
1479     for (auto &KV : Phis) {
1480       if (KV.first.first == LoopReg)
1481         return KV.second;
1482     }
1483   }
1484 
1485   // InitReg is either undef or no existing phi takes InitReg as input. Try and
1486   // find a phi that takes undef as input.
1487   auto I = UndefPhis.find(LoopReg);
1488   if (I != UndefPhis.end()) {
1489     Register R = I->second;
1490     if (!InitReg)
1491       // Found a phi taking undef as input, and this input is undef so return
1492       // without any more changes.
1493       return R;
1494     // Found a phi taking undef as input, so rewrite it to take InitReg.
1495     MachineInstr *MI = MRI.getVRegDef(R);
1496     MI->getOperand(1).setReg(*InitReg);
1497     Phis.insert({{LoopReg, *InitReg}, R});
1498     const TargetRegisterClass *ConstrainRegClass =
1499         MRI.constrainRegClass(R, MRI.getRegClass(*InitReg));
1500     assert(ConstrainRegClass && "Expected a valid constrained register class!");
1501     (void)ConstrainRegClass;
1502     UndefPhis.erase(I);
1503     return R;
1504   }
1505 
1506   // Failed to find any existing phi to reuse, so create a new one.
1507   if (!RC)
1508     RC = MRI.getRegClass(LoopReg);
1509   Register R = MRI.createVirtualRegister(RC);
1510   if (InitReg) {
1511     const TargetRegisterClass *ConstrainRegClass =
1512         MRI.constrainRegClass(R, MRI.getRegClass(*InitReg));
1513     assert(ConstrainRegClass && "Expected a valid constrained register class!");
1514     (void)ConstrainRegClass;
1515   }
1516   BuildMI(*BB, BB->getFirstNonPHI(), DebugLoc(), TII->get(TargetOpcode::PHI), R)
1517       .addReg(InitReg ? *InitReg : undef(RC))
1518       .addMBB(PreheaderBB)
1519       .addReg(LoopReg)
1520       .addMBB(BB);
1521   if (!InitReg)
1522     UndefPhis[LoopReg] = R;
1523   else
1524     Phis[{LoopReg, *InitReg}] = R;
1525   return R;
1526 }
1527 
1528 Register KernelRewriter::undef(const TargetRegisterClass *RC) {
1529   Register &R = Undefs[RC];
1530   if (R == 0) {
1531     // Create an IMPLICIT_DEF that defines this register if we need it.
1532     // All uses of this should be removed by the time we have finished unrolling
1533     // prologs and epilogs.
1534     R = MRI.createVirtualRegister(RC);
1535     auto *InsertBB = &PreheaderBB->getParent()->front();
1536     BuildMI(*InsertBB, InsertBB->getFirstTerminator(), DebugLoc(),
1537             TII->get(TargetOpcode::IMPLICIT_DEF), R);
1538   }
1539   return R;
1540 }
1541 
1542 namespace {
1543 /// Describes an operand in the kernel of a pipelined loop. Characteristics of
1544 /// the operand are discovered, such as how many in-loop PHIs it has to jump
1545 /// through and defaults for these phis.
1546 class KernelOperandInfo {
1547   MachineBasicBlock *BB;
1548   MachineRegisterInfo &MRI;
1549   SmallVector<Register, 4> PhiDefaults;
1550   MachineOperand *Source;
1551   MachineOperand *Target;
1552 
1553 public:
1554   KernelOperandInfo(MachineOperand *MO, MachineRegisterInfo &MRI,
1555                     const SmallPtrSetImpl<MachineInstr *> &IllegalPhis)
1556       : MRI(MRI) {
1557     Source = MO;
1558     BB = MO->getParent()->getParent();
1559     while (isRegInLoop(MO)) {
1560       MachineInstr *MI = MRI.getVRegDef(MO->getReg());
1561       if (MI->isFullCopy()) {
1562         MO = &MI->getOperand(1);
1563         continue;
1564       }
1565       if (!MI->isPHI())
1566         break;
1567       // If this is an illegal phi, don't count it in distance.
1568       if (IllegalPhis.count(MI)) {
1569         MO = &MI->getOperand(3);
1570         continue;
1571       }
1572 
1573       Register Default = getInitPhiReg(*MI, BB);
1574       MO = MI->getOperand(2).getMBB() == BB ? &MI->getOperand(1)
1575                                             : &MI->getOperand(3);
1576       PhiDefaults.push_back(Default);
1577     }
1578     Target = MO;
1579   }
1580 
1581   bool operator==(const KernelOperandInfo &Other) const {
1582     return PhiDefaults.size() == Other.PhiDefaults.size();
1583   }
1584 
1585   void print(raw_ostream &OS) const {
1586     OS << "use of " << *Source << ": distance(" << PhiDefaults.size() << ") in "
1587        << *Source->getParent();
1588   }
1589 
1590 private:
1591   bool isRegInLoop(MachineOperand *MO) {
1592     return MO->isReg() && MO->getReg().isVirtual() &&
1593            MRI.getVRegDef(MO->getReg())->getParent() == BB;
1594   }
1595 };
1596 } // namespace
1597 
1598 MachineBasicBlock *
1599 PeelingModuloScheduleExpander::peelKernel(LoopPeelDirection LPD) {
1600   MachineBasicBlock *NewBB = PeelSingleBlockLoop(LPD, BB, MRI, TII);
1601   if (LPD == LPD_Front)
1602     PeeledFront.push_back(NewBB);
1603   else
1604     PeeledBack.push_front(NewBB);
1605   for (auto I = BB->begin(), NI = NewBB->begin(); !I->isTerminator();
1606        ++I, ++NI) {
1607     CanonicalMIs[&*I] = &*I;
1608     CanonicalMIs[&*NI] = &*I;
1609     BlockMIs[{NewBB, &*I}] = &*NI;
1610     BlockMIs[{BB, &*I}] = &*I;
1611   }
1612   return NewBB;
1613 }
1614 
1615 void PeelingModuloScheduleExpander::filterInstructions(MachineBasicBlock *MB,
1616                                                        int MinStage) {
1617   for (auto I = MB->getFirstInstrTerminator()->getReverseIterator();
1618        I != std::next(MB->getFirstNonPHI()->getReverseIterator());) {
1619     MachineInstr *MI = &*I++;
1620     int Stage = getStage(MI);
1621     if (Stage == -1 || Stage >= MinStage)
1622       continue;
1623 
1624     for (MachineOperand &DefMO : MI->defs()) {
1625       SmallVector<std::pair<MachineInstr *, Register>, 4> Subs;
1626       for (MachineInstr &UseMI : MRI.use_instructions(DefMO.getReg())) {
1627         // Only PHIs can use values from this block by construction.
1628         // Match with the equivalent PHI in B.
1629         assert(UseMI.isPHI());
1630         Register Reg = getEquivalentRegisterIn(UseMI.getOperand(0).getReg(),
1631                                                MI->getParent());
1632         Subs.emplace_back(&UseMI, Reg);
1633       }
1634       for (auto &Sub : Subs)
1635         Sub.first->substituteRegister(DefMO.getReg(), Sub.second, /*SubIdx=*/0,
1636                                       *MRI.getTargetRegisterInfo());
1637     }
1638     if (LIS)
1639       LIS->RemoveMachineInstrFromMaps(*MI);
1640     MI->eraseFromParent();
1641   }
1642 }
1643 
1644 void PeelingModuloScheduleExpander::moveStageBetweenBlocks(
1645     MachineBasicBlock *DestBB, MachineBasicBlock *SourceBB, unsigned Stage) {
1646   auto InsertPt = DestBB->getFirstNonPHI();
1647   DenseMap<Register, Register> Remaps;
1648   for (MachineInstr &MI : llvm::make_early_inc_range(
1649            llvm::make_range(SourceBB->getFirstNonPHI(), SourceBB->end()))) {
1650     if (MI.isPHI()) {
1651       // This is an illegal PHI. If we move any instructions using an illegal
1652       // PHI, we need to create a legal Phi.
1653       if (getStage(&MI) != Stage) {
1654         // The legal Phi is not necessary if the illegal phi's stage
1655         // is being moved.
1656         Register PhiR = MI.getOperand(0).getReg();
1657         auto RC = MRI.getRegClass(PhiR);
1658         Register NR = MRI.createVirtualRegister(RC);
1659         MachineInstr *NI = BuildMI(*DestBB, DestBB->getFirstNonPHI(),
1660                                    DebugLoc(), TII->get(TargetOpcode::PHI), NR)
1661                                .addReg(PhiR)
1662                                .addMBB(SourceBB);
1663         BlockMIs[{DestBB, CanonicalMIs[&MI]}] = NI;
1664         CanonicalMIs[NI] = CanonicalMIs[&MI];
1665         Remaps[PhiR] = NR;
1666       }
1667     }
1668     if (getStage(&MI) != Stage)
1669       continue;
1670     MI.removeFromParent();
1671     DestBB->insert(InsertPt, &MI);
1672     auto *KernelMI = CanonicalMIs[&MI];
1673     BlockMIs[{DestBB, KernelMI}] = &MI;
1674     BlockMIs.erase({SourceBB, KernelMI});
1675   }
1676   SmallVector<MachineInstr *, 4> PhiToDelete;
1677   for (MachineInstr &MI : DestBB->phis()) {
1678     assert(MI.getNumOperands() == 3);
1679     MachineInstr *Def = MRI.getVRegDef(MI.getOperand(1).getReg());
1680     // If the instruction referenced by the phi is moved inside the block
1681     // we don't need the phi anymore.
1682     if (getStage(Def) == Stage) {
1683       Register PhiReg = MI.getOperand(0).getReg();
1684       assert(Def->findRegisterDefOperandIdx(MI.getOperand(1).getReg(),
1685                                             /*TRI=*/nullptr) != -1);
1686       MRI.replaceRegWith(MI.getOperand(0).getReg(), MI.getOperand(1).getReg());
1687       MI.getOperand(0).setReg(PhiReg);
1688       PhiToDelete.push_back(&MI);
1689     }
1690   }
1691   for (auto *P : PhiToDelete)
1692     P->eraseFromParent();
1693   InsertPt = DestBB->getFirstNonPHI();
1694   // Helper to clone Phi instructions into the destination block. We clone Phi
1695   // greedily to avoid combinatorial explosion of Phi instructions.
1696   auto clonePhi = [&](MachineInstr *Phi) {
1697     MachineInstr *NewMI = MF.CloneMachineInstr(Phi);
1698     DestBB->insert(InsertPt, NewMI);
1699     Register OrigR = Phi->getOperand(0).getReg();
1700     Register R = MRI.createVirtualRegister(MRI.getRegClass(OrigR));
1701     NewMI->getOperand(0).setReg(R);
1702     NewMI->getOperand(1).setReg(OrigR);
1703     NewMI->getOperand(2).setMBB(*DestBB->pred_begin());
1704     Remaps[OrigR] = R;
1705     CanonicalMIs[NewMI] = CanonicalMIs[Phi];
1706     BlockMIs[{DestBB, CanonicalMIs[Phi]}] = NewMI;
1707     PhiNodeLoopIteration[NewMI] = PhiNodeLoopIteration[Phi];
1708     return R;
1709   };
1710   for (auto I = DestBB->getFirstNonPHI(); I != DestBB->end(); ++I) {
1711     for (MachineOperand &MO : I->uses()) {
1712       if (!MO.isReg())
1713         continue;
1714       if (auto It = Remaps.find(MO.getReg()); It != Remaps.end())
1715         MO.setReg(It->second);
1716       else {
1717         // If we are using a phi from the source block we need to add a new phi
1718         // pointing to the old one.
1719         MachineInstr *Use = MRI.getUniqueVRegDef(MO.getReg());
1720         if (Use && Use->isPHI() && Use->getParent() == SourceBB) {
1721           Register R = clonePhi(Use);
1722           MO.setReg(R);
1723         }
1724       }
1725     }
1726   }
1727 }
1728 
1729 Register
1730 PeelingModuloScheduleExpander::getPhiCanonicalReg(MachineInstr *CanonicalPhi,
1731                                                   MachineInstr *Phi) {
1732   unsigned distance = PhiNodeLoopIteration[Phi];
1733   MachineInstr *CanonicalUse = CanonicalPhi;
1734   Register CanonicalUseReg = CanonicalUse->getOperand(0).getReg();
1735   for (unsigned I = 0; I < distance; ++I) {
1736     assert(CanonicalUse->isPHI());
1737     assert(CanonicalUse->getNumOperands() == 5);
1738     unsigned LoopRegIdx = 3, InitRegIdx = 1;
1739     if (CanonicalUse->getOperand(2).getMBB() == CanonicalUse->getParent())
1740       std::swap(LoopRegIdx, InitRegIdx);
1741     CanonicalUseReg = CanonicalUse->getOperand(LoopRegIdx).getReg();
1742     CanonicalUse = MRI.getVRegDef(CanonicalUseReg);
1743   }
1744   return CanonicalUseReg;
1745 }
1746 
1747 void PeelingModuloScheduleExpander::peelPrologAndEpilogs() {
1748   BitVector LS(Schedule.getNumStages(), true);
1749   BitVector AS(Schedule.getNumStages(), true);
1750   LiveStages[BB] = LS;
1751   AvailableStages[BB] = AS;
1752 
1753   // Peel out the prologs.
1754   LS.reset();
1755   for (int I = 0; I < Schedule.getNumStages() - 1; ++I) {
1756     LS[I] = true;
1757     Prologs.push_back(peelKernel(LPD_Front));
1758     LiveStages[Prologs.back()] = LS;
1759     AvailableStages[Prologs.back()] = LS;
1760   }
1761 
1762   // Create a block that will end up as the new loop exiting block (dominated by
1763   // all prologs and epilogs). It will only contain PHIs, in the same order as
1764   // BB's PHIs. This gives us a poor-man's LCSSA with the inductive property
1765   // that the exiting block is a (sub) clone of BB. This in turn gives us the
1766   // property that any value deffed in BB but used outside of BB is used by a
1767   // PHI in the exiting block.
1768   MachineBasicBlock *ExitingBB = CreateLCSSAExitingBlock();
1769   EliminateDeadPhis(ExitingBB, MRI, LIS, /*KeepSingleSrcPhi=*/true);
1770   // Push out the epilogs, again in reverse order.
1771   // We can't assume anything about the minumum loop trip count at this point,
1772   // so emit a fairly complex epilog.
1773 
1774   // We first peel number of stages minus one epilogue. Then we remove dead
1775   // stages and reorder instructions based on their stage. If we have 3 stages
1776   // we generate first:
1777   // E0[3, 2, 1]
1778   // E1[3', 2']
1779   // E2[3'']
1780   // And then we move instructions based on their stages to have:
1781   // E0[3]
1782   // E1[2, 3']
1783   // E2[1, 2', 3'']
1784   // The transformation is legal because we only move instructions past
1785   // instructions of a previous loop iteration.
1786   for (int I = 1; I <= Schedule.getNumStages() - 1; ++I) {
1787     Epilogs.push_back(peelKernel(LPD_Back));
1788     MachineBasicBlock *B = Epilogs.back();
1789     filterInstructions(B, Schedule.getNumStages() - I);
1790     // Keep track at which iteration each phi belongs to. We need it to know
1791     // what version of the variable to use during prologue/epilogue stitching.
1792     EliminateDeadPhis(B, MRI, LIS, /*KeepSingleSrcPhi=*/true);
1793     for (MachineInstr &Phi : B->phis())
1794       PhiNodeLoopIteration[&Phi] = Schedule.getNumStages() - I;
1795   }
1796   for (size_t I = 0; I < Epilogs.size(); I++) {
1797     LS.reset();
1798     for (size_t J = I; J < Epilogs.size(); J++) {
1799       int Iteration = J;
1800       unsigned Stage = Schedule.getNumStages() - 1 + I - J;
1801       // Move stage one block at a time so that Phi nodes are updated correctly.
1802       for (size_t K = Iteration; K > I; K--)
1803         moveStageBetweenBlocks(Epilogs[K - 1], Epilogs[K], Stage);
1804       LS[Stage] = true;
1805     }
1806     LiveStages[Epilogs[I]] = LS;
1807     AvailableStages[Epilogs[I]] = AS;
1808   }
1809 
1810   // Now we've defined all the prolog and epilog blocks as a fallthrough
1811   // sequence, add the edges that will be followed if the loop trip count is
1812   // lower than the number of stages (connecting prologs directly with epilogs).
1813   auto PI = Prologs.begin();
1814   auto EI = Epilogs.begin();
1815   assert(Prologs.size() == Epilogs.size());
1816   for (; PI != Prologs.end(); ++PI, ++EI) {
1817     MachineBasicBlock *Pred = *(*EI)->pred_begin();
1818     (*PI)->addSuccessor(*EI);
1819     for (MachineInstr &MI : (*EI)->phis()) {
1820       Register Reg = MI.getOperand(1).getReg();
1821       MachineInstr *Use = MRI.getUniqueVRegDef(Reg);
1822       if (Use && Use->getParent() == Pred) {
1823         MachineInstr *CanonicalUse = CanonicalMIs[Use];
1824         if (CanonicalUse->isPHI()) {
1825           // If the use comes from a phi we need to skip as many phi as the
1826           // distance between the epilogue and the kernel. Trace through the phi
1827           // chain to find the right value.
1828           Reg = getPhiCanonicalReg(CanonicalUse, Use);
1829         }
1830         Reg = getEquivalentRegisterIn(Reg, *PI);
1831       }
1832       MI.addOperand(MachineOperand::CreateReg(Reg, /*isDef=*/false));
1833       MI.addOperand(MachineOperand::CreateMBB(*PI));
1834     }
1835   }
1836 
1837   // Create a list of all blocks in order.
1838   SmallVector<MachineBasicBlock *, 8> Blocks;
1839   llvm::copy(PeeledFront, std::back_inserter(Blocks));
1840   Blocks.push_back(BB);
1841   llvm::copy(PeeledBack, std::back_inserter(Blocks));
1842 
1843   // Iterate in reverse order over all instructions, remapping as we go.
1844   for (MachineBasicBlock *B : reverse(Blocks)) {
1845     for (auto I = B->instr_rbegin();
1846          I != std::next(B->getFirstNonPHI()->getReverseIterator());) {
1847       MachineBasicBlock::reverse_instr_iterator MI = I++;
1848       rewriteUsesOf(&*MI);
1849     }
1850   }
1851   for (auto *MI : IllegalPhisToDelete) {
1852     if (LIS)
1853       LIS->RemoveMachineInstrFromMaps(*MI);
1854     MI->eraseFromParent();
1855   }
1856   IllegalPhisToDelete.clear();
1857 
1858   // Now all remapping has been done, we're free to optimize the generated code.
1859   for (MachineBasicBlock *B : reverse(Blocks))
1860     EliminateDeadPhis(B, MRI, LIS);
1861   EliminateDeadPhis(ExitingBB, MRI, LIS);
1862 }
1863 
1864 MachineBasicBlock *PeelingModuloScheduleExpander::CreateLCSSAExitingBlock() {
1865   MachineFunction &MF = *BB->getParent();
1866   MachineBasicBlock *Exit = *BB->succ_begin();
1867   if (Exit == BB)
1868     Exit = *std::next(BB->succ_begin());
1869 
1870   MachineBasicBlock *NewBB = MF.CreateMachineBasicBlock(BB->getBasicBlock());
1871   MF.insert(std::next(BB->getIterator()), NewBB);
1872 
1873   // Clone all phis in BB into NewBB and rewrite.
1874   for (MachineInstr &MI : BB->phis()) {
1875     auto RC = MRI.getRegClass(MI.getOperand(0).getReg());
1876     Register OldR = MI.getOperand(3).getReg();
1877     Register R = MRI.createVirtualRegister(RC);
1878     SmallVector<MachineInstr *, 4> Uses;
1879     for (MachineInstr &Use : MRI.use_instructions(OldR))
1880       if (Use.getParent() != BB)
1881         Uses.push_back(&Use);
1882     for (MachineInstr *Use : Uses)
1883       Use->substituteRegister(OldR, R, /*SubIdx=*/0,
1884                               *MRI.getTargetRegisterInfo());
1885     MachineInstr *NI = BuildMI(NewBB, DebugLoc(), TII->get(TargetOpcode::PHI), R)
1886         .addReg(OldR)
1887         .addMBB(BB);
1888     BlockMIs[{NewBB, &MI}] = NI;
1889     CanonicalMIs[NI] = &MI;
1890   }
1891   BB->replaceSuccessor(Exit, NewBB);
1892   Exit->replacePhiUsesWith(BB, NewBB);
1893   NewBB->addSuccessor(Exit);
1894 
1895   MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
1896   SmallVector<MachineOperand, 4> Cond;
1897   bool CanAnalyzeBr = !TII->analyzeBranch(*BB, TBB, FBB, Cond);
1898   (void)CanAnalyzeBr;
1899   assert(CanAnalyzeBr && "Must be able to analyze the loop branch!");
1900   TII->removeBranch(*BB);
1901   TII->insertBranch(*BB, TBB == Exit ? NewBB : TBB, FBB == Exit ? NewBB : FBB,
1902                     Cond, DebugLoc());
1903   TII->insertUnconditionalBranch(*NewBB, Exit, DebugLoc());
1904   return NewBB;
1905 }
1906 
1907 Register
1908 PeelingModuloScheduleExpander::getEquivalentRegisterIn(Register Reg,
1909                                                        MachineBasicBlock *BB) {
1910   MachineInstr *MI = MRI.getUniqueVRegDef(Reg);
1911   unsigned OpIdx = MI->findRegisterDefOperandIdx(Reg, /*TRI=*/nullptr);
1912   return BlockMIs[{BB, CanonicalMIs[MI]}]->getOperand(OpIdx).getReg();
1913 }
1914 
1915 void PeelingModuloScheduleExpander::rewriteUsesOf(MachineInstr *MI) {
1916   if (MI->isPHI()) {
1917     // This is an illegal PHI. The loop-carried (desired) value is operand 3,
1918     // and it is produced by this block.
1919     Register PhiR = MI->getOperand(0).getReg();
1920     Register R = MI->getOperand(3).getReg();
1921     int RMIStage = getStage(MRI.getUniqueVRegDef(R));
1922     if (RMIStage != -1 && !AvailableStages[MI->getParent()].test(RMIStage))
1923       R = MI->getOperand(1).getReg();
1924     MRI.setRegClass(R, MRI.getRegClass(PhiR));
1925     MRI.replaceRegWith(PhiR, R);
1926     // Postpone deleting the Phi as it may be referenced by BlockMIs and used
1927     // later to figure out how to remap registers.
1928     MI->getOperand(0).setReg(PhiR);
1929     IllegalPhisToDelete.push_back(MI);
1930     return;
1931   }
1932 
1933   int Stage = getStage(MI);
1934   if (Stage == -1 || LiveStages.count(MI->getParent()) == 0 ||
1935       LiveStages[MI->getParent()].test(Stage))
1936     // Instruction is live, no rewriting to do.
1937     return;
1938 
1939   for (MachineOperand &DefMO : MI->defs()) {
1940     SmallVector<std::pair<MachineInstr *, Register>, 4> Subs;
1941     for (MachineInstr &UseMI : MRI.use_instructions(DefMO.getReg())) {
1942       // Only PHIs can use values from this block by construction.
1943       // Match with the equivalent PHI in B.
1944       assert(UseMI.isPHI());
1945       Register Reg = getEquivalentRegisterIn(UseMI.getOperand(0).getReg(),
1946                                              MI->getParent());
1947       Subs.emplace_back(&UseMI, Reg);
1948     }
1949     for (auto &Sub : Subs)
1950       Sub.first->substituteRegister(DefMO.getReg(), Sub.second, /*SubIdx=*/0,
1951                                     *MRI.getTargetRegisterInfo());
1952   }
1953   if (LIS)
1954     LIS->RemoveMachineInstrFromMaps(*MI);
1955   MI->eraseFromParent();
1956 }
1957 
1958 void PeelingModuloScheduleExpander::fixupBranches() {
1959   // Work outwards from the kernel.
1960   bool KernelDisposed = false;
1961   int TC = Schedule.getNumStages() - 1;
1962   for (auto PI = Prologs.rbegin(), EI = Epilogs.rbegin(); PI != Prologs.rend();
1963        ++PI, ++EI, --TC) {
1964     MachineBasicBlock *Prolog = *PI;
1965     MachineBasicBlock *Fallthrough = *Prolog->succ_begin();
1966     MachineBasicBlock *Epilog = *EI;
1967     SmallVector<MachineOperand, 4> Cond;
1968     TII->removeBranch(*Prolog);
1969     std::optional<bool> StaticallyGreater =
1970         LoopInfo->createTripCountGreaterCondition(TC, *Prolog, Cond);
1971     if (!StaticallyGreater) {
1972       LLVM_DEBUG(dbgs() << "Dynamic: TC > " << TC << "\n");
1973       // Dynamically branch based on Cond.
1974       TII->insertBranch(*Prolog, Epilog, Fallthrough, Cond, DebugLoc());
1975     } else if (*StaticallyGreater == false) {
1976       LLVM_DEBUG(dbgs() << "Static-false: TC > " << TC << "\n");
1977       // Prolog never falls through; branch to epilog and orphan interior
1978       // blocks. Leave it to unreachable-block-elim to clean up.
1979       Prolog->removeSuccessor(Fallthrough);
1980       for (MachineInstr &P : Fallthrough->phis()) {
1981         P.removeOperand(2);
1982         P.removeOperand(1);
1983       }
1984       TII->insertUnconditionalBranch(*Prolog, Epilog, DebugLoc());
1985       KernelDisposed = true;
1986     } else {
1987       LLVM_DEBUG(dbgs() << "Static-true: TC > " << TC << "\n");
1988       // Prolog always falls through; remove incoming values in epilog.
1989       Prolog->removeSuccessor(Epilog);
1990       for (MachineInstr &P : Epilog->phis()) {
1991         P.removeOperand(4);
1992         P.removeOperand(3);
1993       }
1994     }
1995   }
1996 
1997   if (!KernelDisposed) {
1998     LoopInfo->adjustTripCount(-(Schedule.getNumStages() - 1));
1999     LoopInfo->setPreheader(Prologs.back());
2000   } else {
2001     LoopInfo->disposed();
2002   }
2003 }
2004 
2005 void PeelingModuloScheduleExpander::rewriteKernel() {
2006   KernelRewriter KR(*Schedule.getLoop(), Schedule, BB);
2007   KR.rewrite();
2008 }
2009 
2010 void PeelingModuloScheduleExpander::expand() {
2011   BB = Schedule.getLoop()->getTopBlock();
2012   Preheader = Schedule.getLoop()->getLoopPreheader();
2013   LLVM_DEBUG(Schedule.dump());
2014   LoopInfo = TII->analyzeLoopForPipelining(BB);
2015   assert(LoopInfo);
2016 
2017   rewriteKernel();
2018   peelPrologAndEpilogs();
2019   fixupBranches();
2020 }
2021 
2022 void PeelingModuloScheduleExpander::validateAgainstModuloScheduleExpander() {
2023   BB = Schedule.getLoop()->getTopBlock();
2024   Preheader = Schedule.getLoop()->getLoopPreheader();
2025 
2026   // Dump the schedule before we invalidate and remap all its instructions.
2027   // Stash it in a string so we can print it if we found an error.
2028   std::string ScheduleDump;
2029   raw_string_ostream OS(ScheduleDump);
2030   Schedule.print(OS);
2031   OS.flush();
2032 
2033   // First, run the normal ModuleScheduleExpander. We don't support any
2034   // InstrChanges.
2035   assert(LIS && "Requires LiveIntervals!");
2036   ModuloScheduleExpander MSE(MF, Schedule, *LIS,
2037                              ModuloScheduleExpander::InstrChangesTy());
2038   MSE.expand();
2039   MachineBasicBlock *ExpandedKernel = MSE.getRewrittenKernel();
2040   if (!ExpandedKernel) {
2041     // The expander optimized away the kernel. We can't do any useful checking.
2042     MSE.cleanup();
2043     return;
2044   }
2045   // Before running the KernelRewriter, re-add BB into the CFG.
2046   Preheader->addSuccessor(BB);
2047 
2048   // Now run the new expansion algorithm.
2049   KernelRewriter KR(*Schedule.getLoop(), Schedule, BB);
2050   KR.rewrite();
2051   peelPrologAndEpilogs();
2052 
2053   // Collect all illegal phis that the new algorithm created. We'll give these
2054   // to KernelOperandInfo.
2055   SmallPtrSet<MachineInstr *, 4> IllegalPhis;
2056   for (auto NI = BB->getFirstNonPHI(); NI != BB->end(); ++NI) {
2057     if (NI->isPHI())
2058       IllegalPhis.insert(&*NI);
2059   }
2060 
2061   // Co-iterate across both kernels. We expect them to be identical apart from
2062   // phis and full COPYs (we look through both).
2063   SmallVector<std::pair<KernelOperandInfo, KernelOperandInfo>, 8> KOIs;
2064   auto OI = ExpandedKernel->begin();
2065   auto NI = BB->begin();
2066   for (; !OI->isTerminator() && !NI->isTerminator(); ++OI, ++NI) {
2067     while (OI->isPHI() || OI->isFullCopy())
2068       ++OI;
2069     while (NI->isPHI() || NI->isFullCopy())
2070       ++NI;
2071     assert(OI->getOpcode() == NI->getOpcode() && "Opcodes don't match?!");
2072     // Analyze every operand separately.
2073     for (auto OOpI = OI->operands_begin(), NOpI = NI->operands_begin();
2074          OOpI != OI->operands_end(); ++OOpI, ++NOpI)
2075       KOIs.emplace_back(KernelOperandInfo(&*OOpI, MRI, IllegalPhis),
2076                         KernelOperandInfo(&*NOpI, MRI, IllegalPhis));
2077   }
2078 
2079   bool Failed = false;
2080   for (auto &OldAndNew : KOIs) {
2081     if (OldAndNew.first == OldAndNew.second)
2082       continue;
2083     Failed = true;
2084     errs() << "Modulo kernel validation error: [\n";
2085     errs() << " [golden] ";
2086     OldAndNew.first.print(errs());
2087     errs() << "          ";
2088     OldAndNew.second.print(errs());
2089     errs() << "]\n";
2090   }
2091 
2092   if (Failed) {
2093     errs() << "Golden reference kernel:\n";
2094     ExpandedKernel->print(errs());
2095     errs() << "New kernel:\n";
2096     BB->print(errs());
2097     errs() << ScheduleDump;
2098     report_fatal_error(
2099         "Modulo kernel validation (-pipeliner-experimental-cg) failed");
2100   }
2101 
2102   // Cleanup by removing BB from the CFG again as the original
2103   // ModuloScheduleExpander intended.
2104   Preheader->removeSuccessor(BB);
2105   MSE.cleanup();
2106 }
2107 
2108 MachineInstr *ModuloScheduleExpanderMVE::cloneInstr(MachineInstr *OldMI) {
2109   MachineInstr *NewMI = MF.CloneMachineInstr(OldMI);
2110 
2111   // TODO: Offset information needs to be corrected.
2112   NewMI->dropMemRefs(MF);
2113 
2114   return NewMI;
2115 }
2116 
2117 /// Create a dedicated exit for Loop. Exit is the original exit for Loop.
2118 /// If it is already dedicated exit, return it. Otherwise, insert a new
2119 /// block between them and return the new block.
2120 static MachineBasicBlock *createDedicatedExit(MachineBasicBlock *Loop,
2121                                               MachineBasicBlock *Exit) {
2122   if (Exit->pred_size() == 1)
2123     return Exit;
2124 
2125   MachineFunction *MF = Loop->getParent();
2126   const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
2127 
2128   MachineBasicBlock *NewExit =
2129       MF->CreateMachineBasicBlock(Loop->getBasicBlock());
2130   MF->insert(Loop->getIterator(), NewExit);
2131 
2132   MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
2133   SmallVector<MachineOperand, 4> Cond;
2134   TII->analyzeBranch(*Loop, TBB, FBB, Cond);
2135   if (TBB == Loop)
2136     FBB = NewExit;
2137   else if (FBB == Loop)
2138     TBB = NewExit;
2139   else
2140     llvm_unreachable("unexpected loop structure");
2141   TII->removeBranch(*Loop);
2142   TII->insertBranch(*Loop, TBB, FBB, Cond, DebugLoc());
2143   Loop->replaceSuccessor(Exit, NewExit);
2144   TII->insertUnconditionalBranch(*NewExit, Exit, DebugLoc());
2145   NewExit->addSuccessor(Exit);
2146 
2147   Exit->replacePhiUsesWith(Loop, NewExit);
2148 
2149   return NewExit;
2150 }
2151 
2152 /// Insert branch code into the end of MBB. It branches to GreaterThan if the
2153 /// remaining trip count for instructions in LastStage0Insts is greater than
2154 /// RequiredTC, and to Otherwise otherwise.
2155 void ModuloScheduleExpanderMVE::insertCondBranch(MachineBasicBlock &MBB,
2156                                                  int RequiredTC,
2157                                                  InstrMapTy &LastStage0Insts,
2158                                                  MachineBasicBlock &GreaterThan,
2159                                                  MachineBasicBlock &Otherwise) {
2160   SmallVector<MachineOperand, 4> Cond;
2161   LoopInfo->createRemainingIterationsGreaterCondition(RequiredTC, MBB, Cond,
2162                                                       LastStage0Insts);
2163 
2164   if (SwapBranchTargetsMVE) {
2165     // Set SwapBranchTargetsMVE to true if a target prefers to replace TBB and
2166     // FBB for optimal performance.
2167     if (TII->reverseBranchCondition(Cond))
2168       llvm_unreachable("can not reverse branch condition");
2169     TII->insertBranch(MBB, &Otherwise, &GreaterThan, Cond, DebugLoc());
2170   } else {
2171     TII->insertBranch(MBB, &GreaterThan, &Otherwise, Cond, DebugLoc());
2172   }
2173 }
2174 
2175 /// Generate a pipelined loop that is unrolled by using MVE algorithm and any
2176 /// other necessary blocks. The control flow is modified to execute the
2177 /// pipelined loop if the trip count satisfies the condition, otherwise the
2178 /// original loop. The original loop is also used to execute the remainder
2179 /// iterations which occur due to unrolling.
2180 void ModuloScheduleExpanderMVE::generatePipelinedLoop() {
2181   // The control flow for pipelining with MVE:
2182   //
2183   // OrigPreheader:
2184   //   // The block that is originally the loop preheader
2185   //   goto Check
2186   //
2187   // Check:
2188   //   // Check whether the trip count satisfies the requirements to pipeline.
2189   //   if (LoopCounter > NumStages + NumUnroll - 2)
2190   //     // The minimum number of iterations to pipeline =
2191   //     //   iterations executed in prolog/epilog (NumStages-1) +
2192   //     //   iterations executed in one kernel run (NumUnroll)
2193   //     goto Prolog
2194   //   // fallback to the original loop
2195   //   goto NewPreheader
2196   //
2197   // Prolog:
2198   //   // All prolog stages. There are no direct branches to the epilogue.
2199   //   goto NewKernel
2200   //
2201   // NewKernel:
2202   //   // NumUnroll copies of the kernel
2203   //   if (LoopCounter > MVE-1)
2204   //     goto NewKernel
2205   //   goto Epilog
2206   //
2207   // Epilog:
2208   //   // All epilog stages.
2209   //   if (LoopCounter > 0)
2210   //     // The remainder is executed in the original loop
2211   //     goto NewPreheader
2212   //   goto NewExit
2213   //
2214   // NewPreheader:
2215   //   // Newly created preheader for the original loop.
2216   //   // The initial values of the phis in the loop are merged from two paths.
2217   //   NewInitVal = Phi OrigInitVal, Check, PipelineLastVal, Epilog
2218   //   goto OrigKernel
2219   //
2220   // OrigKernel:
2221   //   // The original loop block.
2222   //   if (LoopCounter != 0)
2223   //     goto OrigKernel
2224   //   goto NewExit
2225   //
2226   // NewExit:
2227   //   // Newly created dedicated exit for the original loop.
2228   //   // Merge values which are referenced after the loop
2229   //   Merged = Phi OrigVal, OrigKernel, PipelineVal, Epilog
2230   //   goto OrigExit
2231   //
2232   // OrigExit:
2233   //   // The block that is originally the loop exit.
2234   //   // If it is already deicated exit, NewExit is not created.
2235 
2236   // An example of where each stage is executed:
2237   // Assume #Stages 3, #MVE 4, #Iterations 12
2238   // Iter   0 1 2 3 4 5 6 7 8 9 10-11
2239   // -------------------------------------------------
2240   // Stage  0                          Prolog#0
2241   // Stage  1 0                        Prolog#1
2242   // Stage  2 1 0                      Kernel Unroll#0 Iter#0
2243   // Stage    2 1 0                    Kernel Unroll#1 Iter#0
2244   // Stage      2 1 0                  Kernel Unroll#2 Iter#0
2245   // Stage        2 1 0                Kernel Unroll#3 Iter#0
2246   // Stage          2 1 0              Kernel Unroll#0 Iter#1
2247   // Stage            2 1 0            Kernel Unroll#1 Iter#1
2248   // Stage              2 1 0          Kernel Unroll#2 Iter#1
2249   // Stage                2 1 0        Kernel Unroll#3 Iter#1
2250   // Stage                  2 1        Epilog#0
2251   // Stage                    2        Epilog#1
2252   // Stage                      0-2    OrigKernel
2253 
2254   LoopInfo = TII->analyzeLoopForPipelining(OrigKernel);
2255   assert(LoopInfo && "Must be able to analyze loop!");
2256 
2257   calcNumUnroll();
2258 
2259   Check = MF.CreateMachineBasicBlock(OrigKernel->getBasicBlock());
2260   Prolog = MF.CreateMachineBasicBlock(OrigKernel->getBasicBlock());
2261   NewKernel = MF.CreateMachineBasicBlock(OrigKernel->getBasicBlock());
2262   Epilog = MF.CreateMachineBasicBlock(OrigKernel->getBasicBlock());
2263   NewPreheader = MF.CreateMachineBasicBlock(OrigKernel->getBasicBlock());
2264 
2265   MF.insert(OrigKernel->getIterator(), Check);
2266   MF.insert(OrigKernel->getIterator(), Prolog);
2267   MF.insert(OrigKernel->getIterator(), NewKernel);
2268   MF.insert(OrigKernel->getIterator(), Epilog);
2269   MF.insert(OrigKernel->getIterator(), NewPreheader);
2270 
2271   NewExit = createDedicatedExit(OrigKernel, OrigExit);
2272 
2273   NewPreheader->transferSuccessorsAndUpdatePHIs(OrigPreheader);
2274   TII->insertUnconditionalBranch(*NewPreheader, OrigKernel, DebugLoc());
2275 
2276   OrigPreheader->addSuccessor(Check);
2277   TII->removeBranch(*OrigPreheader);
2278   TII->insertUnconditionalBranch(*OrigPreheader, Check, DebugLoc());
2279 
2280   Check->addSuccessor(Prolog);
2281   Check->addSuccessor(NewPreheader);
2282 
2283   Prolog->addSuccessor(NewKernel);
2284 
2285   NewKernel->addSuccessor(NewKernel);
2286   NewKernel->addSuccessor(Epilog);
2287 
2288   Epilog->addSuccessor(NewPreheader);
2289   Epilog->addSuccessor(NewExit);
2290 
2291   InstrMapTy LastStage0Insts;
2292   insertCondBranch(*Check, Schedule.getNumStages() + NumUnroll - 2,
2293                    LastStage0Insts, *Prolog, *NewPreheader);
2294 
2295   // VRMaps map (prolog/kernel/epilog phase#, original register#) to new
2296   // register#
2297   SmallVector<ValueMapTy> PrologVRMap, KernelVRMap, EpilogVRMap;
2298   generateProlog(PrologVRMap);
2299   generateKernel(PrologVRMap, KernelVRMap, LastStage0Insts);
2300   generateEpilog(KernelVRMap, EpilogVRMap, LastStage0Insts);
2301 }
2302 
2303 /// Replace MI's use operands according to the maps.
2304 void ModuloScheduleExpanderMVE::updateInstrUse(
2305     MachineInstr *MI, int StageNum, int PhaseNum,
2306     SmallVectorImpl<ValueMapTy> &CurVRMap,
2307     SmallVectorImpl<ValueMapTy> *PrevVRMap) {
2308   // If MI is in the prolog/kernel/epilog block, CurVRMap is
2309   // PrologVRMap/KernelVRMap/EpilogVRMap respectively.
2310   // PrevVRMap is nullptr/PhiVRMap/KernelVRMap respectively.
2311   // Refer to the appropriate map according to the stage difference between
2312   // MI and the definition of an operand.
2313 
2314   for (MachineOperand &UseMO : MI->uses()) {
2315     if (!UseMO.isReg() || !UseMO.getReg().isVirtual())
2316       continue;
2317     int DiffStage = 0;
2318     Register OrigReg = UseMO.getReg();
2319     MachineInstr *DefInst = MRI.getVRegDef(OrigReg);
2320     if (!DefInst || DefInst->getParent() != OrigKernel)
2321       continue;
2322     unsigned InitReg = 0;
2323     unsigned DefReg = OrigReg;
2324     if (DefInst->isPHI()) {
2325       ++DiffStage;
2326       unsigned LoopReg;
2327       getPhiRegs(*DefInst, OrigKernel, InitReg, LoopReg);
2328       // LoopReg is guaranteed to be defined within the loop by canApply()
2329       DefReg = LoopReg;
2330       DefInst = MRI.getVRegDef(LoopReg);
2331     }
2332     unsigned DefStageNum = Schedule.getStage(DefInst);
2333     DiffStage += StageNum - DefStageNum;
2334     Register NewReg;
2335     if (PhaseNum >= DiffStage && CurVRMap[PhaseNum - DiffStage].count(DefReg))
2336       // NewReg is defined in a previous phase of the same block
2337       NewReg = CurVRMap[PhaseNum - DiffStage][DefReg];
2338     else if (!PrevVRMap)
2339       // Since this is the first iteration, refer the initial register of the
2340       // loop
2341       NewReg = InitReg;
2342     else
2343       // Cases where DiffStage is larger than PhaseNum.
2344       // If MI is in the kernel block, the value is defined by the previous
2345       // iteration and PhiVRMap is referenced. If MI is in the epilog block, the
2346       // value is defined in the kernel block and KernelVRMap is referenced.
2347       NewReg = (*PrevVRMap)[PrevVRMap->size() - (DiffStage - PhaseNum)][DefReg];
2348 
2349     const TargetRegisterClass *NRC =
2350         MRI.constrainRegClass(NewReg, MRI.getRegClass(OrigReg));
2351     if (NRC)
2352       UseMO.setReg(NewReg);
2353     else {
2354       Register SplitReg = MRI.createVirtualRegister(MRI.getRegClass(OrigReg));
2355       BuildMI(*OrigKernel, MI, MI->getDebugLoc(), TII->get(TargetOpcode::COPY),
2356               SplitReg)
2357           .addReg(NewReg);
2358       UseMO.setReg(SplitReg);
2359     }
2360   }
2361 }
2362 
2363 /// Return a phi if Reg is referenced by the phi.
2364 /// canApply() guarantees that at most only one such phi exists.
2365 static MachineInstr *getLoopPhiUser(Register Reg, MachineBasicBlock *Loop) {
2366   for (MachineInstr &Phi : Loop->phis()) {
2367     unsigned InitVal, LoopVal;
2368     getPhiRegs(Phi, Loop, InitVal, LoopVal);
2369     if (LoopVal == Reg)
2370       return &Phi;
2371   }
2372   return nullptr;
2373 }
2374 
2375 /// Generate phis for registers defined by OrigMI.
2376 void ModuloScheduleExpanderMVE::generatePhi(
2377     MachineInstr *OrigMI, int UnrollNum,
2378     SmallVectorImpl<ValueMapTy> &PrologVRMap,
2379     SmallVectorImpl<ValueMapTy> &KernelVRMap,
2380     SmallVectorImpl<ValueMapTy> &PhiVRMap) {
2381   int StageNum = Schedule.getStage(OrigMI);
2382   bool UsePrologReg;
2383   if (Schedule.getNumStages() - NumUnroll + UnrollNum - 1 >= StageNum)
2384     UsePrologReg = true;
2385   else if (Schedule.getNumStages() - NumUnroll + UnrollNum == StageNum)
2386     UsePrologReg = false;
2387   else
2388     return;
2389 
2390   // Examples that show which stages are merged by phi.
2391   // Meaning of the symbol following the stage number:
2392   //   a/b: Stages with the same letter are merged (UsePrologReg == true)
2393   //   +: Merged with the initial value (UsePrologReg == false)
2394   //   *: No phis required
2395   //
2396   // #Stages 3, #MVE 4
2397   // Iter   0 1 2 3 4 5 6 7 8
2398   // -----------------------------------------
2399   // Stage  0a                 Prolog#0
2400   // Stage  1a 0b              Prolog#1
2401   // Stage  2* 1* 0*           Kernel Unroll#0
2402   // Stage     2* 1* 0+        Kernel Unroll#1
2403   // Stage        2* 1+ 0a     Kernel Unroll#2
2404   // Stage           2+ 1a 0b  Kernel Unroll#3
2405   //
2406   // #Stages 3, #MVE 2
2407   // Iter   0 1 2 3 4 5 6 7 8
2408   // -----------------------------------------
2409   // Stage  0a                 Prolog#0
2410   // Stage  1a 0b              Prolog#1
2411   // Stage  2* 1+ 0a           Kernel Unroll#0
2412   // Stage     2+ 1a 0b        Kernel Unroll#1
2413   //
2414   // #Stages 3, #MVE 1
2415   // Iter   0 1 2 3 4 5 6 7 8
2416   // -----------------------------------------
2417   // Stage  0*                 Prolog#0
2418   // Stage  1a 0b              Prolog#1
2419   // Stage  2+ 1a 0b           Kernel Unroll#0
2420 
2421   for (MachineOperand &DefMO : OrigMI->defs()) {
2422     if (!DefMO.isReg() || DefMO.isDead())
2423       continue;
2424     Register OrigReg = DefMO.getReg();
2425     auto NewReg = KernelVRMap[UnrollNum].find(OrigReg);
2426     if (NewReg == KernelVRMap[UnrollNum].end())
2427       continue;
2428     Register CorrespondReg;
2429     if (UsePrologReg) {
2430       int PrologNum = Schedule.getNumStages() - NumUnroll + UnrollNum - 1;
2431       CorrespondReg = PrologVRMap[PrologNum][OrigReg];
2432     } else {
2433       MachineInstr *Phi = getLoopPhiUser(OrigReg, OrigKernel);
2434       if (!Phi)
2435         continue;
2436       CorrespondReg = getInitPhiReg(*Phi, OrigKernel);
2437     }
2438 
2439     assert(CorrespondReg.isValid());
2440     Register PhiReg = MRI.createVirtualRegister(MRI.getRegClass(OrigReg));
2441     BuildMI(*NewKernel, NewKernel->getFirstNonPHI(), DebugLoc(),
2442             TII->get(TargetOpcode::PHI), PhiReg)
2443         .addReg(NewReg->second)
2444         .addMBB(NewKernel)
2445         .addReg(CorrespondReg)
2446         .addMBB(Prolog);
2447     PhiVRMap[UnrollNum][OrigReg] = PhiReg;
2448   }
2449 }
2450 
2451 static void replacePhiSrc(MachineInstr &Phi, Register OrigReg, Register NewReg,
2452                           MachineBasicBlock *NewMBB) {
2453   for (unsigned Idx = 1; Idx < Phi.getNumOperands(); Idx += 2) {
2454     if (Phi.getOperand(Idx).getReg() == OrigReg) {
2455       Phi.getOperand(Idx).setReg(NewReg);
2456       Phi.getOperand(Idx + 1).setMBB(NewMBB);
2457       return;
2458     }
2459   }
2460 }
2461 
2462 /// Generate phis that merge values from multiple routes
2463 void ModuloScheduleExpanderMVE::mergeRegUsesAfterPipeline(Register OrigReg,
2464                                                           Register NewReg) {
2465   SmallVector<MachineOperand *> UsesAfterLoop;
2466   SmallVector<MachineInstr *> LoopPhis;
2467   for (MachineRegisterInfo::use_iterator I = MRI.use_begin(OrigReg),
2468                                          E = MRI.use_end();
2469        I != E; ++I) {
2470     MachineOperand &O = *I;
2471     if (O.getParent()->getParent() != OrigKernel &&
2472         O.getParent()->getParent() != Prolog &&
2473         O.getParent()->getParent() != NewKernel &&
2474         O.getParent()->getParent() != Epilog)
2475       UsesAfterLoop.push_back(&O);
2476     if (O.getParent()->getParent() == OrigKernel && O.getParent()->isPHI())
2477       LoopPhis.push_back(O.getParent());
2478   }
2479 
2480   // Merge the route that only execute the pipelined loop (when there are no
2481   // remaining iterations) with the route that execute the original loop.
2482   if (!UsesAfterLoop.empty()) {
2483     Register PhiReg = MRI.createVirtualRegister(MRI.getRegClass(OrigReg));
2484     BuildMI(*NewExit, NewExit->getFirstNonPHI(), DebugLoc(),
2485             TII->get(TargetOpcode::PHI), PhiReg)
2486         .addReg(OrigReg)
2487         .addMBB(OrigKernel)
2488         .addReg(NewReg)
2489         .addMBB(Epilog);
2490 
2491     for (MachineOperand *MO : UsesAfterLoop)
2492       MO->setReg(PhiReg);
2493 
2494     if (!LIS.hasInterval(PhiReg))
2495       LIS.createEmptyInterval(PhiReg);
2496   }
2497 
2498   // Merge routes from the pipelined loop and the bypassed route before the
2499   // original loop
2500   if (!LoopPhis.empty()) {
2501     for (MachineInstr *Phi : LoopPhis) {
2502       unsigned InitReg, LoopReg;
2503       getPhiRegs(*Phi, OrigKernel, InitReg, LoopReg);
2504       Register NewInit = MRI.createVirtualRegister(MRI.getRegClass(InitReg));
2505       BuildMI(*NewPreheader, NewPreheader->getFirstNonPHI(), Phi->getDebugLoc(),
2506               TII->get(TargetOpcode::PHI), NewInit)
2507           .addReg(InitReg)
2508           .addMBB(Check)
2509           .addReg(NewReg)
2510           .addMBB(Epilog);
2511       replacePhiSrc(*Phi, InitReg, NewInit, NewPreheader);
2512     }
2513   }
2514 }
2515 
2516 void ModuloScheduleExpanderMVE::generateProlog(
2517     SmallVectorImpl<ValueMapTy> &PrologVRMap) {
2518   PrologVRMap.clear();
2519   PrologVRMap.resize(Schedule.getNumStages() - 1);
2520   DenseMap<MachineInstr *, std::pair<int, int>> NewMIMap;
2521   for (int PrologNum = 0; PrologNum < Schedule.getNumStages() - 1;
2522        ++PrologNum) {
2523     for (MachineInstr *MI : Schedule.getInstructions()) {
2524       if (MI->isPHI())
2525         continue;
2526       int StageNum = Schedule.getStage(MI);
2527       if (StageNum > PrologNum)
2528         continue;
2529       MachineInstr *NewMI = cloneInstr(MI);
2530       updateInstrDef(NewMI, PrologVRMap[PrologNum], false);
2531       NewMIMap[NewMI] = {PrologNum, StageNum};
2532       Prolog->push_back(NewMI);
2533     }
2534   }
2535 
2536   for (auto I : NewMIMap) {
2537     MachineInstr *MI = I.first;
2538     int PrologNum = I.second.first;
2539     int StageNum = I.second.second;
2540     updateInstrUse(MI, StageNum, PrologNum, PrologVRMap, nullptr);
2541   }
2542 
2543   LLVM_DEBUG({
2544     dbgs() << "prolog:\n";
2545     Prolog->dump();
2546   });
2547 }
2548 
2549 void ModuloScheduleExpanderMVE::generateKernel(
2550     SmallVectorImpl<ValueMapTy> &PrologVRMap,
2551     SmallVectorImpl<ValueMapTy> &KernelVRMap, InstrMapTy &LastStage0Insts) {
2552   KernelVRMap.clear();
2553   KernelVRMap.resize(NumUnroll);
2554   SmallVector<ValueMapTy> PhiVRMap;
2555   PhiVRMap.resize(NumUnroll);
2556   DenseMap<MachineInstr *, std::pair<int, int>> NewMIMap;
2557   DenseMap<MachineInstr *, MachineInstr *> MIMapLastStage0;
2558   for (int UnrollNum = 0; UnrollNum < NumUnroll; ++UnrollNum) {
2559     for (MachineInstr *MI : Schedule.getInstructions()) {
2560       if (MI->isPHI())
2561         continue;
2562       int StageNum = Schedule.getStage(MI);
2563       MachineInstr *NewMI = cloneInstr(MI);
2564       if (UnrollNum == NumUnroll - 1)
2565         LastStage0Insts[MI] = NewMI;
2566       updateInstrDef(NewMI, KernelVRMap[UnrollNum],
2567                      (UnrollNum == NumUnroll - 1 && StageNum == 0));
2568       generatePhi(MI, UnrollNum, PrologVRMap, KernelVRMap, PhiVRMap);
2569       NewMIMap[NewMI] = {UnrollNum, StageNum};
2570       NewKernel->push_back(NewMI);
2571     }
2572   }
2573 
2574   for (auto I : NewMIMap) {
2575     MachineInstr *MI = I.first;
2576     int UnrollNum = I.second.first;
2577     int StageNum = I.second.second;
2578     updateInstrUse(MI, StageNum, UnrollNum, KernelVRMap, &PhiVRMap);
2579   }
2580 
2581   // If remaining trip count is greater than NumUnroll-1, loop continues
2582   insertCondBranch(*NewKernel, NumUnroll - 1, LastStage0Insts, *NewKernel,
2583                    *Epilog);
2584 
2585   LLVM_DEBUG({
2586     dbgs() << "kernel:\n";
2587     NewKernel->dump();
2588   });
2589 }
2590 
2591 void ModuloScheduleExpanderMVE::generateEpilog(
2592     SmallVectorImpl<ValueMapTy> &KernelVRMap,
2593     SmallVectorImpl<ValueMapTy> &EpilogVRMap, InstrMapTy &LastStage0Insts) {
2594   EpilogVRMap.clear();
2595   EpilogVRMap.resize(Schedule.getNumStages() - 1);
2596   DenseMap<MachineInstr *, std::pair<int, int>> NewMIMap;
2597   for (int EpilogNum = 0; EpilogNum < Schedule.getNumStages() - 1;
2598        ++EpilogNum) {
2599     for (MachineInstr *MI : Schedule.getInstructions()) {
2600       if (MI->isPHI())
2601         continue;
2602       int StageNum = Schedule.getStage(MI);
2603       if (StageNum <= EpilogNum)
2604         continue;
2605       MachineInstr *NewMI = cloneInstr(MI);
2606       updateInstrDef(NewMI, EpilogVRMap[EpilogNum], StageNum - 1 == EpilogNum);
2607       NewMIMap[NewMI] = {EpilogNum, StageNum};
2608       Epilog->push_back(NewMI);
2609     }
2610   }
2611 
2612   for (auto I : NewMIMap) {
2613     MachineInstr *MI = I.first;
2614     int EpilogNum = I.second.first;
2615     int StageNum = I.second.second;
2616     updateInstrUse(MI, StageNum, EpilogNum, EpilogVRMap, &KernelVRMap);
2617   }
2618 
2619   // If there are remaining iterations, they are executed in the original loop.
2620   // Instructions related to loop control, such as loop counter comparison,
2621   // are indicated by shouldIgnoreForPipelining() and are assumed to be placed
2622   // in stage 0. Thus, the map is for the last one in the kernel.
2623   insertCondBranch(*Epilog, 0, LastStage0Insts, *NewPreheader, *NewExit);
2624 
2625   LLVM_DEBUG({
2626     dbgs() << "epilog:\n";
2627     Epilog->dump();
2628   });
2629 }
2630 
2631 /// Calculate the number of unroll required and set it to NumUnroll
2632 void ModuloScheduleExpanderMVE::calcNumUnroll() {
2633   DenseMap<MachineInstr *, unsigned> Inst2Idx;
2634   NumUnroll = 1;
2635   for (unsigned I = 0; I < Schedule.getInstructions().size(); ++I)
2636     Inst2Idx[Schedule.getInstructions()[I]] = I;
2637 
2638   for (MachineInstr *MI : Schedule.getInstructions()) {
2639     if (MI->isPHI())
2640       continue;
2641     int StageNum = Schedule.getStage(MI);
2642     for (const MachineOperand &MO : MI->uses()) {
2643       if (!MO.isReg() || !MO.getReg().isVirtual())
2644         continue;
2645       MachineInstr *DefMI = MRI.getVRegDef(MO.getReg());
2646       if (DefMI->getParent() != OrigKernel)
2647         continue;
2648 
2649       int NumUnrollLocal = 1;
2650       if (DefMI->isPHI()) {
2651         ++NumUnrollLocal;
2652         // canApply() guarantees that DefMI is not phi and is an instruction in
2653         // the loop
2654         DefMI = MRI.getVRegDef(getLoopPhiReg(*DefMI, OrigKernel));
2655       }
2656       NumUnrollLocal += StageNum - Schedule.getStage(DefMI);
2657       if (Inst2Idx[MI] <= Inst2Idx[DefMI])
2658         --NumUnrollLocal;
2659       NumUnroll = std::max(NumUnroll, NumUnrollLocal);
2660     }
2661   }
2662   LLVM_DEBUG(dbgs() << "NumUnroll: " << NumUnroll << "\n");
2663 }
2664 
2665 /// Create new virtual registers for definitions of NewMI and update NewMI.
2666 /// If the definitions are referenced after the pipelined loop, phis are
2667 /// created to merge with other routes.
2668 void ModuloScheduleExpanderMVE::updateInstrDef(MachineInstr *NewMI,
2669                                                ValueMapTy &VRMap,
2670                                                bool LastDef) {
2671   for (MachineOperand &MO : NewMI->all_defs()) {
2672     if (!MO.getReg().isVirtual())
2673       continue;
2674     Register Reg = MO.getReg();
2675     const TargetRegisterClass *RC = MRI.getRegClass(Reg);
2676     Register NewReg = MRI.createVirtualRegister(RC);
2677     MO.setReg(NewReg);
2678     VRMap[Reg] = NewReg;
2679     if (LastDef)
2680       mergeRegUsesAfterPipeline(Reg, NewReg);
2681   }
2682 }
2683 
2684 void ModuloScheduleExpanderMVE::expand() {
2685   OrigKernel = Schedule.getLoop()->getTopBlock();
2686   OrigPreheader = Schedule.getLoop()->getLoopPreheader();
2687   OrigExit = Schedule.getLoop()->getExitBlock();
2688 
2689   LLVM_DEBUG(Schedule.dump());
2690 
2691   generatePipelinedLoop();
2692 }
2693 
2694 /// Check if ModuloScheduleExpanderMVE can be applied to L
2695 bool ModuloScheduleExpanderMVE::canApply(MachineLoop &L) {
2696   if (!L.getExitBlock()) {
2697     LLVM_DEBUG(dbgs() << "Can not apply MVE expander: No single exit block.\n");
2698     return false;
2699   }
2700 
2701   MachineBasicBlock *BB = L.getTopBlock();
2702   MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
2703 
2704   // Put some constraints on the operands of the phis to simplify the
2705   // transformation
2706   DenseSet<unsigned> UsedByPhi;
2707   for (MachineInstr &MI : BB->phis()) {
2708     // Registers defined by phis must be used only inside the loop and be never
2709     // used by phis.
2710     for (MachineOperand &MO : MI.defs())
2711       if (MO.isReg())
2712         for (MachineInstr &Ref : MRI.use_instructions(MO.getReg()))
2713           if (Ref.getParent() != BB || Ref.isPHI()) {
2714             LLVM_DEBUG(dbgs() << "Can not apply MVE expander: A phi result is "
2715                                  "referenced outside of the loop or by phi.\n");
2716             return false;
2717           }
2718 
2719     // A source register from the loop block must be defined inside the loop.
2720     // A register defined inside the loop must be referenced by only one phi at
2721     // most.
2722     unsigned InitVal, LoopVal;
2723     getPhiRegs(MI, MI.getParent(), InitVal, LoopVal);
2724     if (!Register(LoopVal).isVirtual() ||
2725         MRI.getVRegDef(LoopVal)->getParent() != BB) {
2726       LLVM_DEBUG(
2727           dbgs() << "Can not apply MVE expander: A phi source value coming "
2728                     "from the loop is not defined in the loop.\n");
2729       return false;
2730     }
2731     if (UsedByPhi.count(LoopVal)) {
2732       LLVM_DEBUG(dbgs() << "Can not apply MVE expander: A value defined in the "
2733                            "loop is referenced by two or more phis.\n");
2734       return false;
2735     }
2736     UsedByPhi.insert(LoopVal);
2737   }
2738 
2739   return true;
2740 }
2741 
2742 //===----------------------------------------------------------------------===//
2743 // ModuloScheduleTestPass implementation
2744 //===----------------------------------------------------------------------===//
2745 // This pass constructs a ModuloSchedule from its module and runs
2746 // ModuloScheduleExpander.
2747 //
2748 // The module is expected to contain a single-block analyzable loop.
2749 // The total order of instructions is taken from the loop as-is.
2750 // Instructions are expected to be annotated with a PostInstrSymbol.
2751 // This PostInstrSymbol must have the following format:
2752 //  "Stage=%d Cycle=%d".
2753 //===----------------------------------------------------------------------===//
2754 
2755 namespace {
2756 class ModuloScheduleTest : public MachineFunctionPass {
2757 public:
2758   static char ID;
2759 
2760   ModuloScheduleTest() : MachineFunctionPass(ID) {
2761     initializeModuloScheduleTestPass(*PassRegistry::getPassRegistry());
2762   }
2763 
2764   bool runOnMachineFunction(MachineFunction &MF) override;
2765   void runOnLoop(MachineFunction &MF, MachineLoop &L);
2766 
2767   void getAnalysisUsage(AnalysisUsage &AU) const override {
2768     AU.addRequired<MachineLoopInfoWrapperPass>();
2769     AU.addRequired<LiveIntervalsWrapperPass>();
2770     MachineFunctionPass::getAnalysisUsage(AU);
2771   }
2772 };
2773 } // namespace
2774 
2775 char ModuloScheduleTest::ID = 0;
2776 
2777 INITIALIZE_PASS_BEGIN(ModuloScheduleTest, "modulo-schedule-test",
2778                       "Modulo Schedule test pass", false, false)
2779 INITIALIZE_PASS_DEPENDENCY(MachineLoopInfoWrapperPass)
2780 INITIALIZE_PASS_DEPENDENCY(LiveIntervalsWrapperPass)
2781 INITIALIZE_PASS_END(ModuloScheduleTest, "modulo-schedule-test",
2782                     "Modulo Schedule test pass", false, false)
2783 
2784 bool ModuloScheduleTest::runOnMachineFunction(MachineFunction &MF) {
2785   MachineLoopInfo &MLI = getAnalysis<MachineLoopInfoWrapperPass>().getLI();
2786   for (auto *L : MLI) {
2787     if (L->getTopBlock() != L->getBottomBlock())
2788       continue;
2789     runOnLoop(MF, *L);
2790     return false;
2791   }
2792   return false;
2793 }
2794 
2795 static void parseSymbolString(StringRef S, int &Cycle, int &Stage) {
2796   std::pair<StringRef, StringRef> StageAndCycle = getToken(S, "_");
2797   std::pair<StringRef, StringRef> StageTokenAndValue =
2798       getToken(StageAndCycle.first, "-");
2799   std::pair<StringRef, StringRef> CycleTokenAndValue =
2800       getToken(StageAndCycle.second, "-");
2801   if (StageTokenAndValue.first != "Stage" ||
2802       CycleTokenAndValue.first != "_Cycle") {
2803     llvm_unreachable(
2804         "Bad post-instr symbol syntax: see comment in ModuloScheduleTest");
2805     return;
2806   }
2807 
2808   StageTokenAndValue.second.drop_front().getAsInteger(10, Stage);
2809   CycleTokenAndValue.second.drop_front().getAsInteger(10, Cycle);
2810 
2811   dbgs() << "  Stage=" << Stage << ", Cycle=" << Cycle << "\n";
2812 }
2813 
2814 void ModuloScheduleTest::runOnLoop(MachineFunction &MF, MachineLoop &L) {
2815   LiveIntervals &LIS = getAnalysis<LiveIntervalsWrapperPass>().getLIS();
2816   MachineBasicBlock *BB = L.getTopBlock();
2817   dbgs() << "--- ModuloScheduleTest running on BB#" << BB->getNumber() << "\n";
2818 
2819   DenseMap<MachineInstr *, int> Cycle, Stage;
2820   std::vector<MachineInstr *> Instrs;
2821   for (MachineInstr &MI : *BB) {
2822     if (MI.isTerminator())
2823       continue;
2824     Instrs.push_back(&MI);
2825     if (MCSymbol *Sym = MI.getPostInstrSymbol()) {
2826       dbgs() << "Parsing post-instr symbol for " << MI;
2827       parseSymbolString(Sym->getName(), Cycle[&MI], Stage[&MI]);
2828     }
2829   }
2830 
2831   ModuloSchedule MS(MF, &L, std::move(Instrs), std::move(Cycle),
2832                     std::move(Stage));
2833   ModuloScheduleExpander MSE(
2834       MF, MS, LIS, /*InstrChanges=*/ModuloScheduleExpander::InstrChangesTy());
2835   MSE.expand();
2836   MSE.cleanup();
2837 }
2838 
2839 //===----------------------------------------------------------------------===//
2840 // ModuloScheduleTestAnnotater implementation
2841 //===----------------------------------------------------------------------===//
2842 
2843 void ModuloScheduleTestAnnotater::annotate() {
2844   for (MachineInstr *MI : S.getInstructions()) {
2845     SmallVector<char, 16> SV;
2846     raw_svector_ostream OS(SV);
2847     OS << "Stage-" << S.getStage(MI) << "_Cycle-" << S.getCycle(MI);
2848     MCSymbol *Sym = MF.getContext().getOrCreateSymbol(OS.str());
2849     MI->setPostInstrSymbol(MF, Sym);
2850   }
2851 }
2852