xref: /llvm-project/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp (revision 778138114e9e42e28fcb51c0a38224e667a3790c)
1 //===---- ScheduleDAGInstrs.cpp - MachineInstr Rescheduling ---------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file This implements the ScheduleDAGInstrs class, which implements
10 /// re-scheduling of MachineInstrs.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "llvm/CodeGen/ScheduleDAGInstrs.h"
15 
16 #include "llvm/ADT/IntEqClasses.h"
17 #include "llvm/ADT/MapVector.h"
18 #include "llvm/ADT/SmallVector.h"
19 #include "llvm/ADT/SparseSet.h"
20 #include "llvm/ADT/iterator_range.h"
21 #include "llvm/Analysis/AliasAnalysis.h"
22 #include "llvm/Analysis/ValueTracking.h"
23 #include "llvm/CodeGen/LiveIntervals.h"
24 #include "llvm/CodeGen/LivePhysRegs.h"
25 #include "llvm/CodeGen/MachineBasicBlock.h"
26 #include "llvm/CodeGen/MachineFrameInfo.h"
27 #include "llvm/CodeGen/MachineFunction.h"
28 #include "llvm/CodeGen/MachineInstr.h"
29 #include "llvm/CodeGen/MachineInstrBundle.h"
30 #include "llvm/CodeGen/MachineMemOperand.h"
31 #include "llvm/CodeGen/MachineOperand.h"
32 #include "llvm/CodeGen/MachineRegisterInfo.h"
33 #include "llvm/CodeGen/PseudoSourceValue.h"
34 #include "llvm/CodeGen/RegisterPressure.h"
35 #include "llvm/CodeGen/ScheduleDAG.h"
36 #include "llvm/CodeGen/ScheduleDFS.h"
37 #include "llvm/CodeGen/SlotIndexes.h"
38 #include "llvm/CodeGen/TargetInstrInfo.h"
39 #include "llvm/CodeGen/TargetRegisterInfo.h"
40 #include "llvm/CodeGen/TargetSubtargetInfo.h"
41 #include "llvm/Config/llvm-config.h"
42 #include "llvm/IR/Constants.h"
43 #include "llvm/IR/Function.h"
44 #include "llvm/IR/Type.h"
45 #include "llvm/IR/Value.h"
46 #include "llvm/MC/LaneBitmask.h"
47 #include "llvm/MC/MCRegisterInfo.h"
48 #include "llvm/Support/Casting.h"
49 #include "llvm/Support/CommandLine.h"
50 #include "llvm/Support/Compiler.h"
51 #include "llvm/Support/Debug.h"
52 #include "llvm/Support/ErrorHandling.h"
53 #include "llvm/Support/Format.h"
54 #include "llvm/Support/raw_ostream.h"
55 #include <algorithm>
56 #include <cassert>
57 #include <iterator>
58 #include <utility>
59 #include <vector>
60 
61 using namespace llvm;
62 
63 #define DEBUG_TYPE "machine-scheduler"
64 
65 static cl::opt<bool>
66     EnableAASchedMI("enable-aa-sched-mi", cl::Hidden,
67                     cl::desc("Enable use of AA during MI DAG construction"));
68 
69 static cl::opt<bool> UseTBAA("use-tbaa-in-sched-mi", cl::Hidden,
70     cl::init(true), cl::desc("Enable use of TBAA during MI DAG construction"));
71 
72 // Note: the two options below might be used in tuning compile time vs
73 // output quality. Setting HugeRegion so large that it will never be
74 // reached means best-effort, but may be slow.
75 
76 // When Stores and Loads maps (or NonAliasStores and NonAliasLoads)
77 // together hold this many SUs, a reduction of maps will be done.
78 static cl::opt<unsigned> HugeRegion("dag-maps-huge-region", cl::Hidden,
79     cl::init(1000), cl::desc("The limit to use while constructing the DAG "
80                              "prior to scheduling, at which point a trade-off "
81                              "is made to avoid excessive compile time."));
82 
83 static cl::opt<unsigned> ReductionSize(
84     "dag-maps-reduction-size", cl::Hidden,
85     cl::desc("A huge scheduling region will have maps reduced by this many "
86              "nodes at a time. Defaults to HugeRegion / 2."));
87 
88 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
89 static cl::opt<bool> SchedPrintCycles(
90     "sched-print-cycles", cl::Hidden, cl::init(false),
91     cl::desc("Report top/bottom cycles when dumping SUnit instances"));
92 #endif
93 
94 static unsigned getReductionSize() {
95   // Always reduce a huge region with half of the elements, except
96   // when user sets this number explicitly.
97   if (ReductionSize.getNumOccurrences() == 0)
98     return HugeRegion / 2;
99   return ReductionSize;
100 }
101 
102 static void dumpSUList(const ScheduleDAGInstrs::SUList &L) {
103 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
104   dbgs() << "{ ";
105   for (const SUnit *SU : L) {
106     dbgs() << "SU(" << SU->NodeNum << ")";
107     if (SU != L.back())
108       dbgs() << ", ";
109   }
110   dbgs() << "}\n";
111 #endif
112 }
113 
114 ScheduleDAGInstrs::ScheduleDAGInstrs(MachineFunction &mf,
115                                      const MachineLoopInfo *mli,
116                                      bool RemoveKillFlags)
117     : ScheduleDAG(mf), MLI(mli), MFI(mf.getFrameInfo()),
118       RemoveKillFlags(RemoveKillFlags),
119       UnknownValue(UndefValue::get(
120                              Type::getVoidTy(mf.getFunction().getContext()))), Topo(SUnits, &ExitSU) {
121   DbgValues.clear();
122 
123   const TargetSubtargetInfo &ST = mf.getSubtarget();
124   SchedModel.init(&ST);
125 }
126 
127 /// If this machine instr has memory reference information and it can be
128 /// tracked to a normal reference to a known object, return the Value
129 /// for that object. This function returns false the memory location is
130 /// unknown or may alias anything.
131 static bool getUnderlyingObjectsForInstr(const MachineInstr *MI,
132                                          const MachineFrameInfo &MFI,
133                                          UnderlyingObjectsVector &Objects,
134                                          const DataLayout &DL) {
135   auto AllMMOsOkay = [&]() {
136     for (const MachineMemOperand *MMO : MI->memoperands()) {
137       // TODO: Figure out whether isAtomic is really necessary (see D57601).
138       if (MMO->isVolatile() || MMO->isAtomic())
139         return false;
140 
141       if (const PseudoSourceValue *PSV = MMO->getPseudoValue()) {
142         // Function that contain tail calls don't have unique PseudoSourceValue
143         // objects. Two PseudoSourceValues might refer to the same or
144         // overlapping locations. The client code calling this function assumes
145         // this is not the case. So return a conservative answer of no known
146         // object.
147         if (MFI.hasTailCall())
148           return false;
149 
150         // For now, ignore PseudoSourceValues which may alias LLVM IR values
151         // because the code that uses this function has no way to cope with
152         // such aliases.
153         if (PSV->isAliased(&MFI))
154           return false;
155 
156         bool MayAlias = PSV->mayAlias(&MFI);
157         Objects.emplace_back(PSV, MayAlias);
158       } else if (const Value *V = MMO->getValue()) {
159         SmallVector<Value *, 4> Objs;
160         if (!getUnderlyingObjectsForCodeGen(V, Objs))
161           return false;
162 
163         for (Value *V : Objs) {
164           assert(isIdentifiedObject(V));
165           Objects.emplace_back(V, true);
166         }
167       } else
168         return false;
169     }
170     return true;
171   };
172 
173   if (!AllMMOsOkay()) {
174     Objects.clear();
175     return false;
176   }
177 
178   return true;
179 }
180 
181 void ScheduleDAGInstrs::startBlock(MachineBasicBlock *bb) {
182   BB = bb;
183 }
184 
185 void ScheduleDAGInstrs::finishBlock() {
186   // Subclasses should no longer refer to the old block.
187   BB = nullptr;
188 }
189 
190 void ScheduleDAGInstrs::enterRegion(MachineBasicBlock *bb,
191                                     MachineBasicBlock::iterator begin,
192                                     MachineBasicBlock::iterator end,
193                                     unsigned regioninstrs) {
194   assert(bb == BB && "startBlock should set BB");
195   RegionBegin = begin;
196   RegionEnd = end;
197   NumRegionInstrs = regioninstrs;
198 }
199 
200 void ScheduleDAGInstrs::exitRegion() {
201   // Nothing to do.
202 }
203 
204 void ScheduleDAGInstrs::addSchedBarrierDeps() {
205   MachineInstr *ExitMI =
206       RegionEnd != BB->end()
207           ? &*skipDebugInstructionsBackward(RegionEnd, RegionBegin)
208           : nullptr;
209   ExitSU.setInstr(ExitMI);
210   // Add dependencies on the defs and uses of the instruction.
211   if (ExitMI) {
212     for (const MachineOperand &MO : ExitMI->all_uses()) {
213       Register Reg = MO.getReg();
214       if (Reg.isPhysical()) {
215         for (MCRegUnit Unit : TRI->regunits(Reg))
216           Uses.insert(PhysRegSUOper(&ExitSU, -1, Unit));
217       } else if (Reg.isVirtual() && MO.readsReg()) {
218         addVRegUseDeps(&ExitSU, MO.getOperandNo());
219       }
220     }
221   }
222   if (!ExitMI || (!ExitMI->isCall() && !ExitMI->isBarrier())) {
223     // For others, e.g. fallthrough, conditional branch, assume the exit
224     // uses all the registers that are livein to the successor blocks.
225     for (const MachineBasicBlock *Succ : BB->successors()) {
226       for (const auto &LI : Succ->liveins()) {
227         for (MCRegUnitMaskIterator U(LI.PhysReg, TRI); U.isValid(); ++U) {
228           auto [Unit, Mask] = *U;
229           if ((Mask & LI.LaneMask).any() && !Uses.contains(Unit))
230             Uses.insert(PhysRegSUOper(&ExitSU, -1, Unit));
231         }
232       }
233     }
234   }
235 }
236 
237 /// MO is an operand of SU's instruction that defines a physical register. Adds
238 /// data dependencies from SU to any uses of the physical register.
239 void ScheduleDAGInstrs::addPhysRegDataDeps(SUnit *SU, unsigned OperIdx) {
240   const MachineOperand &MO = SU->getInstr()->getOperand(OperIdx);
241   assert(MO.isDef() && "expect physreg def");
242   Register Reg = MO.getReg();
243 
244   // Ask the target if address-backscheduling is desirable, and if so how much.
245   const TargetSubtargetInfo &ST = MF.getSubtarget();
246 
247   // Only use any non-zero latency for real defs/uses, in contrast to
248   // "fake" operands added by regalloc.
249   const MCInstrDesc &DefMIDesc = SU->getInstr()->getDesc();
250   bool ImplicitPseudoDef = (OperIdx >= DefMIDesc.getNumOperands() &&
251                             !DefMIDesc.hasImplicitDefOfPhysReg(Reg));
252   for (MCRegUnit Unit : TRI->regunits(Reg)) {
253     for (RegUnit2SUnitsMap::iterator I = Uses.find(Unit); I != Uses.end();
254          ++I) {
255       SUnit *UseSU = I->SU;
256       if (UseSU == SU)
257         continue;
258 
259       // Adjust the dependence latency using operand def/use information,
260       // then allow the target to perform its own adjustments.
261       MachineInstr *UseInstr = nullptr;
262       int UseOpIdx = I->OpIdx;
263       bool ImplicitPseudoUse = false;
264       SDep Dep;
265       if (UseOpIdx < 0) {
266         Dep = SDep(SU, SDep::Artificial);
267       } else {
268         // Set the hasPhysRegDefs only for physreg defs that have a use within
269         // the scheduling region.
270         SU->hasPhysRegDefs = true;
271 
272         UseInstr = UseSU->getInstr();
273         Register UseReg = UseInstr->getOperand(UseOpIdx).getReg();
274         const MCInstrDesc &UseMIDesc = UseInstr->getDesc();
275         ImplicitPseudoUse = UseOpIdx >= ((int)UseMIDesc.getNumOperands()) &&
276                             !UseMIDesc.hasImplicitUseOfPhysReg(UseReg);
277 
278         Dep = SDep(SU, SDep::Data, UseReg);
279       }
280       if (!ImplicitPseudoDef && !ImplicitPseudoUse) {
281         Dep.setLatency(SchedModel.computeOperandLatency(SU->getInstr(), OperIdx,
282                                                         UseInstr, UseOpIdx));
283       } else {
284         Dep.setLatency(0);
285       }
286       ST.adjustSchedDependency(SU, OperIdx, UseSU, UseOpIdx, Dep, &SchedModel);
287       UseSU->addPred(Dep);
288     }
289   }
290 }
291 
292 /// Adds register dependencies (data, anti, and output) from this SUnit
293 /// to following instructions in the same scheduling region that depend the
294 /// physical register referenced at OperIdx.
295 void ScheduleDAGInstrs::addPhysRegDeps(SUnit *SU, unsigned OperIdx) {
296   MachineInstr *MI = SU->getInstr();
297   MachineOperand &MO = MI->getOperand(OperIdx);
298   Register Reg = MO.getReg();
299   // We do not need to track any dependencies for constant registers.
300   if (MRI.isConstantPhysReg(Reg))
301     return;
302 
303   const TargetSubtargetInfo &ST = MF.getSubtarget();
304 
305   // Optionally add output and anti dependencies. For anti
306   // dependencies we use a latency of 0 because for a multi-issue
307   // target we want to allow the defining instruction to issue
308   // in the same cycle as the using instruction.
309   // TODO: Using a latency of 1 here for output dependencies assumes
310   //       there's no cost for reusing registers.
311   SDep::Kind Kind = MO.isUse() ? SDep::Anti : SDep::Output;
312   for (MCRegUnit Unit : TRI->regunits(Reg)) {
313     for (RegUnit2SUnitsMap::iterator I = Defs.find(Unit); I != Defs.end();
314          ++I) {
315       SUnit *DefSU = I->SU;
316       if (DefSU == &ExitSU)
317         continue;
318       MachineInstr *DefInstr = DefSU->getInstr();
319       MachineOperand &DefMO = DefInstr->getOperand(I->OpIdx);
320       if (DefSU != SU &&
321           (Kind != SDep::Output || !MO.isDead() || !DefMO.isDead())) {
322         SDep Dep(SU, Kind, DefMO.getReg());
323         if (Kind != SDep::Anti) {
324           Dep.setLatency(
325               SchedModel.computeOutputLatency(MI, OperIdx, DefInstr));
326         }
327         ST.adjustSchedDependency(SU, OperIdx, DefSU, I->OpIdx, Dep,
328                                  &SchedModel);
329         DefSU->addPred(Dep);
330       }
331     }
332   }
333 
334   if (MO.isUse()) {
335     SU->hasPhysRegUses = true;
336     // Either insert a new Reg2SUnits entry with an empty SUnits list, or
337     // retrieve the existing SUnits list for this register's uses.
338     // Push this SUnit on the use list.
339     for (MCRegUnit Unit : TRI->regunits(Reg))
340       Uses.insert(PhysRegSUOper(SU, OperIdx, Unit));
341     if (RemoveKillFlags)
342       MO.setIsKill(false);
343   } else {
344     addPhysRegDataDeps(SU, OperIdx);
345 
346     // Clear previous uses and defs of this register and its subregisters.
347     for (MCRegUnit Unit : TRI->regunits(Reg)) {
348       Uses.eraseAll(Unit);
349       if (!MO.isDead())
350         Defs.eraseAll(Unit);
351     }
352 
353     if (MO.isDead() && SU->isCall) {
354       // Calls will not be reordered because of chain dependencies (see
355       // below). Since call operands are dead, calls may continue to be added
356       // to the DefList making dependence checking quadratic in the size of
357       // the block. Instead, we leave only one call at the back of the
358       // DefList.
359       for (MCRegUnit Unit : TRI->regunits(Reg)) {
360         RegUnit2SUnitsMap::RangePair P = Defs.equal_range(Unit);
361         RegUnit2SUnitsMap::iterator B = P.first;
362         RegUnit2SUnitsMap::iterator I = P.second;
363         for (bool isBegin = I == B; !isBegin; /* empty */) {
364           isBegin = (--I) == B;
365           if (!I->SU->isCall)
366             break;
367           I = Defs.erase(I);
368         }
369       }
370     }
371 
372     // Defs are pushed in the order they are visited and never reordered.
373     for (MCRegUnit Unit : TRI->regunits(Reg))
374       Defs.insert(PhysRegSUOper(SU, OperIdx, Unit));
375   }
376 }
377 
378 LaneBitmask ScheduleDAGInstrs::getLaneMaskForMO(const MachineOperand &MO) const
379 {
380   Register Reg = MO.getReg();
381   // No point in tracking lanemasks if we don't have interesting subregisters.
382   const TargetRegisterClass &RC = *MRI.getRegClass(Reg);
383   if (!RC.HasDisjunctSubRegs)
384     return LaneBitmask::getAll();
385 
386   unsigned SubReg = MO.getSubReg();
387   if (SubReg == 0)
388     return RC.getLaneMask();
389   return TRI->getSubRegIndexLaneMask(SubReg);
390 }
391 
392 bool ScheduleDAGInstrs::deadDefHasNoUse(const MachineOperand &MO) {
393   auto RegUse = CurrentVRegUses.find(MO.getReg());
394   if (RegUse == CurrentVRegUses.end())
395     return true;
396   return (RegUse->LaneMask & getLaneMaskForMO(MO)).none();
397 }
398 
399 /// Adds register output and data dependencies from this SUnit to instructions
400 /// that occur later in the same scheduling region if they read from or write to
401 /// the virtual register defined at OperIdx.
402 ///
403 /// TODO: Hoist loop induction variable increments. This has to be
404 /// reevaluated. Generally, IV scheduling should be done before coalescing.
405 void ScheduleDAGInstrs::addVRegDefDeps(SUnit *SU, unsigned OperIdx) {
406   MachineInstr *MI = SU->getInstr();
407   MachineOperand &MO = MI->getOperand(OperIdx);
408   Register Reg = MO.getReg();
409 
410   LaneBitmask DefLaneMask;
411   LaneBitmask KillLaneMask;
412   if (TrackLaneMasks) {
413     bool IsKill = MO.getSubReg() == 0 || MO.isUndef();
414     DefLaneMask = getLaneMaskForMO(MO);
415     // If we have a <read-undef> flag, none of the lane values comes from an
416     // earlier instruction.
417     KillLaneMask = IsKill ? LaneBitmask::getAll() : DefLaneMask;
418 
419     if (MO.getSubReg() != 0 && MO.isUndef()) {
420       // There may be other subregister defs on the same instruction of the same
421       // register in later operands. The lanes of other defs will now be live
422       // after this instruction, so these should not be treated as killed by the
423       // instruction even though they appear to be killed in this one operand.
424       for (const MachineOperand &OtherMO :
425            llvm::drop_begin(MI->operands(), OperIdx + 1))
426         if (OtherMO.isReg() && OtherMO.isDef() && OtherMO.getReg() == Reg)
427           KillLaneMask &= ~getLaneMaskForMO(OtherMO);
428     }
429 
430     // Clear undef flag, we'll re-add it later once we know which subregister
431     // Def is first.
432     MO.setIsUndef(false);
433   } else {
434     DefLaneMask = LaneBitmask::getAll();
435     KillLaneMask = LaneBitmask::getAll();
436   }
437 
438   if (MO.isDead()) {
439     assert(deadDefHasNoUse(MO) && "Dead defs should have no uses");
440   } else {
441     // Add data dependence to all uses we found so far.
442     const TargetSubtargetInfo &ST = MF.getSubtarget();
443     for (VReg2SUnitOperIdxMultiMap::iterator I = CurrentVRegUses.find(Reg),
444          E = CurrentVRegUses.end(); I != E; /*empty*/) {
445       LaneBitmask LaneMask = I->LaneMask;
446       // Ignore uses of other lanes.
447       if ((LaneMask & KillLaneMask).none()) {
448         ++I;
449         continue;
450       }
451 
452       if ((LaneMask & DefLaneMask).any()) {
453         SUnit *UseSU = I->SU;
454         MachineInstr *Use = UseSU->getInstr();
455         SDep Dep(SU, SDep::Data, Reg);
456         Dep.setLatency(SchedModel.computeOperandLatency(MI, OperIdx, Use,
457                                                         I->OperandIndex));
458         ST.adjustSchedDependency(SU, OperIdx, UseSU, I->OperandIndex, Dep,
459                                  &SchedModel);
460         UseSU->addPred(Dep);
461       }
462 
463       LaneMask &= ~KillLaneMask;
464       // If we found a Def for all lanes of this use, remove it from the list.
465       if (LaneMask.any()) {
466         I->LaneMask = LaneMask;
467         ++I;
468       } else
469         I = CurrentVRegUses.erase(I);
470     }
471   }
472 
473   // Shortcut: Singly defined vregs do not have output/anti dependencies.
474   if (MRI.hasOneDef(Reg))
475     return;
476 
477   // Add output dependence to the next nearest defs of this vreg.
478   //
479   // Unless this definition is dead, the output dependence should be
480   // transitively redundant with antidependencies from this definition's
481   // uses. We're conservative for now until we have a way to guarantee the uses
482   // are not eliminated sometime during scheduling. The output dependence edge
483   // is also useful if output latency exceeds def-use latency.
484   LaneBitmask LaneMask = DefLaneMask;
485   for (VReg2SUnit &V2SU : make_range(CurrentVRegDefs.find(Reg),
486                                      CurrentVRegDefs.end())) {
487     // Ignore defs for other lanes.
488     if ((V2SU.LaneMask & LaneMask).none())
489       continue;
490     // Add an output dependence.
491     SUnit *DefSU = V2SU.SU;
492     // Ignore additional defs of the same lanes in one instruction. This can
493     // happen because lanemasks are shared for targets with too many
494     // subregisters. We also use some representration tricks/hacks where we
495     // add super-register defs/uses, to imply that although we only access parts
496     // of the reg we care about the full one.
497     if (DefSU == SU)
498       continue;
499     SDep Dep(SU, SDep::Output, Reg);
500     Dep.setLatency(
501       SchedModel.computeOutputLatency(MI, OperIdx, DefSU->getInstr()));
502     DefSU->addPred(Dep);
503 
504     // Update current definition. This can get tricky if the def was about a
505     // bigger lanemask before. We then have to shrink it and create a new
506     // VReg2SUnit for the non-overlapping part.
507     LaneBitmask OverlapMask = V2SU.LaneMask & LaneMask;
508     LaneBitmask NonOverlapMask = V2SU.LaneMask & ~LaneMask;
509     V2SU.SU = SU;
510     V2SU.LaneMask = OverlapMask;
511     if (NonOverlapMask.any())
512       CurrentVRegDefs.insert(VReg2SUnit(Reg, NonOverlapMask, DefSU));
513   }
514   // If there was no CurrentVRegDefs entry for some lanes yet, create one.
515   if (LaneMask.any())
516     CurrentVRegDefs.insert(VReg2SUnit(Reg, LaneMask, SU));
517 }
518 
519 /// Adds a register data dependency if the instruction that defines the
520 /// virtual register used at OperIdx is mapped to an SUnit. Add a register
521 /// antidependency from this SUnit to instructions that occur later in the same
522 /// scheduling region if they write the virtual register.
523 ///
524 /// TODO: Handle ExitSU "uses" properly.
525 void ScheduleDAGInstrs::addVRegUseDeps(SUnit *SU, unsigned OperIdx) {
526   const MachineInstr *MI = SU->getInstr();
527   assert(!MI->isDebugOrPseudoInstr());
528 
529   const MachineOperand &MO = MI->getOperand(OperIdx);
530   Register Reg = MO.getReg();
531 
532   // Remember the use. Data dependencies will be added when we find the def.
533   LaneBitmask LaneMask = TrackLaneMasks ? getLaneMaskForMO(MO)
534                                         : LaneBitmask::getAll();
535   CurrentVRegUses.insert(VReg2SUnitOperIdx(Reg, LaneMask, OperIdx, SU));
536 
537   // Add antidependences to the following defs of the vreg.
538   for (VReg2SUnit &V2SU : make_range(CurrentVRegDefs.find(Reg),
539                                      CurrentVRegDefs.end())) {
540     // Ignore defs for unrelated lanes.
541     LaneBitmask PrevDefLaneMask = V2SU.LaneMask;
542     if ((PrevDefLaneMask & LaneMask).none())
543       continue;
544     if (V2SU.SU == SU)
545       continue;
546 
547     V2SU.SU->addPred(SDep(SU, SDep::Anti, Reg));
548   }
549 }
550 
551 
552 void ScheduleDAGInstrs::addChainDependency (SUnit *SUa, SUnit *SUb,
553                                             unsigned Latency) {
554   if (SUa->getInstr()->mayAlias(getAAForDep(), *SUb->getInstr(), UseTBAA)) {
555     SDep Dep(SUa, SDep::MayAliasMem);
556     Dep.setLatency(Latency);
557     SUb->addPred(Dep);
558   }
559 }
560 
561 /// Creates an SUnit for each real instruction, numbered in top-down
562 /// topological order. The instruction order A < B, implies that no edge exists
563 /// from B to A.
564 ///
565 /// Map each real instruction to its SUnit.
566 ///
567 /// After initSUnits, the SUnits vector cannot be resized and the scheduler may
568 /// hang onto SUnit pointers. We may relax this in the future by using SUnit IDs
569 /// instead of pointers.
570 ///
571 /// MachineScheduler relies on initSUnits numbering the nodes by their order in
572 /// the original instruction list.
573 void ScheduleDAGInstrs::initSUnits() {
574   // We'll be allocating one SUnit for each real instruction in the region,
575   // which is contained within a basic block.
576   SUnits.reserve(NumRegionInstrs);
577 
578   for (MachineInstr &MI : make_range(RegionBegin, RegionEnd)) {
579     if (MI.isDebugOrPseudoInstr())
580       continue;
581 
582     SUnit *SU = newSUnit(&MI);
583     MISUnitMap[&MI] = SU;
584 
585     SU->isCall = MI.isCall();
586     SU->isCommutable = MI.isCommutable();
587 
588     // Assign the Latency field of SU using target-provided information.
589     SU->Latency = SchedModel.computeInstrLatency(SU->getInstr());
590 
591     // If this SUnit uses a reserved or unbuffered resource, mark it as such.
592     //
593     // Reserved resources block an instruction from issuing and stall the
594     // entire pipeline. These are identified by BufferSize=0.
595     //
596     // Unbuffered resources prevent execution of subsequent instructions that
597     // require the same resources. This is used for in-order execution pipelines
598     // within an out-of-order core. These are identified by BufferSize=1.
599     if (SchedModel.hasInstrSchedModel()) {
600       const MCSchedClassDesc *SC = getSchedClass(SU);
601       for (const MCWriteProcResEntry &PRE :
602            make_range(SchedModel.getWriteProcResBegin(SC),
603                       SchedModel.getWriteProcResEnd(SC))) {
604         switch (SchedModel.getProcResource(PRE.ProcResourceIdx)->BufferSize) {
605         case 0:
606           SU->hasReservedResource = true;
607           break;
608         case 1:
609           SU->isUnbuffered = true;
610           break;
611         default:
612           break;
613         }
614       }
615     }
616   }
617 }
618 
619 class ScheduleDAGInstrs::Value2SUsMap
620     : public SmallMapVector<ValueType, SUList, 4> {
621   /// Current total number of SUs in map.
622   unsigned NumNodes = 0;
623 
624   /// 1 for loads, 0 for stores. (see comment in SUList)
625   unsigned TrueMemOrderLatency;
626 
627 public:
628   Value2SUsMap(unsigned lat = 0) : TrueMemOrderLatency(lat) {}
629 
630   /// To keep NumNodes up to date, insert() is used instead of
631   /// this operator w/ push_back().
632   ValueType &operator[](const SUList &Key) {
633     llvm_unreachable("Don't use. Use insert() instead."); };
634 
635   /// Adds SU to the SUList of V. If Map grows huge, reduce its size by calling
636   /// reduce().
637   void inline insert(SUnit *SU, ValueType V) {
638     MapVector::operator[](V).push_back(SU);
639     NumNodes++;
640   }
641 
642   /// Clears the list of SUs mapped to V.
643   void inline clearList(ValueType V) {
644     iterator Itr = find(V);
645     if (Itr != end()) {
646       assert(NumNodes >= Itr->second.size());
647       NumNodes -= Itr->second.size();
648 
649       Itr->second.clear();
650     }
651   }
652 
653   /// Clears map from all contents.
654   void clear() {
655     SmallMapVector<ValueType, SUList, 4>::clear();
656     NumNodes = 0;
657   }
658 
659   unsigned inline size() const { return NumNodes; }
660 
661   /// Counts the number of SUs in this map after a reduction.
662   void reComputeSize() {
663     NumNodes = 0;
664     for (auto &I : *this)
665       NumNodes += I.second.size();
666   }
667 
668   unsigned inline getTrueMemOrderLatency() const {
669     return TrueMemOrderLatency;
670   }
671 
672   void dump();
673 };
674 
675 void ScheduleDAGInstrs::addChainDependencies(SUnit *SU,
676                                              Value2SUsMap &Val2SUsMap) {
677   for (auto &I : Val2SUsMap)
678     addChainDependencies(SU, I.second,
679                          Val2SUsMap.getTrueMemOrderLatency());
680 }
681 
682 void ScheduleDAGInstrs::addChainDependencies(SUnit *SU,
683                                              Value2SUsMap &Val2SUsMap,
684                                              ValueType V) {
685   Value2SUsMap::iterator Itr = Val2SUsMap.find(V);
686   if (Itr != Val2SUsMap.end())
687     addChainDependencies(SU, Itr->second,
688                          Val2SUsMap.getTrueMemOrderLatency());
689 }
690 
691 void ScheduleDAGInstrs::addBarrierChain(Value2SUsMap &map) {
692   assert(BarrierChain != nullptr);
693 
694   for (auto &[V, SUs] : map) {
695     (void)V;
696     for (auto *SU : SUs)
697       SU->addPredBarrier(BarrierChain);
698   }
699   map.clear();
700 }
701 
702 void ScheduleDAGInstrs::insertBarrierChain(Value2SUsMap &map) {
703   assert(BarrierChain != nullptr);
704 
705   // Go through all lists of SUs.
706   for (Value2SUsMap::iterator I = map.begin(), EE = map.end(); I != EE;) {
707     Value2SUsMap::iterator CurrItr = I++;
708     SUList &sus = CurrItr->second;
709     SUList::iterator SUItr = sus.begin(), SUEE = sus.end();
710     for (; SUItr != SUEE; ++SUItr) {
711       // Stop on BarrierChain or any instruction above it.
712       if ((*SUItr)->NodeNum <= BarrierChain->NodeNum)
713         break;
714 
715       (*SUItr)->addPredBarrier(BarrierChain);
716     }
717 
718     // Remove also the BarrierChain from list if present.
719     if (SUItr != SUEE && *SUItr == BarrierChain)
720       SUItr++;
721 
722     // Remove all SUs that are now successors of BarrierChain.
723     if (SUItr != sus.begin())
724       sus.erase(sus.begin(), SUItr);
725   }
726 
727   // Remove all entries with empty su lists.
728   map.remove_if([&](std::pair<ValueType, SUList> &mapEntry) {
729       return (mapEntry.second.empty()); });
730 
731   // Recompute the size of the map (NumNodes).
732   map.reComputeSize();
733 }
734 
735 void ScheduleDAGInstrs::buildSchedGraph(AAResults *AA,
736                                         RegPressureTracker *RPTracker,
737                                         PressureDiffs *PDiffs,
738                                         LiveIntervals *LIS,
739                                         bool TrackLaneMasks) {
740   const TargetSubtargetInfo &ST = MF.getSubtarget();
741   bool UseAA = EnableAASchedMI.getNumOccurrences() > 0 ? EnableAASchedMI
742                                                        : ST.useAA();
743   if (UseAA && AA)
744     AAForDep.emplace(*AA);
745 
746   BarrierChain = nullptr;
747 
748   this->TrackLaneMasks = TrackLaneMasks;
749   MISUnitMap.clear();
750   ScheduleDAG::clearDAG();
751 
752   // Create an SUnit for each real instruction.
753   initSUnits();
754 
755   if (PDiffs)
756     PDiffs->init(SUnits.size());
757 
758   // We build scheduling units by walking a block's instruction list
759   // from bottom to top.
760 
761   // Each MIs' memory operand(s) is analyzed to a list of underlying
762   // objects. The SU is then inserted in the SUList(s) mapped from the
763   // Value(s). Each Value thus gets mapped to lists of SUs depending
764   // on it, stores and loads kept separately. Two SUs are trivially
765   // non-aliasing if they both depend on only identified Values and do
766   // not share any common Value.
767   Value2SUsMap Stores, Loads(1 /*TrueMemOrderLatency*/);
768 
769   // Certain memory accesses are known to not alias any SU in Stores
770   // or Loads, and have therefore their own 'NonAlias'
771   // domain. E.g. spill / reload instructions never alias LLVM I/R
772   // Values. It would be nice to assume that this type of memory
773   // accesses always have a proper memory operand modelling, and are
774   // therefore never unanalyzable, but this is conservatively not
775   // done.
776   Value2SUsMap NonAliasStores, NonAliasLoads(1 /*TrueMemOrderLatency*/);
777 
778   // Track all instructions that may raise floating-point exceptions.
779   // These do not depend on one other (or normal loads or stores), but
780   // must not be rescheduled across global barriers.  Note that we don't
781   // really need a "map" here since we don't track those MIs by value;
782   // using the same Value2SUsMap data type here is simply a matter of
783   // convenience.
784   Value2SUsMap FPExceptions;
785 
786   // Remove any stale debug info; sometimes BuildSchedGraph is called again
787   // without emitting the info from the previous call.
788   DbgValues.clear();
789   FirstDbgValue = nullptr;
790 
791   assert(Defs.empty() && Uses.empty() &&
792          "Only BuildGraph should update Defs/Uses");
793   Defs.setUniverse(TRI->getNumRegs());
794   Uses.setUniverse(TRI->getNumRegs());
795 
796   assert(CurrentVRegDefs.empty() && "nobody else should use CurrentVRegDefs");
797   assert(CurrentVRegUses.empty() && "nobody else should use CurrentVRegUses");
798   unsigned NumVirtRegs = MRI.getNumVirtRegs();
799   CurrentVRegDefs.setUniverse(NumVirtRegs);
800   CurrentVRegUses.setUniverse(NumVirtRegs);
801 
802   // Model data dependencies between instructions being scheduled and the
803   // ExitSU.
804   addSchedBarrierDeps();
805 
806   // Walk the list of instructions, from bottom moving up.
807   MachineInstr *DbgMI = nullptr;
808   for (MachineBasicBlock::iterator MII = RegionEnd, MIE = RegionBegin;
809        MII != MIE; --MII) {
810     MachineInstr &MI = *std::prev(MII);
811     if (DbgMI) {
812       DbgValues.emplace_back(DbgMI, &MI);
813       DbgMI = nullptr;
814     }
815 
816     if (MI.isDebugValue() || MI.isDebugPHI()) {
817       DbgMI = &MI;
818       continue;
819     }
820 
821     if (MI.isDebugLabel() || MI.isDebugRef() || MI.isPseudoProbe())
822       continue;
823 
824     SUnit *SU = MISUnitMap[&MI];
825     assert(SU && "No SUnit mapped to this MI");
826 
827     if (RPTracker) {
828       RegisterOperands RegOpers;
829       RegOpers.collect(MI, *TRI, MRI, TrackLaneMasks, false);
830       if (TrackLaneMasks) {
831         SlotIndex SlotIdx = LIS->getInstructionIndex(MI);
832         RegOpers.adjustLaneLiveness(*LIS, MRI, SlotIdx);
833       }
834       if (PDiffs != nullptr)
835         PDiffs->addInstruction(SU->NodeNum, RegOpers, MRI);
836 
837       if (RPTracker->getPos() == RegionEnd || &*RPTracker->getPos() != &MI)
838         RPTracker->recedeSkipDebugValues();
839       assert(&*RPTracker->getPos() == &MI && "RPTracker in sync");
840       RPTracker->recede(RegOpers);
841     }
842 
843     assert(
844         (CanHandleTerminators || (!MI.isTerminator() && !MI.isPosition())) &&
845         "Cannot schedule terminators or labels!");
846 
847     // Add register-based dependencies (data, anti, and output).
848     // For some instructions (calls, returns, inline-asm, etc.) there can
849     // be explicit uses and implicit defs, in which case the use will appear
850     // on the operand list before the def. Do two passes over the operand
851     // list to make sure that defs are processed before any uses.
852     bool HasVRegDef = false;
853     for (unsigned j = 0, n = MI.getNumOperands(); j != n; ++j) {
854       const MachineOperand &MO = MI.getOperand(j);
855       if (!MO.isReg() || !MO.isDef())
856         continue;
857       Register Reg = MO.getReg();
858       if (Reg.isPhysical()) {
859         addPhysRegDeps(SU, j);
860       } else if (Reg.isVirtual()) {
861         HasVRegDef = true;
862         addVRegDefDeps(SU, j);
863       }
864     }
865     // Now process all uses.
866     for (unsigned j = 0, n = MI.getNumOperands(); j != n; ++j) {
867       const MachineOperand &MO = MI.getOperand(j);
868       // Only look at use operands.
869       // We do not need to check for MO.readsReg() here because subsequent
870       // subregister defs will get output dependence edges and need no
871       // additional use dependencies.
872       if (!MO.isReg() || !MO.isUse())
873         continue;
874       Register Reg = MO.getReg();
875       if (Reg.isPhysical()) {
876         addPhysRegDeps(SU, j);
877       } else if (Reg.isVirtual() && MO.readsReg()) {
878         addVRegUseDeps(SU, j);
879       }
880     }
881 
882     // If we haven't seen any uses in this scheduling region, create a
883     // dependence edge to ExitSU to model the live-out latency. This is required
884     // for vreg defs with no in-region use, and prefetches with no vreg def.
885     //
886     // FIXME: NumDataSuccs would be more precise than NumSuccs here. This
887     // check currently relies on being called before adding chain deps.
888     if (SU->NumSuccs == 0 && SU->Latency > 1 && (HasVRegDef || MI.mayLoad())) {
889       SDep Dep(SU, SDep::Artificial);
890       Dep.setLatency(SU->Latency - 1);
891       ExitSU.addPred(Dep);
892     }
893 
894     // Add memory dependencies (Note: isStoreToStackSlot and
895     // isLoadFromStackSLot are not usable after stack slots are lowered to
896     // actual addresses).
897 
898     const TargetInstrInfo *TII = ST.getInstrInfo();
899     // This is a barrier event that acts as a pivotal node in the DAG.
900     if (TII->isGlobalMemoryObject(&MI)) {
901 
902       // Become the barrier chain.
903       if (BarrierChain)
904         BarrierChain->addPredBarrier(SU);
905       BarrierChain = SU;
906 
907       LLVM_DEBUG(dbgs() << "Global memory object and new barrier chain: SU("
908                         << BarrierChain->NodeNum << ").\n");
909 
910       // Add dependencies against everything below it and clear maps.
911       addBarrierChain(Stores);
912       addBarrierChain(Loads);
913       addBarrierChain(NonAliasStores);
914       addBarrierChain(NonAliasLoads);
915       addBarrierChain(FPExceptions);
916 
917       continue;
918     }
919 
920     // Instructions that may raise FP exceptions may not be moved
921     // across any global barriers.
922     if (MI.mayRaiseFPException()) {
923       if (BarrierChain)
924         BarrierChain->addPredBarrier(SU);
925 
926       FPExceptions.insert(SU, UnknownValue);
927 
928       if (FPExceptions.size() >= HugeRegion) {
929         LLVM_DEBUG(dbgs() << "Reducing FPExceptions map.\n");
930         Value2SUsMap empty;
931         reduceHugeMemNodeMaps(FPExceptions, empty, getReductionSize());
932       }
933     }
934 
935     // If it's not a store or a variant load, we're done.
936     if (!MI.mayStore() &&
937         !(MI.mayLoad() && !MI.isDereferenceableInvariantLoad()))
938       continue;
939 
940     // Always add dependecy edge to BarrierChain if present.
941     if (BarrierChain)
942       BarrierChain->addPredBarrier(SU);
943 
944     // Find the underlying objects for MI. The Objs vector is either
945     // empty, or filled with the Values of memory locations which this
946     // SU depends on.
947     UnderlyingObjectsVector Objs;
948     bool ObjsFound = getUnderlyingObjectsForInstr(&MI, MFI, Objs,
949                                                   MF.getDataLayout());
950 
951     if (MI.mayStore()) {
952       if (!ObjsFound) {
953         // An unknown store depends on all stores and loads.
954         addChainDependencies(SU, Stores);
955         addChainDependencies(SU, NonAliasStores);
956         addChainDependencies(SU, Loads);
957         addChainDependencies(SU, NonAliasLoads);
958 
959         // Map this store to 'UnknownValue'.
960         Stores.insert(SU, UnknownValue);
961       } else {
962         // Add precise dependencies against all previously seen memory
963         // accesses mapped to the same Value(s).
964         for (const UnderlyingObject &UnderlObj : Objs) {
965           ValueType V = UnderlObj.getValue();
966           bool ThisMayAlias = UnderlObj.mayAlias();
967 
968           // Add dependencies to previous stores and loads mapped to V.
969           addChainDependencies(SU, (ThisMayAlias ? Stores : NonAliasStores), V);
970           addChainDependencies(SU, (ThisMayAlias ? Loads : NonAliasLoads), V);
971         }
972         // Update the store map after all chains have been added to avoid adding
973         // self-loop edge if multiple underlying objects are present.
974         for (const UnderlyingObject &UnderlObj : Objs) {
975           ValueType V = UnderlObj.getValue();
976           bool ThisMayAlias = UnderlObj.mayAlias();
977 
978           // Map this store to V.
979           (ThisMayAlias ? Stores : NonAliasStores).insert(SU, V);
980         }
981         // The store may have dependencies to unanalyzable loads and
982         // stores.
983         addChainDependencies(SU, Loads, UnknownValue);
984         addChainDependencies(SU, Stores, UnknownValue);
985       }
986     } else { // SU is a load.
987       if (!ObjsFound) {
988         // An unknown load depends on all stores.
989         addChainDependencies(SU, Stores);
990         addChainDependencies(SU, NonAliasStores);
991 
992         Loads.insert(SU, UnknownValue);
993       } else {
994         for (const UnderlyingObject &UnderlObj : Objs) {
995           ValueType V = UnderlObj.getValue();
996           bool ThisMayAlias = UnderlObj.mayAlias();
997 
998           // Add precise dependencies against all previously seen stores
999           // mapping to the same Value(s).
1000           addChainDependencies(SU, (ThisMayAlias ? Stores : NonAliasStores), V);
1001 
1002           // Map this load to V.
1003           (ThisMayAlias ? Loads : NonAliasLoads).insert(SU, V);
1004         }
1005         // The load may have dependencies to unanalyzable stores.
1006         addChainDependencies(SU, Stores, UnknownValue);
1007       }
1008     }
1009 
1010     // Reduce maps if they grow huge.
1011     if (Stores.size() + Loads.size() >= HugeRegion) {
1012       LLVM_DEBUG(dbgs() << "Reducing Stores and Loads maps.\n");
1013       reduceHugeMemNodeMaps(Stores, Loads, getReductionSize());
1014     }
1015     if (NonAliasStores.size() + NonAliasLoads.size() >= HugeRegion) {
1016       LLVM_DEBUG(dbgs() << "Reducing NonAliasStores and NonAliasLoads maps.\n");
1017       reduceHugeMemNodeMaps(NonAliasStores, NonAliasLoads, getReductionSize());
1018     }
1019   }
1020 
1021   if (DbgMI)
1022     FirstDbgValue = DbgMI;
1023 
1024   Defs.clear();
1025   Uses.clear();
1026   CurrentVRegDefs.clear();
1027   CurrentVRegUses.clear();
1028 
1029   Topo.MarkDirty();
1030 }
1031 
1032 raw_ostream &llvm::operator<<(raw_ostream &OS, const PseudoSourceValue* PSV) {
1033   PSV->printCustom(OS);
1034   return OS;
1035 }
1036 
1037 void ScheduleDAGInstrs::Value2SUsMap::dump() {
1038   for (const auto &[ValType, SUs] : *this) {
1039     if (isa<const Value *>(ValType)) {
1040       const Value *V = cast<const Value *>(ValType);
1041       if (isa<UndefValue>(V))
1042         dbgs() << "Unknown";
1043       else
1044         V->printAsOperand(dbgs());
1045     } else if (isa<const PseudoSourceValue *>(ValType))
1046       dbgs() << cast<const PseudoSourceValue *>(ValType);
1047     else
1048       llvm_unreachable("Unknown Value type.");
1049 
1050     dbgs() << " : ";
1051     dumpSUList(SUs);
1052   }
1053 }
1054 
1055 void ScheduleDAGInstrs::reduceHugeMemNodeMaps(Value2SUsMap &stores,
1056                                               Value2SUsMap &loads, unsigned N) {
1057   LLVM_DEBUG(dbgs() << "Before reduction:\nStoring SUnits:\n"; stores.dump();
1058              dbgs() << "Loading SUnits:\n"; loads.dump());
1059 
1060   // Insert all SU's NodeNums into a vector and sort it.
1061   std::vector<unsigned> NodeNums;
1062   NodeNums.reserve(stores.size() + loads.size());
1063   for (const auto &[V, SUs] : stores) {
1064     (void)V;
1065     for (const auto *SU : SUs)
1066       NodeNums.push_back(SU->NodeNum);
1067   }
1068   for (const auto &[V, SUs] : loads) {
1069     (void)V;
1070     for (const auto *SU : SUs)
1071       NodeNums.push_back(SU->NodeNum);
1072   }
1073   llvm::sort(NodeNums);
1074 
1075   // The N last elements in NodeNums will be removed, and the SU with
1076   // the lowest NodeNum of them will become the new BarrierChain to
1077   // let the not yet seen SUs have a dependency to the removed SUs.
1078   assert(N <= NodeNums.size());
1079   SUnit *newBarrierChain = &SUnits[*(NodeNums.end() - N)];
1080   if (BarrierChain) {
1081     // The aliasing and non-aliasing maps reduce independently of each
1082     // other, but share a common BarrierChain. Check if the
1083     // newBarrierChain is above the former one. If it is not, it may
1084     // introduce a loop to use newBarrierChain, so keep the old one.
1085     if (newBarrierChain->NodeNum < BarrierChain->NodeNum) {
1086       BarrierChain->addPredBarrier(newBarrierChain);
1087       BarrierChain = newBarrierChain;
1088       LLVM_DEBUG(dbgs() << "Inserting new barrier chain: SU("
1089                         << BarrierChain->NodeNum << ").\n");
1090     }
1091     else
1092       LLVM_DEBUG(dbgs() << "Keeping old barrier chain: SU("
1093                         << BarrierChain->NodeNum << ").\n");
1094   }
1095   else
1096     BarrierChain = newBarrierChain;
1097 
1098   insertBarrierChain(stores);
1099   insertBarrierChain(loads);
1100 
1101   LLVM_DEBUG(dbgs() << "After reduction:\nStoring SUnits:\n"; stores.dump();
1102              dbgs() << "Loading SUnits:\n"; loads.dump());
1103 }
1104 
1105 static void toggleKills(const MachineRegisterInfo &MRI, LiveRegUnits &LiveRegs,
1106                         MachineInstr &MI, bool addToLiveRegs) {
1107   for (MachineOperand &MO : MI.operands()) {
1108     if (!MO.isReg() || !MO.readsReg())
1109       continue;
1110     Register Reg = MO.getReg();
1111     if (!Reg)
1112       continue;
1113 
1114     // Things that are available after the instruction are killed by it.
1115     bool IsKill = LiveRegs.available(Reg);
1116 
1117     // Exception: Do not kill reserved registers
1118     MO.setIsKill(IsKill && !MRI.isReserved(Reg));
1119     if (addToLiveRegs)
1120       LiveRegs.addReg(Reg);
1121   }
1122 }
1123 
1124 void ScheduleDAGInstrs::fixupKills(MachineBasicBlock &MBB) {
1125   LLVM_DEBUG(dbgs() << "Fixup kills for " << printMBBReference(MBB) << '\n');
1126 
1127   LiveRegs.init(*TRI);
1128   LiveRegs.addLiveOuts(MBB);
1129 
1130   // Examine block from end to start...
1131   for (MachineInstr &MI : llvm::reverse(MBB)) {
1132     if (MI.isDebugOrPseudoInstr())
1133       continue;
1134 
1135     // Update liveness.  Registers that are defed but not used in this
1136     // instruction are now dead. Mark register and all subregs as they
1137     // are completely defined.
1138     for (ConstMIBundleOperands O(MI); O.isValid(); ++O) {
1139       const MachineOperand &MO = *O;
1140       if (MO.isReg()) {
1141         if (!MO.isDef())
1142           continue;
1143         Register Reg = MO.getReg();
1144         if (!Reg)
1145           continue;
1146         LiveRegs.removeReg(Reg);
1147       } else if (MO.isRegMask()) {
1148         LiveRegs.removeRegsNotPreserved(MO.getRegMask());
1149       }
1150     }
1151 
1152     // If there is a bundle header fix it up first.
1153     if (!MI.isBundled()) {
1154       toggleKills(MRI, LiveRegs, MI, true);
1155     } else {
1156       MachineBasicBlock::instr_iterator Bundle = MI.getIterator();
1157       if (MI.isBundle())
1158         toggleKills(MRI, LiveRegs, MI, false);
1159 
1160       // Some targets make the (questionable) assumtion that the instructions
1161       // inside the bundle are ordered and consequently only the last use of
1162       // a register inside the bundle can kill it.
1163       MachineBasicBlock::instr_iterator I = std::next(Bundle);
1164       while (I->isBundledWithSucc())
1165         ++I;
1166       do {
1167         if (!I->isDebugOrPseudoInstr())
1168           toggleKills(MRI, LiveRegs, *I, true);
1169         --I;
1170       } while (I != Bundle);
1171     }
1172   }
1173 }
1174 
1175 void ScheduleDAGInstrs::dumpNode(const SUnit &SU) const {
1176 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1177   dumpNodeName(SU);
1178   if (SchedPrintCycles)
1179     dbgs() << " [TopReadyCycle = " << SU.TopReadyCycle
1180            << ", BottomReadyCycle = " << SU.BotReadyCycle << "]";
1181   dbgs() << ": ";
1182   SU.getInstr()->dump();
1183 #endif
1184 }
1185 
1186 void ScheduleDAGInstrs::dump() const {
1187 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1188   if (EntrySU.getInstr() != nullptr)
1189     dumpNodeAll(EntrySU);
1190   for (const SUnit &SU : SUnits)
1191     dumpNodeAll(SU);
1192   if (ExitSU.getInstr() != nullptr)
1193     dumpNodeAll(ExitSU);
1194 #endif
1195 }
1196 
1197 std::string ScheduleDAGInstrs::getGraphNodeLabel(const SUnit *SU) const {
1198   std::string s;
1199   raw_string_ostream oss(s);
1200   if (SU == &EntrySU)
1201     oss << "<entry>";
1202   else if (SU == &ExitSU)
1203     oss << "<exit>";
1204   else
1205     SU->getInstr()->print(oss, /*IsStandalone=*/true);
1206   return s;
1207 }
1208 
1209 /// Return the basic block label. It is not necessarily unique because a block
1210 /// contains multiple scheduling regions. But it is fine for visualization.
1211 std::string ScheduleDAGInstrs::getDAGName() const {
1212   return "dag." + BB->getFullName();
1213 }
1214 
1215 bool ScheduleDAGInstrs::canAddEdge(SUnit *SuccSU, SUnit *PredSU) {
1216   return SuccSU == &ExitSU || !Topo.IsReachable(PredSU, SuccSU);
1217 }
1218 
1219 bool ScheduleDAGInstrs::addEdge(SUnit *SuccSU, const SDep &PredDep) {
1220   if (SuccSU != &ExitSU) {
1221     // Do not use WillCreateCycle, it assumes SD scheduling.
1222     // If Pred is reachable from Succ, then the edge creates a cycle.
1223     if (Topo.IsReachable(PredDep.getSUnit(), SuccSU))
1224       return false;
1225     Topo.AddPredQueued(SuccSU, PredDep.getSUnit());
1226   }
1227   SuccSU->addPred(PredDep, /*Required=*/!PredDep.isArtificial());
1228   // Return true regardless of whether a new edge needed to be inserted.
1229   return true;
1230 }
1231 
1232 //===----------------------------------------------------------------------===//
1233 // SchedDFSResult Implementation
1234 //===----------------------------------------------------------------------===//
1235 
1236 namespace llvm {
1237 
1238 /// Internal state used to compute SchedDFSResult.
1239 class SchedDFSImpl {
1240   SchedDFSResult &R;
1241 
1242   /// Join DAG nodes into equivalence classes by their subtree.
1243   IntEqClasses SubtreeClasses;
1244   /// List PredSU, SuccSU pairs that represent data edges between subtrees.
1245   std::vector<std::pair<const SUnit *, const SUnit*>> ConnectionPairs;
1246 
1247   struct RootData {
1248     unsigned NodeID;
1249     unsigned ParentNodeID;  ///< Parent node (member of the parent subtree).
1250     unsigned SubInstrCount = 0; ///< Instr count in this tree only, not
1251                                 /// children.
1252 
1253     RootData(unsigned id): NodeID(id),
1254                            ParentNodeID(SchedDFSResult::InvalidSubtreeID) {}
1255 
1256     unsigned getSparseSetIndex() const { return NodeID; }
1257   };
1258 
1259   SparseSet<RootData> RootSet;
1260 
1261 public:
1262   SchedDFSImpl(SchedDFSResult &r): R(r), SubtreeClasses(R.DFSNodeData.size()) {
1263     RootSet.setUniverse(R.DFSNodeData.size());
1264   }
1265 
1266   /// Returns true if this node been visited by the DFS traversal.
1267   ///
1268   /// During visitPostorderNode the Node's SubtreeID is assigned to the Node
1269   /// ID. Later, SubtreeID is updated but remains valid.
1270   bool isVisited(const SUnit *SU) const {
1271     return R.DFSNodeData[SU->NodeNum].SubtreeID
1272       != SchedDFSResult::InvalidSubtreeID;
1273   }
1274 
1275   /// Initializes this node's instruction count. We don't need to flag the node
1276   /// visited until visitPostorder because the DAG cannot have cycles.
1277   void visitPreorder(const SUnit *SU) {
1278     R.DFSNodeData[SU->NodeNum].InstrCount =
1279       SU->getInstr()->isTransient() ? 0 : 1;
1280   }
1281 
1282   /// Called once for each node after all predecessors are visited. Revisit this
1283   /// node's predecessors and potentially join them now that we know the ILP of
1284   /// the other predecessors.
1285   void visitPostorderNode(const SUnit *SU) {
1286     // Mark this node as the root of a subtree. It may be joined with its
1287     // successors later.
1288     R.DFSNodeData[SU->NodeNum].SubtreeID = SU->NodeNum;
1289     RootData RData(SU->NodeNum);
1290     RData.SubInstrCount = SU->getInstr()->isTransient() ? 0 : 1;
1291 
1292     // If any predecessors are still in their own subtree, they either cannot be
1293     // joined or are large enough to remain separate. If this parent node's
1294     // total instruction count is not greater than a child subtree by at least
1295     // the subtree limit, then try to join it now since splitting subtrees is
1296     // only useful if multiple high-pressure paths are possible.
1297     unsigned InstrCount = R.DFSNodeData[SU->NodeNum].InstrCount;
1298     for (const SDep &PredDep : SU->Preds) {
1299       if (PredDep.getKind() != SDep::Data)
1300         continue;
1301       unsigned PredNum = PredDep.getSUnit()->NodeNum;
1302       if ((InstrCount - R.DFSNodeData[PredNum].InstrCount) < R.SubtreeLimit)
1303         joinPredSubtree(PredDep, SU, /*CheckLimit=*/false);
1304 
1305       // Either link or merge the TreeData entry from the child to the parent.
1306       if (R.DFSNodeData[PredNum].SubtreeID == PredNum) {
1307         // If the predecessor's parent is invalid, this is a tree edge and the
1308         // current node is the parent.
1309         if (RootSet[PredNum].ParentNodeID == SchedDFSResult::InvalidSubtreeID)
1310           RootSet[PredNum].ParentNodeID = SU->NodeNum;
1311       }
1312       else if (RootSet.count(PredNum)) {
1313         // The predecessor is not a root, but is still in the root set. This
1314         // must be the new parent that it was just joined to. Note that
1315         // RootSet[PredNum].ParentNodeID may either be invalid or may still be
1316         // set to the original parent.
1317         RData.SubInstrCount += RootSet[PredNum].SubInstrCount;
1318         RootSet.erase(PredNum);
1319       }
1320     }
1321     RootSet[SU->NodeNum] = RData;
1322   }
1323 
1324   /// Called once for each tree edge after calling visitPostOrderNode on
1325   /// the predecessor. Increment the parent node's instruction count and
1326   /// preemptively join this subtree to its parent's if it is small enough.
1327   void visitPostorderEdge(const SDep &PredDep, const SUnit *Succ) {
1328     R.DFSNodeData[Succ->NodeNum].InstrCount
1329       += R.DFSNodeData[PredDep.getSUnit()->NodeNum].InstrCount;
1330     joinPredSubtree(PredDep, Succ);
1331   }
1332 
1333   /// Adds a connection for cross edges.
1334   void visitCrossEdge(const SDep &PredDep, const SUnit *Succ) {
1335     ConnectionPairs.emplace_back(PredDep.getSUnit(), Succ);
1336   }
1337 
1338   /// Sets each node's subtree ID to the representative ID and record
1339   /// connections between trees.
1340   void finalize() {
1341     SubtreeClasses.compress();
1342     R.DFSTreeData.resize(SubtreeClasses.getNumClasses());
1343     assert(SubtreeClasses.getNumClasses() == RootSet.size()
1344            && "number of roots should match trees");
1345     for (const RootData &Root : RootSet) {
1346       unsigned TreeID = SubtreeClasses[Root.NodeID];
1347       if (Root.ParentNodeID != SchedDFSResult::InvalidSubtreeID)
1348         R.DFSTreeData[TreeID].ParentTreeID = SubtreeClasses[Root.ParentNodeID];
1349       R.DFSTreeData[TreeID].SubInstrCount = Root.SubInstrCount;
1350       // Note that SubInstrCount may be greater than InstrCount if we joined
1351       // subtrees across a cross edge. InstrCount will be attributed to the
1352       // original parent, while SubInstrCount will be attributed to the joined
1353       // parent.
1354     }
1355     R.SubtreeConnections.resize(SubtreeClasses.getNumClasses());
1356     R.SubtreeConnectLevels.resize(SubtreeClasses.getNumClasses());
1357     LLVM_DEBUG(dbgs() << R.getNumSubtrees() << " subtrees:\n");
1358     for (unsigned Idx = 0, End = R.DFSNodeData.size(); Idx != End; ++Idx) {
1359       R.DFSNodeData[Idx].SubtreeID = SubtreeClasses[Idx];
1360       LLVM_DEBUG(dbgs() << "  SU(" << Idx << ") in tree "
1361                         << R.DFSNodeData[Idx].SubtreeID << '\n');
1362     }
1363     for (const auto &[Pred, Succ] : ConnectionPairs) {
1364       unsigned PredTree = SubtreeClasses[Pred->NodeNum];
1365       unsigned SuccTree = SubtreeClasses[Succ->NodeNum];
1366       if (PredTree == SuccTree)
1367         continue;
1368       unsigned Depth = Pred->getDepth();
1369       addConnection(PredTree, SuccTree, Depth);
1370       addConnection(SuccTree, PredTree, Depth);
1371     }
1372   }
1373 
1374 protected:
1375   /// Joins the predecessor subtree with the successor that is its DFS parent.
1376   /// Applies some heuristics before joining.
1377   bool joinPredSubtree(const SDep &PredDep, const SUnit *Succ,
1378                        bool CheckLimit = true) {
1379     assert(PredDep.getKind() == SDep::Data && "Subtrees are for data edges");
1380 
1381     // Check if the predecessor is already joined.
1382     const SUnit *PredSU = PredDep.getSUnit();
1383     unsigned PredNum = PredSU->NodeNum;
1384     if (R.DFSNodeData[PredNum].SubtreeID != PredNum)
1385       return false;
1386 
1387     // Four is the magic number of successors before a node is considered a
1388     // pinch point.
1389     unsigned NumDataSucs = 0;
1390     for (const SDep &SuccDep : PredSU->Succs) {
1391       if (SuccDep.getKind() == SDep::Data) {
1392         if (++NumDataSucs >= 4)
1393           return false;
1394       }
1395     }
1396     if (CheckLimit && R.DFSNodeData[PredNum].InstrCount > R.SubtreeLimit)
1397       return false;
1398     R.DFSNodeData[PredNum].SubtreeID = Succ->NodeNum;
1399     SubtreeClasses.join(Succ->NodeNum, PredNum);
1400     return true;
1401   }
1402 
1403   /// Called by finalize() to record a connection between trees.
1404   void addConnection(unsigned FromTree, unsigned ToTree, unsigned Depth) {
1405     if (!Depth)
1406       return;
1407 
1408     do {
1409       SmallVectorImpl<SchedDFSResult::Connection> &Connections =
1410         R.SubtreeConnections[FromTree];
1411       for (SchedDFSResult::Connection &C : Connections) {
1412         if (C.TreeID == ToTree) {
1413           C.Level = std::max(C.Level, Depth);
1414           return;
1415         }
1416       }
1417       Connections.push_back(SchedDFSResult::Connection(ToTree, Depth));
1418       FromTree = R.DFSTreeData[FromTree].ParentTreeID;
1419     } while (FromTree != SchedDFSResult::InvalidSubtreeID);
1420   }
1421 };
1422 
1423 } // end namespace llvm
1424 
1425 namespace {
1426 
1427 /// Manage the stack used by a reverse depth-first search over the DAG.
1428 class SchedDAGReverseDFS {
1429   std::vector<std::pair<const SUnit *, SUnit::const_pred_iterator>> DFSStack;
1430 
1431 public:
1432   bool isComplete() const { return DFSStack.empty(); }
1433 
1434   void follow(const SUnit *SU) {
1435     DFSStack.emplace_back(SU, SU->Preds.begin());
1436   }
1437   void advance() { ++DFSStack.back().second; }
1438 
1439   const SDep *backtrack() {
1440     DFSStack.pop_back();
1441     return DFSStack.empty() ? nullptr : std::prev(DFSStack.back().second);
1442   }
1443 
1444   const SUnit *getCurr() const { return DFSStack.back().first; }
1445 
1446   SUnit::const_pred_iterator getPred() const { return DFSStack.back().second; }
1447 
1448   SUnit::const_pred_iterator getPredEnd() const {
1449     return getCurr()->Preds.end();
1450   }
1451 };
1452 
1453 } // end anonymous namespace
1454 
1455 static bool hasDataSucc(const SUnit *SU) {
1456   for (const SDep &SuccDep : SU->Succs) {
1457     if (SuccDep.getKind() == SDep::Data &&
1458         !SuccDep.getSUnit()->isBoundaryNode())
1459       return true;
1460   }
1461   return false;
1462 }
1463 
1464 /// Computes an ILP metric for all nodes in the subDAG reachable via depth-first
1465 /// search from this root.
1466 void SchedDFSResult::compute(ArrayRef<SUnit> SUnits) {
1467   if (!IsBottomUp)
1468     llvm_unreachable("Top-down ILP metric is unimplemented");
1469 
1470   SchedDFSImpl Impl(*this);
1471   for (const SUnit &SU : SUnits) {
1472     if (Impl.isVisited(&SU) || hasDataSucc(&SU))
1473       continue;
1474 
1475     SchedDAGReverseDFS DFS;
1476     Impl.visitPreorder(&SU);
1477     DFS.follow(&SU);
1478     while (true) {
1479       // Traverse the leftmost path as far as possible.
1480       while (DFS.getPred() != DFS.getPredEnd()) {
1481         const SDep &PredDep = *DFS.getPred();
1482         DFS.advance();
1483         // Ignore non-data edges.
1484         if (PredDep.getKind() != SDep::Data
1485             || PredDep.getSUnit()->isBoundaryNode()) {
1486           continue;
1487         }
1488         // An already visited edge is a cross edge, assuming an acyclic DAG.
1489         if (Impl.isVisited(PredDep.getSUnit())) {
1490           Impl.visitCrossEdge(PredDep, DFS.getCurr());
1491           continue;
1492         }
1493         Impl.visitPreorder(PredDep.getSUnit());
1494         DFS.follow(PredDep.getSUnit());
1495       }
1496       // Visit the top of the stack in postorder and backtrack.
1497       const SUnit *Child = DFS.getCurr();
1498       const SDep *PredDep = DFS.backtrack();
1499       Impl.visitPostorderNode(Child);
1500       if (PredDep)
1501         Impl.visitPostorderEdge(*PredDep, DFS.getCurr());
1502       if (DFS.isComplete())
1503         break;
1504     }
1505   }
1506   Impl.finalize();
1507 }
1508 
1509 /// The root of the given SubtreeID was just scheduled. For all subtrees
1510 /// connected to this tree, record the depth of the connection so that the
1511 /// nearest connected subtrees can be prioritized.
1512 void SchedDFSResult::scheduleTree(unsigned SubtreeID) {
1513   for (const Connection &C : SubtreeConnections[SubtreeID]) {
1514     SubtreeConnectLevels[C.TreeID] =
1515       std::max(SubtreeConnectLevels[C.TreeID], C.Level);
1516     LLVM_DEBUG(dbgs() << "  Tree: " << C.TreeID << " @"
1517                       << SubtreeConnectLevels[C.TreeID] << '\n');
1518   }
1519 }
1520 
1521 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1522 LLVM_DUMP_METHOD void ILPValue::print(raw_ostream &OS) const {
1523   OS << InstrCount << " / " << Length << " = ";
1524   if (!Length)
1525     OS << "BADILP";
1526   else
1527     OS << format("%g", ((double)InstrCount / Length));
1528 }
1529 
1530 LLVM_DUMP_METHOD void ILPValue::dump() const {
1531   dbgs() << *this << '\n';
1532 }
1533 
1534 namespace llvm {
1535 
1536 LLVM_ATTRIBUTE_UNUSED
1537 raw_ostream &operator<<(raw_ostream &OS, const ILPValue &Val) {
1538   Val.print(OS);
1539   return OS;
1540 }
1541 
1542 } // end namespace llvm
1543 
1544 #endif
1545