xref: /llvm-project/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp (revision 9861a68a7cc5825fc9e680c76dfe71cda4a3c0b9)
1 //===--- AMDGPUIGroupLP.cpp - AMDGPU IGroupLP  ------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // \file This file defines a set of schedule DAG mutations that can be used to
10 // override default scheduler behavior to enforce specific scheduling patterns.
11 // They should be used in cases where runtime performance considerations such as
12 // inter-wavefront interactions, mean that compile-time heuristics cannot
13 // predict the optimal instruction ordering, or in kernels where optimum
14 // instruction scheduling is important enough to warrant manual intervention.
15 //
16 //===----------------------------------------------------------------------===//
17 
18 #include "AMDGPUIGroupLP.h"
19 #include "AMDGPUTargetMachine.h"
20 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
21 #include "SIInstrInfo.h"
22 #include "SIMachineFunctionInfo.h"
23 #include "llvm/ADT/BitmaskEnum.h"
24 #include "llvm/ADT/DenseMap.h"
25 #include "llvm/CodeGen/MachineScheduler.h"
26 #include "llvm/CodeGen/TargetOpcodes.h"
27 
28 using namespace llvm;
29 
30 #define DEBUG_TYPE "igrouplp"
31 
32 namespace {
33 
34 static cl::opt<bool> EnableExactSolver(
35     "amdgpu-igrouplp-exact-solver", cl::Hidden,
36     cl::desc("Whether to use the exponential time solver to fit "
37              "the instructions to the pipeline as closely as "
38              "possible."),
39     cl::init(false));
40 
41 static cl::opt<unsigned> CutoffForExact(
42     "amdgpu-igrouplp-exact-solver-cutoff", cl::init(0), cl::Hidden,
43     cl::desc("The maximum number of scheduling group conflicts "
44              "which we attempt to solve with the exponential time "
45              "exact solver. Problem sizes greater than this will"
46              "be solved by the less accurate greedy algorithm. Selecting "
47              "solver by size is superseded by manually selecting "
48              "the solver (e.g. by amdgpu-igrouplp-exact-solver"));
49 
50 static cl::opt<uint64_t> MaxBranchesExplored(
51     "amdgpu-igrouplp-exact-solver-max-branches", cl::init(0), cl::Hidden,
52     cl::desc("The amount of branches that we are willing to explore with"
53              "the exact algorithm before giving up."));
54 
55 static cl::opt<bool> UseCostHeur(
56     "amdgpu-igrouplp-exact-solver-cost-heur", cl::init(true), cl::Hidden,
57     cl::desc("Whether to use the cost heuristic to make choices as we "
58              "traverse the search space using the exact solver. Defaulted "
59              "to on, and if turned off, we will use the node order -- "
60              "attempting to put the later nodes in the later sched groups. "
61              "Experimentally, results are mixed, so this should be set on a "
62              "case-by-case basis."));
63 
64 // Components of the mask that determines which instruction types may be may be
65 // classified into a SchedGroup.
66 enum class SchedGroupMask {
67   NONE = 0u,
68   ALU = 1u << 0,
69   VALU = 1u << 1,
70   SALU = 1u << 2,
71   MFMA = 1u << 3,
72   VMEM = 1u << 4,
73   VMEM_READ = 1u << 5,
74   VMEM_WRITE = 1u << 6,
75   DS = 1u << 7,
76   DS_READ = 1u << 8,
77   DS_WRITE = 1u << 9,
78   ALL = ALU | VALU | SALU | MFMA | VMEM | VMEM_READ | VMEM_WRITE | DS |
79         DS_READ | DS_WRITE,
80   LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ ALL)
81 };
82 
83 typedef DenseMap<SUnit *, SmallVector<int, 4>> SUnitsToCandidateSGsMap;
84 
85 // Classify instructions into groups to enable fine tuned control over the
86 // scheduler. These groups may be more specific than current SchedModel
87 // instruction classes.
88 class SchedGroup {
89 private:
90   // Mask that defines which instruction types can be classified into this
91   // SchedGroup. The instruction types correspond to the mask from SCHED_BARRIER
92   // and SCHED_GROUP_BARRIER.
93   SchedGroupMask SGMask;
94 
95   // Maximum number of SUnits that can be added to this group.
96   Optional<unsigned> MaxSize;
97 
98   // SchedGroups will only synchronize with other SchedGroups that have the same
99   // SyncID.
100   int SyncID = 0;
101 
102   // SGID is used to map instructions to candidate SchedGroups
103   unsigned SGID;
104 
105   // Count of the number of created SchedGroups, used to initialize SGID.
106   static unsigned NumSchedGroups;
107 
108   ScheduleDAGInstrs *DAG;
109 
110   const SIInstrInfo *TII;
111 
112   // Try to add and edge from SU A to SU B.
113   bool tryAddEdge(SUnit *A, SUnit *B);
114 
115   // Use SGMask to determine whether we can classify MI as a member of this
116   // SchedGroup object.
117   bool canAddMI(const MachineInstr &MI) const;
118 
119 public:
120   // Collection of SUnits that are classified as members of this group.
121   SmallVector<SUnit *, 32> Collection;
122 
123   // Returns true if SU can be added to this SchedGroup.
124   bool canAddSU(SUnit &SU) const;
125 
126   // Add DAG dependencies from all SUnits in this SchedGroup and this SU. If
127   // MakePred is true, SU will be a predecessor of the SUnits in this
128   // SchedGroup, otherwise SU will be a successor.
129   void link(SUnit &SU, bool MakePred = false);
130 
131   // Add DAG dependencies and track which edges are added, and the count of
132   // missed edges
133   int link(SUnit &SU, bool MakePred,
134            std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges);
135 
136   // Add DAG dependencies from all SUnits in this SchedGroup and this SU.
137   // Use the predicate to determine whether SU should be a predecessor (P =
138   // true) or a successor (P = false) of this SchedGroup.
139   void link(SUnit &SU, function_ref<bool(const SUnit *A, const SUnit *B)> P);
140 
141   // Add DAG dependencies such that SUnits in this group shall be ordered
142   // before SUnits in OtherGroup.
143   void link(SchedGroup &OtherGroup);
144 
145   // Returns true if no more instructions may be added to this group.
146   bool isFull() const { return MaxSize && Collection.size() >= *MaxSize; }
147 
148   // Add SU to the SchedGroup.
149   void add(SUnit &SU) {
150     LLVM_DEBUG(dbgs() << "For SchedGroup with mask "
151                       << format_hex((int)SGMask, 10, true) << " adding "
152                       << *SU.getInstr());
153     Collection.push_back(&SU);
154   }
155 
156   // Remove last element in the SchedGroup
157   void pop() { Collection.pop_back(); }
158 
159   // Identify and add all relevant SUs from the DAG to this SchedGroup.
160   void initSchedGroup();
161 
162   // Add instructions to the SchedGroup bottom up starting from RIter.
163   // PipelineInstrs is a set of instructions that should not be added to the
164   // SchedGroup even when the other conditions for adding it are satisfied.
165   // RIter will be added to the SchedGroup as well, and dependencies will be
166   // added so that RIter will always be scheduled at the end of the group.
167   void initSchedGroup(std::vector<SUnit>::reverse_iterator RIter,
168                       SUnitsToCandidateSGsMap &SyncedInstrs);
169 
170   void initSchedGroup(SUnitsToCandidateSGsMap &SyncedInstrs);
171 
172   int getSyncID() { return SyncID; }
173 
174   int getSGID() { return SGID; }
175 
176   SchedGroupMask getMask() { return SGMask; }
177 
178   SchedGroup(SchedGroupMask SGMask, Optional<unsigned> MaxSize,
179              ScheduleDAGInstrs *DAG, const SIInstrInfo *TII)
180       : SGMask(SGMask), MaxSize(MaxSize), DAG(DAG), TII(TII) {
181     SGID = NumSchedGroups++;
182   }
183 
184   SchedGroup(SchedGroupMask SGMask, Optional<unsigned> MaxSize, int SyncID,
185              ScheduleDAGInstrs *DAG, const SIInstrInfo *TII)
186       : SGMask(SGMask), MaxSize(MaxSize), SyncID(SyncID), DAG(DAG), TII(TII) {
187     SGID = NumSchedGroups++;
188   }
189 };
190 
191 // Remove all existing edges from a SCHED_BARRIER or SCHED_GROUP_BARRIER.
192 static void resetEdges(SUnit &SU, ScheduleDAGInstrs *DAG) {
193   assert(SU.getInstr()->getOpcode() == AMDGPU::SCHED_BARRIER ||
194          SU.getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER ||
195          SU.getInstr()->getOpcode() == AMDGPU::IGLP_OPT);
196 
197   while (!SU.Preds.empty())
198     for (auto &P : SU.Preds)
199       SU.removePred(P);
200 
201   while (!SU.Succs.empty())
202     for (auto &S : SU.Succs)
203       for (auto &SP : S.getSUnit()->Preds)
204         if (SP.getSUnit() == &SU)
205           S.getSUnit()->removePred(SP);
206 }
207 
208 typedef std::pair<SUnit *, SmallVector<int, 4>> SUToCandSGsPair;
209 typedef SmallVector<SUToCandSGsPair, 4> SUsToCandSGsVec;
210 
211 // The PipelineSolver is used to assign SUnits to SchedGroups in a pipeline
212 // in non-trivial cases. For example, if the requested pipeline is
213 // {VMEM_READ, VALU, MFMA, VMEM_READ} and we encounter a VMEM_READ instruction
214 // in the DAG, then we will have an instruction that can not be trivially
215 // assigned to a SchedGroup. The PipelineSolver class implements two algorithms
216 // to find a good solution to the pipeline -- a greedy algorithm and an exact
217 // algorithm. The exact algorithm has an exponential time complexity and should
218 // only be used for small sized problems or medium sized problems where an exact
219 // solution is highly desired.
220 class PipelineSolver {
221   ScheduleDAGMI *DAG;
222 
223   // Instructions that can be assigned to multiple SchedGroups
224   DenseMap<int, SUnitsToCandidateSGsMap> SyncedInstrs;
225   SmallVector<SUsToCandSGsVec, 4> PipelineInstrs;
226   DenseMap<int, SmallVector<SchedGroup, 4>> SyncedSchedGroups;
227   // The current working pipeline
228   SmallVector<SmallVector<SchedGroup, 4>, 4> CurrPipeline;
229   // The pipeline that has the best solution found so far
230   SmallVector<SmallVector<SchedGroup, 4>, 4> BestPipeline;
231 
232   // Whether or not we actually have any SyncedInstrs to try to solve.
233   bool NeedsSolver = false;
234 
235   // Compute an estimate of the size of search tree -- the true size is
236   // the product of each conflictedInst.Matches.size() across all SyncPipelines
237   unsigned computeProblemSize();
238 
239   // The cost penalty of not assigning a SU to a SchedGroup
240   int MissPenalty = 0;
241 
242   // Costs in terms of the number of edges we are unable to add
243   int BestCost = -1;
244   int CurrCost = 0;
245 
246   // Index pointing to the conflicting instruction that is currently being
247   // fitted
248   int CurrConflInstNo = 0;
249   // Index to the pipeline that is currently being fitted
250   int CurrSyncGroupIdx = 0;
251   // The first non trivial pipeline
252   int BeginSyncGroupIdx = 0;
253 
254   // How many branches we have explored
255   uint64_t BranchesExplored = 0;
256 
257   // Update indices to fit next conflicting instruction
258   void advancePosition();
259   // Recede indices to attempt to find better fit for previous conflicting
260   // instruction
261   void retreatPosition();
262 
263   // The exponential time algorithm which finds the provably best fit
264   bool solveExact();
265   // The polynomial time algorithm which attempts to find a good fit
266   bool solveGreedy();
267   // Whether or not the current solution is optimal
268   bool checkOptimal();
269   // Populate the ready list, prioiritizing fewest missed edges first
270   void populateReadyList(SUToCandSGsPair &CurrSU,
271                          SmallVectorImpl<std::pair<int, int>> &ReadyList,
272                          SmallVectorImpl<SchedGroup> &SyncPipeline);
273   // Add edges corresponding to the SchedGroups as assigned by solver
274   void makePipeline();
275   // Add the edges from the SU to the other SchedGroups in pipeline, and
276   // return the number of edges missed.
277   int addEdges(SmallVectorImpl<SchedGroup> &SyncPipeline, SUnit *SU, int SGID,
278                std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges);
279   // Remove the edges passed via AddedEdges
280   void removeEdges(const std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges);
281   // Convert the passed in maps to arrays for bidirectional iterators
282   void convertSyncMapsToArrays();
283 
284   void reset();
285 
286 public:
287   // Invoke the solver to map instructions to instruction groups. Heuristic &&
288   // command-line-option determines to use exact or greedy algorithm.
289   void solve();
290 
291   PipelineSolver(DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
292                  DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
293                  ScheduleDAGMI *DAG)
294       : DAG(DAG), SyncedInstrs(SyncedInstrs),
295         SyncedSchedGroups(SyncedSchedGroups) {
296 
297     for (auto &PipelineInstrs : SyncedInstrs) {
298       if (PipelineInstrs.second.size() > 0) {
299         NeedsSolver = true;
300         break;
301       }
302     }
303 
304     if (!NeedsSolver)
305       return;
306 
307     convertSyncMapsToArrays();
308 
309     CurrPipeline = BestPipeline;
310 
311     while (static_cast<size_t>(BeginSyncGroupIdx) < PipelineInstrs.size() &&
312            PipelineInstrs[BeginSyncGroupIdx].size() == 0)
313       ++BeginSyncGroupIdx;
314 
315     if (static_cast<size_t>(BeginSyncGroupIdx) >= PipelineInstrs.size())
316       return;
317   }
318 };
319 
320 void PipelineSolver::reset() {
321 
322   for (auto &SyncPipeline : CurrPipeline) {
323     for (auto &SG : SyncPipeline) {
324       SmallVector<SUnit *, 32> TempCollection = SG.Collection;
325       SG.Collection.clear();
326       auto SchedBarr = llvm::find_if(TempCollection, [](SUnit *SU) {
327         return SU->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER;
328       });
329       if (SchedBarr != TempCollection.end())
330         SG.Collection.push_back(*SchedBarr);
331     }
332   }
333 
334   CurrSyncGroupIdx = BeginSyncGroupIdx;
335   CurrConflInstNo = 0;
336   CurrCost = 0;
337 }
338 
339 void PipelineSolver::convertSyncMapsToArrays() {
340   for (auto &SyncPipe : SyncedSchedGroups) {
341     BestPipeline.insert(BestPipeline.begin(), SyncPipe.second);
342   }
343 
344   int PipelineIDx = SyncedInstrs.size() - 1;
345   PipelineInstrs.resize(SyncedInstrs.size());
346   for (auto &SyncInstrMap : SyncedInstrs) {
347     for (auto &SUsToCandSGs : SyncInstrMap.second) {
348       if (PipelineInstrs[PipelineIDx].size() == 0) {
349         PipelineInstrs[PipelineIDx].push_back(
350             std::make_pair(SUsToCandSGs.first, SUsToCandSGs.second));
351         continue;
352       }
353       auto SortPosition = PipelineInstrs[PipelineIDx].begin();
354       // Insert them in sorted order -- this allows for good parsing order in
355       // the greedy algorithm
356       while (SortPosition != PipelineInstrs[PipelineIDx].end() &&
357              SUsToCandSGs.first->NodeNum > SortPosition->first->NodeNum)
358         ++SortPosition;
359       PipelineInstrs[PipelineIDx].insert(
360           SortPosition,
361           std::make_pair(SUsToCandSGs.first, SUsToCandSGs.second));
362     }
363     --PipelineIDx;
364   }
365 }
366 
367 void PipelineSolver::makePipeline() {
368   // Preserve the order of barrier for subsequent SchedGroupBarrier mutations
369   for (auto &SyncPipeline : BestPipeline) {
370     for (auto &SG : SyncPipeline) {
371       SUnit *SGBarr = nullptr;
372       for (auto &SU : SG.Collection) {
373         if (SU->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER)
374           SGBarr = SU;
375       }
376       // Command line requested IGroupLP doesn't have SGBarr
377       if (!SGBarr)
378         continue;
379       resetEdges(*SGBarr, DAG);
380       SG.link(*SGBarr, false);
381     }
382   }
383 
384   for (auto &SyncPipeline : BestPipeline) {
385     auto I = SyncPipeline.rbegin();
386     auto E = SyncPipeline.rend();
387     for (; I != E; ++I) {
388       auto &GroupA = *I;
389       for (auto J = std::next(I); J != E; ++J) {
390         auto &GroupB = *J;
391         GroupA.link(GroupB);
392       }
393     }
394   }
395 }
396 
397 int PipelineSolver::addEdges(
398     SmallVectorImpl<SchedGroup> &SyncPipeline, SUnit *SU, int SGID,
399     std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges) {
400   int AddedCost = 0;
401   bool MakePred = false;
402 
403   // The groups in the pipeline are in reverse order. Thus,
404   // by traversing them from last to first, we are traversing
405   // them in the order as they were introduced in the code. After we
406   // pass the group the SU is being assigned to, it should be
407   // linked as a predecessor of the subsequent SchedGroups
408   auto GroupNo = (int)SyncPipeline.size() - 1;
409   for (; GroupNo >= 0; GroupNo--) {
410     if (SyncPipeline[GroupNo].getSGID() == SGID) {
411       MakePred = true;
412       continue;
413     }
414     auto Group = &SyncPipeline[GroupNo];
415     AddedCost += Group->link(*SU, MakePred, AddedEdges);
416     assert(AddedCost >= 0);
417   }
418 
419   return AddedCost;
420 }
421 
422 void PipelineSolver::removeEdges(
423     const std::vector<std::pair<SUnit *, SUnit *>> &EdgesToRemove) {
424   // Only remove the edges that we have added when testing
425   // the fit.
426   for (auto &PredSuccPair : EdgesToRemove) {
427     SUnit *Pred = PredSuccPair.first;
428     SUnit *Succ = PredSuccPair.second;
429 
430     auto Match = llvm::find_if(
431         Succ->Preds, [&Pred](SDep &P) { return P.getSUnit() == Pred; });
432     if (Match != Succ->Preds.end()) {
433       assert(Match->isArtificial());
434       Succ->removePred(*Match);
435     }
436   }
437 }
438 
439 void PipelineSolver::advancePosition() {
440   ++CurrConflInstNo;
441 
442   if (static_cast<size_t>(CurrConflInstNo) >=
443       PipelineInstrs[CurrSyncGroupIdx].size()) {
444     CurrConflInstNo = 0;
445     ++CurrSyncGroupIdx;
446     // Advance to next non-trivial pipeline
447     while (static_cast<size_t>(CurrSyncGroupIdx) < PipelineInstrs.size() &&
448            PipelineInstrs[CurrSyncGroupIdx].size() == 0)
449       ++CurrSyncGroupIdx;
450   }
451 }
452 
453 void PipelineSolver::retreatPosition() {
454   assert(CurrConflInstNo >= 0);
455   assert(CurrSyncGroupIdx >= 0);
456 
457   if (CurrConflInstNo > 0) {
458     --CurrConflInstNo;
459     return;
460   }
461 
462   if (CurrConflInstNo == 0) {
463     // If we return to the starting position, we have explored
464     // the entire tree
465     if (CurrSyncGroupIdx == BeginSyncGroupIdx)
466       return;
467 
468     --CurrSyncGroupIdx;
469     // Go to previous non-trivial pipeline
470     while (PipelineInstrs[CurrSyncGroupIdx].size() == 0)
471       --CurrSyncGroupIdx;
472 
473     CurrConflInstNo = PipelineInstrs[CurrSyncGroupIdx].size() - 1;
474   }
475 }
476 
477 bool PipelineSolver::checkOptimal() {
478   if (static_cast<size_t>(CurrSyncGroupIdx) == PipelineInstrs.size()) {
479     if (BestCost == -1 || CurrCost < BestCost) {
480       BestPipeline = CurrPipeline;
481       BestCost = CurrCost;
482       LLVM_DEBUG(dbgs() << "Found Fit with cost " << BestCost << "\n");
483     }
484     assert(BestCost >= 0);
485   }
486 
487   bool DoneExploring = false;
488   if (MaxBranchesExplored > 0 && BranchesExplored >= MaxBranchesExplored)
489     DoneExploring = true;
490 
491   return (DoneExploring || BestCost == 0);
492 }
493 
494 void PipelineSolver::populateReadyList(
495     SUToCandSGsPair &CurrSU, SmallVectorImpl<std::pair<int, int>> &ReadyList,
496     SmallVectorImpl<SchedGroup> &SyncPipeline) {
497   assert(CurrSU.second.size() >= 1);
498   auto I = CurrSU.second.rbegin();
499   auto E = CurrSU.second.rend();
500   for (; I != E; ++I) {
501     std::vector<std::pair<SUnit *, SUnit *>> AddedEdges;
502     int CandSGID = *I;
503     SchedGroup *Match;
504     for (auto &SG : SyncPipeline) {
505       if (SG.getSGID() == CandSGID)
506         Match = &SG;
507     }
508 
509     if (UseCostHeur) {
510       if (Match->isFull()) {
511         ReadyList.push_back(std::make_pair(*I, MissPenalty));
512         continue;
513       }
514 
515       int TempCost = addEdges(SyncPipeline, CurrSU.first, CandSGID, AddedEdges);
516       ReadyList.push_back(std::make_pair(*I, TempCost));
517       removeEdges(AddedEdges);
518     } else
519       ReadyList.push_back(std::make_pair(*I, -1));
520   }
521 
522   if (UseCostHeur) {
523     std::sort(ReadyList.begin(), ReadyList.end(),
524               [](std::pair<int, int> A, std::pair<int, int> B) {
525                 return A.second < B.second;
526               });
527   }
528 
529   assert(ReadyList.size() == CurrSU.second.size());
530 }
531 
532 bool PipelineSolver::solveExact() {
533   if (checkOptimal())
534     return true;
535 
536   if (static_cast<size_t>(CurrSyncGroupIdx) == PipelineInstrs.size())
537     return false;
538 
539   assert(static_cast<size_t>(CurrSyncGroupIdx) < PipelineInstrs.size());
540   assert(static_cast<size_t>(CurrConflInstNo) <
541          PipelineInstrs[CurrSyncGroupIdx].size());
542   SUToCandSGsPair CurrSU = PipelineInstrs[CurrSyncGroupIdx][CurrConflInstNo];
543   LLVM_DEBUG(dbgs() << "Fitting SU(" << CurrSU.first->NodeNum
544                     << ") in Pipeline # " << CurrSyncGroupIdx << "\n");
545 
546   // SchedGroup -> Cost pairs
547   SmallVector<std::pair<int, int>, 4> ReadyList;
548   // Prioritize the candidate sched groups in terms of lowest cost first
549   populateReadyList(CurrSU, ReadyList, CurrPipeline[CurrSyncGroupIdx]);
550 
551   auto I = ReadyList.begin();
552   auto E = ReadyList.end();
553   for (; I != E; ++I) {
554     // If we are trying SGs in least cost order, and the current SG is cost
555     // infeasible, then all subsequent SGs will also be cost infeasible, so we
556     // can prune.
557     if (BestCost != -1 && (CurrCost + I->second > BestCost))
558       return false;
559 
560     int CandSGID = I->first;
561     int AddedCost = 0;
562     std::vector<std::pair<SUnit *, SUnit *>> AddedEdges;
563     auto &SyncPipeline = CurrPipeline[CurrSyncGroupIdx];
564     SchedGroup *Match;
565     for (auto &SG : SyncPipeline) {
566       if (SG.getSGID() == CandSGID)
567         Match = &SG;
568     }
569 
570     if (Match->isFull())
571       continue;
572 
573     LLVM_DEBUG(dbgs() << "Assigning to SchedGroup with Mask "
574                       << (int)Match->getMask() << "and ID " << CandSGID
575                       << "\n");
576     Match->add(*CurrSU.first);
577     AddedCost = addEdges(SyncPipeline, CurrSU.first, CandSGID, AddedEdges);
578     LLVM_DEBUG(dbgs() << "Cost of Assignment: " << AddedCost << "\n");
579     CurrCost += AddedCost;
580     advancePosition();
581     ++BranchesExplored;
582     bool FinishedExploring = false;
583     // If the Cost after adding edges is greater than a known solution,
584     // backtrack
585     if (CurrCost < BestCost || BestCost == -1) {
586       if (solveExact()) {
587         FinishedExploring = BestCost != 0;
588         if (!FinishedExploring)
589           return true;
590       }
591     }
592 
593     retreatPosition();
594     CurrCost -= AddedCost;
595     removeEdges(AddedEdges);
596     Match->pop();
597     CurrPipeline[CurrSyncGroupIdx] = SyncPipeline;
598     if (FinishedExploring)
599       return true;
600   }
601 
602   // Try the pipeline where the current instruction is omitted
603   // Potentially if we omit a problematic instruction from the pipeline,
604   // all the other instructions can nicely fit.
605   CurrCost += MissPenalty;
606   advancePosition();
607 
608   LLVM_DEBUG(dbgs() << "NOT Assigned (" << CurrSU.first->NodeNum << ")\n");
609 
610   bool FinishedExploring = false;
611   if (CurrCost < BestCost || BestCost == -1) {
612     if (solveExact()) {
613       bool FinishedExploring = BestCost != 0;
614       if (!FinishedExploring)
615         return true;
616     }
617   }
618 
619   retreatPosition();
620   CurrCost -= MissPenalty;
621   return FinishedExploring;
622 }
623 
624 bool PipelineSolver::solveGreedy() {
625   BestCost = 0;
626   std::vector<std::pair<SUnit *, SUnit *>> AddedEdges;
627 
628   while (static_cast<size_t>(CurrSyncGroupIdx) < PipelineInstrs.size()) {
629     SUToCandSGsPair CurrSU = PipelineInstrs[CurrSyncGroupIdx][CurrConflInstNo];
630     int BestNodeCost = -1;
631     int TempCost;
632     SchedGroup *BestGroup = nullptr;
633     int BestGroupID = -1;
634     auto &SyncPipeline = CurrPipeline[CurrSyncGroupIdx];
635     LLVM_DEBUG(dbgs() << "Fitting SU(" << CurrSU.first->NodeNum
636                       << ") in Pipeline # " << CurrSyncGroupIdx << "\n");
637 
638     // Since we have added the potential SchedGroups from bottom up, but
639     // traversed the DAG from top down, parse over the groups from last to
640     // first. If we fail to do this for the greedy algorithm, the solution will
641     // likely not be good in more complex cases.
642     auto I = CurrSU.second.rbegin();
643     auto E = CurrSU.second.rend();
644     for (; I != E; ++I) {
645       std::vector<std::pair<SUnit *, SUnit *>> AddedEdges;
646       int CandSGID = *I;
647       SchedGroup *Match;
648       for (auto &SG : SyncPipeline) {
649         if (SG.getSGID() == CandSGID)
650           Match = &SG;
651       }
652 
653       LLVM_DEBUG(dbgs() << "Trying SGID # " << CandSGID << " with Mask "
654                         << (int)Match->getMask() << "\n");
655 
656       if (Match->isFull()) {
657         LLVM_DEBUG(dbgs() << "SGID # " << CandSGID << " is full\n");
658         continue;
659       }
660       TempCost = addEdges(SyncPipeline, CurrSU.first, CandSGID, AddedEdges);
661       LLVM_DEBUG(dbgs() << "Cost of Group " << TempCost << "\n");
662       if (TempCost < BestNodeCost || BestNodeCost == -1) {
663         BestGroup = Match;
664         BestNodeCost = TempCost;
665         BestGroupID = CandSGID;
666       }
667       removeEdges(AddedEdges);
668       if (BestNodeCost == 0)
669         break;
670     }
671 
672     if (BestGroupID != -1) {
673       BestGroup->add(*CurrSU.first);
674       addEdges(SyncPipeline, CurrSU.first, BestGroupID, AddedEdges);
675       LLVM_DEBUG(dbgs() << "Best Group has ID: " << BestGroupID << " and Mask"
676                         << (int)BestGroup->getMask() << "\n");
677       BestCost += TempCost;
678     } else
679       BestCost += MissPenalty;
680 
681     CurrPipeline[CurrSyncGroupIdx] = SyncPipeline;
682     advancePosition();
683   }
684   BestPipeline = CurrPipeline;
685   removeEdges(AddedEdges);
686   return false;
687 }
688 
689 unsigned PipelineSolver::computeProblemSize() {
690   unsigned ProblemSize = 0;
691   for (auto &PipeConflicts : PipelineInstrs) {
692     ProblemSize += PipeConflicts.size();
693   }
694 
695   return ProblemSize;
696 }
697 
698 void PipelineSolver::solve() {
699   if (!NeedsSolver)
700     return;
701 
702   unsigned ProblemSize = computeProblemSize();
703   assert(ProblemSize > 0);
704 
705   bool BelowCutoff = (CutoffForExact > 0) && ProblemSize <= CutoffForExact;
706   MissPenalty = (ProblemSize / 2) + 1;
707 
708   LLVM_DEBUG(DAG->dump());
709   if (EnableExactSolver || BelowCutoff) {
710     LLVM_DEBUG(dbgs() << "Starting Greedy pipeline solver\n");
711     solveGreedy();
712     reset();
713     LLVM_DEBUG(dbgs() << "Greedy produced best cost of " << BestCost << "\n");
714     if (BestCost > 0) {
715       LLVM_DEBUG(dbgs() << "Starting EXACT pipeline solver\n");
716       solveExact();
717       LLVM_DEBUG(dbgs() << "Exact produced best cost of " << BestCost << "\n");
718     }
719   } else { // Use the Greedy Algorithm by default
720     LLVM_DEBUG(dbgs() << "Starting GREEDY pipeline solver\n");
721     solveGreedy();
722   }
723 
724   makePipeline();
725 }
726 
727 enum IGLPStrategyID : int { MFMASmallGemmOptID = 0 };
728 
729 // Implement a IGLP scheduling strategy.
730 class IGLPStrategy {
731 protected:
732   ScheduleDAGInstrs *DAG;
733 
734   const SIInstrInfo *TII;
735 
736 public:
737   // Add SchedGroups to \p Pipeline to implement this Strategy.
738   virtual void applyIGLPStrategy(
739       DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
740       DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups) = 0;
741 
742   // Returns true if this strategy should be applied to a ScheduleDAG.
743   virtual bool shouldApplyStrategy(ScheduleDAGInstrs *DAG) = 0;
744 
745   IGLPStrategy(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII)
746       : DAG(DAG), TII(TII) {}
747 
748   virtual ~IGLPStrategy() = default;
749 };
750 
751 class MFMASmallGemmOpt final : public IGLPStrategy {
752 public:
753   void applyIGLPStrategy(
754       DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
755       DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups) override;
756 
757   bool shouldApplyStrategy(ScheduleDAGInstrs *DAG) override { return true; }
758 
759   MFMASmallGemmOpt(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII)
760       : IGLPStrategy(DAG, TII) {}
761 };
762 
763 void MFMASmallGemmOpt::applyIGLPStrategy(
764     DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
765     DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups) {
766   // Count the number of MFMA instructions.
767   unsigned MFMACount = 0;
768   for (auto I = DAG->begin(), E = DAG->end(); I != E; ++I) {
769     if (TII->isMFMA(*I))
770       ++MFMACount;
771   }
772 
773   const unsigned PipelineSyncID = 0;
774   SchedGroup *SG = nullptr;
775   for (unsigned I = 0; I < MFMACount; ++I) {
776     SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
777         SchedGroupMask::DS_READ, 1, PipelineSyncID, DAG, TII);
778     SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
779 
780     SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
781         SchedGroupMask::VMEM_READ, 1, PipelineSyncID, DAG, TII);
782     SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
783 
784     SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
785         SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
786     SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
787 
788     SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
789         SchedGroupMask::VMEM_WRITE, 1, PipelineSyncID, DAG, TII);
790     SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
791 
792     SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
793         SchedGroupMask::DS_WRITE, 1, PipelineSyncID, DAG, TII);
794     SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
795   }
796 
797   for (unsigned I = 0; I < MFMACount; ++I) {
798     SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
799         SchedGroupMask::DS_READ, 1, PipelineSyncID, DAG, TII);
800     SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
801 
802     SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
803         SchedGroupMask::VMEM_READ, 1, PipelineSyncID, DAG, TII);
804     SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
805 
806     SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
807         SchedGroupMask::VMEM_WRITE, 1, PipelineSyncID, DAG, TII);
808     SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
809 
810     SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
811         SchedGroupMask::DS_WRITE, 1, PipelineSyncID, DAG, TII);
812     SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
813   }
814 }
815 
816 static std::unique_ptr<IGLPStrategy>
817 createIGLPStrategy(IGLPStrategyID ID, ScheduleDAGInstrs *DAG,
818                    const SIInstrInfo *TII) {
819   switch (ID) {
820   case MFMASmallGemmOptID:
821     return std::make_unique<MFMASmallGemmOpt>(DAG, TII);
822   }
823 
824   llvm_unreachable("Unknown IGLPStrategyID");
825 }
826 
827 class IGroupLPDAGMutation : public ScheduleDAGMutation {
828 private:
829   const SIInstrInfo *TII;
830 
831   ScheduleDAGMI *DAG;
832 
833   // Organize lists of SchedGroups by their SyncID. SchedGroups /
834   // SCHED_GROUP_BARRIERs with different SyncIDs will have no edges added
835   // between then.
836   DenseMap<int, SmallVector<SchedGroup, 4>> SyncedSchedGroups;
837 
838   // Used to track instructions that can be mapped to multiple sched groups
839   DenseMap<int, SUnitsToCandidateSGsMap> SyncedInstrs;
840 
841   // Add DAG edges that enforce SCHED_BARRIER ordering.
842   void addSchedBarrierEdges(SUnit &SU);
843 
844   // Use a SCHED_BARRIER's mask to identify instruction SchedGroups that should
845   // not be reordered accross the SCHED_BARRIER. This is used for the base
846   // SCHED_BARRIER, and not SCHED_GROUP_BARRIER. The difference is that
847   // SCHED_BARRIER will always block all instructions that can be classified
848   // into a particular SchedClass, whereas SCHED_GROUP_BARRIER has a fixed size
849   // and may only synchronize with some SchedGroups. Returns the inverse of
850   // Mask. SCHED_BARRIER's mask describes which instruction types should be
851   // allowed to be scheduled across it. Invert the mask to get the
852   // SchedGroupMask of instructions that should be barred.
853   SchedGroupMask invertSchedBarrierMask(SchedGroupMask Mask) const;
854 
855   // Create SchedGroups for a SCHED_GROUP_BARRIER.
856   void initSchedGroupBarrierPipelineStage(
857       std::vector<SUnit>::reverse_iterator RIter);
858 
859   void initIGLPOpt(SUnit &SU);
860 
861 public:
862   void apply(ScheduleDAGInstrs *DAGInstrs) override;
863 
864   IGroupLPDAGMutation() = default;
865 };
866 
867 unsigned SchedGroup::NumSchedGroups = 0;
868 
869 bool SchedGroup::tryAddEdge(SUnit *A, SUnit *B) {
870   if (A != B && DAG->canAddEdge(B, A)) {
871     DAG->addEdge(B, SDep(A, SDep::Artificial));
872     return true;
873   }
874   return false;
875 }
876 
877 bool SchedGroup::canAddMI(const MachineInstr &MI) const {
878   bool Result = false;
879   if (MI.isMetaInstruction())
880     Result = false;
881 
882   else if (((SGMask & SchedGroupMask::ALU) != SchedGroupMask::NONE) &&
883            (TII->isVALU(MI) || TII->isMFMA(MI) || TII->isSALU(MI)))
884     Result = true;
885 
886   else if (((SGMask & SchedGroupMask::VALU) != SchedGroupMask::NONE) &&
887            TII->isVALU(MI) && !TII->isMFMA(MI))
888     Result = true;
889 
890   else if (((SGMask & SchedGroupMask::SALU) != SchedGroupMask::NONE) &&
891            TII->isSALU(MI))
892     Result = true;
893 
894   else if (((SGMask & SchedGroupMask::MFMA) != SchedGroupMask::NONE) &&
895            TII->isMFMA(MI))
896     Result = true;
897 
898   else if (((SGMask & SchedGroupMask::VMEM) != SchedGroupMask::NONE) &&
899            (TII->isVMEM(MI) || (TII->isFLAT(MI) && !TII->isDS(MI))))
900     Result = true;
901 
902   else if (((SGMask & SchedGroupMask::VMEM_READ) != SchedGroupMask::NONE) &&
903            MI.mayLoad() &&
904            (TII->isVMEM(MI) || (TII->isFLAT(MI) && !TII->isDS(MI))))
905     Result = true;
906 
907   else if (((SGMask & SchedGroupMask::VMEM_WRITE) != SchedGroupMask::NONE) &&
908            MI.mayStore() &&
909            (TII->isVMEM(MI) || (TII->isFLAT(MI) && !TII->isDS(MI))))
910     Result = true;
911 
912   else if (((SGMask & SchedGroupMask::DS) != SchedGroupMask::NONE) &&
913            TII->isDS(MI))
914     Result = true;
915 
916   else if (((SGMask & SchedGroupMask::DS_READ) != SchedGroupMask::NONE) &&
917            MI.mayLoad() && TII->isDS(MI))
918     Result = true;
919 
920   else if (((SGMask & SchedGroupMask::DS_WRITE) != SchedGroupMask::NONE) &&
921            MI.mayStore() && TII->isDS(MI))
922     Result = true;
923 
924   LLVM_DEBUG(
925       dbgs() << "For SchedGroup with mask " << format_hex((int)SGMask, 10, true)
926              << (Result ? " could classify " : " unable to classify ") << MI);
927 
928   return Result;
929 }
930 
931 int SchedGroup::link(SUnit &SU, bool MakePred,
932                      std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges) {
933   int MissedEdges = 0;
934   for (auto *A : Collection) {
935     SUnit *B = &SU;
936     if (A == B || A->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER)
937       continue;
938     if (MakePred)
939       std::swap(A, B);
940 
941     if (DAG->IsReachable(B, A))
942       continue;
943     // tryAddEdge returns false if there is a dependency that makes adding
944     // the A->B edge impossible, otherwise it returns true;
945     bool Added = tryAddEdge(A, B);
946     if (Added)
947       AddedEdges.push_back(std::make_pair(A, B));
948     else
949       ++MissedEdges;
950   }
951 
952   return MissedEdges;
953 }
954 
955 void SchedGroup::link(SUnit &SU, bool MakePred) {
956   for (auto *A : Collection) {
957     SUnit *B = &SU;
958     if (A->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER)
959       continue;
960     if (MakePred)
961       std::swap(A, B);
962 
963     tryAddEdge(A, B);
964   }
965 }
966 
967 void SchedGroup::link(SUnit &SU,
968                       function_ref<bool(const SUnit *A, const SUnit *B)> P) {
969   for (auto *A : Collection) {
970     SUnit *B = &SU;
971     if (P(A, B))
972       std::swap(A, B);
973 
974     tryAddEdge(A, B);
975   }
976 }
977 
978 void SchedGroup::link(SchedGroup &OtherGroup) {
979   for (auto *B : OtherGroup.Collection)
980     link(*B);
981 }
982 
983 bool SchedGroup::canAddSU(SUnit &SU) const {
984   MachineInstr &MI = *SU.getInstr();
985   if (MI.getOpcode() != TargetOpcode::BUNDLE)
986     return canAddMI(MI);
987 
988   // Special case for bundled MIs.
989   const MachineBasicBlock *MBB = MI.getParent();
990   MachineBasicBlock::instr_iterator B = MI.getIterator(), E = ++B;
991   while (E != MBB->end() && E->isBundledWithPred())
992     ++E;
993 
994   // Return true if all of the bundled MIs can be added to this group.
995   return std::all_of(B, E, [this](MachineInstr &MI) { return canAddMI(MI); });
996 }
997 
998 void SchedGroup::initSchedGroup() {
999   for (auto &SU : DAG->SUnits) {
1000     if (isFull())
1001       break;
1002 
1003     if (canAddSU(SU))
1004       add(SU);
1005   }
1006 }
1007 
1008 void SchedGroup::initSchedGroup(std::vector<SUnit>::reverse_iterator RIter,
1009                                 SUnitsToCandidateSGsMap &SyncedInstrs) {
1010   SUnit &InitSU = *RIter;
1011   for (auto E = DAG->SUnits.rend(); RIter != E; ++RIter) {
1012     auto &SU = *RIter;
1013     if (isFull())
1014       break;
1015 
1016     if (canAddSU(SU))
1017       SyncedInstrs[&SU].push_back(SGID);
1018   }
1019 
1020   add(InitSU);
1021   assert(MaxSize);
1022   (*MaxSize)++;
1023 }
1024 
1025 void SchedGroup::initSchedGroup(SUnitsToCandidateSGsMap &SyncedInstrs) {
1026   auto I = DAG->SUnits.rbegin();
1027   auto E = DAG->SUnits.rend();
1028   for (; I != E; ++I) {
1029     auto &SU = *I;
1030     if (isFull())
1031       break;
1032 
1033     if (canAddSU(SU))
1034       SyncedInstrs[&SU].push_back(SGID);
1035   }
1036 }
1037 
1038 void IGroupLPDAGMutation::apply(ScheduleDAGInstrs *DAGInstrs) {
1039   const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel();
1040   if (!TSchedModel || DAGInstrs->SUnits.empty())
1041     return;
1042 
1043   LLVM_DEBUG(dbgs() << "Applying IGroupLPDAGMutation...\n");
1044   const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>();
1045   TII = ST.getInstrInfo();
1046   DAG = static_cast<ScheduleDAGMI *>(DAGInstrs);
1047   SyncedSchedGroups.clear();
1048   SyncedInstrs.clear();
1049   bool foundSB = false;
1050   bool foundIGLP = false;
1051   for (auto R = DAG->SUnits.rbegin(), E = DAG->SUnits.rend(); R != E; ++R) {
1052     unsigned Opc = R->getInstr()->getOpcode();
1053     // SCHED_[GROUP_]BARRIER and IGLP are mutually exclusive.
1054     if (Opc == AMDGPU::SCHED_BARRIER) {
1055       addSchedBarrierEdges(*R);
1056       foundSB = true;
1057     } else if (Opc == AMDGPU::SCHED_GROUP_BARRIER) {
1058       initSchedGroupBarrierPipelineStage(R);
1059       foundSB = true;
1060     } else if (Opc == AMDGPU::IGLP_OPT) {
1061       resetEdges(*R, DAG);
1062       if (!foundSB && !foundIGLP)
1063         initIGLPOpt(*R);
1064       foundIGLP = true;
1065     }
1066   }
1067 
1068   if (foundSB || foundIGLP) {
1069     PipelineSolver PS(SyncedSchedGroups, SyncedInstrs, DAG);
1070     // PipelineSolver performs the mutation by adding the edges it
1071     // determined as the best
1072     PS.solve();
1073   }
1074 }
1075 
1076 void IGroupLPDAGMutation::addSchedBarrierEdges(SUnit &SchedBarrier) {
1077   MachineInstr &MI = *SchedBarrier.getInstr();
1078   assert(MI.getOpcode() == AMDGPU::SCHED_BARRIER);
1079   // Remove all existing edges from the SCHED_BARRIER that were added due to the
1080   // instruction having side effects.
1081   resetEdges(SchedBarrier, DAG);
1082   auto InvertedMask =
1083       invertSchedBarrierMask((SchedGroupMask)MI.getOperand(0).getImm());
1084   SchedGroup SG(InvertedMask, None, DAG, TII);
1085   SG.initSchedGroup();
1086   // Preserve original instruction ordering relative to the SCHED_BARRIER.
1087   SG.link(
1088       SchedBarrier,
1089       (function_ref<bool(const SUnit *A, const SUnit *B)>)[](
1090           const SUnit *A, const SUnit *B) { return A->NodeNum > B->NodeNum; });
1091 }
1092 
1093 SchedGroupMask
1094 IGroupLPDAGMutation::invertSchedBarrierMask(SchedGroupMask Mask) const {
1095   // Invert mask and erase bits for types of instructions that are implied to be
1096   // allowed past the SCHED_BARRIER.
1097   SchedGroupMask InvertedMask = ~Mask;
1098 
1099   // ALU implies VALU, SALU, MFMA.
1100   if ((InvertedMask & SchedGroupMask::ALU) == SchedGroupMask::NONE)
1101     InvertedMask &=
1102         ~SchedGroupMask::VALU & ~SchedGroupMask::SALU & ~SchedGroupMask::MFMA;
1103   // VALU, SALU, MFMA implies ALU.
1104   else if ((InvertedMask & SchedGroupMask::VALU) == SchedGroupMask::NONE ||
1105            (InvertedMask & SchedGroupMask::SALU) == SchedGroupMask::NONE ||
1106            (InvertedMask & SchedGroupMask::MFMA) == SchedGroupMask::NONE)
1107     InvertedMask &= ~SchedGroupMask::ALU;
1108 
1109   // VMEM implies VMEM_READ, VMEM_WRITE.
1110   if ((InvertedMask & SchedGroupMask::VMEM) == SchedGroupMask::NONE)
1111     InvertedMask &= ~SchedGroupMask::VMEM_READ & ~SchedGroupMask::VMEM_WRITE;
1112   // VMEM_READ, VMEM_WRITE implies VMEM.
1113   else if ((InvertedMask & SchedGroupMask::VMEM_READ) == SchedGroupMask::NONE ||
1114            (InvertedMask & SchedGroupMask::VMEM_WRITE) == SchedGroupMask::NONE)
1115     InvertedMask &= ~SchedGroupMask::VMEM;
1116 
1117   // DS implies DS_READ, DS_WRITE.
1118   if ((InvertedMask & SchedGroupMask::DS) == SchedGroupMask::NONE)
1119     InvertedMask &= ~SchedGroupMask::DS_READ & ~SchedGroupMask::DS_WRITE;
1120   // DS_READ, DS_WRITE implies DS.
1121   else if ((InvertedMask & SchedGroupMask::DS_READ) == SchedGroupMask::NONE ||
1122            (InvertedMask & SchedGroupMask::DS_WRITE) == SchedGroupMask::NONE)
1123     InvertedMask &= ~SchedGroupMask::DS;
1124 
1125   return InvertedMask;
1126 }
1127 
1128 void IGroupLPDAGMutation::initSchedGroupBarrierPipelineStage(
1129     std::vector<SUnit>::reverse_iterator RIter) {
1130   // Remove all existing edges from the SCHED_GROUP_BARRIER that were added due
1131   // to the instruction having side effects.
1132   resetEdges(*RIter, DAG);
1133   MachineInstr &SGB = *RIter->getInstr();
1134   assert(SGB.getOpcode() == AMDGPU::SCHED_GROUP_BARRIER);
1135   int32_t SGMask = SGB.getOperand(0).getImm();
1136   int32_t Size = SGB.getOperand(1).getImm();
1137   int32_t SyncID = SGB.getOperand(2).getImm();
1138 
1139   auto &SG = SyncedSchedGroups[SyncID].emplace_back((SchedGroupMask)SGMask,
1140                                                     Size, SyncID, DAG, TII);
1141 
1142   SG.initSchedGroup(RIter, SyncedInstrs[SG.getSyncID()]);
1143 }
1144 
1145 void IGroupLPDAGMutation::initIGLPOpt(SUnit &SU) {
1146   IGLPStrategyID StrategyID =
1147       (IGLPStrategyID)SU.getInstr()->getOperand(0).getImm();
1148   auto S = createIGLPStrategy(StrategyID, DAG, TII);
1149   if (S->shouldApplyStrategy(DAG))
1150     S->applyIGLPStrategy(SyncedInstrs, SyncedSchedGroups);
1151 }
1152 
1153 } // namespace
1154 
1155 namespace llvm {
1156 
1157 std::unique_ptr<ScheduleDAGMutation> createIGroupLPDAGMutation() {
1158   return std::make_unique<IGroupLPDAGMutation>();
1159 }
1160 
1161 } // end namespace llvm
1162