xref: /freebsd-src/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp (revision cb14a3fe5122c879eae1fb480ed7ce82a699ddb6)
181ad6265SDimitry Andric //===--- AMDGPUIGroupLP.cpp - AMDGPU IGroupLP  ------------===//
281ad6265SDimitry Andric //
381ad6265SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
481ad6265SDimitry Andric // See https://llvm.org/LICENSE.txt for license information.
581ad6265SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
681ad6265SDimitry Andric //
781ad6265SDimitry Andric //===----------------------------------------------------------------------===//
881ad6265SDimitry Andric //
981ad6265SDimitry Andric // \file This file defines a set of schedule DAG mutations that can be used to
1081ad6265SDimitry Andric // override default scheduler behavior to enforce specific scheduling patterns.
1181ad6265SDimitry Andric // They should be used in cases where runtime performance considerations such as
1281ad6265SDimitry Andric // inter-wavefront interactions, mean that compile-time heuristics cannot
1381ad6265SDimitry Andric // predict the optimal instruction ordering, or in kernels where optimum
1481ad6265SDimitry Andric // instruction scheduling is important enough to warrant manual intervention.
1581ad6265SDimitry Andric //
1681ad6265SDimitry Andric //===----------------------------------------------------------------------===//
1781ad6265SDimitry Andric 
1881ad6265SDimitry Andric #include "AMDGPUIGroupLP.h"
1981ad6265SDimitry Andric #include "AMDGPUTargetMachine.h"
2081ad6265SDimitry Andric #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
2181ad6265SDimitry Andric #include "SIInstrInfo.h"
2281ad6265SDimitry Andric #include "SIMachineFunctionInfo.h"
2381ad6265SDimitry Andric #include "llvm/ADT/BitmaskEnum.h"
24bdd1243dSDimitry Andric #include "llvm/ADT/DenseMap.h"
2581ad6265SDimitry Andric #include "llvm/CodeGen/MachineScheduler.h"
2681ad6265SDimitry Andric #include "llvm/CodeGen/TargetOpcodes.h"
2781ad6265SDimitry Andric 
2881ad6265SDimitry Andric using namespace llvm;
2981ad6265SDimitry Andric 
30bdd1243dSDimitry Andric #define DEBUG_TYPE "igrouplp"
3181ad6265SDimitry Andric 
3281ad6265SDimitry Andric namespace {
3381ad6265SDimitry Andric 
34bdd1243dSDimitry Andric static cl::opt<bool> EnableExactSolver(
35bdd1243dSDimitry Andric     "amdgpu-igrouplp-exact-solver", cl::Hidden,
36bdd1243dSDimitry Andric     cl::desc("Whether to use the exponential time solver to fit "
37bdd1243dSDimitry Andric              "the instructions to the pipeline as closely as "
38bdd1243dSDimitry Andric              "possible."),
3981ad6265SDimitry Andric     cl::init(false));
4081ad6265SDimitry Andric 
41bdd1243dSDimitry Andric static cl::opt<unsigned> CutoffForExact(
42bdd1243dSDimitry Andric     "amdgpu-igrouplp-exact-solver-cutoff", cl::init(0), cl::Hidden,
43bdd1243dSDimitry Andric     cl::desc("The maximum number of scheduling group conflicts "
44bdd1243dSDimitry Andric              "which we attempt to solve with the exponential time "
45bdd1243dSDimitry Andric              "exact solver. Problem sizes greater than this will"
46bdd1243dSDimitry Andric              "be solved by the less accurate greedy algorithm. Selecting "
47bdd1243dSDimitry Andric              "solver by size is superseded by manually selecting "
48bdd1243dSDimitry Andric              "the solver (e.g. by amdgpu-igrouplp-exact-solver"));
4981ad6265SDimitry Andric 
50bdd1243dSDimitry Andric static cl::opt<uint64_t> MaxBranchesExplored(
51bdd1243dSDimitry Andric     "amdgpu-igrouplp-exact-solver-max-branches", cl::init(0), cl::Hidden,
52bdd1243dSDimitry Andric     cl::desc("The amount of branches that we are willing to explore with"
53bdd1243dSDimitry Andric              "the exact algorithm before giving up."));
5481ad6265SDimitry Andric 
55bdd1243dSDimitry Andric static cl::opt<bool> UseCostHeur(
56bdd1243dSDimitry Andric     "amdgpu-igrouplp-exact-solver-cost-heur", cl::init(true), cl::Hidden,
57bdd1243dSDimitry Andric     cl::desc("Whether to use the cost heuristic to make choices as we "
58bdd1243dSDimitry Andric              "traverse the search space using the exact solver. Defaulted "
59bdd1243dSDimitry Andric              "to on, and if turned off, we will use the node order -- "
60bdd1243dSDimitry Andric              "attempting to put the later nodes in the later sched groups. "
61bdd1243dSDimitry Andric              "Experimentally, results are mixed, so this should be set on a "
62bdd1243dSDimitry Andric              "case-by-case basis."));
6381ad6265SDimitry Andric 
64bdd1243dSDimitry Andric // Components of the mask that determines which instruction types may be may be
65bdd1243dSDimitry Andric // classified into a SchedGroup.
66bdd1243dSDimitry Andric enum class SchedGroupMask {
6781ad6265SDimitry Andric   NONE = 0u,
6881ad6265SDimitry Andric   ALU = 1u << 0,
6981ad6265SDimitry Andric   VALU = 1u << 1,
7081ad6265SDimitry Andric   SALU = 1u << 2,
7181ad6265SDimitry Andric   MFMA = 1u << 3,
7281ad6265SDimitry Andric   VMEM = 1u << 4,
7381ad6265SDimitry Andric   VMEM_READ = 1u << 5,
7481ad6265SDimitry Andric   VMEM_WRITE = 1u << 6,
7581ad6265SDimitry Andric   DS = 1u << 7,
7681ad6265SDimitry Andric   DS_READ = 1u << 8,
7781ad6265SDimitry Andric   DS_WRITE = 1u << 9,
78*cb14a3feSDimitry Andric   TRANS = 1u << 10,
79bdd1243dSDimitry Andric   ALL = ALU | VALU | SALU | MFMA | VMEM | VMEM_READ | VMEM_WRITE | DS |
80*cb14a3feSDimitry Andric         DS_READ | DS_WRITE | TRANS,
81bdd1243dSDimitry Andric   LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ ALL)
8281ad6265SDimitry Andric };
8381ad6265SDimitry Andric 
8406c3fb27SDimitry Andric class SchedGroup;
8506c3fb27SDimitry Andric 
8606c3fb27SDimitry Andric // InstructionRule class is used to enact a filter which determines whether or
8706c3fb27SDimitry Andric // not an SU maps to a given SchedGroup. It contains complementary data
8806c3fb27SDimitry Andric // structures (e.g Cache) to help those filters.
8906c3fb27SDimitry Andric class InstructionRule {
9006c3fb27SDimitry Andric protected:
9106c3fb27SDimitry Andric   const SIInstrInfo *TII;
9206c3fb27SDimitry Andric   unsigned SGID;
9306c3fb27SDimitry Andric   // A cache made available to the Filter to store SUnits for subsequent
9406c3fb27SDimitry Andric   // invocations of the Filter
9506c3fb27SDimitry Andric   std::optional<SmallVector<SUnit *, 4>> Cache;
9606c3fb27SDimitry Andric 
9706c3fb27SDimitry Andric public:
9806c3fb27SDimitry Andric   virtual bool
9906c3fb27SDimitry Andric   apply(const SUnit *, const ArrayRef<SUnit *>,
10006c3fb27SDimitry Andric         SmallVectorImpl<SchedGroup> &) {
10106c3fb27SDimitry Andric     return true;
10206c3fb27SDimitry Andric   };
10306c3fb27SDimitry Andric 
10406c3fb27SDimitry Andric   InstructionRule(const SIInstrInfo *TII, unsigned SGID,
10506c3fb27SDimitry Andric                   bool NeedsCache = false)
10606c3fb27SDimitry Andric       : TII(TII), SGID(SGID) {
10706c3fb27SDimitry Andric     if (NeedsCache) {
10806c3fb27SDimitry Andric       Cache = SmallVector<SUnit *, 4>();
10906c3fb27SDimitry Andric     }
11006c3fb27SDimitry Andric   }
11106c3fb27SDimitry Andric 
11206c3fb27SDimitry Andric   virtual ~InstructionRule() = default;
11306c3fb27SDimitry Andric };
11406c3fb27SDimitry Andric 
115bdd1243dSDimitry Andric typedef DenseMap<SUnit *, SmallVector<int, 4>> SUnitsToCandidateSGsMap;
11681ad6265SDimitry Andric 
117bdd1243dSDimitry Andric // Classify instructions into groups to enable fine tuned control over the
118bdd1243dSDimitry Andric // scheduler. These groups may be more specific than current SchedModel
119bdd1243dSDimitry Andric // instruction classes.
120bdd1243dSDimitry Andric class SchedGroup {
121bdd1243dSDimitry Andric private:
122bdd1243dSDimitry Andric   // Mask that defines which instruction types can be classified into this
123bdd1243dSDimitry Andric   // SchedGroup. The instruction types correspond to the mask from SCHED_BARRIER
124bdd1243dSDimitry Andric   // and SCHED_GROUP_BARRIER.
125bdd1243dSDimitry Andric   SchedGroupMask SGMask;
12681ad6265SDimitry Andric 
127bdd1243dSDimitry Andric   // Maximum number of SUnits that can be added to this group.
128bdd1243dSDimitry Andric   std::optional<unsigned> MaxSize;
12981ad6265SDimitry Andric 
130bdd1243dSDimitry Andric   // SchedGroups will only synchronize with other SchedGroups that have the same
131bdd1243dSDimitry Andric   // SyncID.
132bdd1243dSDimitry Andric   int SyncID = 0;
13381ad6265SDimitry Andric 
134bdd1243dSDimitry Andric   // SGID is used to map instructions to candidate SchedGroups
135bdd1243dSDimitry Andric   unsigned SGID;
136bdd1243dSDimitry Andric 
13706c3fb27SDimitry Andric   // The different rules each instruction in this SchedGroup must conform to
13806c3fb27SDimitry Andric   SmallVector<std::shared_ptr<InstructionRule>, 4> Rules;
13906c3fb27SDimitry Andric 
140bdd1243dSDimitry Andric   // Count of the number of created SchedGroups, used to initialize SGID.
141bdd1243dSDimitry Andric   static unsigned NumSchedGroups;
142bdd1243dSDimitry Andric 
143bdd1243dSDimitry Andric   const SIInstrInfo *TII;
144bdd1243dSDimitry Andric 
145bdd1243dSDimitry Andric   // Try to add and edge from SU A to SU B.
146bdd1243dSDimitry Andric   bool tryAddEdge(SUnit *A, SUnit *B);
147bdd1243dSDimitry Andric 
148bdd1243dSDimitry Andric   // Use SGMask to determine whether we can classify MI as a member of this
149bdd1243dSDimitry Andric   // SchedGroup object.
150bdd1243dSDimitry Andric   bool canAddMI(const MachineInstr &MI) const;
15181ad6265SDimitry Andric 
15281ad6265SDimitry Andric public:
153bdd1243dSDimitry Andric   // Collection of SUnits that are classified as members of this group.
154bdd1243dSDimitry Andric   SmallVector<SUnit *, 32> Collection;
15581ad6265SDimitry Andric 
15606c3fb27SDimitry Andric   ScheduleDAGInstrs *DAG;
15706c3fb27SDimitry Andric 
158bdd1243dSDimitry Andric   // Returns true if SU can be added to this SchedGroup.
159bdd1243dSDimitry Andric   bool canAddSU(SUnit &SU) const;
16081ad6265SDimitry Andric 
161bdd1243dSDimitry Andric   // Add DAG dependencies from all SUnits in this SchedGroup and this SU. If
162bdd1243dSDimitry Andric   // MakePred is true, SU will be a predecessor of the SUnits in this
163bdd1243dSDimitry Andric   // SchedGroup, otherwise SU will be a successor.
164bdd1243dSDimitry Andric   void link(SUnit &SU, bool MakePred = false);
16581ad6265SDimitry Andric 
166bdd1243dSDimitry Andric   // Add DAG dependencies and track which edges are added, and the count of
167bdd1243dSDimitry Andric   // missed edges
168bdd1243dSDimitry Andric   int link(SUnit &SU, bool MakePred,
169bdd1243dSDimitry Andric            std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges);
17081ad6265SDimitry Andric 
171bdd1243dSDimitry Andric   // Add DAG dependencies from all SUnits in this SchedGroup and this SU.
172bdd1243dSDimitry Andric   // Use the predicate to determine whether SU should be a predecessor (P =
173bdd1243dSDimitry Andric   // true) or a successor (P = false) of this SchedGroup.
174bdd1243dSDimitry Andric   void link(SUnit &SU, function_ref<bool(const SUnit *A, const SUnit *B)> P);
17581ad6265SDimitry Andric 
176bdd1243dSDimitry Andric   // Add DAG dependencies such that SUnits in this group shall be ordered
177bdd1243dSDimitry Andric   // before SUnits in OtherGroup.
178bdd1243dSDimitry Andric   void link(SchedGroup &OtherGroup);
179bdd1243dSDimitry Andric 
180bdd1243dSDimitry Andric   // Returns true if no more instructions may be added to this group.
181bdd1243dSDimitry Andric   bool isFull() const { return MaxSize && Collection.size() >= *MaxSize; }
182bdd1243dSDimitry Andric 
18306c3fb27SDimitry Andric   // Append a constraint that SUs must meet in order to fit into this
18406c3fb27SDimitry Andric   // SchedGroup. Since many rules involve the relationship between a SchedGroup
18506c3fb27SDimitry Andric   // and the SUnits in other SchedGroups, rules are checked at Pipeline Solve
18606c3fb27SDimitry Andric   // time (rather than SchedGroup init time.)
18706c3fb27SDimitry Andric   void addRule(std::shared_ptr<InstructionRule> NewRule) {
18806c3fb27SDimitry Andric     Rules.push_back(NewRule);
18906c3fb27SDimitry Andric   }
19006c3fb27SDimitry Andric 
19106c3fb27SDimitry Andric   // Returns true if the SU matches all rules
19206c3fb27SDimitry Andric   bool allowedByRules(const SUnit *SU,
19306c3fb27SDimitry Andric                       SmallVectorImpl<SchedGroup> &SyncPipe) const {
19406c3fb27SDimitry Andric     if (Rules.empty())
19506c3fb27SDimitry Andric       return true;
19606c3fb27SDimitry Andric     for (size_t I = 0; I < Rules.size(); I++) {
19706c3fb27SDimitry Andric       auto TheRule = Rules[I].get();
19806c3fb27SDimitry Andric       if (!TheRule->apply(SU, Collection, SyncPipe)) {
19906c3fb27SDimitry Andric         return false;
20006c3fb27SDimitry Andric       }
20106c3fb27SDimitry Andric     }
20206c3fb27SDimitry Andric     return true;
20306c3fb27SDimitry Andric   }
20406c3fb27SDimitry Andric 
205bdd1243dSDimitry Andric   // Add SU to the SchedGroup.
206bdd1243dSDimitry Andric   void add(SUnit &SU) {
207bdd1243dSDimitry Andric     LLVM_DEBUG(dbgs() << "For SchedGroup with mask "
208bdd1243dSDimitry Andric                       << format_hex((int)SGMask, 10, true) << " adding "
209bdd1243dSDimitry Andric                       << *SU.getInstr());
210bdd1243dSDimitry Andric     Collection.push_back(&SU);
21181ad6265SDimitry Andric   }
21281ad6265SDimitry Andric 
213bdd1243dSDimitry Andric   // Remove last element in the SchedGroup
214bdd1243dSDimitry Andric   void pop() { Collection.pop_back(); }
215bdd1243dSDimitry Andric 
216bdd1243dSDimitry Andric   // Identify and add all relevant SUs from the DAG to this SchedGroup.
217bdd1243dSDimitry Andric   void initSchedGroup();
218bdd1243dSDimitry Andric 
219bdd1243dSDimitry Andric   // Add instructions to the SchedGroup bottom up starting from RIter.
220bdd1243dSDimitry Andric   // PipelineInstrs is a set of instructions that should not be added to the
221bdd1243dSDimitry Andric   // SchedGroup even when the other conditions for adding it are satisfied.
222bdd1243dSDimitry Andric   // RIter will be added to the SchedGroup as well, and dependencies will be
223bdd1243dSDimitry Andric   // added so that RIter will always be scheduled at the end of the group.
224bdd1243dSDimitry Andric   void initSchedGroup(std::vector<SUnit>::reverse_iterator RIter,
225bdd1243dSDimitry Andric                       SUnitsToCandidateSGsMap &SyncedInstrs);
226bdd1243dSDimitry Andric 
227bdd1243dSDimitry Andric   void initSchedGroup(SUnitsToCandidateSGsMap &SyncedInstrs);
228bdd1243dSDimitry Andric 
229bdd1243dSDimitry Andric   int getSyncID() { return SyncID; }
230bdd1243dSDimitry Andric 
231bdd1243dSDimitry Andric   int getSGID() { return SGID; }
232bdd1243dSDimitry Andric 
233bdd1243dSDimitry Andric   SchedGroupMask getMask() { return SGMask; }
234bdd1243dSDimitry Andric 
235bdd1243dSDimitry Andric   SchedGroup(SchedGroupMask SGMask, std::optional<unsigned> MaxSize,
236bdd1243dSDimitry Andric              ScheduleDAGInstrs *DAG, const SIInstrInfo *TII)
23706c3fb27SDimitry Andric       : SGMask(SGMask), MaxSize(MaxSize), TII(TII), DAG(DAG) {
238bdd1243dSDimitry Andric     SGID = NumSchedGroups++;
239bdd1243dSDimitry Andric   }
240bdd1243dSDimitry Andric 
241bdd1243dSDimitry Andric   SchedGroup(SchedGroupMask SGMask, std::optional<unsigned> MaxSize, int SyncID,
242bdd1243dSDimitry Andric              ScheduleDAGInstrs *DAG, const SIInstrInfo *TII)
24306c3fb27SDimitry Andric       : SGMask(SGMask), MaxSize(MaxSize), SyncID(SyncID), TII(TII), DAG(DAG) {
244bdd1243dSDimitry Andric     SGID = NumSchedGroups++;
245bdd1243dSDimitry Andric   }
246bdd1243dSDimitry Andric };
247bdd1243dSDimitry Andric 
248bdd1243dSDimitry Andric // Remove all existing edges from a SCHED_BARRIER or SCHED_GROUP_BARRIER.
249bdd1243dSDimitry Andric static void resetEdges(SUnit &SU, ScheduleDAGInstrs *DAG) {
250bdd1243dSDimitry Andric   assert(SU.getInstr()->getOpcode() == AMDGPU::SCHED_BARRIER ||
251bdd1243dSDimitry Andric          SU.getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER ||
252bdd1243dSDimitry Andric          SU.getInstr()->getOpcode() == AMDGPU::IGLP_OPT);
253bdd1243dSDimitry Andric 
254bdd1243dSDimitry Andric   while (!SU.Preds.empty())
255bdd1243dSDimitry Andric     for (auto &P : SU.Preds)
256bdd1243dSDimitry Andric       SU.removePred(P);
257bdd1243dSDimitry Andric 
258bdd1243dSDimitry Andric   while (!SU.Succs.empty())
259bdd1243dSDimitry Andric     for (auto &S : SU.Succs)
260bdd1243dSDimitry Andric       for (auto &SP : S.getSUnit()->Preds)
261bdd1243dSDimitry Andric         if (SP.getSUnit() == &SU)
262bdd1243dSDimitry Andric           S.getSUnit()->removePred(SP);
263bdd1243dSDimitry Andric }
264bdd1243dSDimitry Andric 
265bdd1243dSDimitry Andric typedef std::pair<SUnit *, SmallVector<int, 4>> SUToCandSGsPair;
266bdd1243dSDimitry Andric typedef SmallVector<SUToCandSGsPair, 4> SUsToCandSGsVec;
267bdd1243dSDimitry Andric 
268bdd1243dSDimitry Andric // The PipelineSolver is used to assign SUnits to SchedGroups in a pipeline
269bdd1243dSDimitry Andric // in non-trivial cases. For example, if the requested pipeline is
270bdd1243dSDimitry Andric // {VMEM_READ, VALU, MFMA, VMEM_READ} and we encounter a VMEM_READ instruction
271bdd1243dSDimitry Andric // in the DAG, then we will have an instruction that can not be trivially
272bdd1243dSDimitry Andric // assigned to a SchedGroup. The PipelineSolver class implements two algorithms
273bdd1243dSDimitry Andric // to find a good solution to the pipeline -- a greedy algorithm and an exact
274bdd1243dSDimitry Andric // algorithm. The exact algorithm has an exponential time complexity and should
275bdd1243dSDimitry Andric // only be used for small sized problems or medium sized problems where an exact
276bdd1243dSDimitry Andric // solution is highly desired.
277bdd1243dSDimitry Andric class PipelineSolver {
278bdd1243dSDimitry Andric   ScheduleDAGMI *DAG;
279bdd1243dSDimitry Andric 
280bdd1243dSDimitry Andric   // Instructions that can be assigned to multiple SchedGroups
281bdd1243dSDimitry Andric   DenseMap<int, SUnitsToCandidateSGsMap> SyncedInstrs;
282bdd1243dSDimitry Andric   SmallVector<SUsToCandSGsVec, 4> PipelineInstrs;
283bdd1243dSDimitry Andric   DenseMap<int, SmallVector<SchedGroup, 4>> SyncedSchedGroups;
284bdd1243dSDimitry Andric   // The current working pipeline
285bdd1243dSDimitry Andric   SmallVector<SmallVector<SchedGroup, 4>, 4> CurrPipeline;
286bdd1243dSDimitry Andric   // The pipeline that has the best solution found so far
287bdd1243dSDimitry Andric   SmallVector<SmallVector<SchedGroup, 4>, 4> BestPipeline;
288bdd1243dSDimitry Andric 
289bdd1243dSDimitry Andric   // Whether or not we actually have any SyncedInstrs to try to solve.
290bdd1243dSDimitry Andric   bool NeedsSolver = false;
291bdd1243dSDimitry Andric 
292bdd1243dSDimitry Andric   // Compute an estimate of the size of search tree -- the true size is
293bdd1243dSDimitry Andric   // the product of each conflictedInst.Matches.size() across all SyncPipelines
294bdd1243dSDimitry Andric   unsigned computeProblemSize();
295bdd1243dSDimitry Andric 
296bdd1243dSDimitry Andric   // The cost penalty of not assigning a SU to a SchedGroup
297bdd1243dSDimitry Andric   int MissPenalty = 0;
298bdd1243dSDimitry Andric 
299bdd1243dSDimitry Andric   // Costs in terms of the number of edges we are unable to add
300bdd1243dSDimitry Andric   int BestCost = -1;
301bdd1243dSDimitry Andric   int CurrCost = 0;
302bdd1243dSDimitry Andric 
303bdd1243dSDimitry Andric   // Index pointing to the conflicting instruction that is currently being
304bdd1243dSDimitry Andric   // fitted
305bdd1243dSDimitry Andric   int CurrConflInstNo = 0;
306bdd1243dSDimitry Andric   // Index to the pipeline that is currently being fitted
307bdd1243dSDimitry Andric   int CurrSyncGroupIdx = 0;
308bdd1243dSDimitry Andric   // The first non trivial pipeline
309bdd1243dSDimitry Andric   int BeginSyncGroupIdx = 0;
310bdd1243dSDimitry Andric 
311bdd1243dSDimitry Andric   // How many branches we have explored
312bdd1243dSDimitry Andric   uint64_t BranchesExplored = 0;
313bdd1243dSDimitry Andric 
31406c3fb27SDimitry Andric   // The direction in which we process the candidate SchedGroups per SU
31506c3fb27SDimitry Andric   bool IsBottomUp = 1;
31606c3fb27SDimitry Andric 
317bdd1243dSDimitry Andric   // Update indices to fit next conflicting instruction
318bdd1243dSDimitry Andric   void advancePosition();
319bdd1243dSDimitry Andric   // Recede indices to attempt to find better fit for previous conflicting
320bdd1243dSDimitry Andric   // instruction
321bdd1243dSDimitry Andric   void retreatPosition();
322bdd1243dSDimitry Andric 
323bdd1243dSDimitry Andric   // The exponential time algorithm which finds the provably best fit
324bdd1243dSDimitry Andric   bool solveExact();
325bdd1243dSDimitry Andric   // The polynomial time algorithm which attempts to find a good fit
326bdd1243dSDimitry Andric   bool solveGreedy();
32706c3fb27SDimitry Andric   // Find the best SchedGroup for the current SU using the heuristic given all
32806c3fb27SDimitry Andric   // current information. One step in the greedy algorithm. Templated against
32906c3fb27SDimitry Andric   // the SchedGroup iterator (either reverse or forward).
33006c3fb27SDimitry Andric   template <typename T>
33106c3fb27SDimitry Andric   void greedyFind(std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges, T I,
33206c3fb27SDimitry Andric                   T E);
333bdd1243dSDimitry Andric   // Whether or not the current solution is optimal
334bdd1243dSDimitry Andric   bool checkOptimal();
335bdd1243dSDimitry Andric   // Populate the ready list, prioiritizing fewest missed edges first
33606c3fb27SDimitry Andric   // Templated against the SchedGroup iterator (either reverse or forward).
33706c3fb27SDimitry Andric   template <typename T>
33806c3fb27SDimitry Andric   void populateReadyList(SmallVectorImpl<std::pair<int, int>> &ReadyList, T I,
33906c3fb27SDimitry Andric                          T E);
340bdd1243dSDimitry Andric   // Add edges corresponding to the SchedGroups as assigned by solver
341bdd1243dSDimitry Andric   void makePipeline();
34206c3fb27SDimitry Andric   // Link the SchedGroups in the best found pipeline.
34306c3fb27SDimitry Andric   // Tmplated against the SchedGroup iterator (either reverse or forward).
34406c3fb27SDimitry Andric   template <typename T> void linkSchedGroups(T I, T E);
345bdd1243dSDimitry Andric   // Add the edges from the SU to the other SchedGroups in pipeline, and
346bdd1243dSDimitry Andric   // return the number of edges missed.
347bdd1243dSDimitry Andric   int addEdges(SmallVectorImpl<SchedGroup> &SyncPipeline, SUnit *SU, int SGID,
348bdd1243dSDimitry Andric                std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges);
3495f757f3fSDimitry Andric   /// Link the pipeline as if \p SU was in the SchedGroup with ID \p SGID. It
3505f757f3fSDimitry Andric   /// returns the cost (in terms of missed pipeline edges), and tracks the edges
3515f757f3fSDimitry Andric   /// added in \p AddedEdges
35206c3fb27SDimitry Andric   template <typename T>
35306c3fb27SDimitry Andric   int linkSUnit(SUnit *SU, int SGID,
35406c3fb27SDimitry Andric                 std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges, T I, T E);
3555f757f3fSDimitry Andric   /// Remove the edges passed via \p AddedEdges
356bdd1243dSDimitry Andric   void removeEdges(const std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges);
357bdd1243dSDimitry Andric   // Convert the passed in maps to arrays for bidirectional iterators
358bdd1243dSDimitry Andric   void convertSyncMapsToArrays();
359bdd1243dSDimitry Andric 
360bdd1243dSDimitry Andric   void reset();
361bdd1243dSDimitry Andric 
362bdd1243dSDimitry Andric public:
363bdd1243dSDimitry Andric   // Invoke the solver to map instructions to instruction groups. Heuristic &&
364bdd1243dSDimitry Andric   // command-line-option determines to use exact or greedy algorithm.
365bdd1243dSDimitry Andric   void solve();
366bdd1243dSDimitry Andric 
367bdd1243dSDimitry Andric   PipelineSolver(DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
368bdd1243dSDimitry Andric                  DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
36906c3fb27SDimitry Andric                  ScheduleDAGMI *DAG, bool IsBottomUp = 1)
370bdd1243dSDimitry Andric       : DAG(DAG), SyncedInstrs(SyncedInstrs),
37106c3fb27SDimitry Andric         SyncedSchedGroups(SyncedSchedGroups), IsBottomUp(IsBottomUp) {
372bdd1243dSDimitry Andric 
373bdd1243dSDimitry Andric     for (auto &PipelineInstrs : SyncedInstrs) {
374bdd1243dSDimitry Andric       if (PipelineInstrs.second.size() > 0) {
375bdd1243dSDimitry Andric         NeedsSolver = true;
376bdd1243dSDimitry Andric         break;
377bdd1243dSDimitry Andric       }
378bdd1243dSDimitry Andric     }
379bdd1243dSDimitry Andric 
380bdd1243dSDimitry Andric     if (!NeedsSolver)
381bdd1243dSDimitry Andric       return;
382bdd1243dSDimitry Andric 
383bdd1243dSDimitry Andric     convertSyncMapsToArrays();
384bdd1243dSDimitry Andric 
385bdd1243dSDimitry Andric     CurrPipeline = BestPipeline;
386bdd1243dSDimitry Andric 
387bdd1243dSDimitry Andric     while (static_cast<size_t>(BeginSyncGroupIdx) < PipelineInstrs.size() &&
388bdd1243dSDimitry Andric            PipelineInstrs[BeginSyncGroupIdx].size() == 0)
389bdd1243dSDimitry Andric       ++BeginSyncGroupIdx;
390bdd1243dSDimitry Andric 
391bdd1243dSDimitry Andric     if (static_cast<size_t>(BeginSyncGroupIdx) >= PipelineInstrs.size())
392bdd1243dSDimitry Andric       return;
393bdd1243dSDimitry Andric   }
394bdd1243dSDimitry Andric };
395bdd1243dSDimitry Andric 
396bdd1243dSDimitry Andric void PipelineSolver::reset() {
397bdd1243dSDimitry Andric 
398bdd1243dSDimitry Andric   for (auto &SyncPipeline : CurrPipeline) {
399bdd1243dSDimitry Andric     for (auto &SG : SyncPipeline) {
400bdd1243dSDimitry Andric       SmallVector<SUnit *, 32> TempCollection = SG.Collection;
401bdd1243dSDimitry Andric       SG.Collection.clear();
402bdd1243dSDimitry Andric       auto SchedBarr = llvm::find_if(TempCollection, [](SUnit *SU) {
403bdd1243dSDimitry Andric         return SU->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER;
404bdd1243dSDimitry Andric       });
405bdd1243dSDimitry Andric       if (SchedBarr != TempCollection.end())
406bdd1243dSDimitry Andric         SG.Collection.push_back(*SchedBarr);
407bdd1243dSDimitry Andric     }
408bdd1243dSDimitry Andric   }
409bdd1243dSDimitry Andric 
410bdd1243dSDimitry Andric   CurrSyncGroupIdx = BeginSyncGroupIdx;
411bdd1243dSDimitry Andric   CurrConflInstNo = 0;
412bdd1243dSDimitry Andric   CurrCost = 0;
413bdd1243dSDimitry Andric }
414bdd1243dSDimitry Andric 
415bdd1243dSDimitry Andric void PipelineSolver::convertSyncMapsToArrays() {
416bdd1243dSDimitry Andric   for (auto &SyncPipe : SyncedSchedGroups) {
417bdd1243dSDimitry Andric     BestPipeline.insert(BestPipeline.begin(), SyncPipe.second);
418bdd1243dSDimitry Andric   }
419bdd1243dSDimitry Andric 
420bdd1243dSDimitry Andric   int PipelineIDx = SyncedInstrs.size() - 1;
421bdd1243dSDimitry Andric   PipelineInstrs.resize(SyncedInstrs.size());
422bdd1243dSDimitry Andric   for (auto &SyncInstrMap : SyncedInstrs) {
423bdd1243dSDimitry Andric     for (auto &SUsToCandSGs : SyncInstrMap.second) {
424bdd1243dSDimitry Andric       if (PipelineInstrs[PipelineIDx].size() == 0) {
425bdd1243dSDimitry Andric         PipelineInstrs[PipelineIDx].push_back(
426bdd1243dSDimitry Andric             std::pair(SUsToCandSGs.first, SUsToCandSGs.second));
427bdd1243dSDimitry Andric         continue;
428bdd1243dSDimitry Andric       }
429bdd1243dSDimitry Andric       auto SortPosition = PipelineInstrs[PipelineIDx].begin();
430bdd1243dSDimitry Andric       // Insert them in sorted order -- this allows for good parsing order in
431bdd1243dSDimitry Andric       // the greedy algorithm
432bdd1243dSDimitry Andric       while (SortPosition != PipelineInstrs[PipelineIDx].end() &&
433bdd1243dSDimitry Andric              SUsToCandSGs.first->NodeNum > SortPosition->first->NodeNum)
434bdd1243dSDimitry Andric         ++SortPosition;
435bdd1243dSDimitry Andric       PipelineInstrs[PipelineIDx].insert(
436bdd1243dSDimitry Andric           SortPosition, std::pair(SUsToCandSGs.first, SUsToCandSGs.second));
437bdd1243dSDimitry Andric     }
438bdd1243dSDimitry Andric     --PipelineIDx;
439bdd1243dSDimitry Andric   }
440bdd1243dSDimitry Andric }
441bdd1243dSDimitry Andric 
44206c3fb27SDimitry Andric template <typename T> void PipelineSolver::linkSchedGroups(T I, T E) {
44306c3fb27SDimitry Andric   for (; I != E; ++I) {
44406c3fb27SDimitry Andric     auto &GroupA = *I;
44506c3fb27SDimitry Andric     for (auto J = std::next(I); J != E; ++J) {
44606c3fb27SDimitry Andric       auto &GroupB = *J;
44706c3fb27SDimitry Andric       GroupA.link(GroupB);
44806c3fb27SDimitry Andric     }
44906c3fb27SDimitry Andric   }
45006c3fb27SDimitry Andric }
45106c3fb27SDimitry Andric 
452bdd1243dSDimitry Andric void PipelineSolver::makePipeline() {
453bdd1243dSDimitry Andric   // Preserve the order of barrier for subsequent SchedGroupBarrier mutations
454bdd1243dSDimitry Andric   for (auto &SyncPipeline : BestPipeline) {
45506c3fb27SDimitry Andric     LLVM_DEBUG(dbgs() << "Printing SchedGroups\n");
456bdd1243dSDimitry Andric     for (auto &SG : SyncPipeline) {
45706c3fb27SDimitry Andric       LLVM_DEBUG(dbgs() << "SchedGroup with SGID " << SG.getSGID()
45806c3fb27SDimitry Andric                         << " has: \n");
459bdd1243dSDimitry Andric       SUnit *SGBarr = nullptr;
460bdd1243dSDimitry Andric       for (auto &SU : SG.Collection) {
461bdd1243dSDimitry Andric         if (SU->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER)
462bdd1243dSDimitry Andric           SGBarr = SU;
46306c3fb27SDimitry Andric         LLVM_DEBUG(dbgs() << "SU(" << SU->NodeNum << ")\n");
464bdd1243dSDimitry Andric       }
465bdd1243dSDimitry Andric       // Command line requested IGroupLP doesn't have SGBarr
466bdd1243dSDimitry Andric       if (!SGBarr)
467bdd1243dSDimitry Andric         continue;
468bdd1243dSDimitry Andric       resetEdges(*SGBarr, DAG);
469bdd1243dSDimitry Andric       SG.link(*SGBarr, false);
470bdd1243dSDimitry Andric     }
471bdd1243dSDimitry Andric   }
472bdd1243dSDimitry Andric 
473bdd1243dSDimitry Andric   for (auto &SyncPipeline : BestPipeline) {
47406c3fb27SDimitry Andric     IsBottomUp ? linkSchedGroups(SyncPipeline.rbegin(), SyncPipeline.rend())
47506c3fb27SDimitry Andric                : linkSchedGroups(SyncPipeline.begin(), SyncPipeline.end());
47681ad6265SDimitry Andric   }
47781ad6265SDimitry Andric }
47806c3fb27SDimitry Andric 
47906c3fb27SDimitry Andric template <typename T>
48006c3fb27SDimitry Andric int PipelineSolver::linkSUnit(
48106c3fb27SDimitry Andric     SUnit *SU, int SGID, std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges,
48206c3fb27SDimitry Andric     T I, T E) {
48306c3fb27SDimitry Andric   bool MakePred = false;
48406c3fb27SDimitry Andric   int AddedCost = 0;
48506c3fb27SDimitry Andric   for (; I < E; ++I) {
48606c3fb27SDimitry Andric     if (I->getSGID() == SGID) {
48706c3fb27SDimitry Andric       MakePred = true;
48806c3fb27SDimitry Andric       continue;
48981ad6265SDimitry Andric     }
49006c3fb27SDimitry Andric     auto Group = *I;
49106c3fb27SDimitry Andric     AddedCost += Group.link(*SU, MakePred, AddedEdges);
49206c3fb27SDimitry Andric     assert(AddedCost >= 0);
49306c3fb27SDimitry Andric   }
49406c3fb27SDimitry Andric   return AddedCost;
495bdd1243dSDimitry Andric }
49681ad6265SDimitry Andric 
497bdd1243dSDimitry Andric int PipelineSolver::addEdges(
498bdd1243dSDimitry Andric     SmallVectorImpl<SchedGroup> &SyncPipeline, SUnit *SU, int SGID,
499bdd1243dSDimitry Andric     std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges) {
500bdd1243dSDimitry Andric 
50106c3fb27SDimitry Andric   // For IsBottomUp, the first SchedGroup in SyncPipeline contains the
50206c3fb27SDimitry Andric   // instructions that are the ultimate successors in the resultant mutation.
50306c3fb27SDimitry Andric   // Therefore, in such a configuration, the SchedGroups occurring before the
50406c3fb27SDimitry Andric   // candidate SGID are successors of the candidate SchedGroup, thus the current
50506c3fb27SDimitry Andric   // SU should be linked as a predecessor to SUs in those SchedGroups. The
50606c3fb27SDimitry Andric   // opposite is true if !IsBottomUp. IsBottomUp occurs in the case of multiple
50706c3fb27SDimitry Andric   // SCHED_GROUP_BARRIERS, or if a user specifies IGLP_OPT SchedGroups using
50806c3fb27SDimitry Andric   // IsBottomUp (in reverse).
50906c3fb27SDimitry Andric   return IsBottomUp ? linkSUnit(SU, SGID, AddedEdges, SyncPipeline.rbegin(),
51006c3fb27SDimitry Andric                                 SyncPipeline.rend())
51106c3fb27SDimitry Andric                     : linkSUnit(SU, SGID, AddedEdges, SyncPipeline.begin(),
51206c3fb27SDimitry Andric                                 SyncPipeline.end());
513bdd1243dSDimitry Andric }
514bdd1243dSDimitry Andric 
515bdd1243dSDimitry Andric void PipelineSolver::removeEdges(
516bdd1243dSDimitry Andric     const std::vector<std::pair<SUnit *, SUnit *>> &EdgesToRemove) {
517bdd1243dSDimitry Andric   // Only remove the edges that we have added when testing
518bdd1243dSDimitry Andric   // the fit.
519bdd1243dSDimitry Andric   for (auto &PredSuccPair : EdgesToRemove) {
520bdd1243dSDimitry Andric     SUnit *Pred = PredSuccPair.first;
521bdd1243dSDimitry Andric     SUnit *Succ = PredSuccPair.second;
522bdd1243dSDimitry Andric 
523bdd1243dSDimitry Andric     auto Match = llvm::find_if(
524bdd1243dSDimitry Andric         Succ->Preds, [&Pred](SDep &P) { return P.getSUnit() == Pred; });
525bdd1243dSDimitry Andric     if (Match != Succ->Preds.end()) {
526bdd1243dSDimitry Andric       assert(Match->isArtificial());
527bdd1243dSDimitry Andric       Succ->removePred(*Match);
528bdd1243dSDimitry Andric     }
529bdd1243dSDimitry Andric   }
530bdd1243dSDimitry Andric }
531bdd1243dSDimitry Andric 
532bdd1243dSDimitry Andric void PipelineSolver::advancePosition() {
533bdd1243dSDimitry Andric   ++CurrConflInstNo;
534bdd1243dSDimitry Andric 
535bdd1243dSDimitry Andric   if (static_cast<size_t>(CurrConflInstNo) >=
536bdd1243dSDimitry Andric       PipelineInstrs[CurrSyncGroupIdx].size()) {
537bdd1243dSDimitry Andric     CurrConflInstNo = 0;
538bdd1243dSDimitry Andric     ++CurrSyncGroupIdx;
539bdd1243dSDimitry Andric     // Advance to next non-trivial pipeline
540bdd1243dSDimitry Andric     while (static_cast<size_t>(CurrSyncGroupIdx) < PipelineInstrs.size() &&
541bdd1243dSDimitry Andric            PipelineInstrs[CurrSyncGroupIdx].size() == 0)
542bdd1243dSDimitry Andric       ++CurrSyncGroupIdx;
543bdd1243dSDimitry Andric   }
544bdd1243dSDimitry Andric }
545bdd1243dSDimitry Andric 
546bdd1243dSDimitry Andric void PipelineSolver::retreatPosition() {
547bdd1243dSDimitry Andric   assert(CurrConflInstNo >= 0);
548bdd1243dSDimitry Andric   assert(CurrSyncGroupIdx >= 0);
549bdd1243dSDimitry Andric 
550bdd1243dSDimitry Andric   if (CurrConflInstNo > 0) {
551bdd1243dSDimitry Andric     --CurrConflInstNo;
552bdd1243dSDimitry Andric     return;
553bdd1243dSDimitry Andric   }
554bdd1243dSDimitry Andric 
555bdd1243dSDimitry Andric   if (CurrConflInstNo == 0) {
556bdd1243dSDimitry Andric     // If we return to the starting position, we have explored
557bdd1243dSDimitry Andric     // the entire tree
558bdd1243dSDimitry Andric     if (CurrSyncGroupIdx == BeginSyncGroupIdx)
559bdd1243dSDimitry Andric       return;
560bdd1243dSDimitry Andric 
561bdd1243dSDimitry Andric     --CurrSyncGroupIdx;
562bdd1243dSDimitry Andric     // Go to previous non-trivial pipeline
563bdd1243dSDimitry Andric     while (PipelineInstrs[CurrSyncGroupIdx].size() == 0)
564bdd1243dSDimitry Andric       --CurrSyncGroupIdx;
565bdd1243dSDimitry Andric 
566bdd1243dSDimitry Andric     CurrConflInstNo = PipelineInstrs[CurrSyncGroupIdx].size() - 1;
567bdd1243dSDimitry Andric   }
568bdd1243dSDimitry Andric }
569bdd1243dSDimitry Andric 
570bdd1243dSDimitry Andric bool PipelineSolver::checkOptimal() {
571bdd1243dSDimitry Andric   if (static_cast<size_t>(CurrSyncGroupIdx) == PipelineInstrs.size()) {
572bdd1243dSDimitry Andric     if (BestCost == -1 || CurrCost < BestCost) {
573bdd1243dSDimitry Andric       BestPipeline = CurrPipeline;
574bdd1243dSDimitry Andric       BestCost = CurrCost;
575bdd1243dSDimitry Andric       LLVM_DEBUG(dbgs() << "Found Fit with cost " << BestCost << "\n");
576bdd1243dSDimitry Andric     }
577bdd1243dSDimitry Andric     assert(BestCost >= 0);
578bdd1243dSDimitry Andric   }
579bdd1243dSDimitry Andric 
580bdd1243dSDimitry Andric   bool DoneExploring = false;
581bdd1243dSDimitry Andric   if (MaxBranchesExplored > 0 && BranchesExplored >= MaxBranchesExplored)
582bdd1243dSDimitry Andric     DoneExploring = true;
583bdd1243dSDimitry Andric 
584bdd1243dSDimitry Andric   return (DoneExploring || BestCost == 0);
585bdd1243dSDimitry Andric }
586bdd1243dSDimitry Andric 
58706c3fb27SDimitry Andric template <typename T>
588bdd1243dSDimitry Andric void PipelineSolver::populateReadyList(
58906c3fb27SDimitry Andric     SmallVectorImpl<std::pair<int, int>> &ReadyList, T I, T E) {
59006c3fb27SDimitry Andric   SUToCandSGsPair CurrSU = PipelineInstrs[CurrSyncGroupIdx][CurrConflInstNo];
59106c3fb27SDimitry Andric   auto SyncPipeline = CurrPipeline[CurrSyncGroupIdx];
592bdd1243dSDimitry Andric   assert(CurrSU.second.size() >= 1);
59306c3fb27SDimitry Andric 
594bdd1243dSDimitry Andric   for (; I != E; ++I) {
595bdd1243dSDimitry Andric     std::vector<std::pair<SUnit *, SUnit *>> AddedEdges;
596bdd1243dSDimitry Andric     int CandSGID = *I;
5975f757f3fSDimitry Andric     SchedGroup *Match = llvm::find_if(SyncPipeline, [CandSGID](SchedGroup &SG) {
5985f757f3fSDimitry Andric       return SG.getSGID() == CandSGID;
5995f757f3fSDimitry Andric     });
6005f757f3fSDimitry Andric     assert(Match);
601bdd1243dSDimitry Andric 
602bdd1243dSDimitry Andric     if (UseCostHeur) {
603bdd1243dSDimitry Andric       if (Match->isFull()) {
604bdd1243dSDimitry Andric         ReadyList.push_back(std::pair(*I, MissPenalty));
605bdd1243dSDimitry Andric         continue;
606bdd1243dSDimitry Andric       }
607bdd1243dSDimitry Andric 
608bdd1243dSDimitry Andric       int TempCost = addEdges(SyncPipeline, CurrSU.first, CandSGID, AddedEdges);
609bdd1243dSDimitry Andric       ReadyList.push_back(std::pair(*I, TempCost));
610bdd1243dSDimitry Andric       removeEdges(AddedEdges);
611bdd1243dSDimitry Andric     } else
612bdd1243dSDimitry Andric       ReadyList.push_back(std::pair(*I, -1));
613bdd1243dSDimitry Andric   }
614bdd1243dSDimitry Andric 
615bdd1243dSDimitry Andric   if (UseCostHeur) {
616bdd1243dSDimitry Andric     std::sort(ReadyList.begin(), ReadyList.end(),
617bdd1243dSDimitry Andric               [](std::pair<int, int> A, std::pair<int, int> B) {
618bdd1243dSDimitry Andric                 return A.second < B.second;
619bdd1243dSDimitry Andric               });
620bdd1243dSDimitry Andric   }
621bdd1243dSDimitry Andric 
622bdd1243dSDimitry Andric   assert(ReadyList.size() == CurrSU.second.size());
623bdd1243dSDimitry Andric }
624bdd1243dSDimitry Andric 
625bdd1243dSDimitry Andric bool PipelineSolver::solveExact() {
626bdd1243dSDimitry Andric   if (checkOptimal())
627bdd1243dSDimitry Andric     return true;
628bdd1243dSDimitry Andric 
629bdd1243dSDimitry Andric   if (static_cast<size_t>(CurrSyncGroupIdx) == PipelineInstrs.size())
630bdd1243dSDimitry Andric     return false;
631bdd1243dSDimitry Andric 
632bdd1243dSDimitry Andric   assert(static_cast<size_t>(CurrSyncGroupIdx) < PipelineInstrs.size());
633bdd1243dSDimitry Andric   assert(static_cast<size_t>(CurrConflInstNo) <
634bdd1243dSDimitry Andric          PipelineInstrs[CurrSyncGroupIdx].size());
635bdd1243dSDimitry Andric   SUToCandSGsPair CurrSU = PipelineInstrs[CurrSyncGroupIdx][CurrConflInstNo];
636bdd1243dSDimitry Andric   LLVM_DEBUG(dbgs() << "Fitting SU(" << CurrSU.first->NodeNum
637bdd1243dSDimitry Andric                     << ") in Pipeline # " << CurrSyncGroupIdx << "\n");
638bdd1243dSDimitry Andric 
639bdd1243dSDimitry Andric   // SchedGroup -> Cost pairs
640bdd1243dSDimitry Andric   SmallVector<std::pair<int, int>, 4> ReadyList;
641bdd1243dSDimitry Andric   // Prioritize the candidate sched groups in terms of lowest cost first
64206c3fb27SDimitry Andric   IsBottomUp ? populateReadyList(ReadyList, CurrSU.second.rbegin(),
64306c3fb27SDimitry Andric                                  CurrSU.second.rend())
64406c3fb27SDimitry Andric              : populateReadyList(ReadyList, CurrSU.second.begin(),
64506c3fb27SDimitry Andric                                  CurrSU.second.end());
646bdd1243dSDimitry Andric 
647bdd1243dSDimitry Andric   auto I = ReadyList.begin();
648bdd1243dSDimitry Andric   auto E = ReadyList.end();
649bdd1243dSDimitry Andric   for (; I != E; ++I) {
650bdd1243dSDimitry Andric     // If we are trying SGs in least cost order, and the current SG is cost
651bdd1243dSDimitry Andric     // infeasible, then all subsequent SGs will also be cost infeasible, so we
652bdd1243dSDimitry Andric     // can prune.
653bdd1243dSDimitry Andric     if (BestCost != -1 && (CurrCost + I->second > BestCost))
654bdd1243dSDimitry Andric       return false;
655bdd1243dSDimitry Andric 
656bdd1243dSDimitry Andric     int CandSGID = I->first;
657bdd1243dSDimitry Andric     int AddedCost = 0;
658bdd1243dSDimitry Andric     std::vector<std::pair<SUnit *, SUnit *>> AddedEdges;
659bdd1243dSDimitry Andric     auto &SyncPipeline = CurrPipeline[CurrSyncGroupIdx];
660bdd1243dSDimitry Andric     SchedGroup *Match;
661bdd1243dSDimitry Andric     for (auto &SG : SyncPipeline) {
662bdd1243dSDimitry Andric       if (SG.getSGID() == CandSGID)
663bdd1243dSDimitry Andric         Match = &SG;
664bdd1243dSDimitry Andric     }
665bdd1243dSDimitry Andric 
666bdd1243dSDimitry Andric     if (Match->isFull())
667bdd1243dSDimitry Andric       continue;
668bdd1243dSDimitry Andric 
66906c3fb27SDimitry Andric     if (!Match->allowedByRules(CurrSU.first, SyncPipeline))
67006c3fb27SDimitry Andric       continue;
67106c3fb27SDimitry Andric 
672bdd1243dSDimitry Andric     LLVM_DEBUG(dbgs() << "Assigning to SchedGroup with Mask "
673bdd1243dSDimitry Andric                       << (int)Match->getMask() << "and ID " << CandSGID
674bdd1243dSDimitry Andric                       << "\n");
675bdd1243dSDimitry Andric     Match->add(*CurrSU.first);
676bdd1243dSDimitry Andric     AddedCost = addEdges(SyncPipeline, CurrSU.first, CandSGID, AddedEdges);
677bdd1243dSDimitry Andric     LLVM_DEBUG(dbgs() << "Cost of Assignment: " << AddedCost << "\n");
678bdd1243dSDimitry Andric     CurrCost += AddedCost;
679bdd1243dSDimitry Andric     advancePosition();
680bdd1243dSDimitry Andric     ++BranchesExplored;
681bdd1243dSDimitry Andric     bool FinishedExploring = false;
682bdd1243dSDimitry Andric     // If the Cost after adding edges is greater than a known solution,
683bdd1243dSDimitry Andric     // backtrack
684bdd1243dSDimitry Andric     if (CurrCost < BestCost || BestCost == -1) {
685bdd1243dSDimitry Andric       if (solveExact()) {
686bdd1243dSDimitry Andric         FinishedExploring = BestCost != 0;
687bdd1243dSDimitry Andric         if (!FinishedExploring)
688bdd1243dSDimitry Andric           return true;
689bdd1243dSDimitry Andric       }
690bdd1243dSDimitry Andric     }
691bdd1243dSDimitry Andric 
692bdd1243dSDimitry Andric     retreatPosition();
693bdd1243dSDimitry Andric     CurrCost -= AddedCost;
694bdd1243dSDimitry Andric     removeEdges(AddedEdges);
695bdd1243dSDimitry Andric     Match->pop();
696bdd1243dSDimitry Andric     CurrPipeline[CurrSyncGroupIdx] = SyncPipeline;
697bdd1243dSDimitry Andric     if (FinishedExploring)
698bdd1243dSDimitry Andric       return true;
699bdd1243dSDimitry Andric   }
700bdd1243dSDimitry Andric 
701bdd1243dSDimitry Andric   // Try the pipeline where the current instruction is omitted
702bdd1243dSDimitry Andric   // Potentially if we omit a problematic instruction from the pipeline,
703bdd1243dSDimitry Andric   // all the other instructions can nicely fit.
704bdd1243dSDimitry Andric   CurrCost += MissPenalty;
705bdd1243dSDimitry Andric   advancePosition();
706bdd1243dSDimitry Andric 
707bdd1243dSDimitry Andric   LLVM_DEBUG(dbgs() << "NOT Assigned (" << CurrSU.first->NodeNum << ")\n");
708bdd1243dSDimitry Andric 
709bdd1243dSDimitry Andric   bool FinishedExploring = false;
710bdd1243dSDimitry Andric   if (CurrCost < BestCost || BestCost == -1) {
711bdd1243dSDimitry Andric     if (solveExact()) {
712bdd1243dSDimitry Andric       bool FinishedExploring = BestCost != 0;
713bdd1243dSDimitry Andric       if (!FinishedExploring)
714bdd1243dSDimitry Andric         return true;
715bdd1243dSDimitry Andric     }
716bdd1243dSDimitry Andric   }
717bdd1243dSDimitry Andric 
718bdd1243dSDimitry Andric   retreatPosition();
719bdd1243dSDimitry Andric   CurrCost -= MissPenalty;
720bdd1243dSDimitry Andric   return FinishedExploring;
721bdd1243dSDimitry Andric }
722bdd1243dSDimitry Andric 
72306c3fb27SDimitry Andric template <typename T>
72406c3fb27SDimitry Andric void PipelineSolver::greedyFind(
72506c3fb27SDimitry Andric     std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges, T I, T E) {
726bdd1243dSDimitry Andric   SUToCandSGsPair CurrSU = PipelineInstrs[CurrSyncGroupIdx][CurrConflInstNo];
727bdd1243dSDimitry Andric   int BestNodeCost = -1;
728bdd1243dSDimitry Andric   int TempCost;
729bdd1243dSDimitry Andric   SchedGroup *BestGroup = nullptr;
730bdd1243dSDimitry Andric   int BestGroupID = -1;
731bdd1243dSDimitry Andric   auto &SyncPipeline = CurrPipeline[CurrSyncGroupIdx];
732bdd1243dSDimitry Andric   LLVM_DEBUG(dbgs() << "Fitting SU(" << CurrSU.first->NodeNum
733bdd1243dSDimitry Andric                     << ") in Pipeline # " << CurrSyncGroupIdx << "\n");
734bdd1243dSDimitry Andric 
735bdd1243dSDimitry Andric   // Since we have added the potential SchedGroups from bottom up, but
736bdd1243dSDimitry Andric   // traversed the DAG from top down, parse over the groups from last to
737bdd1243dSDimitry Andric   // first. If we fail to do this for the greedy algorithm, the solution will
738bdd1243dSDimitry Andric   // likely not be good in more complex cases.
739bdd1243dSDimitry Andric   for (; I != E; ++I) {
740bdd1243dSDimitry Andric     std::vector<std::pair<SUnit *, SUnit *>> AddedEdges;
741bdd1243dSDimitry Andric     int CandSGID = *I;
7425f757f3fSDimitry Andric     SchedGroup *Match = llvm::find_if(SyncPipeline, [CandSGID](SchedGroup &SG) {
7435f757f3fSDimitry Andric       return SG.getSGID() == CandSGID;
7445f757f3fSDimitry Andric     });
7455f757f3fSDimitry Andric     assert(Match);
746bdd1243dSDimitry Andric 
747bdd1243dSDimitry Andric     LLVM_DEBUG(dbgs() << "Trying SGID # " << CandSGID << " with Mask "
748bdd1243dSDimitry Andric                       << (int)Match->getMask() << "\n");
749bdd1243dSDimitry Andric 
750bdd1243dSDimitry Andric     if (Match->isFull()) {
751bdd1243dSDimitry Andric       LLVM_DEBUG(dbgs() << "SGID # " << CandSGID << " is full\n");
752bdd1243dSDimitry Andric       continue;
753bdd1243dSDimitry Andric     }
75406c3fb27SDimitry Andric     if (!Match->allowedByRules(CurrSU.first, SyncPipeline)) {
75506c3fb27SDimitry Andric       LLVM_DEBUG(dbgs() << "SGID # " << CandSGID << " has conflicting rule\n");
75606c3fb27SDimitry Andric       continue;
75706c3fb27SDimitry Andric     }
758bdd1243dSDimitry Andric     TempCost = addEdges(SyncPipeline, CurrSU.first, CandSGID, AddedEdges);
759bdd1243dSDimitry Andric     LLVM_DEBUG(dbgs() << "Cost of Group " << TempCost << "\n");
760bdd1243dSDimitry Andric     if (TempCost < BestNodeCost || BestNodeCost == -1) {
761bdd1243dSDimitry Andric       BestGroup = Match;
762bdd1243dSDimitry Andric       BestNodeCost = TempCost;
763bdd1243dSDimitry Andric       BestGroupID = CandSGID;
764bdd1243dSDimitry Andric     }
765bdd1243dSDimitry Andric     removeEdges(AddedEdges);
766bdd1243dSDimitry Andric     if (BestNodeCost == 0)
767bdd1243dSDimitry Andric       break;
768bdd1243dSDimitry Andric   }
769bdd1243dSDimitry Andric 
770bdd1243dSDimitry Andric   if (BestGroupID != -1) {
771bdd1243dSDimitry Andric     BestGroup->add(*CurrSU.first);
772bdd1243dSDimitry Andric     addEdges(SyncPipeline, CurrSU.first, BestGroupID, AddedEdges);
773bdd1243dSDimitry Andric     LLVM_DEBUG(dbgs() << "Best Group has ID: " << BestGroupID << " and Mask"
774bdd1243dSDimitry Andric                       << (int)BestGroup->getMask() << "\n");
775bdd1243dSDimitry Andric     BestCost += TempCost;
776bdd1243dSDimitry Andric   } else
777bdd1243dSDimitry Andric     BestCost += MissPenalty;
778bdd1243dSDimitry Andric 
779bdd1243dSDimitry Andric   CurrPipeline[CurrSyncGroupIdx] = SyncPipeline;
78006c3fb27SDimitry Andric }
78106c3fb27SDimitry Andric 
78206c3fb27SDimitry Andric bool PipelineSolver::solveGreedy() {
78306c3fb27SDimitry Andric   BestCost = 0;
78406c3fb27SDimitry Andric   std::vector<std::pair<SUnit *, SUnit *>> AddedEdges;
78506c3fb27SDimitry Andric 
78606c3fb27SDimitry Andric   while (static_cast<size_t>(CurrSyncGroupIdx) < PipelineInstrs.size()) {
78706c3fb27SDimitry Andric     SUToCandSGsPair CurrSU = PipelineInstrs[CurrSyncGroupIdx][CurrConflInstNo];
78806c3fb27SDimitry Andric     IsBottomUp
78906c3fb27SDimitry Andric         ? greedyFind(AddedEdges, CurrSU.second.rbegin(), CurrSU.second.rend())
79006c3fb27SDimitry Andric         : greedyFind(AddedEdges, CurrSU.second.begin(), CurrSU.second.end());
791bdd1243dSDimitry Andric     advancePosition();
792bdd1243dSDimitry Andric   }
793bdd1243dSDimitry Andric   BestPipeline = CurrPipeline;
794bdd1243dSDimitry Andric   removeEdges(AddedEdges);
795bdd1243dSDimitry Andric   return false;
796bdd1243dSDimitry Andric }
797bdd1243dSDimitry Andric 
798bdd1243dSDimitry Andric unsigned PipelineSolver::computeProblemSize() {
799bdd1243dSDimitry Andric   unsigned ProblemSize = 0;
800bdd1243dSDimitry Andric   for (auto &PipeConflicts : PipelineInstrs) {
801bdd1243dSDimitry Andric     ProblemSize += PipeConflicts.size();
802bdd1243dSDimitry Andric   }
803bdd1243dSDimitry Andric 
804bdd1243dSDimitry Andric   return ProblemSize;
805bdd1243dSDimitry Andric }
806bdd1243dSDimitry Andric 
807bdd1243dSDimitry Andric void PipelineSolver::solve() {
808bdd1243dSDimitry Andric   if (!NeedsSolver)
809bdd1243dSDimitry Andric     return;
810bdd1243dSDimitry Andric 
811bdd1243dSDimitry Andric   unsigned ProblemSize = computeProblemSize();
812bdd1243dSDimitry Andric   assert(ProblemSize > 0);
813bdd1243dSDimitry Andric 
814bdd1243dSDimitry Andric   bool BelowCutoff = (CutoffForExact > 0) && ProblemSize <= CutoffForExact;
815bdd1243dSDimitry Andric   MissPenalty = (ProblemSize / 2) + 1;
816bdd1243dSDimitry Andric 
817bdd1243dSDimitry Andric   LLVM_DEBUG(DAG->dump());
818bdd1243dSDimitry Andric   if (EnableExactSolver || BelowCutoff) {
819bdd1243dSDimitry Andric     LLVM_DEBUG(dbgs() << "Starting Greedy pipeline solver\n");
820bdd1243dSDimitry Andric     solveGreedy();
821bdd1243dSDimitry Andric     reset();
822bdd1243dSDimitry Andric     LLVM_DEBUG(dbgs() << "Greedy produced best cost of " << BestCost << "\n");
823bdd1243dSDimitry Andric     if (BestCost > 0) {
824bdd1243dSDimitry Andric       LLVM_DEBUG(dbgs() << "Starting EXACT pipeline solver\n");
825bdd1243dSDimitry Andric       solveExact();
826bdd1243dSDimitry Andric       LLVM_DEBUG(dbgs() << "Exact produced best cost of " << BestCost << "\n");
827bdd1243dSDimitry Andric     }
828bdd1243dSDimitry Andric   } else { // Use the Greedy Algorithm by default
829bdd1243dSDimitry Andric     LLVM_DEBUG(dbgs() << "Starting GREEDY pipeline solver\n");
830bdd1243dSDimitry Andric     solveGreedy();
831bdd1243dSDimitry Andric   }
832bdd1243dSDimitry Andric 
833bdd1243dSDimitry Andric   makePipeline();
83406c3fb27SDimitry Andric   LLVM_DEBUG(dbgs() << "After applying mutation\n");
83506c3fb27SDimitry Andric   LLVM_DEBUG(DAG->dump());
836bdd1243dSDimitry Andric }
837bdd1243dSDimitry Andric 
83806c3fb27SDimitry Andric enum IGLPStrategyID : int {
83906c3fb27SDimitry Andric   MFMASmallGemmOptID = 0,
84006c3fb27SDimitry Andric   MFMASmallGemmSingleWaveOptID = 1,
84106c3fb27SDimitry Andric };
842bdd1243dSDimitry Andric 
843bdd1243dSDimitry Andric // Implement a IGLP scheduling strategy.
844bdd1243dSDimitry Andric class IGLPStrategy {
845bdd1243dSDimitry Andric protected:
846bdd1243dSDimitry Andric   ScheduleDAGInstrs *DAG;
847bdd1243dSDimitry Andric 
848bdd1243dSDimitry Andric   const SIInstrInfo *TII;
849bdd1243dSDimitry Andric 
850bdd1243dSDimitry Andric public:
8515f757f3fSDimitry Andric   /// Add SchedGroups to \p SyncedSchedGroups to implement this Strategy.
852bdd1243dSDimitry Andric   virtual void applyIGLPStrategy(
853bdd1243dSDimitry Andric       DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
8545f757f3fSDimitry Andric       DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
8555f757f3fSDimitry Andric       bool IsReentry) = 0;
856bdd1243dSDimitry Andric 
857bdd1243dSDimitry Andric   // Returns true if this strategy should be applied to a ScheduleDAG.
858bdd1243dSDimitry Andric   virtual bool shouldApplyStrategy(ScheduleDAGInstrs *DAG) = 0;
859bdd1243dSDimitry Andric 
86006c3fb27SDimitry Andric   bool IsBottomUp = 1;
86106c3fb27SDimitry Andric 
862bdd1243dSDimitry Andric   IGLPStrategy(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII)
863bdd1243dSDimitry Andric       : DAG(DAG), TII(TII) {}
864bdd1243dSDimitry Andric 
865bdd1243dSDimitry Andric   virtual ~IGLPStrategy() = default;
866bdd1243dSDimitry Andric };
867bdd1243dSDimitry Andric 
868bdd1243dSDimitry Andric class MFMASmallGemmOpt final : public IGLPStrategy {
86906c3fb27SDimitry Andric private:
870bdd1243dSDimitry Andric public:
871bdd1243dSDimitry Andric   void applyIGLPStrategy(
872bdd1243dSDimitry Andric       DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
8735f757f3fSDimitry Andric       DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
8745f757f3fSDimitry Andric       bool IsReentry) override;
875bdd1243dSDimitry Andric 
876bdd1243dSDimitry Andric   bool shouldApplyStrategy(ScheduleDAGInstrs *DAG) override { return true; }
877bdd1243dSDimitry Andric 
878bdd1243dSDimitry Andric   MFMASmallGemmOpt(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII)
87906c3fb27SDimitry Andric       : IGLPStrategy(DAG, TII) {
88006c3fb27SDimitry Andric     IsBottomUp = 1;
88106c3fb27SDimitry Andric   }
882bdd1243dSDimitry Andric };
883bdd1243dSDimitry Andric 
884bdd1243dSDimitry Andric void MFMASmallGemmOpt::applyIGLPStrategy(
885bdd1243dSDimitry Andric     DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
8865f757f3fSDimitry Andric     DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
8875f757f3fSDimitry Andric     bool IsReentry) {
888bdd1243dSDimitry Andric   // Count the number of MFMA instructions.
889bdd1243dSDimitry Andric   unsigned MFMACount = 0;
890bdd1243dSDimitry Andric   for (const MachineInstr &I : *DAG)
891bdd1243dSDimitry Andric     if (TII->isMFMAorWMMA(I))
892bdd1243dSDimitry Andric       ++MFMACount;
893bdd1243dSDimitry Andric 
894bdd1243dSDimitry Andric   const unsigned PipelineSyncID = 0;
895bdd1243dSDimitry Andric   SchedGroup *SG = nullptr;
896bdd1243dSDimitry Andric   for (unsigned I = 0; I < MFMACount * 3; ++I) {
897bdd1243dSDimitry Andric     SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
898bdd1243dSDimitry Andric         SchedGroupMask::DS, 2, PipelineSyncID, DAG, TII);
899bdd1243dSDimitry Andric     SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
900bdd1243dSDimitry Andric 
901bdd1243dSDimitry Andric     SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
902bdd1243dSDimitry Andric         SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
903bdd1243dSDimitry Andric     SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
904bdd1243dSDimitry Andric   }
905bdd1243dSDimitry Andric }
906bdd1243dSDimitry Andric 
90706c3fb27SDimitry Andric class MFMASmallGemmSingleWaveOpt final : public IGLPStrategy {
90806c3fb27SDimitry Andric private:
90906c3fb27SDimitry Andric   // Whether the DS_READ is a predecessor of first four MFMA in region
91006c3fb27SDimitry Andric   class EnablesInitialMFMA final : public InstructionRule {
91106c3fb27SDimitry Andric   public:
91206c3fb27SDimitry Andric     bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
91306c3fb27SDimitry Andric                SmallVectorImpl<SchedGroup> &SyncPipe) override {
91406c3fb27SDimitry Andric       if (!SyncPipe.size())
91506c3fb27SDimitry Andric         return false;
91606c3fb27SDimitry Andric       int MFMAsFound = 0;
91706c3fb27SDimitry Andric       if (!Cache->size()) {
91806c3fb27SDimitry Andric         for (auto &Elt : SyncPipe[0].DAG->SUnits) {
91906c3fb27SDimitry Andric           if (TII->isMFMAorWMMA(*Elt.getInstr())) {
92006c3fb27SDimitry Andric             ++MFMAsFound;
92106c3fb27SDimitry Andric             if (MFMAsFound > 4)
92206c3fb27SDimitry Andric               break;
92306c3fb27SDimitry Andric             Cache->push_back(&Elt);
92406c3fb27SDimitry Andric           }
92506c3fb27SDimitry Andric         }
92606c3fb27SDimitry Andric       }
92706c3fb27SDimitry Andric 
92806c3fb27SDimitry Andric       assert(Cache->size());
92906c3fb27SDimitry Andric       auto DAG = SyncPipe[0].DAG;
93006c3fb27SDimitry Andric       for (auto &Elt : *Cache) {
93106c3fb27SDimitry Andric         if (DAG->IsReachable(Elt, const_cast<SUnit *>(SU)))
93206c3fb27SDimitry Andric           return true;
93306c3fb27SDimitry Andric       }
93406c3fb27SDimitry Andric       return false;
93506c3fb27SDimitry Andric     }
93606c3fb27SDimitry Andric 
93706c3fb27SDimitry Andric     EnablesInitialMFMA(const SIInstrInfo *TII, unsigned SGID,
93806c3fb27SDimitry Andric                        bool NeedsCache = false)
93906c3fb27SDimitry Andric         : InstructionRule(TII, SGID, NeedsCache) {}
94006c3fb27SDimitry Andric   };
94106c3fb27SDimitry Andric 
94206c3fb27SDimitry Andric   // Whether the MI is a V_PERM and is a predecessor of a common DS_WRITE
94306c3fb27SDimitry Andric   class IsPermForDSW final : public InstructionRule {
94406c3fb27SDimitry Andric   public:
94506c3fb27SDimitry Andric     bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
94606c3fb27SDimitry Andric                SmallVectorImpl<SchedGroup> &SyncPipe) override {
94706c3fb27SDimitry Andric       auto MI = SU->getInstr();
94806c3fb27SDimitry Andric       if (MI->getOpcode() != AMDGPU::V_PERM_B32_e64)
94906c3fb27SDimitry Andric         return false;
95006c3fb27SDimitry Andric 
95106c3fb27SDimitry Andric       bool FitsInGroup = false;
95206c3fb27SDimitry Andric       // Does the VALU have a DS_WRITE successor
95306c3fb27SDimitry Andric       if (!Collection.size()) {
95406c3fb27SDimitry Andric         for (auto &Succ : SU->Succs) {
95506c3fb27SDimitry Andric           SUnit *SuccUnit = Succ.getSUnit();
95606c3fb27SDimitry Andric           if (TII->isDS(*SuccUnit->getInstr()) &&
95706c3fb27SDimitry Andric               SuccUnit->getInstr()->mayStore()) {
95806c3fb27SDimitry Andric             Cache->push_back(SuccUnit);
95906c3fb27SDimitry Andric             FitsInGroup = true;
96006c3fb27SDimitry Andric           }
96106c3fb27SDimitry Andric         }
96206c3fb27SDimitry Andric         return FitsInGroup;
96306c3fb27SDimitry Andric       }
96406c3fb27SDimitry Andric 
96506c3fb27SDimitry Andric       assert(Cache->size());
96606c3fb27SDimitry Andric 
96706c3fb27SDimitry Andric       // Does the VALU have a DS_WRITE successor that is the same as other
96806c3fb27SDimitry Andric       // VALU already in the group. The V_PERMs will all share 1 DS_W succ
9695f757f3fSDimitry Andric       return llvm::any_of(*Cache, [&SU](SUnit *Elt) {
9705f757f3fSDimitry Andric         return llvm::any_of(SU->Succs, [&Elt](const SDep &ThisSucc) {
97106c3fb27SDimitry Andric           return ThisSucc.getSUnit() == Elt;
97206c3fb27SDimitry Andric         });
97306c3fb27SDimitry Andric       });
97406c3fb27SDimitry Andric     }
97506c3fb27SDimitry Andric 
97606c3fb27SDimitry Andric     IsPermForDSW(const SIInstrInfo *TII, unsigned SGID, bool NeedsCache = false)
97706c3fb27SDimitry Andric         : InstructionRule(TII, SGID, NeedsCache) {}
97806c3fb27SDimitry Andric   };
97906c3fb27SDimitry Andric 
98006c3fb27SDimitry Andric   // Whether the SU is a successor of any element in previous SchedGroup
98106c3fb27SDimitry Andric   class IsSuccOfPrevGroup final : public InstructionRule {
98206c3fb27SDimitry Andric   public:
98306c3fb27SDimitry Andric     bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
98406c3fb27SDimitry Andric                SmallVectorImpl<SchedGroup> &SyncPipe) override {
98506c3fb27SDimitry Andric       SchedGroup *OtherGroup = nullptr;
98606c3fb27SDimitry Andric       for (auto &PipeSG : SyncPipe) {
98706c3fb27SDimitry Andric         if ((unsigned)PipeSG.getSGID() == SGID - 1) {
98806c3fb27SDimitry Andric           OtherGroup = &PipeSG;
98906c3fb27SDimitry Andric         }
99006c3fb27SDimitry Andric       }
99106c3fb27SDimitry Andric 
99206c3fb27SDimitry Andric       if (!OtherGroup)
99306c3fb27SDimitry Andric         return false;
99406c3fb27SDimitry Andric       if (!OtherGroup->Collection.size())
99506c3fb27SDimitry Andric         return true;
99606c3fb27SDimitry Andric 
99706c3fb27SDimitry Andric       // Does the previous VALU have this DS_Write as a successor
99806c3fb27SDimitry Andric       return (std::any_of(OtherGroup->Collection.begin(),
99906c3fb27SDimitry Andric                           OtherGroup->Collection.end(), [&SU](SUnit *Elt) {
100006c3fb27SDimitry Andric                             return std::any_of(Elt->Succs.begin(),
100106c3fb27SDimitry Andric                                                Elt->Succs.end(),
100206c3fb27SDimitry Andric                                                [&SU](SDep &Succ) {
100306c3fb27SDimitry Andric                                                  return Succ.getSUnit() == SU;
100406c3fb27SDimitry Andric                                                });
100506c3fb27SDimitry Andric                           }));
100606c3fb27SDimitry Andric     }
100706c3fb27SDimitry Andric     IsSuccOfPrevGroup(const SIInstrInfo *TII, unsigned SGID,
100806c3fb27SDimitry Andric                       bool NeedsCache = false)
100906c3fb27SDimitry Andric         : InstructionRule(TII, SGID, NeedsCache) {}
101006c3fb27SDimitry Andric   };
101106c3fb27SDimitry Andric 
101206c3fb27SDimitry Andric   // Whether the combined load width of group is 128 bits
101306c3fb27SDimitry Andric   class VMEMSize final : public InstructionRule {
101406c3fb27SDimitry Andric   public:
101506c3fb27SDimitry Andric     bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
101606c3fb27SDimitry Andric                SmallVectorImpl<SchedGroup> &SyncPipe) override {
101706c3fb27SDimitry Andric       auto MI = SU->getInstr();
101806c3fb27SDimitry Andric       if (MI->getOpcode() == TargetOpcode::BUNDLE)
101906c3fb27SDimitry Andric         return false;
102006c3fb27SDimitry Andric       if (!Collection.size())
102106c3fb27SDimitry Andric         return true;
102206c3fb27SDimitry Andric 
102306c3fb27SDimitry Andric       int NumBits = 0;
102406c3fb27SDimitry Andric 
102506c3fb27SDimitry Andric       auto TRI = TII->getRegisterInfo();
102606c3fb27SDimitry Andric       auto &MRI = MI->getParent()->getParent()->getRegInfo();
102706c3fb27SDimitry Andric       for (auto &Elt : Collection) {
102806c3fb27SDimitry Andric         auto Op = Elt->getInstr()->getOperand(0);
102906c3fb27SDimitry Andric         auto Size =
103006c3fb27SDimitry Andric             TRI.getRegSizeInBits(*TRI.getRegClassForOperandReg(MRI, Op));
103106c3fb27SDimitry Andric         NumBits += Size;
103206c3fb27SDimitry Andric       }
103306c3fb27SDimitry Andric 
103406c3fb27SDimitry Andric       if (NumBits < 128) {
103506c3fb27SDimitry Andric         assert(TII->isVMEM(*MI) && MI->mayLoad());
103606c3fb27SDimitry Andric         if (NumBits + TRI.getRegSizeInBits(*TRI.getRegClassForOperandReg(
103706c3fb27SDimitry Andric                           MRI, MI->getOperand(0))) <=
103806c3fb27SDimitry Andric             128)
103906c3fb27SDimitry Andric           return true;
104006c3fb27SDimitry Andric       }
104106c3fb27SDimitry Andric 
104206c3fb27SDimitry Andric       return false;
104306c3fb27SDimitry Andric     }
104406c3fb27SDimitry Andric 
104506c3fb27SDimitry Andric     VMEMSize(const SIInstrInfo *TII, unsigned SGID, bool NeedsCache = false)
104606c3fb27SDimitry Andric         : InstructionRule(TII, SGID, NeedsCache) {}
104706c3fb27SDimitry Andric   };
104806c3fb27SDimitry Andric 
10495f757f3fSDimitry Andric   /// Whether the SU shares a V_PERM predecessor with any SU in the SchedGroup
10505f757f3fSDimitry Andric   /// that is \p Distance steps away
105106c3fb27SDimitry Andric   class SharesPredWithPrevNthGroup final : public InstructionRule {
105206c3fb27SDimitry Andric   private:
105306c3fb27SDimitry Andric     unsigned Distance = 1;
105406c3fb27SDimitry Andric 
105506c3fb27SDimitry Andric   public:
105606c3fb27SDimitry Andric     bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
105706c3fb27SDimitry Andric                SmallVectorImpl<SchedGroup> &SyncPipe) override {
105806c3fb27SDimitry Andric       SchedGroup *OtherGroup = nullptr;
105906c3fb27SDimitry Andric       if (!SyncPipe.size())
106006c3fb27SDimitry Andric         return false;
106106c3fb27SDimitry Andric 
106206c3fb27SDimitry Andric       if (!Cache->size()) {
106306c3fb27SDimitry Andric 
106406c3fb27SDimitry Andric         for (auto &PipeSG : SyncPipe) {
106506c3fb27SDimitry Andric           if ((unsigned)PipeSG.getSGID() == SGID - Distance) {
106606c3fb27SDimitry Andric             OtherGroup = &PipeSG;
106706c3fb27SDimitry Andric           }
106806c3fb27SDimitry Andric         }
106906c3fb27SDimitry Andric 
107006c3fb27SDimitry Andric         if (!OtherGroup)
107106c3fb27SDimitry Andric           return false;
107206c3fb27SDimitry Andric         if (!OtherGroup->Collection.size())
107306c3fb27SDimitry Andric           return true;
107406c3fb27SDimitry Andric 
107506c3fb27SDimitry Andric         for (auto &OtherEle : OtherGroup->Collection) {
107606c3fb27SDimitry Andric           for (auto &Pred : OtherEle->Preds) {
107706c3fb27SDimitry Andric             if (Pred.getSUnit()->getInstr()->getOpcode() ==
107806c3fb27SDimitry Andric                 AMDGPU::V_PERM_B32_e64)
107906c3fb27SDimitry Andric               Cache->push_back(Pred.getSUnit());
108006c3fb27SDimitry Andric           }
108106c3fb27SDimitry Andric         }
10825f757f3fSDimitry Andric 
10835f757f3fSDimitry Andric         // If the other group has no PERM preds, then this group won't share any
10845f757f3fSDimitry Andric         if (!Cache->size())
10855f757f3fSDimitry Andric           return false;
108606c3fb27SDimitry Andric       }
108706c3fb27SDimitry Andric 
108806c3fb27SDimitry Andric       auto DAG = SyncPipe[0].DAG;
108906c3fb27SDimitry Andric       // Does the previous DS_WRITE share a V_PERM predecessor with this
109006c3fb27SDimitry Andric       // VMEM_READ
10915f757f3fSDimitry Andric       return llvm::any_of(*Cache, [&SU, &DAG](SUnit *Elt) {
109206c3fb27SDimitry Andric         return DAG->IsReachable(const_cast<SUnit *>(SU), Elt);
10935f757f3fSDimitry Andric       });
109406c3fb27SDimitry Andric     }
109506c3fb27SDimitry Andric     SharesPredWithPrevNthGroup(unsigned Distance, const SIInstrInfo *TII,
109606c3fb27SDimitry Andric                                unsigned SGID, bool NeedsCache = false)
109706c3fb27SDimitry Andric         : InstructionRule(TII, SGID, NeedsCache), Distance(Distance) {}
109806c3fb27SDimitry Andric   };
109906c3fb27SDimitry Andric 
110006c3fb27SDimitry Andric public:
110106c3fb27SDimitry Andric   void applyIGLPStrategy(
110206c3fb27SDimitry Andric       DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
11035f757f3fSDimitry Andric       DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
11045f757f3fSDimitry Andric       bool IsReentry) override;
110506c3fb27SDimitry Andric 
110606c3fb27SDimitry Andric   bool shouldApplyStrategy(ScheduleDAGInstrs *DAG) override { return true; }
110706c3fb27SDimitry Andric 
110806c3fb27SDimitry Andric   MFMASmallGemmSingleWaveOpt(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII)
110906c3fb27SDimitry Andric       : IGLPStrategy(DAG, TII) {
111006c3fb27SDimitry Andric     IsBottomUp = 0;
111106c3fb27SDimitry Andric   }
111206c3fb27SDimitry Andric };
111306c3fb27SDimitry Andric 
11145f757f3fSDimitry Andric static unsigned DSWCount = 0;
11155f757f3fSDimitry Andric static unsigned DSWWithPermCount = 0;
11165f757f3fSDimitry Andric static unsigned DSWWithSharedVMEMCount = 0;
11175f757f3fSDimitry Andric 
111806c3fb27SDimitry Andric void MFMASmallGemmSingleWaveOpt::applyIGLPStrategy(
111906c3fb27SDimitry Andric     DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
11205f757f3fSDimitry Andric     DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
11215f757f3fSDimitry Andric     bool IsReentry) {
112206c3fb27SDimitry Andric   unsigned MFMACount = 0;
112306c3fb27SDimitry Andric   unsigned DSRCount = 0;
11245f757f3fSDimitry Andric 
11255f757f3fSDimitry Andric   assert((IsReentry || (DSWCount == 0 && DSWWithPermCount == 0 &&
11265f757f3fSDimitry Andric                         DSWWithSharedVMEMCount == 0)) &&
11275f757f3fSDimitry Andric          "DSWCounters should be zero in pre-RA scheduling!");
112806c3fb27SDimitry Andric   SmallVector<SUnit *, 6> DSWithPerms;
112906c3fb27SDimitry Andric   for (auto &SU : DAG->SUnits) {
113006c3fb27SDimitry Andric     auto I = SU.getInstr();
113106c3fb27SDimitry Andric     if (TII->isMFMAorWMMA(*I))
113206c3fb27SDimitry Andric       ++MFMACount;
113306c3fb27SDimitry Andric     else if (TII->isDS(*I)) {
113406c3fb27SDimitry Andric       if (I->mayLoad())
113506c3fb27SDimitry Andric         ++DSRCount;
11365f757f3fSDimitry Andric       else if (I->mayStore() && !IsReentry) {
113706c3fb27SDimitry Andric         ++DSWCount;
113806c3fb27SDimitry Andric         for (auto Pred : SU.Preds) {
113906c3fb27SDimitry Andric           if (Pred.getSUnit()->getInstr()->getOpcode() ==
114006c3fb27SDimitry Andric               AMDGPU::V_PERM_B32_e64) {
114106c3fb27SDimitry Andric             DSWithPerms.push_back(&SU);
114206c3fb27SDimitry Andric             break;
114306c3fb27SDimitry Andric           }
114406c3fb27SDimitry Andric         }
114506c3fb27SDimitry Andric       }
114606c3fb27SDimitry Andric     }
114706c3fb27SDimitry Andric   }
11485f757f3fSDimitry Andric 
11495f757f3fSDimitry Andric   if (!IsReentry) {
115006c3fb27SDimitry Andric     DSWWithPermCount = DSWithPerms.size();
115106c3fb27SDimitry Andric     auto I = DSWithPerms.begin();
115206c3fb27SDimitry Andric     auto E = DSWithPerms.end();
115306c3fb27SDimitry Andric 
115406c3fb27SDimitry Andric     // Get the count of DS_WRITES with V_PERM predecessors which
115506c3fb27SDimitry Andric     // have loop carried dependencies (WAR) on the same VMEM_READs.
115606c3fb27SDimitry Andric     // We consider partial overlap as a miss -- in other words,
115706c3fb27SDimitry Andric     // for a given DS_W, we only consider another DS_W as matching
115806c3fb27SDimitry Andric     // if there is a corresponding (in terms of the VMEM_R it uses) V_PERM pred
115906c3fb27SDimitry Andric     // for every V_PERM pred of this DS_W.
116006c3fb27SDimitry Andric     DenseMap<MachineInstr *, SUnit *> VMEMLookup;
116106c3fb27SDimitry Andric     SmallVector<SUnit *, 6> Counted;
116206c3fb27SDimitry Andric     for (; I != E; I++) {
116306c3fb27SDimitry Andric       SUnit *Cand = nullptr;
116406c3fb27SDimitry Andric       bool MissedAny = false;
116506c3fb27SDimitry Andric       for (auto &Pred : (*I)->Preds) {
116606c3fb27SDimitry Andric         if (Pred.getSUnit()->getInstr()->getOpcode() != AMDGPU::V_PERM_B32_e64)
116706c3fb27SDimitry Andric           continue;
116806c3fb27SDimitry Andric 
11695f757f3fSDimitry Andric         if (Cand && llvm::is_contained(Counted, Cand))
117006c3fb27SDimitry Andric           break;
117106c3fb27SDimitry Andric 
117206c3fb27SDimitry Andric         for (auto &Succ : Pred.getSUnit()->Succs) {
117306c3fb27SDimitry Andric           auto MI = Succ.getSUnit()->getInstr();
117406c3fb27SDimitry Andric           if (!TII->isVMEM(*MI) || !MI->mayLoad())
117506c3fb27SDimitry Andric             continue;
117606c3fb27SDimitry Andric 
117706c3fb27SDimitry Andric           if (MissedAny || !VMEMLookup.size()) {
117806c3fb27SDimitry Andric             MissedAny = true;
117906c3fb27SDimitry Andric             VMEMLookup[MI] = *I;
118006c3fb27SDimitry Andric             continue;
118106c3fb27SDimitry Andric           }
118206c3fb27SDimitry Andric 
118306c3fb27SDimitry Andric           if (!VMEMLookup.contains(MI)) {
118406c3fb27SDimitry Andric             MissedAny = true;
118506c3fb27SDimitry Andric             VMEMLookup[MI] = *I;
118606c3fb27SDimitry Andric             continue;
118706c3fb27SDimitry Andric           }
118806c3fb27SDimitry Andric 
118906c3fb27SDimitry Andric           Cand = VMEMLookup[MI];
11905f757f3fSDimitry Andric           if (llvm::is_contained(Counted, Cand)) {
119106c3fb27SDimitry Andric             MissedAny = true;
119206c3fb27SDimitry Andric             break;
119306c3fb27SDimitry Andric           }
119406c3fb27SDimitry Andric         }
119506c3fb27SDimitry Andric       }
119606c3fb27SDimitry Andric       if (!MissedAny && Cand) {
119706c3fb27SDimitry Andric         DSWWithSharedVMEMCount += 2;
119806c3fb27SDimitry Andric         Counted.push_back(Cand);
119906c3fb27SDimitry Andric         Counted.push_back(*I);
120006c3fb27SDimitry Andric       }
120106c3fb27SDimitry Andric     }
12025f757f3fSDimitry Andric   }
120306c3fb27SDimitry Andric 
120406c3fb27SDimitry Andric   assert(DSWWithSharedVMEMCount <= DSWWithPermCount);
120506c3fb27SDimitry Andric   SchedGroup *SG;
120606c3fb27SDimitry Andric   unsigned PipelineSyncID = 0;
120706c3fb27SDimitry Andric   // For kernels with V_PERM, there are enough VALU to mix in between MFMAs
120806c3fb27SDimitry Andric   if (DSWWithPermCount) {
120906c3fb27SDimitry Andric     for (unsigned I = 0; I < MFMACount; I++) {
121006c3fb27SDimitry Andric       SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
121106c3fb27SDimitry Andric           SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
121206c3fb27SDimitry Andric       SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
121306c3fb27SDimitry Andric 
121406c3fb27SDimitry Andric       SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
121506c3fb27SDimitry Andric           SchedGroupMask::VALU, 2, PipelineSyncID, DAG, TII);
121606c3fb27SDimitry Andric       SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
121706c3fb27SDimitry Andric     }
121806c3fb27SDimitry Andric   }
121906c3fb27SDimitry Andric 
122006c3fb27SDimitry Andric   PipelineSyncID = 1;
122106c3fb27SDimitry Andric   // Phase 1: Break up DS_READ and MFMA clusters.
122206c3fb27SDimitry Andric   // First DS_READ to make ready initial MFMA, then interleave MFMA with DS_READ
122306c3fb27SDimitry Andric   // prefetch
122406c3fb27SDimitry Andric 
122506c3fb27SDimitry Andric   // Make ready initial MFMA
122606c3fb27SDimitry Andric   SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
122706c3fb27SDimitry Andric       SchedGroupMask::DS_READ, 4, PipelineSyncID, DAG, TII);
122806c3fb27SDimitry Andric   SG->addRule(std::make_shared<EnablesInitialMFMA>(TII, SG->getSGID(), true));
122906c3fb27SDimitry Andric   SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
123006c3fb27SDimitry Andric 
123106c3fb27SDimitry Andric   SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
123206c3fb27SDimitry Andric       SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
123306c3fb27SDimitry Andric   SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
123406c3fb27SDimitry Andric 
123506c3fb27SDimitry Andric   // Interleave MFMA with DS_READ prefetch
123606c3fb27SDimitry Andric   for (unsigned I = 0; I < DSRCount - 4; ++I) {
123706c3fb27SDimitry Andric     SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
123806c3fb27SDimitry Andric         SchedGroupMask::DS_READ, 1, PipelineSyncID, DAG, TII);
123906c3fb27SDimitry Andric     SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
124006c3fb27SDimitry Andric 
124106c3fb27SDimitry Andric     SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
124206c3fb27SDimitry Andric         SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
124306c3fb27SDimitry Andric     SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
124406c3fb27SDimitry Andric   }
124506c3fb27SDimitry Andric 
124606c3fb27SDimitry Andric   // Phase 2a: Loop carried dependency with V_PERM
124706c3fb27SDimitry Andric   // Schedule VPerm & DS_WRITE as closely as possible to the VMEM_READ they
124806c3fb27SDimitry Andric   // depend on. Interleave MFMA to keep XDL unit busy throughout.
124906c3fb27SDimitry Andric   for (unsigned I = 0; I < DSWWithPermCount - DSWWithSharedVMEMCount; ++I) {
125006c3fb27SDimitry Andric     SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
125106c3fb27SDimitry Andric         SchedGroupMask::VALU, 4, PipelineSyncID, DAG, TII);
125206c3fb27SDimitry Andric     SG->addRule(std::make_shared<IsPermForDSW>(TII, SG->getSGID(), true));
125306c3fb27SDimitry Andric     SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
125406c3fb27SDimitry Andric 
125506c3fb27SDimitry Andric     SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
125606c3fb27SDimitry Andric         SchedGroupMask::DS_WRITE, 1, PipelineSyncID, DAG, TII);
125706c3fb27SDimitry Andric     SG->addRule(std::make_shared<IsSuccOfPrevGroup>(TII, SG->getSGID(), false));
125806c3fb27SDimitry Andric     SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
125906c3fb27SDimitry Andric 
126006c3fb27SDimitry Andric     SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
126106c3fb27SDimitry Andric         SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG, TII);
126206c3fb27SDimitry Andric     SG->addRule(std::make_shared<SharesPredWithPrevNthGroup>(
126306c3fb27SDimitry Andric         1, TII, SG->getSGID(), true));
126406c3fb27SDimitry Andric     SG->addRule(std::make_shared<VMEMSize>(TII, SG->getSGID(), false));
126506c3fb27SDimitry Andric     SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
126606c3fb27SDimitry Andric 
126706c3fb27SDimitry Andric     SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
126806c3fb27SDimitry Andric         SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
126906c3fb27SDimitry Andric     SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
127006c3fb27SDimitry Andric 
127106c3fb27SDimitry Andric     SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
127206c3fb27SDimitry Andric         SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG, TII);
127306c3fb27SDimitry Andric     SG->addRule(std::make_shared<SharesPredWithPrevNthGroup>(
127406c3fb27SDimitry Andric         3, TII, SG->getSGID(), true));
127506c3fb27SDimitry Andric     SG->addRule(std::make_shared<VMEMSize>(TII, SG->getSGID(), false));
127606c3fb27SDimitry Andric     SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
127706c3fb27SDimitry Andric 
127806c3fb27SDimitry Andric     SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
127906c3fb27SDimitry Andric         SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
128006c3fb27SDimitry Andric     SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
128106c3fb27SDimitry Andric   }
128206c3fb27SDimitry Andric 
128306c3fb27SDimitry Andric   // Phase 2b: Loop carried dependency without V_PERM
128406c3fb27SDimitry Andric   // Schedule DS_WRITE as closely as possible to the VMEM_READ they depend on.
128506c3fb27SDimitry Andric   // Interleave MFMA to keep XDL unit busy throughout.
128606c3fb27SDimitry Andric   for (unsigned I = 0; I < DSWCount - DSWWithPermCount; I++) {
128706c3fb27SDimitry Andric     SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
128806c3fb27SDimitry Andric         SchedGroupMask::DS_WRITE, 1, PipelineSyncID, DAG, TII);
128906c3fb27SDimitry Andric     SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
129006c3fb27SDimitry Andric 
129106c3fb27SDimitry Andric     SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
129206c3fb27SDimitry Andric         SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG, TII);
129306c3fb27SDimitry Andric     SG->addRule(std::make_shared<VMEMSize>(TII, SG->getSGID(), false));
129406c3fb27SDimitry Andric     SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
129506c3fb27SDimitry Andric 
129606c3fb27SDimitry Andric     SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
129706c3fb27SDimitry Andric         SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
129806c3fb27SDimitry Andric     SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
129906c3fb27SDimitry Andric   }
130006c3fb27SDimitry Andric 
130106c3fb27SDimitry Andric   // Phase 2c: Loop carried dependency with V_PERM, VMEM_READs are
130206c3fb27SDimitry Andric   // ultimately used by two DS_WRITE
130306c3fb27SDimitry Andric   // Schedule VPerm & DS_WRITE as closely as possible to the VMEM_READ they
130406c3fb27SDimitry Andric   // depend on. Interleave MFMA to keep XDL unit busy throughout.
130506c3fb27SDimitry Andric 
130606c3fb27SDimitry Andric   for (unsigned I = 0; I < DSWWithSharedVMEMCount; ++I) {
130706c3fb27SDimitry Andric     SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
130806c3fb27SDimitry Andric         SchedGroupMask::VALU, 4, PipelineSyncID, DAG, TII);
130906c3fb27SDimitry Andric     SG->addRule(std::make_shared<IsPermForDSW>(TII, SG->getSGID(), true));
131006c3fb27SDimitry Andric     SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
131106c3fb27SDimitry Andric 
131206c3fb27SDimitry Andric     SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
131306c3fb27SDimitry Andric         SchedGroupMask::DS_WRITE, 1, PipelineSyncID, DAG, TII);
131406c3fb27SDimitry Andric     SG->addRule(std::make_shared<IsSuccOfPrevGroup>(TII, SG->getSGID(), false));
131506c3fb27SDimitry Andric     SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
131606c3fb27SDimitry Andric 
131706c3fb27SDimitry Andric     SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
131806c3fb27SDimitry Andric         SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
131906c3fb27SDimitry Andric     SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
132006c3fb27SDimitry Andric 
132106c3fb27SDimitry Andric     SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
132206c3fb27SDimitry Andric         SchedGroupMask::VALU, 4, PipelineSyncID, DAG, TII);
132306c3fb27SDimitry Andric     SG->addRule(std::make_shared<IsPermForDSW>(TII, SG->getSGID(), true));
132406c3fb27SDimitry Andric     SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
132506c3fb27SDimitry Andric 
132606c3fb27SDimitry Andric     SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
132706c3fb27SDimitry Andric         SchedGroupMask::DS_WRITE, 1, PipelineSyncID, DAG, TII);
132806c3fb27SDimitry Andric     SG->addRule(std::make_shared<IsSuccOfPrevGroup>(TII, SG->getSGID(), false));
132906c3fb27SDimitry Andric     SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
133006c3fb27SDimitry Andric 
133106c3fb27SDimitry Andric     SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
133206c3fb27SDimitry Andric         SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
133306c3fb27SDimitry Andric     SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
133406c3fb27SDimitry Andric 
133506c3fb27SDimitry Andric     SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
133606c3fb27SDimitry Andric         SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG, TII);
133706c3fb27SDimitry Andric     SG->addRule(std::make_shared<SharesPredWithPrevNthGroup>(
133806c3fb27SDimitry Andric         2, TII, SG->getSGID(), true));
133906c3fb27SDimitry Andric     SG->addRule(std::make_shared<VMEMSize>(TII, SG->getSGID(), false));
134006c3fb27SDimitry Andric     SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
134106c3fb27SDimitry Andric 
134206c3fb27SDimitry Andric     SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
134306c3fb27SDimitry Andric         SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
134406c3fb27SDimitry Andric     SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
134506c3fb27SDimitry Andric 
134606c3fb27SDimitry Andric     SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
134706c3fb27SDimitry Andric         SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG, TII);
134806c3fb27SDimitry Andric     SG->addRule(std::make_shared<SharesPredWithPrevNthGroup>(
134906c3fb27SDimitry Andric         4, TII, SG->getSGID(), true));
135006c3fb27SDimitry Andric     SG->addRule(std::make_shared<VMEMSize>(TII, SG->getSGID(), false));
135106c3fb27SDimitry Andric     SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
135206c3fb27SDimitry Andric 
135306c3fb27SDimitry Andric     SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
135406c3fb27SDimitry Andric         SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
135506c3fb27SDimitry Andric     SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
135606c3fb27SDimitry Andric   }
135706c3fb27SDimitry Andric }
135806c3fb27SDimitry Andric 
1359bdd1243dSDimitry Andric static std::unique_ptr<IGLPStrategy>
1360bdd1243dSDimitry Andric createIGLPStrategy(IGLPStrategyID ID, ScheduleDAGInstrs *DAG,
1361bdd1243dSDimitry Andric                    const SIInstrInfo *TII) {
1362bdd1243dSDimitry Andric   switch (ID) {
1363bdd1243dSDimitry Andric   case MFMASmallGemmOptID:
1364bdd1243dSDimitry Andric     return std::make_unique<MFMASmallGemmOpt>(DAG, TII);
136506c3fb27SDimitry Andric   case MFMASmallGemmSingleWaveOptID:
136606c3fb27SDimitry Andric     return std::make_unique<MFMASmallGemmSingleWaveOpt>(DAG, TII);
1367bdd1243dSDimitry Andric   }
1368bdd1243dSDimitry Andric 
1369bdd1243dSDimitry Andric   llvm_unreachable("Unknown IGLPStrategyID");
1370bdd1243dSDimitry Andric }
1371bdd1243dSDimitry Andric 
1372bdd1243dSDimitry Andric class IGroupLPDAGMutation : public ScheduleDAGMutation {
1373bdd1243dSDimitry Andric private:
1374bdd1243dSDimitry Andric   const SIInstrInfo *TII;
1375bdd1243dSDimitry Andric 
1376bdd1243dSDimitry Andric   ScheduleDAGMI *DAG;
1377bdd1243dSDimitry Andric 
1378bdd1243dSDimitry Andric   // Organize lists of SchedGroups by their SyncID. SchedGroups /
1379bdd1243dSDimitry Andric   // SCHED_GROUP_BARRIERs with different SyncIDs will have no edges added
1380bdd1243dSDimitry Andric   // between then.
1381bdd1243dSDimitry Andric   DenseMap<int, SmallVector<SchedGroup, 4>> SyncedSchedGroups;
1382bdd1243dSDimitry Andric 
1383bdd1243dSDimitry Andric   // Used to track instructions that can be mapped to multiple sched groups
1384bdd1243dSDimitry Andric   DenseMap<int, SUnitsToCandidateSGsMap> SyncedInstrs;
1385bdd1243dSDimitry Andric 
1386bdd1243dSDimitry Andric   // Add DAG edges that enforce SCHED_BARRIER ordering.
1387bdd1243dSDimitry Andric   void addSchedBarrierEdges(SUnit &SU);
1388bdd1243dSDimitry Andric 
1389bdd1243dSDimitry Andric   // Use a SCHED_BARRIER's mask to identify instruction SchedGroups that should
1390bdd1243dSDimitry Andric   // not be reordered accross the SCHED_BARRIER. This is used for the base
1391bdd1243dSDimitry Andric   // SCHED_BARRIER, and not SCHED_GROUP_BARRIER. The difference is that
1392bdd1243dSDimitry Andric   // SCHED_BARRIER will always block all instructions that can be classified
1393bdd1243dSDimitry Andric   // into a particular SchedClass, whereas SCHED_GROUP_BARRIER has a fixed size
1394bdd1243dSDimitry Andric   // and may only synchronize with some SchedGroups. Returns the inverse of
1395bdd1243dSDimitry Andric   // Mask. SCHED_BARRIER's mask describes which instruction types should be
1396bdd1243dSDimitry Andric   // allowed to be scheduled across it. Invert the mask to get the
1397bdd1243dSDimitry Andric   // SchedGroupMask of instructions that should be barred.
1398bdd1243dSDimitry Andric   SchedGroupMask invertSchedBarrierMask(SchedGroupMask Mask) const;
1399bdd1243dSDimitry Andric 
1400bdd1243dSDimitry Andric   // Create SchedGroups for a SCHED_GROUP_BARRIER.
1401bdd1243dSDimitry Andric   void initSchedGroupBarrierPipelineStage(
1402bdd1243dSDimitry Andric       std::vector<SUnit>::reverse_iterator RIter);
1403bdd1243dSDimitry Andric 
1404bdd1243dSDimitry Andric   void initIGLPOpt(SUnit &SU);
1405bdd1243dSDimitry Andric 
1406bdd1243dSDimitry Andric public:
1407bdd1243dSDimitry Andric   void apply(ScheduleDAGInstrs *DAGInstrs) override;
1408bdd1243dSDimitry Andric 
140906c3fb27SDimitry Andric   // The order in which the PipelineSolver should process the candidate
141006c3fb27SDimitry Andric   // SchedGroup for a PipelineInstr. BOTTOM_UP will try to add SUs to the last
141106c3fb27SDimitry Andric   // created SchedGroup first, and will consider that as the ultimate
141206c3fb27SDimitry Andric   // predecessor group when linking. TOP_DOWN instead links and processes the
141306c3fb27SDimitry Andric   // first created SchedGroup first.
141406c3fb27SDimitry Andric   bool IsBottomUp = 1;
141506c3fb27SDimitry Andric 
14165f757f3fSDimitry Andric   // Whether or not this is a reentry into the IGroupLPDAGMutation.
14175f757f3fSDimitry Andric   bool IsReentry = false;
14185f757f3fSDimitry Andric 
1419bdd1243dSDimitry Andric   IGroupLPDAGMutation() = default;
14205f757f3fSDimitry Andric   IGroupLPDAGMutation(bool IsReentry) : IsReentry(IsReentry) {}
1421bdd1243dSDimitry Andric };
1422bdd1243dSDimitry Andric 
1423bdd1243dSDimitry Andric unsigned SchedGroup::NumSchedGroups = 0;
1424bdd1243dSDimitry Andric 
1425bdd1243dSDimitry Andric bool SchedGroup::tryAddEdge(SUnit *A, SUnit *B) {
1426bdd1243dSDimitry Andric   if (A != B && DAG->canAddEdge(B, A)) {
1427bdd1243dSDimitry Andric     DAG->addEdge(B, SDep(A, SDep::Artificial));
1428bdd1243dSDimitry Andric     return true;
1429bdd1243dSDimitry Andric   }
1430bdd1243dSDimitry Andric   return false;
1431bdd1243dSDimitry Andric }
1432bdd1243dSDimitry Andric 
1433bdd1243dSDimitry Andric bool SchedGroup::canAddMI(const MachineInstr &MI) const {
1434bdd1243dSDimitry Andric   bool Result = false;
1435bdd1243dSDimitry Andric   if (MI.isMetaInstruction())
1436bdd1243dSDimitry Andric     Result = false;
1437bdd1243dSDimitry Andric 
1438bdd1243dSDimitry Andric   else if (((SGMask & SchedGroupMask::ALU) != SchedGroupMask::NONE) &&
1439*cb14a3feSDimitry Andric            (TII->isVALU(MI) || TII->isMFMAorWMMA(MI) || TII->isSALU(MI) ||
1440*cb14a3feSDimitry Andric             TII->isTRANS(MI)))
1441bdd1243dSDimitry Andric     Result = true;
1442bdd1243dSDimitry Andric 
1443bdd1243dSDimitry Andric   else if (((SGMask & SchedGroupMask::VALU) != SchedGroupMask::NONE) &&
1444*cb14a3feSDimitry Andric            TII->isVALU(MI) && !TII->isMFMAorWMMA(MI) && !TII->isTRANS(MI))
1445bdd1243dSDimitry Andric     Result = true;
1446bdd1243dSDimitry Andric 
1447bdd1243dSDimitry Andric   else if (((SGMask & SchedGroupMask::SALU) != SchedGroupMask::NONE) &&
1448bdd1243dSDimitry Andric            TII->isSALU(MI))
1449bdd1243dSDimitry Andric     Result = true;
1450bdd1243dSDimitry Andric 
1451bdd1243dSDimitry Andric   else if (((SGMask & SchedGroupMask::MFMA) != SchedGroupMask::NONE) &&
1452bdd1243dSDimitry Andric            TII->isMFMAorWMMA(MI))
1453bdd1243dSDimitry Andric     Result = true;
1454bdd1243dSDimitry Andric 
1455bdd1243dSDimitry Andric   else if (((SGMask & SchedGroupMask::VMEM) != SchedGroupMask::NONE) &&
1456bdd1243dSDimitry Andric            (TII->isVMEM(MI) || (TII->isFLAT(MI) && !TII->isDS(MI))))
1457bdd1243dSDimitry Andric     Result = true;
1458bdd1243dSDimitry Andric 
1459bdd1243dSDimitry Andric   else if (((SGMask & SchedGroupMask::VMEM_READ) != SchedGroupMask::NONE) &&
1460bdd1243dSDimitry Andric            MI.mayLoad() &&
1461bdd1243dSDimitry Andric            (TII->isVMEM(MI) || (TII->isFLAT(MI) && !TII->isDS(MI))))
1462bdd1243dSDimitry Andric     Result = true;
1463bdd1243dSDimitry Andric 
1464bdd1243dSDimitry Andric   else if (((SGMask & SchedGroupMask::VMEM_WRITE) != SchedGroupMask::NONE) &&
1465bdd1243dSDimitry Andric            MI.mayStore() &&
1466bdd1243dSDimitry Andric            (TII->isVMEM(MI) || (TII->isFLAT(MI) && !TII->isDS(MI))))
1467bdd1243dSDimitry Andric     Result = true;
1468bdd1243dSDimitry Andric 
1469bdd1243dSDimitry Andric   else if (((SGMask & SchedGroupMask::DS) != SchedGroupMask::NONE) &&
1470bdd1243dSDimitry Andric            TII->isDS(MI))
1471bdd1243dSDimitry Andric     Result = true;
1472bdd1243dSDimitry Andric 
1473bdd1243dSDimitry Andric   else if (((SGMask & SchedGroupMask::DS_READ) != SchedGroupMask::NONE) &&
1474bdd1243dSDimitry Andric            MI.mayLoad() && TII->isDS(MI))
1475bdd1243dSDimitry Andric     Result = true;
1476bdd1243dSDimitry Andric 
1477bdd1243dSDimitry Andric   else if (((SGMask & SchedGroupMask::DS_WRITE) != SchedGroupMask::NONE) &&
1478bdd1243dSDimitry Andric            MI.mayStore() && TII->isDS(MI))
1479bdd1243dSDimitry Andric     Result = true;
1480bdd1243dSDimitry Andric 
1481*cb14a3feSDimitry Andric   else if (((SGMask & SchedGroupMask::TRANS) != SchedGroupMask::NONE) &&
1482*cb14a3feSDimitry Andric            TII->isTRANS(MI))
1483*cb14a3feSDimitry Andric     Result = true;
1484*cb14a3feSDimitry Andric 
1485bdd1243dSDimitry Andric   LLVM_DEBUG(
1486bdd1243dSDimitry Andric       dbgs() << "For SchedGroup with mask " << format_hex((int)SGMask, 10, true)
1487bdd1243dSDimitry Andric              << (Result ? " could classify " : " unable to classify ") << MI);
1488bdd1243dSDimitry Andric 
1489bdd1243dSDimitry Andric   return Result;
1490bdd1243dSDimitry Andric }
1491bdd1243dSDimitry Andric 
1492bdd1243dSDimitry Andric int SchedGroup::link(SUnit &SU, bool MakePred,
1493bdd1243dSDimitry Andric                      std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges) {
1494bdd1243dSDimitry Andric   int MissedEdges = 0;
1495bdd1243dSDimitry Andric   for (auto *A : Collection) {
1496bdd1243dSDimitry Andric     SUnit *B = &SU;
1497bdd1243dSDimitry Andric     if (A == B || A->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER)
1498bdd1243dSDimitry Andric       continue;
1499bdd1243dSDimitry Andric     if (MakePred)
1500bdd1243dSDimitry Andric       std::swap(A, B);
1501bdd1243dSDimitry Andric 
1502bdd1243dSDimitry Andric     if (DAG->IsReachable(B, A))
1503bdd1243dSDimitry Andric       continue;
150406c3fb27SDimitry Andric 
1505bdd1243dSDimitry Andric     // tryAddEdge returns false if there is a dependency that makes adding
1506bdd1243dSDimitry Andric     // the A->B edge impossible, otherwise it returns true;
1507bdd1243dSDimitry Andric     bool Added = tryAddEdge(A, B);
1508bdd1243dSDimitry Andric     if (Added)
1509bdd1243dSDimitry Andric       AddedEdges.push_back(std::pair(A, B));
1510bdd1243dSDimitry Andric     else
1511bdd1243dSDimitry Andric       ++MissedEdges;
1512bdd1243dSDimitry Andric   }
1513bdd1243dSDimitry Andric 
1514bdd1243dSDimitry Andric   return MissedEdges;
1515bdd1243dSDimitry Andric }
1516bdd1243dSDimitry Andric 
1517bdd1243dSDimitry Andric void SchedGroup::link(SUnit &SU, bool MakePred) {
1518bdd1243dSDimitry Andric   for (auto *A : Collection) {
1519bdd1243dSDimitry Andric     SUnit *B = &SU;
1520bdd1243dSDimitry Andric     if (A->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER)
1521bdd1243dSDimitry Andric       continue;
1522bdd1243dSDimitry Andric     if (MakePred)
1523bdd1243dSDimitry Andric       std::swap(A, B);
1524bdd1243dSDimitry Andric 
1525bdd1243dSDimitry Andric     tryAddEdge(A, B);
1526bdd1243dSDimitry Andric   }
1527bdd1243dSDimitry Andric }
1528bdd1243dSDimitry Andric 
1529bdd1243dSDimitry Andric void SchedGroup::link(SUnit &SU,
1530bdd1243dSDimitry Andric                       function_ref<bool(const SUnit *A, const SUnit *B)> P) {
1531bdd1243dSDimitry Andric   for (auto *A : Collection) {
1532bdd1243dSDimitry Andric     SUnit *B = &SU;
1533bdd1243dSDimitry Andric     if (P(A, B))
1534bdd1243dSDimitry Andric       std::swap(A, B);
1535bdd1243dSDimitry Andric 
1536bdd1243dSDimitry Andric     tryAddEdge(A, B);
1537bdd1243dSDimitry Andric   }
1538bdd1243dSDimitry Andric }
1539bdd1243dSDimitry Andric 
1540bdd1243dSDimitry Andric void SchedGroup::link(SchedGroup &OtherGroup) {
1541bdd1243dSDimitry Andric   for (auto *B : OtherGroup.Collection)
1542bdd1243dSDimitry Andric     link(*B);
1543bdd1243dSDimitry Andric }
1544bdd1243dSDimitry Andric 
1545bdd1243dSDimitry Andric bool SchedGroup::canAddSU(SUnit &SU) const {
1546bdd1243dSDimitry Andric   MachineInstr &MI = *SU.getInstr();
1547bdd1243dSDimitry Andric   if (MI.getOpcode() != TargetOpcode::BUNDLE)
1548bdd1243dSDimitry Andric     return canAddMI(MI);
1549bdd1243dSDimitry Andric 
1550bdd1243dSDimitry Andric   // Special case for bundled MIs.
1551bdd1243dSDimitry Andric   const MachineBasicBlock *MBB = MI.getParent();
1552bdd1243dSDimitry Andric   MachineBasicBlock::instr_iterator B = MI.getIterator(), E = ++B;
1553bdd1243dSDimitry Andric   while (E != MBB->end() && E->isBundledWithPred())
1554bdd1243dSDimitry Andric     ++E;
1555bdd1243dSDimitry Andric 
1556bdd1243dSDimitry Andric   // Return true if all of the bundled MIs can be added to this group.
1557bdd1243dSDimitry Andric   return std::all_of(B, E, [this](MachineInstr &MI) { return canAddMI(MI); });
1558bdd1243dSDimitry Andric }
1559bdd1243dSDimitry Andric 
1560bdd1243dSDimitry Andric void SchedGroup::initSchedGroup() {
1561bdd1243dSDimitry Andric   for (auto &SU : DAG->SUnits) {
1562bdd1243dSDimitry Andric     if (isFull())
1563bdd1243dSDimitry Andric       break;
1564bdd1243dSDimitry Andric 
1565bdd1243dSDimitry Andric     if (canAddSU(SU))
1566bdd1243dSDimitry Andric       add(SU);
1567bdd1243dSDimitry Andric   }
1568bdd1243dSDimitry Andric }
1569bdd1243dSDimitry Andric 
1570bdd1243dSDimitry Andric void SchedGroup::initSchedGroup(std::vector<SUnit>::reverse_iterator RIter,
1571bdd1243dSDimitry Andric                                 SUnitsToCandidateSGsMap &SyncedInstrs) {
1572bdd1243dSDimitry Andric   SUnit &InitSU = *RIter;
1573bdd1243dSDimitry Andric   for (auto E = DAG->SUnits.rend(); RIter != E; ++RIter) {
1574bdd1243dSDimitry Andric     auto &SU = *RIter;
1575bdd1243dSDimitry Andric     if (isFull())
1576bdd1243dSDimitry Andric       break;
1577bdd1243dSDimitry Andric 
1578bdd1243dSDimitry Andric     if (canAddSU(SU))
1579bdd1243dSDimitry Andric       SyncedInstrs[&SU].push_back(SGID);
1580bdd1243dSDimitry Andric   }
1581bdd1243dSDimitry Andric 
1582bdd1243dSDimitry Andric   add(InitSU);
1583bdd1243dSDimitry Andric   assert(MaxSize);
1584bdd1243dSDimitry Andric   (*MaxSize)++;
1585bdd1243dSDimitry Andric }
1586bdd1243dSDimitry Andric 
1587bdd1243dSDimitry Andric void SchedGroup::initSchedGroup(SUnitsToCandidateSGsMap &SyncedInstrs) {
1588bdd1243dSDimitry Andric   auto I = DAG->SUnits.rbegin();
1589bdd1243dSDimitry Andric   auto E = DAG->SUnits.rend();
1590bdd1243dSDimitry Andric   for (; I != E; ++I) {
1591bdd1243dSDimitry Andric     auto &SU = *I;
1592bdd1243dSDimitry Andric     if (isFull())
1593bdd1243dSDimitry Andric       break;
1594bdd1243dSDimitry Andric 
1595bdd1243dSDimitry Andric     if (canAddSU(SU))
1596bdd1243dSDimitry Andric       SyncedInstrs[&SU].push_back(SGID);
1597bdd1243dSDimitry Andric   }
1598bdd1243dSDimitry Andric }
1599bdd1243dSDimitry Andric 
1600bdd1243dSDimitry Andric void IGroupLPDAGMutation::apply(ScheduleDAGInstrs *DAGInstrs) {
160181ad6265SDimitry Andric   const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel();
160281ad6265SDimitry Andric   if (!TSchedModel || DAGInstrs->SUnits.empty())
160381ad6265SDimitry Andric     return;
160481ad6265SDimitry Andric 
1605bdd1243dSDimitry Andric   LLVM_DEBUG(dbgs() << "Applying IGroupLPDAGMutation...\n");
160681ad6265SDimitry Andric   const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>();
160781ad6265SDimitry Andric   TII = ST.getInstrInfo();
160881ad6265SDimitry Andric   DAG = static_cast<ScheduleDAGMI *>(DAGInstrs);
1609bdd1243dSDimitry Andric   SyncedSchedGroups.clear();
1610bdd1243dSDimitry Andric   SyncedInstrs.clear();
1611bdd1243dSDimitry Andric   bool foundSB = false;
1612bdd1243dSDimitry Andric   bool foundIGLP = false;
1613bdd1243dSDimitry Andric   for (auto R = DAG->SUnits.rbegin(), E = DAG->SUnits.rend(); R != E; ++R) {
1614bdd1243dSDimitry Andric     unsigned Opc = R->getInstr()->getOpcode();
1615bdd1243dSDimitry Andric     // SCHED_[GROUP_]BARRIER and IGLP are mutually exclusive.
1616bdd1243dSDimitry Andric     if (Opc == AMDGPU::SCHED_BARRIER) {
1617bdd1243dSDimitry Andric       addSchedBarrierEdges(*R);
1618bdd1243dSDimitry Andric       foundSB = true;
1619bdd1243dSDimitry Andric     } else if (Opc == AMDGPU::SCHED_GROUP_BARRIER) {
1620bdd1243dSDimitry Andric       initSchedGroupBarrierPipelineStage(R);
1621bdd1243dSDimitry Andric       foundSB = true;
1622bdd1243dSDimitry Andric     } else if (Opc == AMDGPU::IGLP_OPT) {
1623bdd1243dSDimitry Andric       resetEdges(*R, DAG);
1624bdd1243dSDimitry Andric       if (!foundSB && !foundIGLP)
1625bdd1243dSDimitry Andric         initIGLPOpt(*R);
1626bdd1243dSDimitry Andric       foundIGLP = true;
1627bdd1243dSDimitry Andric     }
162881ad6265SDimitry Andric   }
162981ad6265SDimitry Andric 
1630bdd1243dSDimitry Andric   if (foundSB || foundIGLP) {
163106c3fb27SDimitry Andric     PipelineSolver PS(SyncedSchedGroups, SyncedInstrs, DAG, IsBottomUp);
1632bdd1243dSDimitry Andric     // PipelineSolver performs the mutation by adding the edges it
1633bdd1243dSDimitry Andric     // determined as the best
1634bdd1243dSDimitry Andric     PS.solve();
1635bdd1243dSDimitry Andric   }
1636bdd1243dSDimitry Andric }
1637bdd1243dSDimitry Andric 
1638bdd1243dSDimitry Andric void IGroupLPDAGMutation::addSchedBarrierEdges(SUnit &SchedBarrier) {
163981ad6265SDimitry Andric   MachineInstr &MI = *SchedBarrier.getInstr();
164081ad6265SDimitry Andric   assert(MI.getOpcode() == AMDGPU::SCHED_BARRIER);
164181ad6265SDimitry Andric   // Remove all existing edges from the SCHED_BARRIER that were added due to the
164281ad6265SDimitry Andric   // instruction having side effects.
1643bdd1243dSDimitry Andric   resetEdges(SchedBarrier, DAG);
1644*cb14a3feSDimitry Andric   LLVM_DEBUG(dbgs() << "Building SchedGroup for SchedBarrier with Mask: "
1645*cb14a3feSDimitry Andric                     << MI.getOperand(0).getImm() << "\n");
1646bdd1243dSDimitry Andric   auto InvertedMask =
1647bdd1243dSDimitry Andric       invertSchedBarrierMask((SchedGroupMask)MI.getOperand(0).getImm());
1648bdd1243dSDimitry Andric   SchedGroup SG(InvertedMask, std::nullopt, DAG, TII);
1649bdd1243dSDimitry Andric   SG.initSchedGroup();
1650*cb14a3feSDimitry Andric 
1651bdd1243dSDimitry Andric   // Preserve original instruction ordering relative to the SCHED_BARRIER.
1652bdd1243dSDimitry Andric   SG.link(
1653bdd1243dSDimitry Andric       SchedBarrier,
1654bdd1243dSDimitry Andric       (function_ref<bool(const SUnit *A, const SUnit *B)>)[](
1655bdd1243dSDimitry Andric           const SUnit *A, const SUnit *B) { return A->NodeNum > B->NodeNum; });
165681ad6265SDimitry Andric }
165781ad6265SDimitry Andric 
1658bdd1243dSDimitry Andric SchedGroupMask
1659bdd1243dSDimitry Andric IGroupLPDAGMutation::invertSchedBarrierMask(SchedGroupMask Mask) const {
1660bdd1243dSDimitry Andric   // Invert mask and erase bits for types of instructions that are implied to be
1661bdd1243dSDimitry Andric   // allowed past the SCHED_BARRIER.
1662bdd1243dSDimitry Andric   SchedGroupMask InvertedMask = ~Mask;
1663bdd1243dSDimitry Andric 
1664*cb14a3feSDimitry Andric   // ALU implies VALU, SALU, MFMA, TRANS.
1665bdd1243dSDimitry Andric   if ((InvertedMask & SchedGroupMask::ALU) == SchedGroupMask::NONE)
1666*cb14a3feSDimitry Andric     InvertedMask &= ~SchedGroupMask::VALU & ~SchedGroupMask::SALU &
1667*cb14a3feSDimitry Andric                     ~SchedGroupMask::MFMA & ~SchedGroupMask::TRANS;
1668*cb14a3feSDimitry Andric   // VALU, SALU, MFMA, TRANS implies ALU.
1669bdd1243dSDimitry Andric   else if ((InvertedMask & SchedGroupMask::VALU) == SchedGroupMask::NONE ||
1670bdd1243dSDimitry Andric            (InvertedMask & SchedGroupMask::SALU) == SchedGroupMask::NONE ||
1671*cb14a3feSDimitry Andric            (InvertedMask & SchedGroupMask::MFMA) == SchedGroupMask::NONE ||
1672*cb14a3feSDimitry Andric            (InvertedMask & SchedGroupMask::TRANS) == SchedGroupMask::NONE)
1673bdd1243dSDimitry Andric     InvertedMask &= ~SchedGroupMask::ALU;
1674bdd1243dSDimitry Andric 
1675bdd1243dSDimitry Andric   // VMEM implies VMEM_READ, VMEM_WRITE.
1676bdd1243dSDimitry Andric   if ((InvertedMask & SchedGroupMask::VMEM) == SchedGroupMask::NONE)
1677bdd1243dSDimitry Andric     InvertedMask &= ~SchedGroupMask::VMEM_READ & ~SchedGroupMask::VMEM_WRITE;
1678bdd1243dSDimitry Andric   // VMEM_READ, VMEM_WRITE implies VMEM.
1679bdd1243dSDimitry Andric   else if ((InvertedMask & SchedGroupMask::VMEM_READ) == SchedGroupMask::NONE ||
1680bdd1243dSDimitry Andric            (InvertedMask & SchedGroupMask::VMEM_WRITE) == SchedGroupMask::NONE)
1681bdd1243dSDimitry Andric     InvertedMask &= ~SchedGroupMask::VMEM;
1682bdd1243dSDimitry Andric 
1683bdd1243dSDimitry Andric   // DS implies DS_READ, DS_WRITE.
1684bdd1243dSDimitry Andric   if ((InvertedMask & SchedGroupMask::DS) == SchedGroupMask::NONE)
1685bdd1243dSDimitry Andric     InvertedMask &= ~SchedGroupMask::DS_READ & ~SchedGroupMask::DS_WRITE;
1686bdd1243dSDimitry Andric   // DS_READ, DS_WRITE implies DS.
1687bdd1243dSDimitry Andric   else if ((InvertedMask & SchedGroupMask::DS_READ) == SchedGroupMask::NONE ||
1688bdd1243dSDimitry Andric            (InvertedMask & SchedGroupMask::DS_WRITE) == SchedGroupMask::NONE)
1689bdd1243dSDimitry Andric     InvertedMask &= ~SchedGroupMask::DS;
1690bdd1243dSDimitry Andric 
1691*cb14a3feSDimitry Andric   LLVM_DEBUG(dbgs() << "After Inverting, SchedGroup Mask: " << (int)InvertedMask
1692*cb14a3feSDimitry Andric                     << "\n");
1693*cb14a3feSDimitry Andric 
1694bdd1243dSDimitry Andric   return InvertedMask;
169581ad6265SDimitry Andric }
169681ad6265SDimitry Andric 
1697bdd1243dSDimitry Andric void IGroupLPDAGMutation::initSchedGroupBarrierPipelineStage(
1698bdd1243dSDimitry Andric     std::vector<SUnit>::reverse_iterator RIter) {
1699bdd1243dSDimitry Andric   // Remove all existing edges from the SCHED_GROUP_BARRIER that were added due
1700bdd1243dSDimitry Andric   // to the instruction having side effects.
1701bdd1243dSDimitry Andric   resetEdges(*RIter, DAG);
1702bdd1243dSDimitry Andric   MachineInstr &SGB = *RIter->getInstr();
1703bdd1243dSDimitry Andric   assert(SGB.getOpcode() == AMDGPU::SCHED_GROUP_BARRIER);
1704bdd1243dSDimitry Andric   int32_t SGMask = SGB.getOperand(0).getImm();
1705bdd1243dSDimitry Andric   int32_t Size = SGB.getOperand(1).getImm();
1706bdd1243dSDimitry Andric   int32_t SyncID = SGB.getOperand(2).getImm();
1707bdd1243dSDimitry Andric 
1708bdd1243dSDimitry Andric   auto &SG = SyncedSchedGroups[SyncID].emplace_back((SchedGroupMask)SGMask,
1709bdd1243dSDimitry Andric                                                     Size, SyncID, DAG, TII);
1710bdd1243dSDimitry Andric 
1711bdd1243dSDimitry Andric   SG.initSchedGroup(RIter, SyncedInstrs[SG.getSyncID()]);
171281ad6265SDimitry Andric }
171381ad6265SDimitry Andric 
1714bdd1243dSDimitry Andric void IGroupLPDAGMutation::initIGLPOpt(SUnit &SU) {
1715bdd1243dSDimitry Andric   IGLPStrategyID StrategyID =
1716bdd1243dSDimitry Andric       (IGLPStrategyID)SU.getInstr()->getOperand(0).getImm();
1717bdd1243dSDimitry Andric   auto S = createIGLPStrategy(StrategyID, DAG, TII);
171806c3fb27SDimitry Andric   if (S->shouldApplyStrategy(DAG)) {
171906c3fb27SDimitry Andric     IsBottomUp = S->IsBottomUp;
17205f757f3fSDimitry Andric     S->applyIGLPStrategy(SyncedInstrs, SyncedSchedGroups, IsReentry);
172181ad6265SDimitry Andric   }
172206c3fb27SDimitry Andric }
172381ad6265SDimitry Andric 
172481ad6265SDimitry Andric } // namespace
172581ad6265SDimitry Andric 
172681ad6265SDimitry Andric namespace llvm {
172781ad6265SDimitry Andric 
17285f757f3fSDimitry Andric /// \p IsReentry specifes whether or not this is a reentry into the
17295f757f3fSDimitry Andric /// IGroupLPDAGMutation. Since there may be multiple scheduling passes on the
17305f757f3fSDimitry Andric /// same scheduling region (e.g. pre and post-RA scheduling / multiple
17315f757f3fSDimitry Andric /// scheduling "phases"), we can reenter this mutation framework more than once
17325f757f3fSDimitry Andric /// for a given region.
17335f757f3fSDimitry Andric std::unique_ptr<ScheduleDAGMutation> createIGroupLPDAGMutation(bool IsReentry) {
17345f757f3fSDimitry Andric   return std::make_unique<IGroupLPDAGMutation>(IsReentry);
173581ad6265SDimitry Andric }
173681ad6265SDimitry Andric 
173781ad6265SDimitry Andric } // end namespace llvm
1738