xref: /freebsd-src/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp (revision 0fca6ea1d4eea4c934cfff25ac9ee8ad6fe95583)
181ad6265SDimitry Andric //===--- AMDGPUIGroupLP.cpp - AMDGPU IGroupLP  ------------===//
281ad6265SDimitry Andric //
381ad6265SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
481ad6265SDimitry Andric // See https://llvm.org/LICENSE.txt for license information.
581ad6265SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
681ad6265SDimitry Andric //
781ad6265SDimitry Andric //===----------------------------------------------------------------------===//
881ad6265SDimitry Andric //
981ad6265SDimitry Andric // \file This file defines a set of schedule DAG mutations that can be used to
1081ad6265SDimitry Andric // override default scheduler behavior to enforce specific scheduling patterns.
1181ad6265SDimitry Andric // They should be used in cases where runtime performance considerations such as
1281ad6265SDimitry Andric // inter-wavefront interactions, mean that compile-time heuristics cannot
1381ad6265SDimitry Andric // predict the optimal instruction ordering, or in kernels where optimum
1481ad6265SDimitry Andric // instruction scheduling is important enough to warrant manual intervention.
1581ad6265SDimitry Andric //
1681ad6265SDimitry Andric //===----------------------------------------------------------------------===//
1781ad6265SDimitry Andric 
1881ad6265SDimitry Andric #include "AMDGPUIGroupLP.h"
1981ad6265SDimitry Andric #include "AMDGPUTargetMachine.h"
2081ad6265SDimitry Andric #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
2181ad6265SDimitry Andric #include "SIInstrInfo.h"
2281ad6265SDimitry Andric #include "SIMachineFunctionInfo.h"
2381ad6265SDimitry Andric #include "llvm/ADT/BitmaskEnum.h"
24bdd1243dSDimitry Andric #include "llvm/ADT/DenseMap.h"
2581ad6265SDimitry Andric #include "llvm/CodeGen/MachineScheduler.h"
2681ad6265SDimitry Andric #include "llvm/CodeGen/TargetOpcodes.h"
2781ad6265SDimitry Andric 
2881ad6265SDimitry Andric using namespace llvm;
2981ad6265SDimitry Andric 
30bdd1243dSDimitry Andric #define DEBUG_TYPE "igrouplp"
3181ad6265SDimitry Andric 
3281ad6265SDimitry Andric namespace {
3381ad6265SDimitry Andric 
34bdd1243dSDimitry Andric static cl::opt<bool> EnableExactSolver(
35bdd1243dSDimitry Andric     "amdgpu-igrouplp-exact-solver", cl::Hidden,
36bdd1243dSDimitry Andric     cl::desc("Whether to use the exponential time solver to fit "
37bdd1243dSDimitry Andric              "the instructions to the pipeline as closely as "
38bdd1243dSDimitry Andric              "possible."),
3981ad6265SDimitry Andric     cl::init(false));
4081ad6265SDimitry Andric 
41bdd1243dSDimitry Andric static cl::opt<unsigned> CutoffForExact(
42bdd1243dSDimitry Andric     "amdgpu-igrouplp-exact-solver-cutoff", cl::init(0), cl::Hidden,
43bdd1243dSDimitry Andric     cl::desc("The maximum number of scheduling group conflicts "
44bdd1243dSDimitry Andric              "which we attempt to solve with the exponential time "
45bdd1243dSDimitry Andric              "exact solver. Problem sizes greater than this will"
46bdd1243dSDimitry Andric              "be solved by the less accurate greedy algorithm. Selecting "
47bdd1243dSDimitry Andric              "solver by size is superseded by manually selecting "
48bdd1243dSDimitry Andric              "the solver (e.g. by amdgpu-igrouplp-exact-solver"));
4981ad6265SDimitry Andric 
50bdd1243dSDimitry Andric static cl::opt<uint64_t> MaxBranchesExplored(
51bdd1243dSDimitry Andric     "amdgpu-igrouplp-exact-solver-max-branches", cl::init(0), cl::Hidden,
52bdd1243dSDimitry Andric     cl::desc("The amount of branches that we are willing to explore with"
53bdd1243dSDimitry Andric              "the exact algorithm before giving up."));
5481ad6265SDimitry Andric 
55bdd1243dSDimitry Andric static cl::opt<bool> UseCostHeur(
56bdd1243dSDimitry Andric     "amdgpu-igrouplp-exact-solver-cost-heur", cl::init(true), cl::Hidden,
57bdd1243dSDimitry Andric     cl::desc("Whether to use the cost heuristic to make choices as we "
58bdd1243dSDimitry Andric              "traverse the search space using the exact solver. Defaulted "
59bdd1243dSDimitry Andric              "to on, and if turned off, we will use the node order -- "
60bdd1243dSDimitry Andric              "attempting to put the later nodes in the later sched groups. "
61bdd1243dSDimitry Andric              "Experimentally, results are mixed, so this should be set on a "
62bdd1243dSDimitry Andric              "case-by-case basis."));
6381ad6265SDimitry Andric 
64bdd1243dSDimitry Andric // Components of the mask that determines which instruction types may be may be
65bdd1243dSDimitry Andric // classified into a SchedGroup.
66bdd1243dSDimitry Andric enum class SchedGroupMask {
6781ad6265SDimitry Andric   NONE = 0u,
6881ad6265SDimitry Andric   ALU = 1u << 0,
6981ad6265SDimitry Andric   VALU = 1u << 1,
7081ad6265SDimitry Andric   SALU = 1u << 2,
7181ad6265SDimitry Andric   MFMA = 1u << 3,
7281ad6265SDimitry Andric   VMEM = 1u << 4,
7381ad6265SDimitry Andric   VMEM_READ = 1u << 5,
7481ad6265SDimitry Andric   VMEM_WRITE = 1u << 6,
7581ad6265SDimitry Andric   DS = 1u << 7,
7681ad6265SDimitry Andric   DS_READ = 1u << 8,
7781ad6265SDimitry Andric   DS_WRITE = 1u << 9,
78cb14a3feSDimitry Andric   TRANS = 1u << 10,
79bdd1243dSDimitry Andric   ALL = ALU | VALU | SALU | MFMA | VMEM | VMEM_READ | VMEM_WRITE | DS |
80cb14a3feSDimitry Andric         DS_READ | DS_WRITE | TRANS,
81bdd1243dSDimitry Andric   LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ ALL)
8281ad6265SDimitry Andric };
8381ad6265SDimitry Andric 
8406c3fb27SDimitry Andric class SchedGroup;
8506c3fb27SDimitry Andric 
8606c3fb27SDimitry Andric // InstructionRule class is used to enact a filter which determines whether or
8706c3fb27SDimitry Andric // not an SU maps to a given SchedGroup. It contains complementary data
8806c3fb27SDimitry Andric // structures (e.g Cache) to help those filters.
8906c3fb27SDimitry Andric class InstructionRule {
9006c3fb27SDimitry Andric protected:
9106c3fb27SDimitry Andric   const SIInstrInfo *TII;
9206c3fb27SDimitry Andric   unsigned SGID;
9306c3fb27SDimitry Andric   // A cache made available to the Filter to store SUnits for subsequent
9406c3fb27SDimitry Andric   // invocations of the Filter
9506c3fb27SDimitry Andric   std::optional<SmallVector<SUnit *, 4>> Cache;
9606c3fb27SDimitry Andric 
9706c3fb27SDimitry Andric public:
9806c3fb27SDimitry Andric   virtual bool
9906c3fb27SDimitry Andric   apply(const SUnit *, const ArrayRef<SUnit *>,
10006c3fb27SDimitry Andric         SmallVectorImpl<SchedGroup> &) {
10106c3fb27SDimitry Andric     return true;
10206c3fb27SDimitry Andric   };
10306c3fb27SDimitry Andric 
10406c3fb27SDimitry Andric   InstructionRule(const SIInstrInfo *TII, unsigned SGID,
10506c3fb27SDimitry Andric                   bool NeedsCache = false)
10606c3fb27SDimitry Andric       : TII(TII), SGID(SGID) {
10706c3fb27SDimitry Andric     if (NeedsCache) {
10806c3fb27SDimitry Andric       Cache = SmallVector<SUnit *, 4>();
10906c3fb27SDimitry Andric     }
11006c3fb27SDimitry Andric   }
11106c3fb27SDimitry Andric 
11206c3fb27SDimitry Andric   virtual ~InstructionRule() = default;
11306c3fb27SDimitry Andric };
11406c3fb27SDimitry Andric 
115*0fca6ea1SDimitry Andric using SUnitsToCandidateSGsMap = DenseMap<SUnit *, SmallVector<int, 4>>;
11681ad6265SDimitry Andric 
117bdd1243dSDimitry Andric // Classify instructions into groups to enable fine tuned control over the
118bdd1243dSDimitry Andric // scheduler. These groups may be more specific than current SchedModel
119bdd1243dSDimitry Andric // instruction classes.
120bdd1243dSDimitry Andric class SchedGroup {
121bdd1243dSDimitry Andric private:
122bdd1243dSDimitry Andric   // Mask that defines which instruction types can be classified into this
123bdd1243dSDimitry Andric   // SchedGroup. The instruction types correspond to the mask from SCHED_BARRIER
124bdd1243dSDimitry Andric   // and SCHED_GROUP_BARRIER.
125bdd1243dSDimitry Andric   SchedGroupMask SGMask;
12681ad6265SDimitry Andric 
127bdd1243dSDimitry Andric   // Maximum number of SUnits that can be added to this group.
128bdd1243dSDimitry Andric   std::optional<unsigned> MaxSize;
12981ad6265SDimitry Andric 
130bdd1243dSDimitry Andric   // SchedGroups will only synchronize with other SchedGroups that have the same
131bdd1243dSDimitry Andric   // SyncID.
132bdd1243dSDimitry Andric   int SyncID = 0;
13381ad6265SDimitry Andric 
134bdd1243dSDimitry Andric   // SGID is used to map instructions to candidate SchedGroups
135bdd1243dSDimitry Andric   unsigned SGID;
136bdd1243dSDimitry Andric 
13706c3fb27SDimitry Andric   // The different rules each instruction in this SchedGroup must conform to
13806c3fb27SDimitry Andric   SmallVector<std::shared_ptr<InstructionRule>, 4> Rules;
13906c3fb27SDimitry Andric 
140bdd1243dSDimitry Andric   // Count of the number of created SchedGroups, used to initialize SGID.
141bdd1243dSDimitry Andric   static unsigned NumSchedGroups;
142bdd1243dSDimitry Andric 
143bdd1243dSDimitry Andric   // Try to add and edge from SU A to SU B.
144bdd1243dSDimitry Andric   bool tryAddEdge(SUnit *A, SUnit *B);
145bdd1243dSDimitry Andric 
146bdd1243dSDimitry Andric   // Use SGMask to determine whether we can classify MI as a member of this
147bdd1243dSDimitry Andric   // SchedGroup object.
148bdd1243dSDimitry Andric   bool canAddMI(const MachineInstr &MI) const;
14981ad6265SDimitry Andric 
15081ad6265SDimitry Andric public:
151bdd1243dSDimitry Andric   // Collection of SUnits that are classified as members of this group.
152bdd1243dSDimitry Andric   SmallVector<SUnit *, 32> Collection;
15381ad6265SDimitry Andric 
15406c3fb27SDimitry Andric   ScheduleDAGInstrs *DAG;
155*0fca6ea1SDimitry Andric   const SIInstrInfo *TII;
15606c3fb27SDimitry Andric 
157bdd1243dSDimitry Andric   // Returns true if SU can be added to this SchedGroup.
158bdd1243dSDimitry Andric   bool canAddSU(SUnit &SU) const;
15981ad6265SDimitry Andric 
160bdd1243dSDimitry Andric   // Add DAG dependencies from all SUnits in this SchedGroup and this SU. If
161bdd1243dSDimitry Andric   // MakePred is true, SU will be a predecessor of the SUnits in this
162bdd1243dSDimitry Andric   // SchedGroup, otherwise SU will be a successor.
163bdd1243dSDimitry Andric   void link(SUnit &SU, bool MakePred = false);
16481ad6265SDimitry Andric 
165bdd1243dSDimitry Andric   // Add DAG dependencies and track which edges are added, and the count of
166bdd1243dSDimitry Andric   // missed edges
167bdd1243dSDimitry Andric   int link(SUnit &SU, bool MakePred,
168bdd1243dSDimitry Andric            std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges);
16981ad6265SDimitry Andric 
170bdd1243dSDimitry Andric   // Add DAG dependencies from all SUnits in this SchedGroup and this SU.
171bdd1243dSDimitry Andric   // Use the predicate to determine whether SU should be a predecessor (P =
172bdd1243dSDimitry Andric   // true) or a successor (P = false) of this SchedGroup.
173bdd1243dSDimitry Andric   void link(SUnit &SU, function_ref<bool(const SUnit *A, const SUnit *B)> P);
17481ad6265SDimitry Andric 
175bdd1243dSDimitry Andric   // Add DAG dependencies such that SUnits in this group shall be ordered
176bdd1243dSDimitry Andric   // before SUnits in OtherGroup.
177bdd1243dSDimitry Andric   void link(SchedGroup &OtherGroup);
178bdd1243dSDimitry Andric 
179bdd1243dSDimitry Andric   // Returns true if no more instructions may be added to this group.
180bdd1243dSDimitry Andric   bool isFull() const { return MaxSize && Collection.size() >= *MaxSize; }
181bdd1243dSDimitry Andric 
18206c3fb27SDimitry Andric   // Append a constraint that SUs must meet in order to fit into this
18306c3fb27SDimitry Andric   // SchedGroup. Since many rules involve the relationship between a SchedGroup
18406c3fb27SDimitry Andric   // and the SUnits in other SchedGroups, rules are checked at Pipeline Solve
18506c3fb27SDimitry Andric   // time (rather than SchedGroup init time.)
18606c3fb27SDimitry Andric   void addRule(std::shared_ptr<InstructionRule> NewRule) {
18706c3fb27SDimitry Andric     Rules.push_back(NewRule);
18806c3fb27SDimitry Andric   }
18906c3fb27SDimitry Andric 
19006c3fb27SDimitry Andric   // Returns true if the SU matches all rules
19106c3fb27SDimitry Andric   bool allowedByRules(const SUnit *SU,
19206c3fb27SDimitry Andric                       SmallVectorImpl<SchedGroup> &SyncPipe) const {
193*0fca6ea1SDimitry Andric     for (auto &Rule : Rules) {
194*0fca6ea1SDimitry Andric       if (!Rule.get()->apply(SU, Collection, SyncPipe))
19506c3fb27SDimitry Andric         return false;
19606c3fb27SDimitry Andric     }
19706c3fb27SDimitry Andric     return true;
19806c3fb27SDimitry Andric   }
19906c3fb27SDimitry Andric 
200bdd1243dSDimitry Andric   // Add SU to the SchedGroup.
201bdd1243dSDimitry Andric   void add(SUnit &SU) {
202bdd1243dSDimitry Andric     LLVM_DEBUG(dbgs() << "For SchedGroup with mask "
203bdd1243dSDimitry Andric                       << format_hex((int)SGMask, 10, true) << " adding "
204bdd1243dSDimitry Andric                       << *SU.getInstr());
205bdd1243dSDimitry Andric     Collection.push_back(&SU);
20681ad6265SDimitry Andric   }
20781ad6265SDimitry Andric 
208bdd1243dSDimitry Andric   // Remove last element in the SchedGroup
209bdd1243dSDimitry Andric   void pop() { Collection.pop_back(); }
210bdd1243dSDimitry Andric 
211bdd1243dSDimitry Andric   // Identify and add all relevant SUs from the DAG to this SchedGroup.
212bdd1243dSDimitry Andric   void initSchedGroup();
213bdd1243dSDimitry Andric 
214bdd1243dSDimitry Andric   // Add instructions to the SchedGroup bottom up starting from RIter.
215bdd1243dSDimitry Andric   // PipelineInstrs is a set of instructions that should not be added to the
216bdd1243dSDimitry Andric   // SchedGroup even when the other conditions for adding it are satisfied.
217bdd1243dSDimitry Andric   // RIter will be added to the SchedGroup as well, and dependencies will be
218bdd1243dSDimitry Andric   // added so that RIter will always be scheduled at the end of the group.
219bdd1243dSDimitry Andric   void initSchedGroup(std::vector<SUnit>::reverse_iterator RIter,
220bdd1243dSDimitry Andric                       SUnitsToCandidateSGsMap &SyncedInstrs);
221bdd1243dSDimitry Andric 
222bdd1243dSDimitry Andric   void initSchedGroup(SUnitsToCandidateSGsMap &SyncedInstrs);
223bdd1243dSDimitry Andric 
224bdd1243dSDimitry Andric   int getSyncID() { return SyncID; }
225bdd1243dSDimitry Andric 
226bdd1243dSDimitry Andric   int getSGID() { return SGID; }
227bdd1243dSDimitry Andric 
228bdd1243dSDimitry Andric   SchedGroupMask getMask() { return SGMask; }
229bdd1243dSDimitry Andric 
230bdd1243dSDimitry Andric   SchedGroup(SchedGroupMask SGMask, std::optional<unsigned> MaxSize,
231bdd1243dSDimitry Andric              ScheduleDAGInstrs *DAG, const SIInstrInfo *TII)
232*0fca6ea1SDimitry Andric       : SGMask(SGMask), MaxSize(MaxSize), DAG(DAG), TII(TII) {
233bdd1243dSDimitry Andric     SGID = NumSchedGroups++;
234bdd1243dSDimitry Andric   }
235bdd1243dSDimitry Andric 
236bdd1243dSDimitry Andric   SchedGroup(SchedGroupMask SGMask, std::optional<unsigned> MaxSize, int SyncID,
237bdd1243dSDimitry Andric              ScheduleDAGInstrs *DAG, const SIInstrInfo *TII)
238*0fca6ea1SDimitry Andric       : SGMask(SGMask), MaxSize(MaxSize), SyncID(SyncID), DAG(DAG), TII(TII) {
239bdd1243dSDimitry Andric     SGID = NumSchedGroups++;
240bdd1243dSDimitry Andric   }
241bdd1243dSDimitry Andric };
242bdd1243dSDimitry Andric 
243bdd1243dSDimitry Andric // Remove all existing edges from a SCHED_BARRIER or SCHED_GROUP_BARRIER.
244bdd1243dSDimitry Andric static void resetEdges(SUnit &SU, ScheduleDAGInstrs *DAG) {
245bdd1243dSDimitry Andric   assert(SU.getInstr()->getOpcode() == AMDGPU::SCHED_BARRIER ||
246bdd1243dSDimitry Andric          SU.getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER ||
247bdd1243dSDimitry Andric          SU.getInstr()->getOpcode() == AMDGPU::IGLP_OPT);
248bdd1243dSDimitry Andric 
249bdd1243dSDimitry Andric   while (!SU.Preds.empty())
250bdd1243dSDimitry Andric     for (auto &P : SU.Preds)
251bdd1243dSDimitry Andric       SU.removePred(P);
252bdd1243dSDimitry Andric 
253bdd1243dSDimitry Andric   while (!SU.Succs.empty())
254bdd1243dSDimitry Andric     for (auto &S : SU.Succs)
255bdd1243dSDimitry Andric       for (auto &SP : S.getSUnit()->Preds)
256bdd1243dSDimitry Andric         if (SP.getSUnit() == &SU)
257bdd1243dSDimitry Andric           S.getSUnit()->removePred(SP);
258bdd1243dSDimitry Andric }
259bdd1243dSDimitry Andric 
260*0fca6ea1SDimitry Andric using SUToCandSGsPair = std::pair<SUnit *, SmallVector<int, 4>>;
261*0fca6ea1SDimitry Andric using SUsToCandSGsVec = SmallVector<SUToCandSGsPair, 4>;
262bdd1243dSDimitry Andric 
263bdd1243dSDimitry Andric // The PipelineSolver is used to assign SUnits to SchedGroups in a pipeline
264bdd1243dSDimitry Andric // in non-trivial cases. For example, if the requested pipeline is
265bdd1243dSDimitry Andric // {VMEM_READ, VALU, MFMA, VMEM_READ} and we encounter a VMEM_READ instruction
266bdd1243dSDimitry Andric // in the DAG, then we will have an instruction that can not be trivially
267bdd1243dSDimitry Andric // assigned to a SchedGroup. The PipelineSolver class implements two algorithms
268bdd1243dSDimitry Andric // to find a good solution to the pipeline -- a greedy algorithm and an exact
269bdd1243dSDimitry Andric // algorithm. The exact algorithm has an exponential time complexity and should
270bdd1243dSDimitry Andric // only be used for small sized problems or medium sized problems where an exact
271bdd1243dSDimitry Andric // solution is highly desired.
272bdd1243dSDimitry Andric class PipelineSolver {
273bdd1243dSDimitry Andric   ScheduleDAGMI *DAG;
274bdd1243dSDimitry Andric 
275bdd1243dSDimitry Andric   // Instructions that can be assigned to multiple SchedGroups
276bdd1243dSDimitry Andric   DenseMap<int, SUnitsToCandidateSGsMap> SyncedInstrs;
277bdd1243dSDimitry Andric   SmallVector<SUsToCandSGsVec, 4> PipelineInstrs;
278bdd1243dSDimitry Andric   DenseMap<int, SmallVector<SchedGroup, 4>> SyncedSchedGroups;
279bdd1243dSDimitry Andric   // The current working pipeline
280bdd1243dSDimitry Andric   SmallVector<SmallVector<SchedGroup, 4>, 4> CurrPipeline;
281bdd1243dSDimitry Andric   // The pipeline that has the best solution found so far
282bdd1243dSDimitry Andric   SmallVector<SmallVector<SchedGroup, 4>, 4> BestPipeline;
283bdd1243dSDimitry Andric 
284bdd1243dSDimitry Andric   // Whether or not we actually have any SyncedInstrs to try to solve.
285bdd1243dSDimitry Andric   bool NeedsSolver = false;
286bdd1243dSDimitry Andric 
287bdd1243dSDimitry Andric   // Compute an estimate of the size of search tree -- the true size is
288bdd1243dSDimitry Andric   // the product of each conflictedInst.Matches.size() across all SyncPipelines
289bdd1243dSDimitry Andric   unsigned computeProblemSize();
290bdd1243dSDimitry Andric 
291bdd1243dSDimitry Andric   // The cost penalty of not assigning a SU to a SchedGroup
292bdd1243dSDimitry Andric   int MissPenalty = 0;
293bdd1243dSDimitry Andric 
294bdd1243dSDimitry Andric   // Costs in terms of the number of edges we are unable to add
295bdd1243dSDimitry Andric   int BestCost = -1;
296bdd1243dSDimitry Andric   int CurrCost = 0;
297bdd1243dSDimitry Andric 
298bdd1243dSDimitry Andric   // Index pointing to the conflicting instruction that is currently being
299bdd1243dSDimitry Andric   // fitted
300bdd1243dSDimitry Andric   int CurrConflInstNo = 0;
301bdd1243dSDimitry Andric   // Index to the pipeline that is currently being fitted
302bdd1243dSDimitry Andric   int CurrSyncGroupIdx = 0;
303bdd1243dSDimitry Andric   // The first non trivial pipeline
304bdd1243dSDimitry Andric   int BeginSyncGroupIdx = 0;
305bdd1243dSDimitry Andric 
306bdd1243dSDimitry Andric   // How many branches we have explored
307bdd1243dSDimitry Andric   uint64_t BranchesExplored = 0;
308bdd1243dSDimitry Andric 
30906c3fb27SDimitry Andric   // The direction in which we process the candidate SchedGroups per SU
310*0fca6ea1SDimitry Andric   bool IsBottomUp = true;
31106c3fb27SDimitry Andric 
312bdd1243dSDimitry Andric   // Update indices to fit next conflicting instruction
313bdd1243dSDimitry Andric   void advancePosition();
314bdd1243dSDimitry Andric   // Recede indices to attempt to find better fit for previous conflicting
315bdd1243dSDimitry Andric   // instruction
316bdd1243dSDimitry Andric   void retreatPosition();
317bdd1243dSDimitry Andric 
318bdd1243dSDimitry Andric   // The exponential time algorithm which finds the provably best fit
319bdd1243dSDimitry Andric   bool solveExact();
320bdd1243dSDimitry Andric   // The polynomial time algorithm which attempts to find a good fit
321bdd1243dSDimitry Andric   bool solveGreedy();
32206c3fb27SDimitry Andric   // Find the best SchedGroup for the current SU using the heuristic given all
32306c3fb27SDimitry Andric   // current information. One step in the greedy algorithm. Templated against
32406c3fb27SDimitry Andric   // the SchedGroup iterator (either reverse or forward).
32506c3fb27SDimitry Andric   template <typename T>
32606c3fb27SDimitry Andric   void greedyFind(std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges, T I,
32706c3fb27SDimitry Andric                   T E);
328bdd1243dSDimitry Andric   // Whether or not the current solution is optimal
329bdd1243dSDimitry Andric   bool checkOptimal();
330bdd1243dSDimitry Andric   // Populate the ready list, prioiritizing fewest missed edges first
33106c3fb27SDimitry Andric   // Templated against the SchedGroup iterator (either reverse or forward).
33206c3fb27SDimitry Andric   template <typename T>
33306c3fb27SDimitry Andric   void populateReadyList(SmallVectorImpl<std::pair<int, int>> &ReadyList, T I,
33406c3fb27SDimitry Andric                          T E);
335bdd1243dSDimitry Andric   // Add edges corresponding to the SchedGroups as assigned by solver
336bdd1243dSDimitry Andric   void makePipeline();
33706c3fb27SDimitry Andric   // Link the SchedGroups in the best found pipeline.
33806c3fb27SDimitry Andric   // Tmplated against the SchedGroup iterator (either reverse or forward).
33906c3fb27SDimitry Andric   template <typename T> void linkSchedGroups(T I, T E);
340bdd1243dSDimitry Andric   // Add the edges from the SU to the other SchedGroups in pipeline, and
341bdd1243dSDimitry Andric   // return the number of edges missed.
342bdd1243dSDimitry Andric   int addEdges(SmallVectorImpl<SchedGroup> &SyncPipeline, SUnit *SU, int SGID,
343bdd1243dSDimitry Andric                std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges);
3445f757f3fSDimitry Andric   /// Link the pipeline as if \p SU was in the SchedGroup with ID \p SGID. It
3455f757f3fSDimitry Andric   /// returns the cost (in terms of missed pipeline edges), and tracks the edges
3465f757f3fSDimitry Andric   /// added in \p AddedEdges
34706c3fb27SDimitry Andric   template <typename T>
34806c3fb27SDimitry Andric   int linkSUnit(SUnit *SU, int SGID,
34906c3fb27SDimitry Andric                 std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges, T I, T E);
3505f757f3fSDimitry Andric   /// Remove the edges passed via \p AddedEdges
351bdd1243dSDimitry Andric   void removeEdges(const std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges);
352bdd1243dSDimitry Andric   // Convert the passed in maps to arrays for bidirectional iterators
353bdd1243dSDimitry Andric   void convertSyncMapsToArrays();
354bdd1243dSDimitry Andric 
355bdd1243dSDimitry Andric   void reset();
356bdd1243dSDimitry Andric 
357bdd1243dSDimitry Andric public:
358bdd1243dSDimitry Andric   // Invoke the solver to map instructions to instruction groups. Heuristic &&
359bdd1243dSDimitry Andric   // command-line-option determines to use exact or greedy algorithm.
360bdd1243dSDimitry Andric   void solve();
361bdd1243dSDimitry Andric 
362bdd1243dSDimitry Andric   PipelineSolver(DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
363bdd1243dSDimitry Andric                  DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
364*0fca6ea1SDimitry Andric                  ScheduleDAGMI *DAG, bool IsBottomUp = true)
365bdd1243dSDimitry Andric       : DAG(DAG), SyncedInstrs(SyncedInstrs),
36606c3fb27SDimitry Andric         SyncedSchedGroups(SyncedSchedGroups), IsBottomUp(IsBottomUp) {
367bdd1243dSDimitry Andric 
368bdd1243dSDimitry Andric     for (auto &PipelineInstrs : SyncedInstrs) {
369bdd1243dSDimitry Andric       if (PipelineInstrs.second.size() > 0) {
370bdd1243dSDimitry Andric         NeedsSolver = true;
371bdd1243dSDimitry Andric         break;
372bdd1243dSDimitry Andric       }
373bdd1243dSDimitry Andric     }
374bdd1243dSDimitry Andric 
375bdd1243dSDimitry Andric     if (!NeedsSolver)
376bdd1243dSDimitry Andric       return;
377bdd1243dSDimitry Andric 
378bdd1243dSDimitry Andric     convertSyncMapsToArrays();
379bdd1243dSDimitry Andric 
380bdd1243dSDimitry Andric     CurrPipeline = BestPipeline;
381bdd1243dSDimitry Andric 
382bdd1243dSDimitry Andric     while (static_cast<size_t>(BeginSyncGroupIdx) < PipelineInstrs.size() &&
383bdd1243dSDimitry Andric            PipelineInstrs[BeginSyncGroupIdx].size() == 0)
384bdd1243dSDimitry Andric       ++BeginSyncGroupIdx;
385bdd1243dSDimitry Andric 
386bdd1243dSDimitry Andric     if (static_cast<size_t>(BeginSyncGroupIdx) >= PipelineInstrs.size())
387bdd1243dSDimitry Andric       return;
388bdd1243dSDimitry Andric   }
389bdd1243dSDimitry Andric };
390bdd1243dSDimitry Andric 
391bdd1243dSDimitry Andric void PipelineSolver::reset() {
392bdd1243dSDimitry Andric 
393bdd1243dSDimitry Andric   for (auto &SyncPipeline : CurrPipeline) {
394bdd1243dSDimitry Andric     for (auto &SG : SyncPipeline) {
395bdd1243dSDimitry Andric       SmallVector<SUnit *, 32> TempCollection = SG.Collection;
396bdd1243dSDimitry Andric       SG.Collection.clear();
397bdd1243dSDimitry Andric       auto SchedBarr = llvm::find_if(TempCollection, [](SUnit *SU) {
398bdd1243dSDimitry Andric         return SU->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER;
399bdd1243dSDimitry Andric       });
400bdd1243dSDimitry Andric       if (SchedBarr != TempCollection.end())
401bdd1243dSDimitry Andric         SG.Collection.push_back(*SchedBarr);
402bdd1243dSDimitry Andric     }
403bdd1243dSDimitry Andric   }
404bdd1243dSDimitry Andric 
405bdd1243dSDimitry Andric   CurrSyncGroupIdx = BeginSyncGroupIdx;
406bdd1243dSDimitry Andric   CurrConflInstNo = 0;
407bdd1243dSDimitry Andric   CurrCost = 0;
408bdd1243dSDimitry Andric }
409bdd1243dSDimitry Andric 
410bdd1243dSDimitry Andric void PipelineSolver::convertSyncMapsToArrays() {
411bdd1243dSDimitry Andric   for (auto &SyncPipe : SyncedSchedGroups) {
412bdd1243dSDimitry Andric     BestPipeline.insert(BestPipeline.begin(), SyncPipe.second);
413bdd1243dSDimitry Andric   }
414bdd1243dSDimitry Andric 
415bdd1243dSDimitry Andric   int PipelineIDx = SyncedInstrs.size() - 1;
416bdd1243dSDimitry Andric   PipelineInstrs.resize(SyncedInstrs.size());
417bdd1243dSDimitry Andric   for (auto &SyncInstrMap : SyncedInstrs) {
418bdd1243dSDimitry Andric     for (auto &SUsToCandSGs : SyncInstrMap.second) {
419bdd1243dSDimitry Andric       if (PipelineInstrs[PipelineIDx].size() == 0) {
420bdd1243dSDimitry Andric         PipelineInstrs[PipelineIDx].push_back(
421bdd1243dSDimitry Andric             std::pair(SUsToCandSGs.first, SUsToCandSGs.second));
422bdd1243dSDimitry Andric         continue;
423bdd1243dSDimitry Andric       }
424bdd1243dSDimitry Andric       auto SortPosition = PipelineInstrs[PipelineIDx].begin();
425bdd1243dSDimitry Andric       // Insert them in sorted order -- this allows for good parsing order in
426bdd1243dSDimitry Andric       // the greedy algorithm
427bdd1243dSDimitry Andric       while (SortPosition != PipelineInstrs[PipelineIDx].end() &&
428bdd1243dSDimitry Andric              SUsToCandSGs.first->NodeNum > SortPosition->first->NodeNum)
429bdd1243dSDimitry Andric         ++SortPosition;
430bdd1243dSDimitry Andric       PipelineInstrs[PipelineIDx].insert(
431bdd1243dSDimitry Andric           SortPosition, std::pair(SUsToCandSGs.first, SUsToCandSGs.second));
432bdd1243dSDimitry Andric     }
433bdd1243dSDimitry Andric     --PipelineIDx;
434bdd1243dSDimitry Andric   }
435bdd1243dSDimitry Andric }
436bdd1243dSDimitry Andric 
43706c3fb27SDimitry Andric template <typename T> void PipelineSolver::linkSchedGroups(T I, T E) {
43806c3fb27SDimitry Andric   for (; I != E; ++I) {
43906c3fb27SDimitry Andric     auto &GroupA = *I;
44006c3fb27SDimitry Andric     for (auto J = std::next(I); J != E; ++J) {
44106c3fb27SDimitry Andric       auto &GroupB = *J;
44206c3fb27SDimitry Andric       GroupA.link(GroupB);
44306c3fb27SDimitry Andric     }
44406c3fb27SDimitry Andric   }
44506c3fb27SDimitry Andric }
44606c3fb27SDimitry Andric 
447bdd1243dSDimitry Andric void PipelineSolver::makePipeline() {
448bdd1243dSDimitry Andric   // Preserve the order of barrier for subsequent SchedGroupBarrier mutations
449bdd1243dSDimitry Andric   for (auto &SyncPipeline : BestPipeline) {
45006c3fb27SDimitry Andric     LLVM_DEBUG(dbgs() << "Printing SchedGroups\n");
451bdd1243dSDimitry Andric     for (auto &SG : SyncPipeline) {
45206c3fb27SDimitry Andric       LLVM_DEBUG(dbgs() << "SchedGroup with SGID " << SG.getSGID()
45306c3fb27SDimitry Andric                         << " has: \n");
454bdd1243dSDimitry Andric       SUnit *SGBarr = nullptr;
455bdd1243dSDimitry Andric       for (auto &SU : SG.Collection) {
456bdd1243dSDimitry Andric         if (SU->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER)
457bdd1243dSDimitry Andric           SGBarr = SU;
45806c3fb27SDimitry Andric         LLVM_DEBUG(dbgs() << "SU(" << SU->NodeNum << ")\n");
459bdd1243dSDimitry Andric       }
460bdd1243dSDimitry Andric       // Command line requested IGroupLP doesn't have SGBarr
461bdd1243dSDimitry Andric       if (!SGBarr)
462bdd1243dSDimitry Andric         continue;
463bdd1243dSDimitry Andric       resetEdges(*SGBarr, DAG);
464bdd1243dSDimitry Andric       SG.link(*SGBarr, false);
465bdd1243dSDimitry Andric     }
466bdd1243dSDimitry Andric   }
467bdd1243dSDimitry Andric 
468bdd1243dSDimitry Andric   for (auto &SyncPipeline : BestPipeline) {
46906c3fb27SDimitry Andric     IsBottomUp ? linkSchedGroups(SyncPipeline.rbegin(), SyncPipeline.rend())
47006c3fb27SDimitry Andric                : linkSchedGroups(SyncPipeline.begin(), SyncPipeline.end());
47181ad6265SDimitry Andric   }
47281ad6265SDimitry Andric }
47306c3fb27SDimitry Andric 
47406c3fb27SDimitry Andric template <typename T>
47506c3fb27SDimitry Andric int PipelineSolver::linkSUnit(
47606c3fb27SDimitry Andric     SUnit *SU, int SGID, std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges,
47706c3fb27SDimitry Andric     T I, T E) {
47806c3fb27SDimitry Andric   bool MakePred = false;
47906c3fb27SDimitry Andric   int AddedCost = 0;
48006c3fb27SDimitry Andric   for (; I < E; ++I) {
48106c3fb27SDimitry Andric     if (I->getSGID() == SGID) {
48206c3fb27SDimitry Andric       MakePred = true;
48306c3fb27SDimitry Andric       continue;
48481ad6265SDimitry Andric     }
48506c3fb27SDimitry Andric     auto Group = *I;
48606c3fb27SDimitry Andric     AddedCost += Group.link(*SU, MakePred, AddedEdges);
48706c3fb27SDimitry Andric     assert(AddedCost >= 0);
48806c3fb27SDimitry Andric   }
48906c3fb27SDimitry Andric   return AddedCost;
490bdd1243dSDimitry Andric }
49181ad6265SDimitry Andric 
492bdd1243dSDimitry Andric int PipelineSolver::addEdges(
493bdd1243dSDimitry Andric     SmallVectorImpl<SchedGroup> &SyncPipeline, SUnit *SU, int SGID,
494bdd1243dSDimitry Andric     std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges) {
495bdd1243dSDimitry Andric 
49606c3fb27SDimitry Andric   // For IsBottomUp, the first SchedGroup in SyncPipeline contains the
49706c3fb27SDimitry Andric   // instructions that are the ultimate successors in the resultant mutation.
49806c3fb27SDimitry Andric   // Therefore, in such a configuration, the SchedGroups occurring before the
49906c3fb27SDimitry Andric   // candidate SGID are successors of the candidate SchedGroup, thus the current
50006c3fb27SDimitry Andric   // SU should be linked as a predecessor to SUs in those SchedGroups. The
50106c3fb27SDimitry Andric   // opposite is true if !IsBottomUp. IsBottomUp occurs in the case of multiple
50206c3fb27SDimitry Andric   // SCHED_GROUP_BARRIERS, or if a user specifies IGLP_OPT SchedGroups using
50306c3fb27SDimitry Andric   // IsBottomUp (in reverse).
50406c3fb27SDimitry Andric   return IsBottomUp ? linkSUnit(SU, SGID, AddedEdges, SyncPipeline.rbegin(),
50506c3fb27SDimitry Andric                                 SyncPipeline.rend())
50606c3fb27SDimitry Andric                     : linkSUnit(SU, SGID, AddedEdges, SyncPipeline.begin(),
50706c3fb27SDimitry Andric                                 SyncPipeline.end());
508bdd1243dSDimitry Andric }
509bdd1243dSDimitry Andric 
510bdd1243dSDimitry Andric void PipelineSolver::removeEdges(
511bdd1243dSDimitry Andric     const std::vector<std::pair<SUnit *, SUnit *>> &EdgesToRemove) {
512bdd1243dSDimitry Andric   // Only remove the edges that we have added when testing
513bdd1243dSDimitry Andric   // the fit.
514bdd1243dSDimitry Andric   for (auto &PredSuccPair : EdgesToRemove) {
515bdd1243dSDimitry Andric     SUnit *Pred = PredSuccPair.first;
516bdd1243dSDimitry Andric     SUnit *Succ = PredSuccPair.second;
517bdd1243dSDimitry Andric 
518bdd1243dSDimitry Andric     auto Match = llvm::find_if(
519bdd1243dSDimitry Andric         Succ->Preds, [&Pred](SDep &P) { return P.getSUnit() == Pred; });
520bdd1243dSDimitry Andric     if (Match != Succ->Preds.end()) {
521bdd1243dSDimitry Andric       assert(Match->isArtificial());
522bdd1243dSDimitry Andric       Succ->removePred(*Match);
523bdd1243dSDimitry Andric     }
524bdd1243dSDimitry Andric   }
525bdd1243dSDimitry Andric }
526bdd1243dSDimitry Andric 
527bdd1243dSDimitry Andric void PipelineSolver::advancePosition() {
528bdd1243dSDimitry Andric   ++CurrConflInstNo;
529bdd1243dSDimitry Andric 
530bdd1243dSDimitry Andric   if (static_cast<size_t>(CurrConflInstNo) >=
531bdd1243dSDimitry Andric       PipelineInstrs[CurrSyncGroupIdx].size()) {
532bdd1243dSDimitry Andric     CurrConflInstNo = 0;
533bdd1243dSDimitry Andric     ++CurrSyncGroupIdx;
534bdd1243dSDimitry Andric     // Advance to next non-trivial pipeline
535bdd1243dSDimitry Andric     while (static_cast<size_t>(CurrSyncGroupIdx) < PipelineInstrs.size() &&
536bdd1243dSDimitry Andric            PipelineInstrs[CurrSyncGroupIdx].size() == 0)
537bdd1243dSDimitry Andric       ++CurrSyncGroupIdx;
538bdd1243dSDimitry Andric   }
539bdd1243dSDimitry Andric }
540bdd1243dSDimitry Andric 
541bdd1243dSDimitry Andric void PipelineSolver::retreatPosition() {
542bdd1243dSDimitry Andric   assert(CurrConflInstNo >= 0);
543bdd1243dSDimitry Andric   assert(CurrSyncGroupIdx >= 0);
544bdd1243dSDimitry Andric 
545bdd1243dSDimitry Andric   if (CurrConflInstNo > 0) {
546bdd1243dSDimitry Andric     --CurrConflInstNo;
547bdd1243dSDimitry Andric     return;
548bdd1243dSDimitry Andric   }
549bdd1243dSDimitry Andric 
550bdd1243dSDimitry Andric   if (CurrConflInstNo == 0) {
551bdd1243dSDimitry Andric     // If we return to the starting position, we have explored
552bdd1243dSDimitry Andric     // the entire tree
553bdd1243dSDimitry Andric     if (CurrSyncGroupIdx == BeginSyncGroupIdx)
554bdd1243dSDimitry Andric       return;
555bdd1243dSDimitry Andric 
556bdd1243dSDimitry Andric     --CurrSyncGroupIdx;
557bdd1243dSDimitry Andric     // Go to previous non-trivial pipeline
558bdd1243dSDimitry Andric     while (PipelineInstrs[CurrSyncGroupIdx].size() == 0)
559bdd1243dSDimitry Andric       --CurrSyncGroupIdx;
560bdd1243dSDimitry Andric 
561bdd1243dSDimitry Andric     CurrConflInstNo = PipelineInstrs[CurrSyncGroupIdx].size() - 1;
562bdd1243dSDimitry Andric   }
563bdd1243dSDimitry Andric }
564bdd1243dSDimitry Andric 
565bdd1243dSDimitry Andric bool PipelineSolver::checkOptimal() {
566bdd1243dSDimitry Andric   if (static_cast<size_t>(CurrSyncGroupIdx) == PipelineInstrs.size()) {
567bdd1243dSDimitry Andric     if (BestCost == -1 || CurrCost < BestCost) {
568bdd1243dSDimitry Andric       BestPipeline = CurrPipeline;
569bdd1243dSDimitry Andric       BestCost = CurrCost;
570bdd1243dSDimitry Andric       LLVM_DEBUG(dbgs() << "Found Fit with cost " << BestCost << "\n");
571bdd1243dSDimitry Andric     }
572bdd1243dSDimitry Andric     assert(BestCost >= 0);
573bdd1243dSDimitry Andric   }
574bdd1243dSDimitry Andric 
575bdd1243dSDimitry Andric   bool DoneExploring = false;
576bdd1243dSDimitry Andric   if (MaxBranchesExplored > 0 && BranchesExplored >= MaxBranchesExplored)
577bdd1243dSDimitry Andric     DoneExploring = true;
578bdd1243dSDimitry Andric 
579bdd1243dSDimitry Andric   return (DoneExploring || BestCost == 0);
580bdd1243dSDimitry Andric }
581bdd1243dSDimitry Andric 
58206c3fb27SDimitry Andric template <typename T>
583bdd1243dSDimitry Andric void PipelineSolver::populateReadyList(
58406c3fb27SDimitry Andric     SmallVectorImpl<std::pair<int, int>> &ReadyList, T I, T E) {
58506c3fb27SDimitry Andric   SUToCandSGsPair CurrSU = PipelineInstrs[CurrSyncGroupIdx][CurrConflInstNo];
58606c3fb27SDimitry Andric   auto SyncPipeline = CurrPipeline[CurrSyncGroupIdx];
587bdd1243dSDimitry Andric   assert(CurrSU.second.size() >= 1);
58806c3fb27SDimitry Andric 
589bdd1243dSDimitry Andric   for (; I != E; ++I) {
590bdd1243dSDimitry Andric     std::vector<std::pair<SUnit *, SUnit *>> AddedEdges;
591bdd1243dSDimitry Andric     int CandSGID = *I;
5925f757f3fSDimitry Andric     SchedGroup *Match = llvm::find_if(SyncPipeline, [CandSGID](SchedGroup &SG) {
5935f757f3fSDimitry Andric       return SG.getSGID() == CandSGID;
5945f757f3fSDimitry Andric     });
5955f757f3fSDimitry Andric     assert(Match);
596bdd1243dSDimitry Andric 
597bdd1243dSDimitry Andric     if (UseCostHeur) {
598bdd1243dSDimitry Andric       if (Match->isFull()) {
599bdd1243dSDimitry Andric         ReadyList.push_back(std::pair(*I, MissPenalty));
600bdd1243dSDimitry Andric         continue;
601bdd1243dSDimitry Andric       }
602bdd1243dSDimitry Andric 
603bdd1243dSDimitry Andric       int TempCost = addEdges(SyncPipeline, CurrSU.first, CandSGID, AddedEdges);
604bdd1243dSDimitry Andric       ReadyList.push_back(std::pair(*I, TempCost));
605bdd1243dSDimitry Andric       removeEdges(AddedEdges);
606bdd1243dSDimitry Andric     } else
607bdd1243dSDimitry Andric       ReadyList.push_back(std::pair(*I, -1));
608bdd1243dSDimitry Andric   }
609bdd1243dSDimitry Andric 
610bdd1243dSDimitry Andric   if (UseCostHeur) {
611bdd1243dSDimitry Andric     std::sort(ReadyList.begin(), ReadyList.end(),
612bdd1243dSDimitry Andric               [](std::pair<int, int> A, std::pair<int, int> B) {
613bdd1243dSDimitry Andric                 return A.second < B.second;
614bdd1243dSDimitry Andric               });
615bdd1243dSDimitry Andric   }
616bdd1243dSDimitry Andric 
617bdd1243dSDimitry Andric   assert(ReadyList.size() == CurrSU.second.size());
618bdd1243dSDimitry Andric }
619bdd1243dSDimitry Andric 
620bdd1243dSDimitry Andric bool PipelineSolver::solveExact() {
621bdd1243dSDimitry Andric   if (checkOptimal())
622bdd1243dSDimitry Andric     return true;
623bdd1243dSDimitry Andric 
624bdd1243dSDimitry Andric   if (static_cast<size_t>(CurrSyncGroupIdx) == PipelineInstrs.size())
625bdd1243dSDimitry Andric     return false;
626bdd1243dSDimitry Andric 
627bdd1243dSDimitry Andric   assert(static_cast<size_t>(CurrSyncGroupIdx) < PipelineInstrs.size());
628bdd1243dSDimitry Andric   assert(static_cast<size_t>(CurrConflInstNo) <
629bdd1243dSDimitry Andric          PipelineInstrs[CurrSyncGroupIdx].size());
630bdd1243dSDimitry Andric   SUToCandSGsPair CurrSU = PipelineInstrs[CurrSyncGroupIdx][CurrConflInstNo];
631bdd1243dSDimitry Andric   LLVM_DEBUG(dbgs() << "Fitting SU(" << CurrSU.first->NodeNum
632bdd1243dSDimitry Andric                     << ") in Pipeline # " << CurrSyncGroupIdx << "\n");
633bdd1243dSDimitry Andric 
634bdd1243dSDimitry Andric   // SchedGroup -> Cost pairs
635bdd1243dSDimitry Andric   SmallVector<std::pair<int, int>, 4> ReadyList;
636bdd1243dSDimitry Andric   // Prioritize the candidate sched groups in terms of lowest cost first
63706c3fb27SDimitry Andric   IsBottomUp ? populateReadyList(ReadyList, CurrSU.second.rbegin(),
63806c3fb27SDimitry Andric                                  CurrSU.second.rend())
63906c3fb27SDimitry Andric              : populateReadyList(ReadyList, CurrSU.second.begin(),
64006c3fb27SDimitry Andric                                  CurrSU.second.end());
641bdd1243dSDimitry Andric 
642bdd1243dSDimitry Andric   auto I = ReadyList.begin();
643bdd1243dSDimitry Andric   auto E = ReadyList.end();
644bdd1243dSDimitry Andric   for (; I != E; ++I) {
645bdd1243dSDimitry Andric     // If we are trying SGs in least cost order, and the current SG is cost
646bdd1243dSDimitry Andric     // infeasible, then all subsequent SGs will also be cost infeasible, so we
647bdd1243dSDimitry Andric     // can prune.
648bdd1243dSDimitry Andric     if (BestCost != -1 && (CurrCost + I->second > BestCost))
649bdd1243dSDimitry Andric       return false;
650bdd1243dSDimitry Andric 
651bdd1243dSDimitry Andric     int CandSGID = I->first;
652bdd1243dSDimitry Andric     int AddedCost = 0;
653bdd1243dSDimitry Andric     std::vector<std::pair<SUnit *, SUnit *>> AddedEdges;
654bdd1243dSDimitry Andric     auto &SyncPipeline = CurrPipeline[CurrSyncGroupIdx];
655bdd1243dSDimitry Andric     SchedGroup *Match;
656bdd1243dSDimitry Andric     for (auto &SG : SyncPipeline) {
657bdd1243dSDimitry Andric       if (SG.getSGID() == CandSGID)
658bdd1243dSDimitry Andric         Match = &SG;
659bdd1243dSDimitry Andric     }
660bdd1243dSDimitry Andric 
661bdd1243dSDimitry Andric     if (Match->isFull())
662bdd1243dSDimitry Andric       continue;
663bdd1243dSDimitry Andric 
66406c3fb27SDimitry Andric     if (!Match->allowedByRules(CurrSU.first, SyncPipeline))
66506c3fb27SDimitry Andric       continue;
66606c3fb27SDimitry Andric 
667bdd1243dSDimitry Andric     LLVM_DEBUG(dbgs() << "Assigning to SchedGroup with Mask "
668bdd1243dSDimitry Andric                       << (int)Match->getMask() << "and ID " << CandSGID
669bdd1243dSDimitry Andric                       << "\n");
670bdd1243dSDimitry Andric     Match->add(*CurrSU.first);
671bdd1243dSDimitry Andric     AddedCost = addEdges(SyncPipeline, CurrSU.first, CandSGID, AddedEdges);
672bdd1243dSDimitry Andric     LLVM_DEBUG(dbgs() << "Cost of Assignment: " << AddedCost << "\n");
673bdd1243dSDimitry Andric     CurrCost += AddedCost;
674bdd1243dSDimitry Andric     advancePosition();
675bdd1243dSDimitry Andric     ++BranchesExplored;
676bdd1243dSDimitry Andric     bool FinishedExploring = false;
677bdd1243dSDimitry Andric     // If the Cost after adding edges is greater than a known solution,
678bdd1243dSDimitry Andric     // backtrack
679bdd1243dSDimitry Andric     if (CurrCost < BestCost || BestCost == -1) {
680bdd1243dSDimitry Andric       if (solveExact()) {
681bdd1243dSDimitry Andric         FinishedExploring = BestCost != 0;
682bdd1243dSDimitry Andric         if (!FinishedExploring)
683bdd1243dSDimitry Andric           return true;
684bdd1243dSDimitry Andric       }
685bdd1243dSDimitry Andric     }
686bdd1243dSDimitry Andric 
687bdd1243dSDimitry Andric     retreatPosition();
688bdd1243dSDimitry Andric     CurrCost -= AddedCost;
689bdd1243dSDimitry Andric     removeEdges(AddedEdges);
690bdd1243dSDimitry Andric     Match->pop();
691bdd1243dSDimitry Andric     CurrPipeline[CurrSyncGroupIdx] = SyncPipeline;
692bdd1243dSDimitry Andric     if (FinishedExploring)
693bdd1243dSDimitry Andric       return true;
694bdd1243dSDimitry Andric   }
695bdd1243dSDimitry Andric 
696bdd1243dSDimitry Andric   // Try the pipeline where the current instruction is omitted
697bdd1243dSDimitry Andric   // Potentially if we omit a problematic instruction from the pipeline,
698bdd1243dSDimitry Andric   // all the other instructions can nicely fit.
699bdd1243dSDimitry Andric   CurrCost += MissPenalty;
700bdd1243dSDimitry Andric   advancePosition();
701bdd1243dSDimitry Andric 
702bdd1243dSDimitry Andric   LLVM_DEBUG(dbgs() << "NOT Assigned (" << CurrSU.first->NodeNum << ")\n");
703bdd1243dSDimitry Andric 
704bdd1243dSDimitry Andric   bool FinishedExploring = false;
705bdd1243dSDimitry Andric   if (CurrCost < BestCost || BestCost == -1) {
706bdd1243dSDimitry Andric     if (solveExact()) {
707bdd1243dSDimitry Andric       bool FinishedExploring = BestCost != 0;
708bdd1243dSDimitry Andric       if (!FinishedExploring)
709bdd1243dSDimitry Andric         return true;
710bdd1243dSDimitry Andric     }
711bdd1243dSDimitry Andric   }
712bdd1243dSDimitry Andric 
713bdd1243dSDimitry Andric   retreatPosition();
714bdd1243dSDimitry Andric   CurrCost -= MissPenalty;
715bdd1243dSDimitry Andric   return FinishedExploring;
716bdd1243dSDimitry Andric }
717bdd1243dSDimitry Andric 
71806c3fb27SDimitry Andric template <typename T>
71906c3fb27SDimitry Andric void PipelineSolver::greedyFind(
72006c3fb27SDimitry Andric     std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges, T I, T E) {
721bdd1243dSDimitry Andric   SUToCandSGsPair CurrSU = PipelineInstrs[CurrSyncGroupIdx][CurrConflInstNo];
722bdd1243dSDimitry Andric   int BestNodeCost = -1;
723bdd1243dSDimitry Andric   int TempCost;
724bdd1243dSDimitry Andric   SchedGroup *BestGroup = nullptr;
725bdd1243dSDimitry Andric   int BestGroupID = -1;
726bdd1243dSDimitry Andric   auto &SyncPipeline = CurrPipeline[CurrSyncGroupIdx];
727bdd1243dSDimitry Andric   LLVM_DEBUG(dbgs() << "Fitting SU(" << CurrSU.first->NodeNum
728bdd1243dSDimitry Andric                     << ") in Pipeline # " << CurrSyncGroupIdx << "\n");
729bdd1243dSDimitry Andric 
730bdd1243dSDimitry Andric   // Since we have added the potential SchedGroups from bottom up, but
731bdd1243dSDimitry Andric   // traversed the DAG from top down, parse over the groups from last to
732bdd1243dSDimitry Andric   // first. If we fail to do this for the greedy algorithm, the solution will
733bdd1243dSDimitry Andric   // likely not be good in more complex cases.
734bdd1243dSDimitry Andric   for (; I != E; ++I) {
735bdd1243dSDimitry Andric     std::vector<std::pair<SUnit *, SUnit *>> AddedEdges;
736bdd1243dSDimitry Andric     int CandSGID = *I;
7375f757f3fSDimitry Andric     SchedGroup *Match = llvm::find_if(SyncPipeline, [CandSGID](SchedGroup &SG) {
7385f757f3fSDimitry Andric       return SG.getSGID() == CandSGID;
7395f757f3fSDimitry Andric     });
7405f757f3fSDimitry Andric     assert(Match);
741bdd1243dSDimitry Andric 
742bdd1243dSDimitry Andric     LLVM_DEBUG(dbgs() << "Trying SGID # " << CandSGID << " with Mask "
743bdd1243dSDimitry Andric                       << (int)Match->getMask() << "\n");
744bdd1243dSDimitry Andric 
745bdd1243dSDimitry Andric     if (Match->isFull()) {
746bdd1243dSDimitry Andric       LLVM_DEBUG(dbgs() << "SGID # " << CandSGID << " is full\n");
747bdd1243dSDimitry Andric       continue;
748bdd1243dSDimitry Andric     }
74906c3fb27SDimitry Andric     if (!Match->allowedByRules(CurrSU.first, SyncPipeline)) {
75006c3fb27SDimitry Andric       LLVM_DEBUG(dbgs() << "SGID # " << CandSGID << " has conflicting rule\n");
75106c3fb27SDimitry Andric       continue;
75206c3fb27SDimitry Andric     }
753bdd1243dSDimitry Andric     TempCost = addEdges(SyncPipeline, CurrSU.first, CandSGID, AddedEdges);
754bdd1243dSDimitry Andric     LLVM_DEBUG(dbgs() << "Cost of Group " << TempCost << "\n");
755bdd1243dSDimitry Andric     if (TempCost < BestNodeCost || BestNodeCost == -1) {
756bdd1243dSDimitry Andric       BestGroup = Match;
757bdd1243dSDimitry Andric       BestNodeCost = TempCost;
758bdd1243dSDimitry Andric       BestGroupID = CandSGID;
759bdd1243dSDimitry Andric     }
760bdd1243dSDimitry Andric     removeEdges(AddedEdges);
761bdd1243dSDimitry Andric     if (BestNodeCost == 0)
762bdd1243dSDimitry Andric       break;
763bdd1243dSDimitry Andric   }
764bdd1243dSDimitry Andric 
765bdd1243dSDimitry Andric   if (BestGroupID != -1) {
766bdd1243dSDimitry Andric     BestGroup->add(*CurrSU.first);
767bdd1243dSDimitry Andric     addEdges(SyncPipeline, CurrSU.first, BestGroupID, AddedEdges);
768bdd1243dSDimitry Andric     LLVM_DEBUG(dbgs() << "Best Group has ID: " << BestGroupID << " and Mask"
769bdd1243dSDimitry Andric                       << (int)BestGroup->getMask() << "\n");
770bdd1243dSDimitry Andric     BestCost += TempCost;
771bdd1243dSDimitry Andric   } else
772bdd1243dSDimitry Andric     BestCost += MissPenalty;
773bdd1243dSDimitry Andric 
774bdd1243dSDimitry Andric   CurrPipeline[CurrSyncGroupIdx] = SyncPipeline;
77506c3fb27SDimitry Andric }
77606c3fb27SDimitry Andric 
77706c3fb27SDimitry Andric bool PipelineSolver::solveGreedy() {
77806c3fb27SDimitry Andric   BestCost = 0;
77906c3fb27SDimitry Andric   std::vector<std::pair<SUnit *, SUnit *>> AddedEdges;
78006c3fb27SDimitry Andric 
78106c3fb27SDimitry Andric   while (static_cast<size_t>(CurrSyncGroupIdx) < PipelineInstrs.size()) {
78206c3fb27SDimitry Andric     SUToCandSGsPair CurrSU = PipelineInstrs[CurrSyncGroupIdx][CurrConflInstNo];
78306c3fb27SDimitry Andric     IsBottomUp
78406c3fb27SDimitry Andric         ? greedyFind(AddedEdges, CurrSU.second.rbegin(), CurrSU.second.rend())
78506c3fb27SDimitry Andric         : greedyFind(AddedEdges, CurrSU.second.begin(), CurrSU.second.end());
786bdd1243dSDimitry Andric     advancePosition();
787bdd1243dSDimitry Andric   }
788bdd1243dSDimitry Andric   BestPipeline = CurrPipeline;
789bdd1243dSDimitry Andric   removeEdges(AddedEdges);
790bdd1243dSDimitry Andric   return false;
791bdd1243dSDimitry Andric }
792bdd1243dSDimitry Andric 
793bdd1243dSDimitry Andric unsigned PipelineSolver::computeProblemSize() {
794bdd1243dSDimitry Andric   unsigned ProblemSize = 0;
795bdd1243dSDimitry Andric   for (auto &PipeConflicts : PipelineInstrs) {
796bdd1243dSDimitry Andric     ProblemSize += PipeConflicts.size();
797bdd1243dSDimitry Andric   }
798bdd1243dSDimitry Andric 
799bdd1243dSDimitry Andric   return ProblemSize;
800bdd1243dSDimitry Andric }
801bdd1243dSDimitry Andric 
802bdd1243dSDimitry Andric void PipelineSolver::solve() {
803bdd1243dSDimitry Andric   if (!NeedsSolver)
804bdd1243dSDimitry Andric     return;
805bdd1243dSDimitry Andric 
806bdd1243dSDimitry Andric   unsigned ProblemSize = computeProblemSize();
807bdd1243dSDimitry Andric   assert(ProblemSize > 0);
808bdd1243dSDimitry Andric 
809bdd1243dSDimitry Andric   bool BelowCutoff = (CutoffForExact > 0) && ProblemSize <= CutoffForExact;
810bdd1243dSDimitry Andric   MissPenalty = (ProblemSize / 2) + 1;
811bdd1243dSDimitry Andric 
812bdd1243dSDimitry Andric   LLVM_DEBUG(DAG->dump());
813bdd1243dSDimitry Andric   if (EnableExactSolver || BelowCutoff) {
814bdd1243dSDimitry Andric     LLVM_DEBUG(dbgs() << "Starting Greedy pipeline solver\n");
815bdd1243dSDimitry Andric     solveGreedy();
816bdd1243dSDimitry Andric     reset();
817bdd1243dSDimitry Andric     LLVM_DEBUG(dbgs() << "Greedy produced best cost of " << BestCost << "\n");
818bdd1243dSDimitry Andric     if (BestCost > 0) {
819bdd1243dSDimitry Andric       LLVM_DEBUG(dbgs() << "Starting EXACT pipeline solver\n");
820bdd1243dSDimitry Andric       solveExact();
821bdd1243dSDimitry Andric       LLVM_DEBUG(dbgs() << "Exact produced best cost of " << BestCost << "\n");
822bdd1243dSDimitry Andric     }
823bdd1243dSDimitry Andric   } else { // Use the Greedy Algorithm by default
824bdd1243dSDimitry Andric     LLVM_DEBUG(dbgs() << "Starting GREEDY pipeline solver\n");
825bdd1243dSDimitry Andric     solveGreedy();
826bdd1243dSDimitry Andric   }
827bdd1243dSDimitry Andric 
828bdd1243dSDimitry Andric   makePipeline();
82906c3fb27SDimitry Andric   LLVM_DEBUG(dbgs() << "After applying mutation\n");
83006c3fb27SDimitry Andric   LLVM_DEBUG(DAG->dump());
831bdd1243dSDimitry Andric }
832bdd1243dSDimitry Andric 
83306c3fb27SDimitry Andric enum IGLPStrategyID : int {
83406c3fb27SDimitry Andric   MFMASmallGemmOptID = 0,
83506c3fb27SDimitry Andric   MFMASmallGemmSingleWaveOptID = 1,
836*0fca6ea1SDimitry Andric   MFMAExpInterleave = 2
83706c3fb27SDimitry Andric };
838bdd1243dSDimitry Andric 
839bdd1243dSDimitry Andric // Implement a IGLP scheduling strategy.
840bdd1243dSDimitry Andric class IGLPStrategy {
841bdd1243dSDimitry Andric protected:
842bdd1243dSDimitry Andric   ScheduleDAGInstrs *DAG;
843bdd1243dSDimitry Andric 
844bdd1243dSDimitry Andric   const SIInstrInfo *TII;
845bdd1243dSDimitry Andric 
846bdd1243dSDimitry Andric public:
8475f757f3fSDimitry Andric   /// Add SchedGroups to \p SyncedSchedGroups to implement this Strategy.
848*0fca6ea1SDimitry Andric   virtual bool applyIGLPStrategy(
849bdd1243dSDimitry Andric       DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
8505f757f3fSDimitry Andric       DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
851*0fca6ea1SDimitry Andric       AMDGPU::SchedulingPhase Phase) = 0;
852bdd1243dSDimitry Andric 
853bdd1243dSDimitry Andric   // Returns true if this strategy should be applied to a ScheduleDAG.
854*0fca6ea1SDimitry Andric   virtual bool shouldApplyStrategy(ScheduleDAGInstrs *DAG,
855*0fca6ea1SDimitry Andric                                    AMDGPU::SchedulingPhase Phase) = 0;
856bdd1243dSDimitry Andric 
857*0fca6ea1SDimitry Andric   bool IsBottomUp = true;
85806c3fb27SDimitry Andric 
859bdd1243dSDimitry Andric   IGLPStrategy(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII)
860bdd1243dSDimitry Andric       : DAG(DAG), TII(TII) {}
861bdd1243dSDimitry Andric 
862bdd1243dSDimitry Andric   virtual ~IGLPStrategy() = default;
863bdd1243dSDimitry Andric };
864bdd1243dSDimitry Andric 
865bdd1243dSDimitry Andric class MFMASmallGemmOpt final : public IGLPStrategy {
86606c3fb27SDimitry Andric private:
867bdd1243dSDimitry Andric public:
868*0fca6ea1SDimitry Andric   bool applyIGLPStrategy(
869bdd1243dSDimitry Andric       DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
8705f757f3fSDimitry Andric       DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
871*0fca6ea1SDimitry Andric       AMDGPU::SchedulingPhase Phase) override;
872bdd1243dSDimitry Andric 
873*0fca6ea1SDimitry Andric   bool shouldApplyStrategy(ScheduleDAGInstrs *DAG,
874*0fca6ea1SDimitry Andric                            AMDGPU::SchedulingPhase Phase) override {
875*0fca6ea1SDimitry Andric     return true;
876*0fca6ea1SDimitry Andric   }
877bdd1243dSDimitry Andric 
878bdd1243dSDimitry Andric   MFMASmallGemmOpt(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII)
87906c3fb27SDimitry Andric       : IGLPStrategy(DAG, TII) {
880*0fca6ea1SDimitry Andric     IsBottomUp = true;
88106c3fb27SDimitry Andric   }
882bdd1243dSDimitry Andric };
883bdd1243dSDimitry Andric 
884*0fca6ea1SDimitry Andric bool MFMASmallGemmOpt::applyIGLPStrategy(
885bdd1243dSDimitry Andric     DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
8865f757f3fSDimitry Andric     DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
887*0fca6ea1SDimitry Andric     AMDGPU::SchedulingPhase Phase) {
888bdd1243dSDimitry Andric   // Count the number of MFMA instructions.
889bdd1243dSDimitry Andric   unsigned MFMACount = 0;
890bdd1243dSDimitry Andric   for (const MachineInstr &I : *DAG)
891bdd1243dSDimitry Andric     if (TII->isMFMAorWMMA(I))
892bdd1243dSDimitry Andric       ++MFMACount;
893bdd1243dSDimitry Andric 
894bdd1243dSDimitry Andric   const unsigned PipelineSyncID = 0;
895bdd1243dSDimitry Andric   SchedGroup *SG = nullptr;
896bdd1243dSDimitry Andric   for (unsigned I = 0; I < MFMACount * 3; ++I) {
897bdd1243dSDimitry Andric     SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
898bdd1243dSDimitry Andric         SchedGroupMask::DS, 2, PipelineSyncID, DAG, TII);
899bdd1243dSDimitry Andric     SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
900bdd1243dSDimitry Andric 
901bdd1243dSDimitry Andric     SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
902bdd1243dSDimitry Andric         SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
903bdd1243dSDimitry Andric     SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
904bdd1243dSDimitry Andric   }
905*0fca6ea1SDimitry Andric 
906*0fca6ea1SDimitry Andric   return true;
907*0fca6ea1SDimitry Andric }
908*0fca6ea1SDimitry Andric 
909*0fca6ea1SDimitry Andric class MFMAExpInterleaveOpt final : public IGLPStrategy {
910*0fca6ea1SDimitry Andric private:
911*0fca6ea1SDimitry Andric   // The count of TRANS SUs involved in the interleaved pipeline
912*0fca6ea1SDimitry Andric   static unsigned TransPipeCount;
913*0fca6ea1SDimitry Andric   // The count of MFMA SUs involved in the interleaved pipeline
914*0fca6ea1SDimitry Andric   static unsigned MFMAPipeCount;
915*0fca6ea1SDimitry Andric   // The count of Add SUs involved in the interleaved pipeline
916*0fca6ea1SDimitry Andric   static unsigned AddPipeCount;
917*0fca6ea1SDimitry Andric   // The number of transitive MFMA successors for each TRANS SU
918*0fca6ea1SDimitry Andric   static unsigned MFMAEnablement;
919*0fca6ea1SDimitry Andric   // The number of transitive TRANS predecessors for each MFMA SU
920*0fca6ea1SDimitry Andric   static unsigned ExpRequirement;
921*0fca6ea1SDimitry Andric   // The count of independent "chains" of MFMA instructions in the pipeline
922*0fca6ea1SDimitry Andric   static unsigned MFMAChains;
923*0fca6ea1SDimitry Andric   // The length of each independent "chain" of MFMA instructions
924*0fca6ea1SDimitry Andric   static unsigned MFMAChainLength;
925*0fca6ea1SDimitry Andric   // Whether or not the pipeline has V_CVT instructions
926*0fca6ea1SDimitry Andric   static bool HasCvt;
927*0fca6ea1SDimitry Andric   // Whether or not there are instructions between the TRANS instruction and
928*0fca6ea1SDimitry Andric   // V_CVT
929*0fca6ea1SDimitry Andric   static bool HasChainBetweenCvt;
930*0fca6ea1SDimitry Andric   // The first occuring DS_READ which feeds an MFMA chain
931*0fca6ea1SDimitry Andric   static std::optional<unsigned> FirstPipeDSR;
932*0fca6ea1SDimitry Andric   // The MFMAPipe SUs with no MFMA predecessors
933*0fca6ea1SDimitry Andric   SmallVector<SUnit *, 4> MFMAChainSeeds;
934*0fca6ea1SDimitry Andric   // Compute the heuristics for the pipeline, returning whether or not the DAG
935*0fca6ea1SDimitry Andric   // is well formatted for the mutation
936*0fca6ea1SDimitry Andric   bool analyzeDAG(const SIInstrInfo *TII);
937*0fca6ea1SDimitry Andric 
938*0fca6ea1SDimitry Andric   /// Whether or not the instruction is a transitive predecessor of an MFMA
939*0fca6ea1SDimitry Andric   /// instruction
940*0fca6ea1SDimitry Andric   class IsPipeExp final : public InstructionRule {
941*0fca6ea1SDimitry Andric   public:
942*0fca6ea1SDimitry Andric     bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
943*0fca6ea1SDimitry Andric                SmallVectorImpl<SchedGroup> &SyncPipe) override {
944*0fca6ea1SDimitry Andric 
945*0fca6ea1SDimitry Andric       auto DAG = SyncPipe[0].DAG;
946*0fca6ea1SDimitry Andric 
947*0fca6ea1SDimitry Andric       if (Cache->empty()) {
948*0fca6ea1SDimitry Andric         auto I = DAG->SUnits.rbegin();
949*0fca6ea1SDimitry Andric         auto E = DAG->SUnits.rend();
950*0fca6ea1SDimitry Andric         for (; I != E; I++) {
951*0fca6ea1SDimitry Andric           if (TII->isMFMAorWMMA(*I->getInstr()))
952*0fca6ea1SDimitry Andric             Cache->push_back(&*I);
953*0fca6ea1SDimitry Andric         }
954*0fca6ea1SDimitry Andric         if (Cache->empty())
955*0fca6ea1SDimitry Andric           return false;
956*0fca6ea1SDimitry Andric       }
957*0fca6ea1SDimitry Andric 
958*0fca6ea1SDimitry Andric       auto Reaches = (std::any_of(
959*0fca6ea1SDimitry Andric           Cache->begin(), Cache->end(), [&SU, &DAG](SUnit *TargetSU) {
960*0fca6ea1SDimitry Andric             return DAG->IsReachable(TargetSU, const_cast<SUnit *>(SU));
961*0fca6ea1SDimitry Andric           }));
962*0fca6ea1SDimitry Andric 
963*0fca6ea1SDimitry Andric       return Reaches;
964*0fca6ea1SDimitry Andric     }
965*0fca6ea1SDimitry Andric     IsPipeExp(const SIInstrInfo *TII, unsigned SGID, bool NeedsCache = false)
966*0fca6ea1SDimitry Andric         : InstructionRule(TII, SGID, NeedsCache) {}
967*0fca6ea1SDimitry Andric   };
968*0fca6ea1SDimitry Andric 
969*0fca6ea1SDimitry Andric   /// Whether or not the instruction is a transitive predecessor of the
970*0fca6ea1SDimitry Andric   /// \p Number th MFMA of the MFMAs occuring after a TRANS instruction
971*0fca6ea1SDimitry Andric   class EnablesNthMFMA final : public InstructionRule {
972*0fca6ea1SDimitry Andric   private:
973*0fca6ea1SDimitry Andric     unsigned Number = 1;
974*0fca6ea1SDimitry Andric 
975*0fca6ea1SDimitry Andric   public:
976*0fca6ea1SDimitry Andric     bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
977*0fca6ea1SDimitry Andric                SmallVectorImpl<SchedGroup> &SyncPipe) override {
978*0fca6ea1SDimitry Andric       bool FoundTrans = false;
979*0fca6ea1SDimitry Andric       unsigned Counter = 1;
980*0fca6ea1SDimitry Andric       auto DAG = SyncPipe[0].DAG;
981*0fca6ea1SDimitry Andric 
982*0fca6ea1SDimitry Andric       if (Cache->empty()) {
983*0fca6ea1SDimitry Andric         SmallVector<SUnit *, 8> Worklist;
984*0fca6ea1SDimitry Andric 
985*0fca6ea1SDimitry Andric         auto I = DAG->SUnits.begin();
986*0fca6ea1SDimitry Andric         auto E = DAG->SUnits.end();
987*0fca6ea1SDimitry Andric         for (; I != E; I++) {
988*0fca6ea1SDimitry Andric           if (FoundTrans && TII->isMFMAorWMMA(*I->getInstr())) {
989*0fca6ea1SDimitry Andric             if (Counter == Number) {
990*0fca6ea1SDimitry Andric               Cache->push_back(&*I);
991*0fca6ea1SDimitry Andric               break;
992*0fca6ea1SDimitry Andric             }
993*0fca6ea1SDimitry Andric             ++Counter;
994*0fca6ea1SDimitry Andric           }
995*0fca6ea1SDimitry Andric           if (!FoundTrans && TII->isTRANS(I->getInstr()->getOpcode()))
996*0fca6ea1SDimitry Andric             FoundTrans = true;
997*0fca6ea1SDimitry Andric         }
998*0fca6ea1SDimitry Andric         if (Cache->empty())
999*0fca6ea1SDimitry Andric           return false;
1000*0fca6ea1SDimitry Andric       }
1001*0fca6ea1SDimitry Andric 
1002*0fca6ea1SDimitry Andric       return DAG->IsReachable((*Cache)[0], const_cast<SUnit *>(SU));
1003*0fca6ea1SDimitry Andric     }
1004*0fca6ea1SDimitry Andric 
1005*0fca6ea1SDimitry Andric     EnablesNthMFMA(unsigned Number, const SIInstrInfo *TII, unsigned SGID,
1006*0fca6ea1SDimitry Andric                    bool NeedsCache = false)
1007*0fca6ea1SDimitry Andric         : InstructionRule(TII, SGID, NeedsCache), Number(Number) {}
1008*0fca6ea1SDimitry Andric   };
1009*0fca6ea1SDimitry Andric 
1010*0fca6ea1SDimitry Andric   /// Whether or not the instruction enables the exact MFMA that is the \p
1011*0fca6ea1SDimitry Andric   /// Number th MFMA in the chain starting with \p ChainSeed
1012*0fca6ea1SDimitry Andric   class EnablesNthMFMAInChain final : public InstructionRule {
1013*0fca6ea1SDimitry Andric   private:
1014*0fca6ea1SDimitry Andric     unsigned Number = 1;
1015*0fca6ea1SDimitry Andric     SUnit *ChainSeed;
1016*0fca6ea1SDimitry Andric 
1017*0fca6ea1SDimitry Andric   public:
1018*0fca6ea1SDimitry Andric     bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
1019*0fca6ea1SDimitry Andric                SmallVectorImpl<SchedGroup> &SyncPipe) override {
1020*0fca6ea1SDimitry Andric       auto DAG = SyncPipe[0].DAG;
1021*0fca6ea1SDimitry Andric 
1022*0fca6ea1SDimitry Andric       if (!SU || !TII->isMFMAorWMMA(*ChainSeed->getInstr()))
1023*0fca6ea1SDimitry Andric         return false;
1024*0fca6ea1SDimitry Andric 
1025*0fca6ea1SDimitry Andric       if (Cache->empty()) {
1026*0fca6ea1SDimitry Andric         auto TempSU = ChainSeed;
1027*0fca6ea1SDimitry Andric         auto Depth = Number;
1028*0fca6ea1SDimitry Andric         while (Depth > 0) {
1029*0fca6ea1SDimitry Andric           --Depth;
1030*0fca6ea1SDimitry Andric           bool Found = false;
1031*0fca6ea1SDimitry Andric           for (auto &Succ : TempSU->Succs) {
1032*0fca6ea1SDimitry Andric             if (TII->isMFMAorWMMA(*Succ.getSUnit()->getInstr())) {
1033*0fca6ea1SDimitry Andric               TempSU = Succ.getSUnit();
1034*0fca6ea1SDimitry Andric               Found = true;
1035*0fca6ea1SDimitry Andric               break;
1036*0fca6ea1SDimitry Andric             }
1037*0fca6ea1SDimitry Andric           }
1038*0fca6ea1SDimitry Andric           if (!Found)
1039*0fca6ea1SDimitry Andric             return false;
1040*0fca6ea1SDimitry Andric         }
1041*0fca6ea1SDimitry Andric 
1042*0fca6ea1SDimitry Andric         Cache->push_back(TempSU);
1043*0fca6ea1SDimitry Andric       }
1044*0fca6ea1SDimitry Andric       // If we failed to find the instruction to be placed into the cache, we
1045*0fca6ea1SDimitry Andric       // would have already exited.
1046*0fca6ea1SDimitry Andric       assert(!Cache->empty());
1047*0fca6ea1SDimitry Andric 
1048*0fca6ea1SDimitry Andric       return DAG->IsReachable((*Cache)[0], const_cast<SUnit *>(SU));
1049*0fca6ea1SDimitry Andric     }
1050*0fca6ea1SDimitry Andric 
1051*0fca6ea1SDimitry Andric     EnablesNthMFMAInChain(unsigned Number, SUnit *ChainSeed,
1052*0fca6ea1SDimitry Andric                           const SIInstrInfo *TII, unsigned SGID,
1053*0fca6ea1SDimitry Andric                           bool NeedsCache = false)
1054*0fca6ea1SDimitry Andric         : InstructionRule(TII, SGID, NeedsCache), Number(Number),
1055*0fca6ea1SDimitry Andric           ChainSeed(ChainSeed) {}
1056*0fca6ea1SDimitry Andric   };
1057*0fca6ea1SDimitry Andric 
1058*0fca6ea1SDimitry Andric   /// Whether or not the instruction has less than \p Size immediate successors.
1059*0fca6ea1SDimitry Andric   /// If \p HasIntermediary is true, this tests also whether all successors of
1060*0fca6ea1SDimitry Andric   /// the SUnit have less than \p Size successors.
1061*0fca6ea1SDimitry Andric   class LessThanNSuccs final : public InstructionRule {
1062*0fca6ea1SDimitry Andric   private:
1063*0fca6ea1SDimitry Andric     unsigned Size = 1;
1064*0fca6ea1SDimitry Andric     bool HasIntermediary = false;
1065*0fca6ea1SDimitry Andric 
1066*0fca6ea1SDimitry Andric   public:
1067*0fca6ea1SDimitry Andric     bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
1068*0fca6ea1SDimitry Andric                SmallVectorImpl<SchedGroup> &SyncPipe) override {
1069*0fca6ea1SDimitry Andric       if (!SyncPipe.size())
1070*0fca6ea1SDimitry Andric         return false;
1071*0fca6ea1SDimitry Andric 
1072*0fca6ea1SDimitry Andric       auto SuccSize = std::count_if(
1073*0fca6ea1SDimitry Andric           SU->Succs.begin(), SU->Succs.end(),
1074*0fca6ea1SDimitry Andric           [](const SDep &Succ) { return Succ.getKind() == SDep::Data; });
1075*0fca6ea1SDimitry Andric       if (SuccSize >= Size)
1076*0fca6ea1SDimitry Andric         return false;
1077*0fca6ea1SDimitry Andric 
1078*0fca6ea1SDimitry Andric       if (HasIntermediary) {
1079*0fca6ea1SDimitry Andric         for (auto Succ : SU->Succs) {
1080*0fca6ea1SDimitry Andric           auto SuccSize = std::count_if(
1081*0fca6ea1SDimitry Andric               Succ.getSUnit()->Succs.begin(), Succ.getSUnit()->Succs.end(),
1082*0fca6ea1SDimitry Andric               [](const SDep &SuccSucc) {
1083*0fca6ea1SDimitry Andric                 return SuccSucc.getKind() == SDep::Data;
1084*0fca6ea1SDimitry Andric               });
1085*0fca6ea1SDimitry Andric           if (SuccSize >= Size)
1086*0fca6ea1SDimitry Andric             return false;
1087*0fca6ea1SDimitry Andric         }
1088*0fca6ea1SDimitry Andric       }
1089*0fca6ea1SDimitry Andric 
1090*0fca6ea1SDimitry Andric       return true;
1091*0fca6ea1SDimitry Andric     }
1092*0fca6ea1SDimitry Andric     LessThanNSuccs(unsigned Size, const SIInstrInfo *TII, unsigned SGID,
1093*0fca6ea1SDimitry Andric                    bool HasIntermediary = false, bool NeedsCache = false)
1094*0fca6ea1SDimitry Andric         : InstructionRule(TII, SGID, NeedsCache), Size(Size),
1095*0fca6ea1SDimitry Andric           HasIntermediary(HasIntermediary) {}
1096*0fca6ea1SDimitry Andric   };
1097*0fca6ea1SDimitry Andric 
1098*0fca6ea1SDimitry Andric   /// Whether or not the instruction has greater than or equal to \p Size
1099*0fca6ea1SDimitry Andric   /// immediate successors. If \p HasIntermediary is true, this tests also
1100*0fca6ea1SDimitry Andric   /// whether all successors of the SUnit have greater than or equal to \p Size
1101*0fca6ea1SDimitry Andric   /// successors.
1102*0fca6ea1SDimitry Andric   class GreaterThanOrEqualToNSuccs final : public InstructionRule {
1103*0fca6ea1SDimitry Andric   private:
1104*0fca6ea1SDimitry Andric     unsigned Size = 1;
1105*0fca6ea1SDimitry Andric     bool HasIntermediary = false;
1106*0fca6ea1SDimitry Andric 
1107*0fca6ea1SDimitry Andric   public:
1108*0fca6ea1SDimitry Andric     bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
1109*0fca6ea1SDimitry Andric                SmallVectorImpl<SchedGroup> &SyncPipe) override {
1110*0fca6ea1SDimitry Andric       if (!SyncPipe.size())
1111*0fca6ea1SDimitry Andric         return false;
1112*0fca6ea1SDimitry Andric 
1113*0fca6ea1SDimitry Andric       auto SuccSize = std::count_if(
1114*0fca6ea1SDimitry Andric           SU->Succs.begin(), SU->Succs.end(),
1115*0fca6ea1SDimitry Andric           [](const SDep &Succ) { return Succ.getKind() == SDep::Data; });
1116*0fca6ea1SDimitry Andric       if (SuccSize >= Size)
1117*0fca6ea1SDimitry Andric         return true;
1118*0fca6ea1SDimitry Andric 
1119*0fca6ea1SDimitry Andric       if (HasIntermediary) {
1120*0fca6ea1SDimitry Andric         for (auto Succ : SU->Succs) {
1121*0fca6ea1SDimitry Andric           auto SuccSize = std::count_if(
1122*0fca6ea1SDimitry Andric               Succ.getSUnit()->Succs.begin(), Succ.getSUnit()->Succs.end(),
1123*0fca6ea1SDimitry Andric               [](const SDep &SuccSucc) {
1124*0fca6ea1SDimitry Andric                 return SuccSucc.getKind() == SDep::Data;
1125*0fca6ea1SDimitry Andric               });
1126*0fca6ea1SDimitry Andric           if (SuccSize >= Size)
1127*0fca6ea1SDimitry Andric             return true;
1128*0fca6ea1SDimitry Andric         }
1129*0fca6ea1SDimitry Andric       }
1130*0fca6ea1SDimitry Andric 
1131*0fca6ea1SDimitry Andric       return false;
1132*0fca6ea1SDimitry Andric     }
1133*0fca6ea1SDimitry Andric     GreaterThanOrEqualToNSuccs(unsigned Size, const SIInstrInfo *TII,
1134*0fca6ea1SDimitry Andric                                unsigned SGID, bool HasIntermediary = false,
1135*0fca6ea1SDimitry Andric                                bool NeedsCache = false)
1136*0fca6ea1SDimitry Andric         : InstructionRule(TII, SGID, NeedsCache), Size(Size),
1137*0fca6ea1SDimitry Andric           HasIntermediary(HasIntermediary) {}
1138*0fca6ea1SDimitry Andric   };
1139*0fca6ea1SDimitry Andric 
1140*0fca6ea1SDimitry Andric   // Whether or not the instruction is a relevant V_CVT instruction.
1141*0fca6ea1SDimitry Andric   class IsCvt final : public InstructionRule {
1142*0fca6ea1SDimitry Andric   public:
1143*0fca6ea1SDimitry Andric     bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
1144*0fca6ea1SDimitry Andric                SmallVectorImpl<SchedGroup> &SyncPipe) override {
1145*0fca6ea1SDimitry Andric       auto Opc = SU->getInstr()->getOpcode();
1146*0fca6ea1SDimitry Andric       return Opc == AMDGPU::V_CVT_F16_F32_e32 ||
1147*0fca6ea1SDimitry Andric              Opc == AMDGPU::V_CVT_I32_F32_e32;
1148*0fca6ea1SDimitry Andric     }
1149*0fca6ea1SDimitry Andric     IsCvt(const SIInstrInfo *TII, unsigned SGID, bool NeedsCache = false)
1150*0fca6ea1SDimitry Andric         : InstructionRule(TII, SGID, NeedsCache) {}
1151*0fca6ea1SDimitry Andric   };
1152*0fca6ea1SDimitry Andric 
1153*0fca6ea1SDimitry Andric   // Whether or not the instruction is FMA_F32.
1154*0fca6ea1SDimitry Andric   class IsFMA final : public InstructionRule {
1155*0fca6ea1SDimitry Andric   public:
1156*0fca6ea1SDimitry Andric     bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
1157*0fca6ea1SDimitry Andric                SmallVectorImpl<SchedGroup> &SyncPipe) override {
1158*0fca6ea1SDimitry Andric       return SU->getInstr()->getOpcode() == AMDGPU::V_FMA_F32_e64 ||
1159*0fca6ea1SDimitry Andric              SU->getInstr()->getOpcode() == AMDGPU::V_PK_FMA_F32;
1160*0fca6ea1SDimitry Andric     }
1161*0fca6ea1SDimitry Andric     IsFMA(const SIInstrInfo *TII, unsigned SGID, bool NeedsCache = false)
1162*0fca6ea1SDimitry Andric         : InstructionRule(TII, SGID, NeedsCache) {}
1163*0fca6ea1SDimitry Andric   };
1164*0fca6ea1SDimitry Andric 
1165*0fca6ea1SDimitry Andric   // Whether or not the instruction is a V_ADD_F32 instruction.
1166*0fca6ea1SDimitry Andric   class IsPipeAdd final : public InstructionRule {
1167*0fca6ea1SDimitry Andric   public:
1168*0fca6ea1SDimitry Andric     bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
1169*0fca6ea1SDimitry Andric                SmallVectorImpl<SchedGroup> &SyncPipe) override {
1170*0fca6ea1SDimitry Andric       return SU->getInstr()->getOpcode() == AMDGPU::V_ADD_F32_e32;
1171*0fca6ea1SDimitry Andric     }
1172*0fca6ea1SDimitry Andric     IsPipeAdd(const SIInstrInfo *TII, unsigned SGID, bool NeedsCache = false)
1173*0fca6ea1SDimitry Andric         : InstructionRule(TII, SGID, NeedsCache) {}
1174*0fca6ea1SDimitry Andric   };
1175*0fca6ea1SDimitry Andric 
1176*0fca6ea1SDimitry Andric   /// Whether or not the instruction is an immediate RAW successor
1177*0fca6ea1SDimitry Andric   /// of the SchedGroup \p Distance steps before.
1178*0fca6ea1SDimitry Andric   class IsSuccOfPrevNthGroup final : public InstructionRule {
1179*0fca6ea1SDimitry Andric   private:
1180*0fca6ea1SDimitry Andric     unsigned Distance = 1;
1181*0fca6ea1SDimitry Andric 
1182*0fca6ea1SDimitry Andric   public:
1183*0fca6ea1SDimitry Andric     bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
1184*0fca6ea1SDimitry Andric                SmallVectorImpl<SchedGroup> &SyncPipe) override {
1185*0fca6ea1SDimitry Andric       SchedGroup *OtherGroup = nullptr;
1186*0fca6ea1SDimitry Andric       if (!SyncPipe.size())
1187*0fca6ea1SDimitry Andric         return false;
1188*0fca6ea1SDimitry Andric 
1189*0fca6ea1SDimitry Andric       for (auto &PipeSG : SyncPipe) {
1190*0fca6ea1SDimitry Andric         if ((unsigned)PipeSG.getSGID() == SGID - Distance)
1191*0fca6ea1SDimitry Andric           OtherGroup = &PipeSG;
1192*0fca6ea1SDimitry Andric       }
1193*0fca6ea1SDimitry Andric 
1194*0fca6ea1SDimitry Andric       if (!OtherGroup)
1195*0fca6ea1SDimitry Andric         return false;
1196*0fca6ea1SDimitry Andric       if (!OtherGroup->Collection.size())
1197*0fca6ea1SDimitry Andric         return true;
1198*0fca6ea1SDimitry Andric 
1199*0fca6ea1SDimitry Andric       for (auto &OtherEle : OtherGroup->Collection) {
1200*0fca6ea1SDimitry Andric         for (auto &Succ : OtherEle->Succs) {
1201*0fca6ea1SDimitry Andric           if (Succ.getSUnit() == SU && Succ.getKind() == SDep::Data)
1202*0fca6ea1SDimitry Andric             return true;
1203*0fca6ea1SDimitry Andric         }
1204*0fca6ea1SDimitry Andric       }
1205*0fca6ea1SDimitry Andric 
1206*0fca6ea1SDimitry Andric       return false;
1207*0fca6ea1SDimitry Andric     }
1208*0fca6ea1SDimitry Andric     IsSuccOfPrevNthGroup(unsigned Distance, const SIInstrInfo *TII,
1209*0fca6ea1SDimitry Andric                          unsigned SGID, bool NeedsCache = false)
1210*0fca6ea1SDimitry Andric         : InstructionRule(TII, SGID, NeedsCache), Distance(Distance) {}
1211*0fca6ea1SDimitry Andric   };
1212*0fca6ea1SDimitry Andric 
1213*0fca6ea1SDimitry Andric   /// Whether or not the instruction is a transitive successor of any
1214*0fca6ea1SDimitry Andric   /// instruction the the SchedGroup \p Distance steps before.
1215*0fca6ea1SDimitry Andric   class IsReachableFromPrevNthGroup final : public InstructionRule {
1216*0fca6ea1SDimitry Andric   private:
1217*0fca6ea1SDimitry Andric     unsigned Distance = 1;
1218*0fca6ea1SDimitry Andric 
1219*0fca6ea1SDimitry Andric   public:
1220*0fca6ea1SDimitry Andric     bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
1221*0fca6ea1SDimitry Andric                SmallVectorImpl<SchedGroup> &SyncPipe) override {
1222*0fca6ea1SDimitry Andric       SchedGroup *OtherGroup = nullptr;
1223*0fca6ea1SDimitry Andric       if (!SyncPipe.size())
1224*0fca6ea1SDimitry Andric         return false;
1225*0fca6ea1SDimitry Andric 
1226*0fca6ea1SDimitry Andric       for (auto &PipeSG : SyncPipe) {
1227*0fca6ea1SDimitry Andric         if ((unsigned)PipeSG.getSGID() == SGID - Distance)
1228*0fca6ea1SDimitry Andric           OtherGroup = &PipeSG;
1229*0fca6ea1SDimitry Andric       }
1230*0fca6ea1SDimitry Andric 
1231*0fca6ea1SDimitry Andric       if (!OtherGroup)
1232*0fca6ea1SDimitry Andric         return false;
1233*0fca6ea1SDimitry Andric       if (!OtherGroup->Collection.size())
1234*0fca6ea1SDimitry Andric         return true;
1235*0fca6ea1SDimitry Andric 
1236*0fca6ea1SDimitry Andric       auto DAG = SyncPipe[0].DAG;
1237*0fca6ea1SDimitry Andric 
1238*0fca6ea1SDimitry Andric       for (auto &OtherEle : OtherGroup->Collection)
1239*0fca6ea1SDimitry Andric         if (DAG->IsReachable(const_cast<SUnit *>(SU), OtherEle))
1240*0fca6ea1SDimitry Andric           return true;
1241*0fca6ea1SDimitry Andric 
1242*0fca6ea1SDimitry Andric       return false;
1243*0fca6ea1SDimitry Andric     }
1244*0fca6ea1SDimitry Andric     IsReachableFromPrevNthGroup(unsigned Distance, const SIInstrInfo *TII,
1245*0fca6ea1SDimitry Andric                                 unsigned SGID, bool NeedsCache = false)
1246*0fca6ea1SDimitry Andric         : InstructionRule(TII, SGID, NeedsCache), Distance(Distance) {}
1247*0fca6ea1SDimitry Andric   };
1248*0fca6ea1SDimitry Andric 
1249*0fca6ea1SDimitry Andric   /// Whether or not the instruction occurs after the SU with NodeNUm \p Number
1250*0fca6ea1SDimitry Andric   class OccursAtOrAfterNode final : public InstructionRule {
1251*0fca6ea1SDimitry Andric   private:
1252*0fca6ea1SDimitry Andric     unsigned Number = 1;
1253*0fca6ea1SDimitry Andric 
1254*0fca6ea1SDimitry Andric   public:
1255*0fca6ea1SDimitry Andric     bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
1256*0fca6ea1SDimitry Andric                SmallVectorImpl<SchedGroup> &SyncPipe) override {
1257*0fca6ea1SDimitry Andric 
1258*0fca6ea1SDimitry Andric       return SU->NodeNum >= Number;
1259*0fca6ea1SDimitry Andric     }
1260*0fca6ea1SDimitry Andric     OccursAtOrAfterNode(unsigned Number, const SIInstrInfo *TII, unsigned SGID,
1261*0fca6ea1SDimitry Andric                         bool NeedsCache = false)
1262*0fca6ea1SDimitry Andric         : InstructionRule(TII, SGID, NeedsCache), Number(Number) {}
1263*0fca6ea1SDimitry Andric   };
1264*0fca6ea1SDimitry Andric 
1265*0fca6ea1SDimitry Andric   /// Whether or not the SU is exactly the \p Number th MFMA in the chain
1266*0fca6ea1SDimitry Andric   /// starting with \p ChainSeed
1267*0fca6ea1SDimitry Andric   class IsExactMFMA final : public InstructionRule {
1268*0fca6ea1SDimitry Andric   private:
1269*0fca6ea1SDimitry Andric     unsigned Number = 1;
1270*0fca6ea1SDimitry Andric     SUnit *ChainSeed;
1271*0fca6ea1SDimitry Andric 
1272*0fca6ea1SDimitry Andric   public:
1273*0fca6ea1SDimitry Andric     bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
1274*0fca6ea1SDimitry Andric                SmallVectorImpl<SchedGroup> &SyncPipe) override {
1275*0fca6ea1SDimitry Andric       if (!SU || !TII->isMFMAorWMMA(*ChainSeed->getInstr()))
1276*0fca6ea1SDimitry Andric         return false;
1277*0fca6ea1SDimitry Andric 
1278*0fca6ea1SDimitry Andric       if (Cache->empty()) {
1279*0fca6ea1SDimitry Andric         auto TempSU = ChainSeed;
1280*0fca6ea1SDimitry Andric         auto Depth = Number;
1281*0fca6ea1SDimitry Andric         while (Depth > 0) {
1282*0fca6ea1SDimitry Andric           --Depth;
1283*0fca6ea1SDimitry Andric           bool Found = false;
1284*0fca6ea1SDimitry Andric           for (auto &Succ : TempSU->Succs) {
1285*0fca6ea1SDimitry Andric             if (TII->isMFMAorWMMA(*Succ.getSUnit()->getInstr())) {
1286*0fca6ea1SDimitry Andric               TempSU = Succ.getSUnit();
1287*0fca6ea1SDimitry Andric               Found = true;
1288*0fca6ea1SDimitry Andric               break;
1289*0fca6ea1SDimitry Andric             }
1290*0fca6ea1SDimitry Andric           }
1291*0fca6ea1SDimitry Andric           if (!Found) {
1292*0fca6ea1SDimitry Andric             return false;
1293*0fca6ea1SDimitry Andric           }
1294*0fca6ea1SDimitry Andric         }
1295*0fca6ea1SDimitry Andric         Cache->push_back(TempSU);
1296*0fca6ea1SDimitry Andric       }
1297*0fca6ea1SDimitry Andric       // If we failed to find the instruction to be placed into the cache, we
1298*0fca6ea1SDimitry Andric       // would have already exited.
1299*0fca6ea1SDimitry Andric       assert(!Cache->empty());
1300*0fca6ea1SDimitry Andric 
1301*0fca6ea1SDimitry Andric       return (*Cache)[0] == SU;
1302*0fca6ea1SDimitry Andric     }
1303*0fca6ea1SDimitry Andric 
1304*0fca6ea1SDimitry Andric     IsExactMFMA(unsigned Number, SUnit *ChainSeed, const SIInstrInfo *TII,
1305*0fca6ea1SDimitry Andric                 unsigned SGID, bool NeedsCache = false)
1306*0fca6ea1SDimitry Andric         : InstructionRule(TII, SGID, NeedsCache), Number(Number),
1307*0fca6ea1SDimitry Andric           ChainSeed(ChainSeed) {}
1308*0fca6ea1SDimitry Andric   };
1309*0fca6ea1SDimitry Andric 
1310*0fca6ea1SDimitry Andric   // Whether the instruction occurs after the first TRANS instruction. This
1311*0fca6ea1SDimitry Andric   // implies the instruction can not be a predecessor of the first TRANS
1312*0fca6ea1SDimitry Andric   // insruction
1313*0fca6ea1SDimitry Andric   class OccursAfterExp final : public InstructionRule {
1314*0fca6ea1SDimitry Andric   public:
1315*0fca6ea1SDimitry Andric     bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
1316*0fca6ea1SDimitry Andric                SmallVectorImpl<SchedGroup> &SyncPipe) override {
1317*0fca6ea1SDimitry Andric 
1318*0fca6ea1SDimitry Andric       SmallVector<SUnit *, 12> Worklist;
1319*0fca6ea1SDimitry Andric       auto DAG = SyncPipe[0].DAG;
1320*0fca6ea1SDimitry Andric       if (Cache->empty()) {
1321*0fca6ea1SDimitry Andric         for (auto &SU : DAG->SUnits)
1322*0fca6ea1SDimitry Andric           if (TII->isTRANS(SU.getInstr()->getOpcode())) {
1323*0fca6ea1SDimitry Andric             Cache->push_back(&SU);
1324*0fca6ea1SDimitry Andric             break;
1325*0fca6ea1SDimitry Andric           }
1326*0fca6ea1SDimitry Andric         if (Cache->empty())
1327*0fca6ea1SDimitry Andric           return false;
1328*0fca6ea1SDimitry Andric       }
1329*0fca6ea1SDimitry Andric 
1330*0fca6ea1SDimitry Andric       return SU->NodeNum > (*Cache)[0]->NodeNum;
1331*0fca6ea1SDimitry Andric     }
1332*0fca6ea1SDimitry Andric 
1333*0fca6ea1SDimitry Andric     OccursAfterExp(const SIInstrInfo *TII, unsigned SGID,
1334*0fca6ea1SDimitry Andric                    bool NeedsCache = false)
1335*0fca6ea1SDimitry Andric         : InstructionRule(TII, SGID, NeedsCache) {}
1336*0fca6ea1SDimitry Andric   };
1337*0fca6ea1SDimitry Andric 
1338*0fca6ea1SDimitry Andric public:
1339*0fca6ea1SDimitry Andric   bool applyIGLPStrategy(
1340*0fca6ea1SDimitry Andric       DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
1341*0fca6ea1SDimitry Andric       DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
1342*0fca6ea1SDimitry Andric       AMDGPU::SchedulingPhase Phase) override;
1343*0fca6ea1SDimitry Andric 
1344*0fca6ea1SDimitry Andric   bool shouldApplyStrategy(ScheduleDAGInstrs *DAG,
1345*0fca6ea1SDimitry Andric                            AMDGPU::SchedulingPhase Phase) override;
1346*0fca6ea1SDimitry Andric 
1347*0fca6ea1SDimitry Andric   MFMAExpInterleaveOpt(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII)
1348*0fca6ea1SDimitry Andric       : IGLPStrategy(DAG, TII) {
1349*0fca6ea1SDimitry Andric     IsBottomUp = false;
1350*0fca6ea1SDimitry Andric   }
1351*0fca6ea1SDimitry Andric };
1352*0fca6ea1SDimitry Andric 
1353*0fca6ea1SDimitry Andric unsigned MFMAExpInterleaveOpt::TransPipeCount = 0;
1354*0fca6ea1SDimitry Andric unsigned MFMAExpInterleaveOpt::MFMAPipeCount = 0;
1355*0fca6ea1SDimitry Andric unsigned MFMAExpInterleaveOpt::AddPipeCount = 0;
1356*0fca6ea1SDimitry Andric unsigned MFMAExpInterleaveOpt::MFMAEnablement = 0;
1357*0fca6ea1SDimitry Andric unsigned MFMAExpInterleaveOpt::ExpRequirement = 0;
1358*0fca6ea1SDimitry Andric unsigned MFMAExpInterleaveOpt::MFMAChains = 0;
1359*0fca6ea1SDimitry Andric unsigned MFMAExpInterleaveOpt::MFMAChainLength = 0;
1360*0fca6ea1SDimitry Andric bool MFMAExpInterleaveOpt::HasCvt = false;
1361*0fca6ea1SDimitry Andric bool MFMAExpInterleaveOpt::HasChainBetweenCvt = false;
1362*0fca6ea1SDimitry Andric std::optional<unsigned> MFMAExpInterleaveOpt::FirstPipeDSR = std::nullopt;
1363*0fca6ea1SDimitry Andric 
1364*0fca6ea1SDimitry Andric bool MFMAExpInterleaveOpt::analyzeDAG(const SIInstrInfo *TII) {
1365*0fca6ea1SDimitry Andric   SmallVector<SUnit *, 10> ExpPipeCands;
1366*0fca6ea1SDimitry Andric   SmallVector<SUnit *, 10> MFMAPipeCands;
1367*0fca6ea1SDimitry Andric   SmallVector<SUnit *, 10> MFMAPipeSUs;
1368*0fca6ea1SDimitry Andric   SmallVector<SUnit *, 10> PackSUs;
1369*0fca6ea1SDimitry Andric   SmallVector<SUnit *, 10> CvtSUs;
1370*0fca6ea1SDimitry Andric 
1371*0fca6ea1SDimitry Andric   auto isBitPack = [](unsigned Opc) {
1372*0fca6ea1SDimitry Andric     return Opc == AMDGPU::V_PACK_B32_F16_e64 || Opc == AMDGPU::V_PERM_B32_e64;
1373*0fca6ea1SDimitry Andric   };
1374*0fca6ea1SDimitry Andric 
1375*0fca6ea1SDimitry Andric   auto isCvt = [](unsigned Opc) {
1376*0fca6ea1SDimitry Andric     return Opc == AMDGPU::V_CVT_F16_F32_e32 || Opc == AMDGPU::V_CVT_I32_F32_e32;
1377*0fca6ea1SDimitry Andric   };
1378*0fca6ea1SDimitry Andric 
1379*0fca6ea1SDimitry Andric   auto isAdd = [](unsigned Opc) { return Opc == AMDGPU::V_ADD_F32_e32; };
1380*0fca6ea1SDimitry Andric 
1381*0fca6ea1SDimitry Andric   AddPipeCount = 0;
1382*0fca6ea1SDimitry Andric   for (SUnit &SU : DAG->SUnits) {
1383*0fca6ea1SDimitry Andric     auto Opc = SU.getInstr()->getOpcode();
1384*0fca6ea1SDimitry Andric     if (TII->isTRANS(Opc)) {
1385*0fca6ea1SDimitry Andric       // Avoid counting a potential bonus V_EXP which all the MFMA depend on
1386*0fca6ea1SDimitry Andric       if (SU.Succs.size() >= 7)
1387*0fca6ea1SDimitry Andric         continue;
1388*0fca6ea1SDimitry Andric       for (auto &Succ : SU.Succs) {
1389*0fca6ea1SDimitry Andric         if (Succ.getSUnit()->Succs.size() >= 7)
1390*0fca6ea1SDimitry Andric           continue;
1391*0fca6ea1SDimitry Andric       }
1392*0fca6ea1SDimitry Andric       ExpPipeCands.push_back(&SU);
1393*0fca6ea1SDimitry Andric     }
1394*0fca6ea1SDimitry Andric 
1395*0fca6ea1SDimitry Andric     if (TII->isMFMAorWMMA(*SU.getInstr()))
1396*0fca6ea1SDimitry Andric       MFMAPipeCands.push_back(&SU);
1397*0fca6ea1SDimitry Andric 
1398*0fca6ea1SDimitry Andric     if (isBitPack(Opc))
1399*0fca6ea1SDimitry Andric       PackSUs.push_back(&SU);
1400*0fca6ea1SDimitry Andric 
1401*0fca6ea1SDimitry Andric     if (isCvt(Opc))
1402*0fca6ea1SDimitry Andric       CvtSUs.push_back(&SU);
1403*0fca6ea1SDimitry Andric 
1404*0fca6ea1SDimitry Andric     if (isAdd(Opc))
1405*0fca6ea1SDimitry Andric       ++AddPipeCount;
1406*0fca6ea1SDimitry Andric   }
1407*0fca6ea1SDimitry Andric 
1408*0fca6ea1SDimitry Andric   if (!(PackSUs.size() && MFMAPipeCands.size() && ExpPipeCands.size()))
1409*0fca6ea1SDimitry Andric     return false;
1410*0fca6ea1SDimitry Andric 
1411*0fca6ea1SDimitry Andric   TransPipeCount = 0;
1412*0fca6ea1SDimitry Andric 
1413*0fca6ea1SDimitry Andric   std::optional<SUnit *> TempMFMA;
1414*0fca6ea1SDimitry Andric   std::optional<SUnit *> TempExp;
1415*0fca6ea1SDimitry Andric   // Count the number of EXPs that reach an MFMA
1416*0fca6ea1SDimitry Andric   for (auto &PredSU : ExpPipeCands) {
1417*0fca6ea1SDimitry Andric     for (auto &SuccSU : MFMAPipeCands) {
1418*0fca6ea1SDimitry Andric       if (DAG->IsReachable(SuccSU, PredSU)) {
1419*0fca6ea1SDimitry Andric         if (!TempExp) {
1420*0fca6ea1SDimitry Andric           TempExp = PredSU;
1421*0fca6ea1SDimitry Andric           TempMFMA = SuccSU;
1422*0fca6ea1SDimitry Andric         }
1423*0fca6ea1SDimitry Andric         MFMAPipeSUs.push_back(SuccSU);
1424*0fca6ea1SDimitry Andric         ++TransPipeCount;
1425*0fca6ea1SDimitry Andric         break;
1426*0fca6ea1SDimitry Andric       }
1427*0fca6ea1SDimitry Andric     }
1428*0fca6ea1SDimitry Andric   }
1429*0fca6ea1SDimitry Andric 
1430*0fca6ea1SDimitry Andric   if (!(TempExp && TempMFMA))
1431*0fca6ea1SDimitry Andric     return false;
1432*0fca6ea1SDimitry Andric 
1433*0fca6ea1SDimitry Andric   HasChainBetweenCvt =
1434*0fca6ea1SDimitry Andric       std::find_if((*TempExp)->Succs.begin(), (*TempExp)->Succs.end(),
1435*0fca6ea1SDimitry Andric                    [&isCvt](SDep &Succ) {
1436*0fca6ea1SDimitry Andric                      return isCvt(Succ.getSUnit()->getInstr()->getOpcode());
1437*0fca6ea1SDimitry Andric                    }) == (*TempExp)->Succs.end();
1438*0fca6ea1SDimitry Andric 
1439*0fca6ea1SDimitry Andric   // Count the number of MFMAs that are reached by an EXP
1440*0fca6ea1SDimitry Andric   for (auto &SuccSU : MFMAPipeCands) {
1441*0fca6ea1SDimitry Andric     if (MFMAPipeSUs.size() &&
1442*0fca6ea1SDimitry Andric         std::find_if(MFMAPipeSUs.begin(), MFMAPipeSUs.end(),
1443*0fca6ea1SDimitry Andric                      [&SuccSU](SUnit *PotentialMatch) {
1444*0fca6ea1SDimitry Andric                        return PotentialMatch->NodeNum == SuccSU->NodeNum;
1445*0fca6ea1SDimitry Andric                      }) != MFMAPipeSUs.end())
1446*0fca6ea1SDimitry Andric       continue;
1447*0fca6ea1SDimitry Andric 
1448*0fca6ea1SDimitry Andric     for (auto &PredSU : ExpPipeCands) {
1449*0fca6ea1SDimitry Andric       if (DAG->IsReachable(SuccSU, PredSU)) {
1450*0fca6ea1SDimitry Andric         MFMAPipeSUs.push_back(SuccSU);
1451*0fca6ea1SDimitry Andric         break;
1452*0fca6ea1SDimitry Andric       }
1453*0fca6ea1SDimitry Andric     }
1454*0fca6ea1SDimitry Andric   }
1455*0fca6ea1SDimitry Andric 
1456*0fca6ea1SDimitry Andric   MFMAPipeCount = MFMAPipeSUs.size();
1457*0fca6ea1SDimitry Andric 
1458*0fca6ea1SDimitry Andric   assert(TempExp && TempMFMA);
1459*0fca6ea1SDimitry Andric   assert(MFMAPipeCount > 0);
1460*0fca6ea1SDimitry Andric 
1461*0fca6ea1SDimitry Andric   std::optional<SUnit *> TempCvt;
1462*0fca6ea1SDimitry Andric   for (auto &SuccSU : CvtSUs) {
1463*0fca6ea1SDimitry Andric     if (DAG->IsReachable(SuccSU, *TempExp)) {
1464*0fca6ea1SDimitry Andric       TempCvt = SuccSU;
1465*0fca6ea1SDimitry Andric       break;
1466*0fca6ea1SDimitry Andric     }
1467*0fca6ea1SDimitry Andric   }
1468*0fca6ea1SDimitry Andric 
1469*0fca6ea1SDimitry Andric   HasCvt = false;
1470*0fca6ea1SDimitry Andric   if (TempCvt.has_value()) {
1471*0fca6ea1SDimitry Andric     for (auto &SuccSU : MFMAPipeSUs) {
1472*0fca6ea1SDimitry Andric       if (DAG->IsReachable(SuccSU, *TempCvt)) {
1473*0fca6ea1SDimitry Andric         HasCvt = true;
1474*0fca6ea1SDimitry Andric         break;
1475*0fca6ea1SDimitry Andric       }
1476*0fca6ea1SDimitry Andric     }
1477*0fca6ea1SDimitry Andric   }
1478*0fca6ea1SDimitry Andric 
1479*0fca6ea1SDimitry Andric   MFMAChains = 0;
1480*0fca6ea1SDimitry Andric   for (auto &MFMAPipeSU : MFMAPipeSUs) {
1481*0fca6ea1SDimitry Andric     if (is_contained(MFMAChainSeeds, MFMAPipeSU))
1482*0fca6ea1SDimitry Andric       continue;
1483*0fca6ea1SDimitry Andric     if (!std::any_of(MFMAPipeSU->Preds.begin(), MFMAPipeSU->Preds.end(),
1484*0fca6ea1SDimitry Andric                      [&TII](SDep &Succ) {
1485*0fca6ea1SDimitry Andric                        return TII->isMFMAorWMMA(*Succ.getSUnit()->getInstr());
1486*0fca6ea1SDimitry Andric                      })) {
1487*0fca6ea1SDimitry Andric       MFMAChainSeeds.push_back(MFMAPipeSU);
1488*0fca6ea1SDimitry Andric       ++MFMAChains;
1489*0fca6ea1SDimitry Andric     }
1490*0fca6ea1SDimitry Andric   }
1491*0fca6ea1SDimitry Andric 
1492*0fca6ea1SDimitry Andric   if (!MFMAChains)
1493*0fca6ea1SDimitry Andric     return false;
1494*0fca6ea1SDimitry Andric 
1495*0fca6ea1SDimitry Andric   for (auto Pred : MFMAChainSeeds[0]->Preds) {
1496*0fca6ea1SDimitry Andric     if (TII->isDS(Pred.getSUnit()->getInstr()->getOpcode()) &&
1497*0fca6ea1SDimitry Andric         Pred.getSUnit()->getInstr()->mayLoad())
1498*0fca6ea1SDimitry Andric       FirstPipeDSR = Pred.getSUnit()->NodeNum;
1499*0fca6ea1SDimitry Andric   }
1500*0fca6ea1SDimitry Andric 
1501*0fca6ea1SDimitry Andric   MFMAChainLength = MFMAPipeCount / MFMAChains;
1502*0fca6ea1SDimitry Andric 
1503*0fca6ea1SDimitry Andric   // The number of bit pack operations that depend on a single V_EXP
1504*0fca6ea1SDimitry Andric   unsigned PackSuccCount = std::count_if(
1505*0fca6ea1SDimitry Andric       PackSUs.begin(), PackSUs.end(), [this, &TempExp](SUnit *VPack) {
1506*0fca6ea1SDimitry Andric         return DAG->IsReachable(VPack, *TempExp);
1507*0fca6ea1SDimitry Andric       });
1508*0fca6ea1SDimitry Andric 
1509*0fca6ea1SDimitry Andric   // The number of bit pack operations an MFMA depends on
1510*0fca6ea1SDimitry Andric   unsigned PackPredCount =
1511*0fca6ea1SDimitry Andric       std::count_if((*TempMFMA)->Preds.begin(), (*TempMFMA)->Preds.end(),
1512*0fca6ea1SDimitry Andric                     [&isBitPack](SDep &Pred) {
1513*0fca6ea1SDimitry Andric                       auto Opc = Pred.getSUnit()->getInstr()->getOpcode();
1514*0fca6ea1SDimitry Andric                       return isBitPack(Opc);
1515*0fca6ea1SDimitry Andric                     });
1516*0fca6ea1SDimitry Andric 
1517*0fca6ea1SDimitry Andric   auto PackPred =
1518*0fca6ea1SDimitry Andric       std::find_if((*TempMFMA)->Preds.begin(), (*TempMFMA)->Preds.end(),
1519*0fca6ea1SDimitry Andric                    [&isBitPack](SDep &Pred) {
1520*0fca6ea1SDimitry Andric                      auto Opc = Pred.getSUnit()->getInstr()->getOpcode();
1521*0fca6ea1SDimitry Andric                      return isBitPack(Opc);
1522*0fca6ea1SDimitry Andric                    });
1523*0fca6ea1SDimitry Andric 
1524*0fca6ea1SDimitry Andric   if (PackPred == (*TempMFMA)->Preds.end())
1525*0fca6ea1SDimitry Andric     return false;
1526*0fca6ea1SDimitry Andric 
1527*0fca6ea1SDimitry Andric   MFMAEnablement = 0;
1528*0fca6ea1SDimitry Andric   ExpRequirement = 0;
1529*0fca6ea1SDimitry Andric   // How many MFMAs depend on a single bit pack operation
1530*0fca6ea1SDimitry Andric   MFMAEnablement =
1531*0fca6ea1SDimitry Andric       std::count_if(PackPred->getSUnit()->Succs.begin(),
1532*0fca6ea1SDimitry Andric                     PackPred->getSUnit()->Succs.end(), [&TII](SDep &Succ) {
1533*0fca6ea1SDimitry Andric                       return TII->isMFMAorWMMA(*Succ.getSUnit()->getInstr());
1534*0fca6ea1SDimitry Andric                     });
1535*0fca6ea1SDimitry Andric 
1536*0fca6ea1SDimitry Andric   // The number of MFMAs that depend on a single V_EXP
1537*0fca6ea1SDimitry Andric   MFMAEnablement *= PackSuccCount;
1538*0fca6ea1SDimitry Andric 
1539*0fca6ea1SDimitry Andric   // The number of V_EXPs required to resolve all dependencies for an MFMA
1540*0fca6ea1SDimitry Andric   ExpRequirement =
1541*0fca6ea1SDimitry Andric       std::count_if(ExpPipeCands.begin(), ExpPipeCands.end(),
1542*0fca6ea1SDimitry Andric                     [this, &PackPred](SUnit *ExpBase) {
1543*0fca6ea1SDimitry Andric                       return DAG->IsReachable(PackPred->getSUnit(), ExpBase);
1544*0fca6ea1SDimitry Andric                     });
1545*0fca6ea1SDimitry Andric 
1546*0fca6ea1SDimitry Andric   ExpRequirement *= PackPredCount;
1547*0fca6ea1SDimitry Andric   return true;
1548*0fca6ea1SDimitry Andric }
1549*0fca6ea1SDimitry Andric 
1550*0fca6ea1SDimitry Andric bool MFMAExpInterleaveOpt::shouldApplyStrategy(ScheduleDAGInstrs *DAG,
1551*0fca6ea1SDimitry Andric                                                AMDGPU::SchedulingPhase Phase) {
1552*0fca6ea1SDimitry Andric   const GCNSubtarget &ST = DAG->MF.getSubtarget<GCNSubtarget>();
1553*0fca6ea1SDimitry Andric   const SIInstrInfo *TII = ST.getInstrInfo();
1554*0fca6ea1SDimitry Andric 
1555*0fca6ea1SDimitry Andric   if (Phase != AMDGPU::SchedulingPhase::PostRA)
1556*0fca6ea1SDimitry Andric     MFMAChainSeeds.clear();
1557*0fca6ea1SDimitry Andric   if (Phase != AMDGPU::SchedulingPhase::PostRA && !analyzeDAG(TII))
1558*0fca6ea1SDimitry Andric     return false;
1559*0fca6ea1SDimitry Andric 
1560*0fca6ea1SDimitry Andric   return true;
1561*0fca6ea1SDimitry Andric }
1562*0fca6ea1SDimitry Andric 
1563*0fca6ea1SDimitry Andric bool MFMAExpInterleaveOpt::applyIGLPStrategy(
1564*0fca6ea1SDimitry Andric     DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
1565*0fca6ea1SDimitry Andric     DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
1566*0fca6ea1SDimitry Andric     AMDGPU::SchedulingPhase Phase) {
1567*0fca6ea1SDimitry Andric 
1568*0fca6ea1SDimitry Andric   bool IsSmallKernelType =
1569*0fca6ea1SDimitry Andric       MFMAEnablement == 2 && ExpRequirement == 4 && TransPipeCount == 32;
1570*0fca6ea1SDimitry Andric   bool IsLargeKernelType =
1571*0fca6ea1SDimitry Andric       MFMAEnablement == 4 && ExpRequirement == 4 && TransPipeCount == 64;
1572*0fca6ea1SDimitry Andric 
1573*0fca6ea1SDimitry Andric   if (!(IsSmallKernelType || IsLargeKernelType))
1574*0fca6ea1SDimitry Andric     return false;
1575*0fca6ea1SDimitry Andric 
1576*0fca6ea1SDimitry Andric   const GCNSubtarget &ST = DAG->MF.getSubtarget<GCNSubtarget>();
1577*0fca6ea1SDimitry Andric   const SIInstrInfo *TII = ST.getInstrInfo();
1578*0fca6ea1SDimitry Andric 
1579*0fca6ea1SDimitry Andric   unsigned PipelineSyncID = 0;
1580*0fca6ea1SDimitry Andric   SchedGroup *SG = nullptr;
1581*0fca6ea1SDimitry Andric 
1582*0fca6ea1SDimitry Andric   unsigned MFMAChain = 0;
1583*0fca6ea1SDimitry Andric   unsigned PositionInChain = 0;
1584*0fca6ea1SDimitry Andric   unsigned CurrMFMAForTransPosition = 0;
1585*0fca6ea1SDimitry Andric 
1586*0fca6ea1SDimitry Andric   auto incrementTransPosition = [&MFMAChain, &PositionInChain,
1587*0fca6ea1SDimitry Andric                                  &CurrMFMAForTransPosition]() {
1588*0fca6ea1SDimitry Andric     CurrMFMAForTransPosition += MFMAEnablement;
1589*0fca6ea1SDimitry Andric     PositionInChain = (CurrMFMAForTransPosition / MFMAChains);
1590*0fca6ea1SDimitry Andric     MFMAChain = CurrMFMAForTransPosition % MFMAChains;
1591*0fca6ea1SDimitry Andric   };
1592*0fca6ea1SDimitry Andric 
1593*0fca6ea1SDimitry Andric   auto getNextTransPositionInChain = [&CurrMFMAForTransPosition]() {
1594*0fca6ea1SDimitry Andric     auto TempMFMAForTrans = CurrMFMAForTransPosition + MFMAEnablement;
1595*0fca6ea1SDimitry Andric     return (TempMFMAForTrans / MFMAChains);
1596*0fca6ea1SDimitry Andric   };
1597*0fca6ea1SDimitry Andric 
1598*0fca6ea1SDimitry Andric   auto getNextTransMFMAChain = [&CurrMFMAForTransPosition]() {
1599*0fca6ea1SDimitry Andric     auto TempMFMAForTrans = CurrMFMAForTransPosition + MFMAEnablement;
1600*0fca6ea1SDimitry Andric     return TempMFMAForTrans % MFMAChains;
1601*0fca6ea1SDimitry Andric   };
1602*0fca6ea1SDimitry Andric 
1603*0fca6ea1SDimitry Andric   unsigned CurrMFMAPosition = 0;
1604*0fca6ea1SDimitry Andric   unsigned MFMAChainForMFMA = 0;
1605*0fca6ea1SDimitry Andric   unsigned PositionInChainForMFMA = 0;
1606*0fca6ea1SDimitry Andric 
1607*0fca6ea1SDimitry Andric   auto incrementMFMAPosition = [&CurrMFMAPosition, &MFMAChainForMFMA,
1608*0fca6ea1SDimitry Andric                                 &PositionInChainForMFMA]() {
1609*0fca6ea1SDimitry Andric     ++CurrMFMAPosition;
1610*0fca6ea1SDimitry Andric     MFMAChainForMFMA = CurrMFMAPosition % MFMAChains;
1611*0fca6ea1SDimitry Andric     PositionInChainForMFMA = CurrMFMAPosition / MFMAChains;
1612*0fca6ea1SDimitry Andric   };
1613*0fca6ea1SDimitry Andric 
1614*0fca6ea1SDimitry Andric   bool IsPostRA = Phase == AMDGPU::SchedulingPhase::PostRA;
1615*0fca6ea1SDimitry Andric   assert(IsPostRA || MFMAChainSeeds.size() == MFMAChains);
1616*0fca6ea1SDimitry Andric 
1617*0fca6ea1SDimitry Andric   bool UsesFMA = IsSmallKernelType || !IsPostRA;
1618*0fca6ea1SDimitry Andric   bool UsesDSRead = IsLargeKernelType && !IsPostRA && FirstPipeDSR;
1619*0fca6ea1SDimitry Andric   bool UsesCvt = HasCvt && (IsSmallKernelType || !IsPostRA);
1620*0fca6ea1SDimitry Andric   bool UsesVALU = IsSmallKernelType;
1621*0fca6ea1SDimitry Andric 
1622*0fca6ea1SDimitry Andric   // PHASE 1: "Prefetch"
1623*0fca6ea1SDimitry Andric   if (UsesFMA) {
1624*0fca6ea1SDimitry Andric     // First Round FMA
1625*0fca6ea1SDimitry Andric     SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1626*0fca6ea1SDimitry Andric         SchedGroupMask::VALU, ExpRequirement, PipelineSyncID, DAG, TII);
1627*0fca6ea1SDimitry Andric     if (!IsPostRA && MFMAChains) {
1628*0fca6ea1SDimitry Andric       SG->addRule(std::make_shared<EnablesNthMFMAInChain>(
1629*0fca6ea1SDimitry Andric           PositionInChain, MFMAChainSeeds[MFMAChain], TII, SG->getSGID(),
1630*0fca6ea1SDimitry Andric           true));
1631*0fca6ea1SDimitry Andric     } else
1632*0fca6ea1SDimitry Andric       SG->addRule(
1633*0fca6ea1SDimitry Andric           std::make_shared<EnablesNthMFMA>(1, TII, SG->getSGID(), true));
1634*0fca6ea1SDimitry Andric     SG->addRule(std::make_shared<IsFMA>(TII, SG->getSGID()));
1635*0fca6ea1SDimitry Andric     SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1636*0fca6ea1SDimitry Andric 
1637*0fca6ea1SDimitry Andric     // Second Round FMA
1638*0fca6ea1SDimitry Andric     SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1639*0fca6ea1SDimitry Andric         SchedGroupMask::VALU, ExpRequirement, PipelineSyncID, DAG, TII);
1640*0fca6ea1SDimitry Andric     if (!IsPostRA && MFMAChains) {
1641*0fca6ea1SDimitry Andric       SG->addRule(std::make_shared<EnablesNthMFMAInChain>(
1642*0fca6ea1SDimitry Andric           getNextTransPositionInChain(),
1643*0fca6ea1SDimitry Andric           MFMAChainSeeds[getNextTransMFMAChain()], TII, SG->getSGID(), true));
1644*0fca6ea1SDimitry Andric     } else
1645*0fca6ea1SDimitry Andric       SG->addRule(std::make_shared<EnablesNthMFMA>(MFMAEnablement + 1, TII,
1646*0fca6ea1SDimitry Andric                                                    SG->getSGID(), true));
1647*0fca6ea1SDimitry Andric     SG->addRule(std::make_shared<IsFMA>(TII, SG->getSGID()));
1648*0fca6ea1SDimitry Andric     SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1649*0fca6ea1SDimitry Andric   }
1650*0fca6ea1SDimitry Andric 
1651*0fca6ea1SDimitry Andric   if (UsesDSRead) {
1652*0fca6ea1SDimitry Andric     SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1653*0fca6ea1SDimitry Andric         SchedGroupMask::DS_READ, 2, PipelineSyncID, DAG, TII);
1654*0fca6ea1SDimitry Andric     SG->addRule(std::make_shared<OccursAtOrAfterNode>(*FirstPipeDSR, TII,
1655*0fca6ea1SDimitry Andric                                                       SG->getSGID()));
1656*0fca6ea1SDimitry Andric     SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1657*0fca6ea1SDimitry Andric   }
1658*0fca6ea1SDimitry Andric 
1659*0fca6ea1SDimitry Andric   // First Round EXP
1660*0fca6ea1SDimitry Andric   SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1661*0fca6ea1SDimitry Andric       SchedGroupMask::TRANS, ExpRequirement, PipelineSyncID, DAG, TII);
1662*0fca6ea1SDimitry Andric   if (!IsPostRA && MFMAChains)
1663*0fca6ea1SDimitry Andric     SG->addRule(std::make_shared<EnablesNthMFMAInChain>(
1664*0fca6ea1SDimitry Andric         PositionInChain, MFMAChainSeeds[MFMAChain], TII, SG->getSGID(), true));
1665*0fca6ea1SDimitry Andric   else
1666*0fca6ea1SDimitry Andric     SG->addRule(std::make_shared<EnablesNthMFMA>(1, TII, SG->getSGID(), true));
1667*0fca6ea1SDimitry Andric   SG->addRule(std::make_shared<IsPipeExp>(TII, SG->getSGID(), true));
1668*0fca6ea1SDimitry Andric   SG->addRule(std::make_shared<LessThanNSuccs>(8, TII, SG->getSGID(),
1669*0fca6ea1SDimitry Andric                                                HasChainBetweenCvt));
1670*0fca6ea1SDimitry Andric   SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1671*0fca6ea1SDimitry Andric 
1672*0fca6ea1SDimitry Andric   incrementTransPosition();
1673*0fca6ea1SDimitry Andric 
1674*0fca6ea1SDimitry Andric   // First Round CVT, Third Round FMA, Second Round EXP; interleaved
1675*0fca6ea1SDimitry Andric   for (unsigned I = 0; I < ExpRequirement; I++) {
1676*0fca6ea1SDimitry Andric     // First Round CVT
1677*0fca6ea1SDimitry Andric     if (UsesCvt) {
1678*0fca6ea1SDimitry Andric       SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1679*0fca6ea1SDimitry Andric           SchedGroupMask::VALU, 1, PipelineSyncID, DAG, TII);
1680*0fca6ea1SDimitry Andric       SG->addRule(std::make_shared<IsCvt>(TII, SG->getSGID()));
1681*0fca6ea1SDimitry Andric       if (HasChainBetweenCvt)
1682*0fca6ea1SDimitry Andric         SG->addRule(std::make_shared<IsReachableFromPrevNthGroup>(
1683*0fca6ea1SDimitry Andric             1 + (2 + UsesFMA) * I, TII, SG->getSGID()));
1684*0fca6ea1SDimitry Andric       else
1685*0fca6ea1SDimitry Andric         SG->addRule(std::make_shared<IsSuccOfPrevNthGroup>(
1686*0fca6ea1SDimitry Andric             1 + (2 + UsesFMA) * I, TII, SG->getSGID()));
1687*0fca6ea1SDimitry Andric       SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1688*0fca6ea1SDimitry Andric     }
1689*0fca6ea1SDimitry Andric 
1690*0fca6ea1SDimitry Andric     // Third Round FMA
1691*0fca6ea1SDimitry Andric     if (UsesFMA) {
1692*0fca6ea1SDimitry Andric       SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1693*0fca6ea1SDimitry Andric           SchedGroupMask::VALU, 1, PipelineSyncID, DAG, TII);
1694*0fca6ea1SDimitry Andric       if (!IsPostRA && MFMAChains) {
1695*0fca6ea1SDimitry Andric         SG->addRule(std::make_shared<EnablesNthMFMAInChain>(
1696*0fca6ea1SDimitry Andric             getNextTransPositionInChain(),
1697*0fca6ea1SDimitry Andric             MFMAChainSeeds[getNextTransMFMAChain()], TII, SG->getSGID(), true));
1698*0fca6ea1SDimitry Andric       } else
1699*0fca6ea1SDimitry Andric         SG->addRule(std::make_shared<EnablesNthMFMA>(2 * MFMAEnablement + 1,
1700*0fca6ea1SDimitry Andric                                                      TII, SG->getSGID(), true));
1701*0fca6ea1SDimitry Andric       SG->addRule(std::make_shared<IsFMA>(TII, SG->getSGID()));
1702*0fca6ea1SDimitry Andric       SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1703*0fca6ea1SDimitry Andric     }
1704*0fca6ea1SDimitry Andric 
1705*0fca6ea1SDimitry Andric     // Second Round EXP
1706*0fca6ea1SDimitry Andric     SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1707*0fca6ea1SDimitry Andric         SchedGroupMask::TRANS, 1, PipelineSyncID, DAG, TII);
1708*0fca6ea1SDimitry Andric     if (!IsPostRA && MFMAChains)
1709*0fca6ea1SDimitry Andric       SG->addRule(std::make_shared<EnablesNthMFMAInChain>(
1710*0fca6ea1SDimitry Andric           PositionInChain, MFMAChainSeeds[MFMAChain], TII, SG->getSGID(),
1711*0fca6ea1SDimitry Andric           true));
1712*0fca6ea1SDimitry Andric     else
1713*0fca6ea1SDimitry Andric       SG->addRule(std::make_shared<EnablesNthMFMA>(MFMAEnablement + 1, TII,
1714*0fca6ea1SDimitry Andric                                                    SG->getSGID(), true));
1715*0fca6ea1SDimitry Andric     SG->addRule(std::make_shared<IsPipeExp>(TII, SG->getSGID(), true));
1716*0fca6ea1SDimitry Andric     SG->addRule(std::make_shared<LessThanNSuccs>(8, TII, SG->getSGID(),
1717*0fca6ea1SDimitry Andric                                                  HasChainBetweenCvt));
1718*0fca6ea1SDimitry Andric     SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1719*0fca6ea1SDimitry Andric   }
1720*0fca6ea1SDimitry Andric 
1721*0fca6ea1SDimitry Andric   // The "extra" EXP which enables all MFMA
1722*0fca6ea1SDimitry Andric   // TODO: UsesExtraExp
1723*0fca6ea1SDimitry Andric   SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1724*0fca6ea1SDimitry Andric       SchedGroupMask::TRANS, 1, PipelineSyncID, DAG, TII);
1725*0fca6ea1SDimitry Andric   SG->addRule(std::make_shared<IsPipeExp>(TII, SG->getSGID(), true));
1726*0fca6ea1SDimitry Andric   SG->addRule(std::make_shared<GreaterThanOrEqualToNSuccs>(
1727*0fca6ea1SDimitry Andric       8, TII, SG->getSGID(), HasChainBetweenCvt));
1728*0fca6ea1SDimitry Andric   SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1729*0fca6ea1SDimitry Andric 
1730*0fca6ea1SDimitry Andric   // PHASE 2: Main Interleave Loop
1731*0fca6ea1SDimitry Andric 
1732*0fca6ea1SDimitry Andric   // The number of MFMAs per iteration
1733*0fca6ea1SDimitry Andric   unsigned MFMARatio =
1734*0fca6ea1SDimitry Andric       MFMAEnablement > ExpRequirement ? MFMAEnablement / ExpRequirement : 1;
1735*0fca6ea1SDimitry Andric   // The number of Exps per iteration
1736*0fca6ea1SDimitry Andric   unsigned ExpRatio =
1737*0fca6ea1SDimitry Andric       MFMAEnablement > ExpRequirement ? 1 : ExpRequirement / MFMAEnablement;
1738*0fca6ea1SDimitry Andric   // The reamaining Exps
1739*0fca6ea1SDimitry Andric   unsigned RemainingExp = TransPipeCount > (2 * ExpRequirement)
1740*0fca6ea1SDimitry Andric                               ? TransPipeCount - (2 * ExpRequirement)
1741*0fca6ea1SDimitry Andric                               : 0;
1742*0fca6ea1SDimitry Andric   unsigned ExpLoopCount = RemainingExp / ExpRatio;
1743*0fca6ea1SDimitry Andric   // In loop MFMAs
1744*0fca6ea1SDimitry Andric   unsigned MFMAInLoop = MFMAPipeCount > (MFMAEnablement * 2)
1745*0fca6ea1SDimitry Andric                             ? MFMAPipeCount - (MFMAEnablement * 2)
1746*0fca6ea1SDimitry Andric                             : 0;
1747*0fca6ea1SDimitry Andric   unsigned MFMALoopCount = MFMAInLoop / MFMARatio;
1748*0fca6ea1SDimitry Andric   unsigned VALUOps =
1749*0fca6ea1SDimitry Andric       AddPipeCount < MFMAPipeCount ? 1 : AddPipeCount / MFMAPipeCount;
1750*0fca6ea1SDimitry Andric   unsigned LoopSize = std::min(ExpLoopCount, MFMALoopCount);
1751*0fca6ea1SDimitry Andric 
1752*0fca6ea1SDimitry Andric   for (unsigned I = 0; I < LoopSize; I++) {
1753*0fca6ea1SDimitry Andric     if (!(I * ExpRatio % ExpRequirement))
1754*0fca6ea1SDimitry Andric       incrementTransPosition();
1755*0fca6ea1SDimitry Andric 
1756*0fca6ea1SDimitry Andric     // Round N MFMA
1757*0fca6ea1SDimitry Andric     SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1758*0fca6ea1SDimitry Andric         SchedGroupMask::MFMA, MFMARatio, PipelineSyncID, DAG, TII);
1759*0fca6ea1SDimitry Andric     if (!IsPostRA && MFMAChains)
1760*0fca6ea1SDimitry Andric       SG->addRule(std::make_shared<IsExactMFMA>(
1761*0fca6ea1SDimitry Andric           PositionInChainForMFMA, MFMAChainSeeds[MFMAChainForMFMA], TII,
1762*0fca6ea1SDimitry Andric           SG->getSGID(), true));
1763*0fca6ea1SDimitry Andric     else
1764*0fca6ea1SDimitry Andric       SG->addRule(std::make_shared<OccursAfterExp>(TII, SG->getSGID(), true));
1765*0fca6ea1SDimitry Andric     SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1766*0fca6ea1SDimitry Andric     incrementMFMAPosition();
1767*0fca6ea1SDimitry Andric 
1768*0fca6ea1SDimitry Andric     if (UsesVALU) {
1769*0fca6ea1SDimitry Andric       SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1770*0fca6ea1SDimitry Andric           SchedGroupMask::VALU, VALUOps, PipelineSyncID, DAG, TII);
1771*0fca6ea1SDimitry Andric       SG->addRule(std::make_shared<IsPipeAdd>(TII, SG->getSGID()));
1772*0fca6ea1SDimitry Andric       SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1773*0fca6ea1SDimitry Andric     }
1774*0fca6ea1SDimitry Andric 
1775*0fca6ea1SDimitry Andric     if (UsesDSRead && !(I % 4)) {
1776*0fca6ea1SDimitry Andric       SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1777*0fca6ea1SDimitry Andric           SchedGroupMask::DS_READ, 2, PipelineSyncID, DAG, TII);
1778*0fca6ea1SDimitry Andric       SG->addRule(std::make_shared<OccursAtOrAfterNode>(*FirstPipeDSR, TII,
1779*0fca6ea1SDimitry Andric                                                         SG->getSGID()));
1780*0fca6ea1SDimitry Andric       SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1781*0fca6ea1SDimitry Andric     }
1782*0fca6ea1SDimitry Andric 
1783*0fca6ea1SDimitry Andric     // CVT, EXP, FMA Interleaving
1784*0fca6ea1SDimitry Andric     for (unsigned J = 0; J < ExpRatio; J++) {
1785*0fca6ea1SDimitry Andric       auto MFMAOffset = (1 + UsesVALU) * MFMARatio * (I + 1);
1786*0fca6ea1SDimitry Andric       auto MaxMFMAOffset =
1787*0fca6ea1SDimitry Andric           (1 + UsesVALU) * ExpRequirement * MFMARatio / ExpRatio;
1788*0fca6ea1SDimitry Andric 
1789*0fca6ea1SDimitry Andric       // Round N + 1 CVT
1790*0fca6ea1SDimitry Andric       if (UsesCvt) {
1791*0fca6ea1SDimitry Andric         SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1792*0fca6ea1SDimitry Andric             SchedGroupMask::VALU, 1, PipelineSyncID, DAG, TII);
1793*0fca6ea1SDimitry Andric         SG->addRule(std::make_shared<IsCvt>(TII, SG->getSGID()));
1794*0fca6ea1SDimitry Andric         auto BaseDiff = (2 + UsesFMA) * (ExpRequirement - 1) + 1;
1795*0fca6ea1SDimitry Andric         auto DSROffset = I / 4 + 1;
1796*0fca6ea1SDimitry Andric         auto MaxDSROffset = MaxMFMAOffset / 4;
1797*0fca6ea1SDimitry Andric         // TODO: UsesExtraExp
1798*0fca6ea1SDimitry Andric         auto ExpOffset = I * ExpRatio + J >= ExpRequirement ? 0 : 1;
1799*0fca6ea1SDimitry Andric         auto CurrentOffset = UsesDSRead * std::min(MaxDSROffset, DSROffset) +
1800*0fca6ea1SDimitry Andric                              std::min(MaxMFMAOffset, MFMAOffset) + BaseDiff +
1801*0fca6ea1SDimitry Andric                              ExpOffset;
1802*0fca6ea1SDimitry Andric         if (HasChainBetweenCvt)
1803*0fca6ea1SDimitry Andric           SG->addRule(std::make_shared<IsReachableFromPrevNthGroup>(
1804*0fca6ea1SDimitry Andric               CurrentOffset, TII, SG->getSGID()));
1805*0fca6ea1SDimitry Andric         else
1806*0fca6ea1SDimitry Andric           SG->addRule(std::make_shared<IsSuccOfPrevNthGroup>(CurrentOffset, TII,
1807*0fca6ea1SDimitry Andric                                                              SG->getSGID()));
1808*0fca6ea1SDimitry Andric         SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1809*0fca6ea1SDimitry Andric       }
1810*0fca6ea1SDimitry Andric 
1811*0fca6ea1SDimitry Andric       // Round N + 3 FMA
1812*0fca6ea1SDimitry Andric       if (UsesFMA) {
1813*0fca6ea1SDimitry Andric         SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1814*0fca6ea1SDimitry Andric             SchedGroupMask::VALU, 1, PipelineSyncID, DAG, TII);
1815*0fca6ea1SDimitry Andric         if (!IsPostRA && MFMAChains)
1816*0fca6ea1SDimitry Andric           SG->addRule(std::make_shared<EnablesNthMFMAInChain>(
1817*0fca6ea1SDimitry Andric               getNextTransPositionInChain(),
1818*0fca6ea1SDimitry Andric               MFMAChainSeeds[getNextTransMFMAChain()], TII, SG->getSGID(),
1819*0fca6ea1SDimitry Andric               true));
1820*0fca6ea1SDimitry Andric         else
1821*0fca6ea1SDimitry Andric           SG->addRule(std::make_shared<EnablesNthMFMA>(
1822*0fca6ea1SDimitry Andric               (((I * ExpRatio + J) / ExpRequirement) + 3) * MFMAEnablement + 1,
1823*0fca6ea1SDimitry Andric               TII, SG->getSGID(), true));
1824*0fca6ea1SDimitry Andric         SG->addRule(std::make_shared<IsFMA>(TII, SG->getSGID()));
1825*0fca6ea1SDimitry Andric         SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1826*0fca6ea1SDimitry Andric       }
1827*0fca6ea1SDimitry Andric 
1828*0fca6ea1SDimitry Andric       // Round N + 2 Exp
1829*0fca6ea1SDimitry Andric       SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1830*0fca6ea1SDimitry Andric           SchedGroupMask::TRANS, 1, PipelineSyncID, DAG, TII);
1831*0fca6ea1SDimitry Andric       if (!IsPostRA && MFMAChains)
1832*0fca6ea1SDimitry Andric         SG->addRule(std::make_shared<EnablesNthMFMAInChain>(
1833*0fca6ea1SDimitry Andric             PositionInChain, MFMAChainSeeds[MFMAChain], TII, SG->getSGID(),
1834*0fca6ea1SDimitry Andric             true));
1835*0fca6ea1SDimitry Andric       else
1836*0fca6ea1SDimitry Andric         SG->addRule(std::make_shared<EnablesNthMFMA>(
1837*0fca6ea1SDimitry Andric             (((I * ExpRatio + J) / ExpRequirement) + 2) * MFMAEnablement + 1,
1838*0fca6ea1SDimitry Andric             TII, SG->getSGID(), true));
1839*0fca6ea1SDimitry Andric       SG->addRule(std::make_shared<IsPipeExp>(TII, SG->getSGID(), true));
1840*0fca6ea1SDimitry Andric       SG->addRule(std::make_shared<LessThanNSuccs>(8, TII, SG->getSGID(),
1841*0fca6ea1SDimitry Andric                                                    HasChainBetweenCvt));
1842*0fca6ea1SDimitry Andric       SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1843*0fca6ea1SDimitry Andric     }
1844*0fca6ea1SDimitry Andric   }
1845*0fca6ea1SDimitry Andric 
1846*0fca6ea1SDimitry Andric   // PHASE 3: Remaining MFMAs
1847*0fca6ea1SDimitry Andric   SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1848*0fca6ea1SDimitry Andric       SchedGroupMask::MFMA, MFMAEnablement * 2, PipelineSyncID, DAG, TII);
1849*0fca6ea1SDimitry Andric   SG->addRule(std::make_shared<OccursAfterExp>(TII, SG->getSGID(), true));
1850*0fca6ea1SDimitry Andric   SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1851*0fca6ea1SDimitry Andric   return true;
1852bdd1243dSDimitry Andric }
1853bdd1243dSDimitry Andric 
185406c3fb27SDimitry Andric class MFMASmallGemmSingleWaveOpt final : public IGLPStrategy {
185506c3fb27SDimitry Andric private:
185606c3fb27SDimitry Andric   // Whether the DS_READ is a predecessor of first four MFMA in region
185706c3fb27SDimitry Andric   class EnablesInitialMFMA final : public InstructionRule {
185806c3fb27SDimitry Andric   public:
185906c3fb27SDimitry Andric     bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
186006c3fb27SDimitry Andric                SmallVectorImpl<SchedGroup> &SyncPipe) override {
186106c3fb27SDimitry Andric       if (!SyncPipe.size())
186206c3fb27SDimitry Andric         return false;
186306c3fb27SDimitry Andric       int MFMAsFound = 0;
186406c3fb27SDimitry Andric       if (!Cache->size()) {
186506c3fb27SDimitry Andric         for (auto &Elt : SyncPipe[0].DAG->SUnits) {
186606c3fb27SDimitry Andric           if (TII->isMFMAorWMMA(*Elt.getInstr())) {
186706c3fb27SDimitry Andric             ++MFMAsFound;
186806c3fb27SDimitry Andric             if (MFMAsFound > 4)
186906c3fb27SDimitry Andric               break;
187006c3fb27SDimitry Andric             Cache->push_back(&Elt);
187106c3fb27SDimitry Andric           }
187206c3fb27SDimitry Andric         }
187306c3fb27SDimitry Andric       }
187406c3fb27SDimitry Andric 
187506c3fb27SDimitry Andric       assert(Cache->size());
187606c3fb27SDimitry Andric       auto DAG = SyncPipe[0].DAG;
187706c3fb27SDimitry Andric       for (auto &Elt : *Cache) {
187806c3fb27SDimitry Andric         if (DAG->IsReachable(Elt, const_cast<SUnit *>(SU)))
187906c3fb27SDimitry Andric           return true;
188006c3fb27SDimitry Andric       }
188106c3fb27SDimitry Andric       return false;
188206c3fb27SDimitry Andric     }
188306c3fb27SDimitry Andric 
188406c3fb27SDimitry Andric     EnablesInitialMFMA(const SIInstrInfo *TII, unsigned SGID,
188506c3fb27SDimitry Andric                        bool NeedsCache = false)
188606c3fb27SDimitry Andric         : InstructionRule(TII, SGID, NeedsCache) {}
188706c3fb27SDimitry Andric   };
188806c3fb27SDimitry Andric 
188906c3fb27SDimitry Andric   // Whether the MI is a V_PERM and is a predecessor of a common DS_WRITE
189006c3fb27SDimitry Andric   class IsPermForDSW final : public InstructionRule {
189106c3fb27SDimitry Andric   public:
189206c3fb27SDimitry Andric     bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
189306c3fb27SDimitry Andric                SmallVectorImpl<SchedGroup> &SyncPipe) override {
189406c3fb27SDimitry Andric       auto MI = SU->getInstr();
189506c3fb27SDimitry Andric       if (MI->getOpcode() != AMDGPU::V_PERM_B32_e64)
189606c3fb27SDimitry Andric         return false;
189706c3fb27SDimitry Andric 
189806c3fb27SDimitry Andric       bool FitsInGroup = false;
189906c3fb27SDimitry Andric       // Does the VALU have a DS_WRITE successor
190006c3fb27SDimitry Andric       if (!Collection.size()) {
190106c3fb27SDimitry Andric         for (auto &Succ : SU->Succs) {
190206c3fb27SDimitry Andric           SUnit *SuccUnit = Succ.getSUnit();
190306c3fb27SDimitry Andric           if (TII->isDS(*SuccUnit->getInstr()) &&
190406c3fb27SDimitry Andric               SuccUnit->getInstr()->mayStore()) {
190506c3fb27SDimitry Andric             Cache->push_back(SuccUnit);
190606c3fb27SDimitry Andric             FitsInGroup = true;
190706c3fb27SDimitry Andric           }
190806c3fb27SDimitry Andric         }
190906c3fb27SDimitry Andric         return FitsInGroup;
191006c3fb27SDimitry Andric       }
191106c3fb27SDimitry Andric 
191206c3fb27SDimitry Andric       assert(Cache->size());
191306c3fb27SDimitry Andric 
191406c3fb27SDimitry Andric       // Does the VALU have a DS_WRITE successor that is the same as other
191506c3fb27SDimitry Andric       // VALU already in the group. The V_PERMs will all share 1 DS_W succ
19165f757f3fSDimitry Andric       return llvm::any_of(*Cache, [&SU](SUnit *Elt) {
19175f757f3fSDimitry Andric         return llvm::any_of(SU->Succs, [&Elt](const SDep &ThisSucc) {
191806c3fb27SDimitry Andric           return ThisSucc.getSUnit() == Elt;
191906c3fb27SDimitry Andric         });
192006c3fb27SDimitry Andric       });
192106c3fb27SDimitry Andric     }
192206c3fb27SDimitry Andric 
192306c3fb27SDimitry Andric     IsPermForDSW(const SIInstrInfo *TII, unsigned SGID, bool NeedsCache = false)
192406c3fb27SDimitry Andric         : InstructionRule(TII, SGID, NeedsCache) {}
192506c3fb27SDimitry Andric   };
192606c3fb27SDimitry Andric 
192706c3fb27SDimitry Andric   // Whether the SU is a successor of any element in previous SchedGroup
192806c3fb27SDimitry Andric   class IsSuccOfPrevGroup final : public InstructionRule {
192906c3fb27SDimitry Andric   public:
193006c3fb27SDimitry Andric     bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
193106c3fb27SDimitry Andric                SmallVectorImpl<SchedGroup> &SyncPipe) override {
193206c3fb27SDimitry Andric       SchedGroup *OtherGroup = nullptr;
193306c3fb27SDimitry Andric       for (auto &PipeSG : SyncPipe) {
193406c3fb27SDimitry Andric         if ((unsigned)PipeSG.getSGID() == SGID - 1) {
193506c3fb27SDimitry Andric           OtherGroup = &PipeSG;
193606c3fb27SDimitry Andric         }
193706c3fb27SDimitry Andric       }
193806c3fb27SDimitry Andric 
193906c3fb27SDimitry Andric       if (!OtherGroup)
194006c3fb27SDimitry Andric         return false;
194106c3fb27SDimitry Andric       if (!OtherGroup->Collection.size())
194206c3fb27SDimitry Andric         return true;
194306c3fb27SDimitry Andric 
194406c3fb27SDimitry Andric       // Does the previous VALU have this DS_Write as a successor
194506c3fb27SDimitry Andric       return (std::any_of(OtherGroup->Collection.begin(),
194606c3fb27SDimitry Andric                           OtherGroup->Collection.end(), [&SU](SUnit *Elt) {
194706c3fb27SDimitry Andric                             return std::any_of(Elt->Succs.begin(),
194806c3fb27SDimitry Andric                                                Elt->Succs.end(),
194906c3fb27SDimitry Andric                                                [&SU](SDep &Succ) {
195006c3fb27SDimitry Andric                                                  return Succ.getSUnit() == SU;
195106c3fb27SDimitry Andric                                                });
195206c3fb27SDimitry Andric                           }));
195306c3fb27SDimitry Andric     }
195406c3fb27SDimitry Andric     IsSuccOfPrevGroup(const SIInstrInfo *TII, unsigned SGID,
195506c3fb27SDimitry Andric                       bool NeedsCache = false)
195606c3fb27SDimitry Andric         : InstructionRule(TII, SGID, NeedsCache) {}
195706c3fb27SDimitry Andric   };
195806c3fb27SDimitry Andric 
195906c3fb27SDimitry Andric   // Whether the combined load width of group is 128 bits
196006c3fb27SDimitry Andric   class VMEMSize final : public InstructionRule {
196106c3fb27SDimitry Andric   public:
196206c3fb27SDimitry Andric     bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
196306c3fb27SDimitry Andric                SmallVectorImpl<SchedGroup> &SyncPipe) override {
196406c3fb27SDimitry Andric       auto MI = SU->getInstr();
196506c3fb27SDimitry Andric       if (MI->getOpcode() == TargetOpcode::BUNDLE)
196606c3fb27SDimitry Andric         return false;
196706c3fb27SDimitry Andric       if (!Collection.size())
196806c3fb27SDimitry Andric         return true;
196906c3fb27SDimitry Andric 
197006c3fb27SDimitry Andric       int NumBits = 0;
197106c3fb27SDimitry Andric 
197206c3fb27SDimitry Andric       auto TRI = TII->getRegisterInfo();
197306c3fb27SDimitry Andric       auto &MRI = MI->getParent()->getParent()->getRegInfo();
197406c3fb27SDimitry Andric       for (auto &Elt : Collection) {
197506c3fb27SDimitry Andric         auto Op = Elt->getInstr()->getOperand(0);
197606c3fb27SDimitry Andric         auto Size =
197706c3fb27SDimitry Andric             TRI.getRegSizeInBits(*TRI.getRegClassForOperandReg(MRI, Op));
197806c3fb27SDimitry Andric         NumBits += Size;
197906c3fb27SDimitry Andric       }
198006c3fb27SDimitry Andric 
198106c3fb27SDimitry Andric       if (NumBits < 128) {
198206c3fb27SDimitry Andric         assert(TII->isVMEM(*MI) && MI->mayLoad());
198306c3fb27SDimitry Andric         if (NumBits + TRI.getRegSizeInBits(*TRI.getRegClassForOperandReg(
198406c3fb27SDimitry Andric                           MRI, MI->getOperand(0))) <=
198506c3fb27SDimitry Andric             128)
198606c3fb27SDimitry Andric           return true;
198706c3fb27SDimitry Andric       }
198806c3fb27SDimitry Andric 
198906c3fb27SDimitry Andric       return false;
199006c3fb27SDimitry Andric     }
199106c3fb27SDimitry Andric 
199206c3fb27SDimitry Andric     VMEMSize(const SIInstrInfo *TII, unsigned SGID, bool NeedsCache = false)
199306c3fb27SDimitry Andric         : InstructionRule(TII, SGID, NeedsCache) {}
199406c3fb27SDimitry Andric   };
199506c3fb27SDimitry Andric 
19965f757f3fSDimitry Andric   /// Whether the SU shares a V_PERM predecessor with any SU in the SchedGroup
19975f757f3fSDimitry Andric   /// that is \p Distance steps away
199806c3fb27SDimitry Andric   class SharesPredWithPrevNthGroup final : public InstructionRule {
199906c3fb27SDimitry Andric   private:
200006c3fb27SDimitry Andric     unsigned Distance = 1;
200106c3fb27SDimitry Andric 
200206c3fb27SDimitry Andric   public:
200306c3fb27SDimitry Andric     bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
200406c3fb27SDimitry Andric                SmallVectorImpl<SchedGroup> &SyncPipe) override {
200506c3fb27SDimitry Andric       SchedGroup *OtherGroup = nullptr;
200606c3fb27SDimitry Andric       if (!SyncPipe.size())
200706c3fb27SDimitry Andric         return false;
200806c3fb27SDimitry Andric 
200906c3fb27SDimitry Andric       if (!Cache->size()) {
201006c3fb27SDimitry Andric 
201106c3fb27SDimitry Andric         for (auto &PipeSG : SyncPipe) {
201206c3fb27SDimitry Andric           if ((unsigned)PipeSG.getSGID() == SGID - Distance) {
201306c3fb27SDimitry Andric             OtherGroup = &PipeSG;
201406c3fb27SDimitry Andric           }
201506c3fb27SDimitry Andric         }
201606c3fb27SDimitry Andric 
201706c3fb27SDimitry Andric         if (!OtherGroup)
201806c3fb27SDimitry Andric           return false;
201906c3fb27SDimitry Andric         if (!OtherGroup->Collection.size())
202006c3fb27SDimitry Andric           return true;
202106c3fb27SDimitry Andric 
202206c3fb27SDimitry Andric         for (auto &OtherEle : OtherGroup->Collection) {
202306c3fb27SDimitry Andric           for (auto &Pred : OtherEle->Preds) {
202406c3fb27SDimitry Andric             if (Pred.getSUnit()->getInstr()->getOpcode() ==
202506c3fb27SDimitry Andric                 AMDGPU::V_PERM_B32_e64)
202606c3fb27SDimitry Andric               Cache->push_back(Pred.getSUnit());
202706c3fb27SDimitry Andric           }
202806c3fb27SDimitry Andric         }
20295f757f3fSDimitry Andric 
20305f757f3fSDimitry Andric         // If the other group has no PERM preds, then this group won't share any
20315f757f3fSDimitry Andric         if (!Cache->size())
20325f757f3fSDimitry Andric           return false;
203306c3fb27SDimitry Andric       }
203406c3fb27SDimitry Andric 
203506c3fb27SDimitry Andric       auto DAG = SyncPipe[0].DAG;
203606c3fb27SDimitry Andric       // Does the previous DS_WRITE share a V_PERM predecessor with this
203706c3fb27SDimitry Andric       // VMEM_READ
20385f757f3fSDimitry Andric       return llvm::any_of(*Cache, [&SU, &DAG](SUnit *Elt) {
203906c3fb27SDimitry Andric         return DAG->IsReachable(const_cast<SUnit *>(SU), Elt);
20405f757f3fSDimitry Andric       });
204106c3fb27SDimitry Andric     }
204206c3fb27SDimitry Andric     SharesPredWithPrevNthGroup(unsigned Distance, const SIInstrInfo *TII,
204306c3fb27SDimitry Andric                                unsigned SGID, bool NeedsCache = false)
204406c3fb27SDimitry Andric         : InstructionRule(TII, SGID, NeedsCache), Distance(Distance) {}
204506c3fb27SDimitry Andric   };
204606c3fb27SDimitry Andric 
204706c3fb27SDimitry Andric public:
2048*0fca6ea1SDimitry Andric   bool applyIGLPStrategy(
204906c3fb27SDimitry Andric       DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
20505f757f3fSDimitry Andric       DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
2051*0fca6ea1SDimitry Andric       AMDGPU::SchedulingPhase Phase) override;
205206c3fb27SDimitry Andric 
2053*0fca6ea1SDimitry Andric   bool shouldApplyStrategy(ScheduleDAGInstrs *DAG,
2054*0fca6ea1SDimitry Andric                            AMDGPU::SchedulingPhase Phase) override {
2055*0fca6ea1SDimitry Andric     return true;
2056*0fca6ea1SDimitry Andric   }
205706c3fb27SDimitry Andric 
205806c3fb27SDimitry Andric   MFMASmallGemmSingleWaveOpt(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII)
205906c3fb27SDimitry Andric       : IGLPStrategy(DAG, TII) {
2060*0fca6ea1SDimitry Andric     IsBottomUp = false;
206106c3fb27SDimitry Andric   }
206206c3fb27SDimitry Andric };
206306c3fb27SDimitry Andric 
20645f757f3fSDimitry Andric static unsigned DSWCount = 0;
20655f757f3fSDimitry Andric static unsigned DSWWithPermCount = 0;
20665f757f3fSDimitry Andric static unsigned DSWWithSharedVMEMCount = 0;
20675f757f3fSDimitry Andric 
2068*0fca6ea1SDimitry Andric bool MFMASmallGemmSingleWaveOpt::applyIGLPStrategy(
206906c3fb27SDimitry Andric     DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
20705f757f3fSDimitry Andric     DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
2071*0fca6ea1SDimitry Andric     AMDGPU::SchedulingPhase Phase) {
207206c3fb27SDimitry Andric   unsigned MFMACount = 0;
207306c3fb27SDimitry Andric   unsigned DSRCount = 0;
20745f757f3fSDimitry Andric 
2075*0fca6ea1SDimitry Andric   bool IsInitial = Phase == AMDGPU::SchedulingPhase::Initial;
2076*0fca6ea1SDimitry Andric 
2077*0fca6ea1SDimitry Andric   assert((!IsInitial || (DSWCount == 0 && DSWWithPermCount == 0 &&
20785f757f3fSDimitry Andric                          DSWWithSharedVMEMCount == 0)) &&
20795f757f3fSDimitry Andric          "DSWCounters should be zero in pre-RA scheduling!");
208006c3fb27SDimitry Andric   SmallVector<SUnit *, 6> DSWithPerms;
208106c3fb27SDimitry Andric   for (auto &SU : DAG->SUnits) {
208206c3fb27SDimitry Andric     auto I = SU.getInstr();
208306c3fb27SDimitry Andric     if (TII->isMFMAorWMMA(*I))
208406c3fb27SDimitry Andric       ++MFMACount;
208506c3fb27SDimitry Andric     else if (TII->isDS(*I)) {
208606c3fb27SDimitry Andric       if (I->mayLoad())
208706c3fb27SDimitry Andric         ++DSRCount;
2088*0fca6ea1SDimitry Andric       else if (I->mayStore() && IsInitial) {
208906c3fb27SDimitry Andric         ++DSWCount;
209006c3fb27SDimitry Andric         for (auto Pred : SU.Preds) {
209106c3fb27SDimitry Andric           if (Pred.getSUnit()->getInstr()->getOpcode() ==
209206c3fb27SDimitry Andric               AMDGPU::V_PERM_B32_e64) {
209306c3fb27SDimitry Andric             DSWithPerms.push_back(&SU);
209406c3fb27SDimitry Andric             break;
209506c3fb27SDimitry Andric           }
209606c3fb27SDimitry Andric         }
209706c3fb27SDimitry Andric       }
209806c3fb27SDimitry Andric     }
209906c3fb27SDimitry Andric   }
21005f757f3fSDimitry Andric 
2101*0fca6ea1SDimitry Andric   if (IsInitial) {
210206c3fb27SDimitry Andric     DSWWithPermCount = DSWithPerms.size();
210306c3fb27SDimitry Andric     auto I = DSWithPerms.begin();
210406c3fb27SDimitry Andric     auto E = DSWithPerms.end();
210506c3fb27SDimitry Andric 
210606c3fb27SDimitry Andric     // Get the count of DS_WRITES with V_PERM predecessors which
210706c3fb27SDimitry Andric     // have loop carried dependencies (WAR) on the same VMEM_READs.
210806c3fb27SDimitry Andric     // We consider partial overlap as a miss -- in other words,
210906c3fb27SDimitry Andric     // for a given DS_W, we only consider another DS_W as matching
211006c3fb27SDimitry Andric     // if there is a corresponding (in terms of the VMEM_R it uses) V_PERM pred
211106c3fb27SDimitry Andric     // for every V_PERM pred of this DS_W.
211206c3fb27SDimitry Andric     DenseMap<MachineInstr *, SUnit *> VMEMLookup;
211306c3fb27SDimitry Andric     SmallVector<SUnit *, 6> Counted;
211406c3fb27SDimitry Andric     for (; I != E; I++) {
211506c3fb27SDimitry Andric       SUnit *Cand = nullptr;
211606c3fb27SDimitry Andric       bool MissedAny = false;
211706c3fb27SDimitry Andric       for (auto &Pred : (*I)->Preds) {
211806c3fb27SDimitry Andric         if (Pred.getSUnit()->getInstr()->getOpcode() != AMDGPU::V_PERM_B32_e64)
211906c3fb27SDimitry Andric           continue;
212006c3fb27SDimitry Andric 
21215f757f3fSDimitry Andric         if (Cand && llvm::is_contained(Counted, Cand))
212206c3fb27SDimitry Andric           break;
212306c3fb27SDimitry Andric 
212406c3fb27SDimitry Andric         for (auto &Succ : Pred.getSUnit()->Succs) {
212506c3fb27SDimitry Andric           auto MI = Succ.getSUnit()->getInstr();
212606c3fb27SDimitry Andric           if (!TII->isVMEM(*MI) || !MI->mayLoad())
212706c3fb27SDimitry Andric             continue;
212806c3fb27SDimitry Andric 
212906c3fb27SDimitry Andric           if (MissedAny || !VMEMLookup.size()) {
213006c3fb27SDimitry Andric             MissedAny = true;
213106c3fb27SDimitry Andric             VMEMLookup[MI] = *I;
213206c3fb27SDimitry Andric             continue;
213306c3fb27SDimitry Andric           }
213406c3fb27SDimitry Andric 
213506c3fb27SDimitry Andric           if (!VMEMLookup.contains(MI)) {
213606c3fb27SDimitry Andric             MissedAny = true;
213706c3fb27SDimitry Andric             VMEMLookup[MI] = *I;
213806c3fb27SDimitry Andric             continue;
213906c3fb27SDimitry Andric           }
214006c3fb27SDimitry Andric 
214106c3fb27SDimitry Andric           Cand = VMEMLookup[MI];
21425f757f3fSDimitry Andric           if (llvm::is_contained(Counted, Cand)) {
214306c3fb27SDimitry Andric             MissedAny = true;
214406c3fb27SDimitry Andric             break;
214506c3fb27SDimitry Andric           }
214606c3fb27SDimitry Andric         }
214706c3fb27SDimitry Andric       }
214806c3fb27SDimitry Andric       if (!MissedAny && Cand) {
214906c3fb27SDimitry Andric         DSWWithSharedVMEMCount += 2;
215006c3fb27SDimitry Andric         Counted.push_back(Cand);
215106c3fb27SDimitry Andric         Counted.push_back(*I);
215206c3fb27SDimitry Andric       }
215306c3fb27SDimitry Andric     }
21545f757f3fSDimitry Andric   }
215506c3fb27SDimitry Andric 
215606c3fb27SDimitry Andric   assert(DSWWithSharedVMEMCount <= DSWWithPermCount);
215706c3fb27SDimitry Andric   SchedGroup *SG;
215806c3fb27SDimitry Andric   unsigned PipelineSyncID = 0;
215906c3fb27SDimitry Andric   // For kernels with V_PERM, there are enough VALU to mix in between MFMAs
216006c3fb27SDimitry Andric   if (DSWWithPermCount) {
216106c3fb27SDimitry Andric     for (unsigned I = 0; I < MFMACount; I++) {
216206c3fb27SDimitry Andric       SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
216306c3fb27SDimitry Andric           SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
216406c3fb27SDimitry Andric       SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
216506c3fb27SDimitry Andric 
216606c3fb27SDimitry Andric       SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
216706c3fb27SDimitry Andric           SchedGroupMask::VALU, 2, PipelineSyncID, DAG, TII);
216806c3fb27SDimitry Andric       SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
216906c3fb27SDimitry Andric     }
217006c3fb27SDimitry Andric   }
217106c3fb27SDimitry Andric 
217206c3fb27SDimitry Andric   PipelineSyncID = 1;
217306c3fb27SDimitry Andric   // Phase 1: Break up DS_READ and MFMA clusters.
217406c3fb27SDimitry Andric   // First DS_READ to make ready initial MFMA, then interleave MFMA with DS_READ
217506c3fb27SDimitry Andric   // prefetch
217606c3fb27SDimitry Andric 
217706c3fb27SDimitry Andric   // Make ready initial MFMA
217806c3fb27SDimitry Andric   SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
217906c3fb27SDimitry Andric       SchedGroupMask::DS_READ, 4, PipelineSyncID, DAG, TII);
218006c3fb27SDimitry Andric   SG->addRule(std::make_shared<EnablesInitialMFMA>(TII, SG->getSGID(), true));
218106c3fb27SDimitry Andric   SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
218206c3fb27SDimitry Andric 
218306c3fb27SDimitry Andric   SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
218406c3fb27SDimitry Andric       SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
218506c3fb27SDimitry Andric   SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
218606c3fb27SDimitry Andric 
218706c3fb27SDimitry Andric   // Interleave MFMA with DS_READ prefetch
218806c3fb27SDimitry Andric   for (unsigned I = 0; I < DSRCount - 4; ++I) {
218906c3fb27SDimitry Andric     SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
219006c3fb27SDimitry Andric         SchedGroupMask::DS_READ, 1, PipelineSyncID, DAG, TII);
219106c3fb27SDimitry Andric     SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
219206c3fb27SDimitry Andric 
219306c3fb27SDimitry Andric     SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
219406c3fb27SDimitry Andric         SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
219506c3fb27SDimitry Andric     SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
219606c3fb27SDimitry Andric   }
219706c3fb27SDimitry Andric 
219806c3fb27SDimitry Andric   // Phase 2a: Loop carried dependency with V_PERM
219906c3fb27SDimitry Andric   // Schedule VPerm & DS_WRITE as closely as possible to the VMEM_READ they
220006c3fb27SDimitry Andric   // depend on. Interleave MFMA to keep XDL unit busy throughout.
220106c3fb27SDimitry Andric   for (unsigned I = 0; I < DSWWithPermCount - DSWWithSharedVMEMCount; ++I) {
220206c3fb27SDimitry Andric     SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
220306c3fb27SDimitry Andric         SchedGroupMask::VALU, 4, PipelineSyncID, DAG, TII);
220406c3fb27SDimitry Andric     SG->addRule(std::make_shared<IsPermForDSW>(TII, SG->getSGID(), true));
220506c3fb27SDimitry Andric     SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
220606c3fb27SDimitry Andric 
220706c3fb27SDimitry Andric     SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
220806c3fb27SDimitry Andric         SchedGroupMask::DS_WRITE, 1, PipelineSyncID, DAG, TII);
2209*0fca6ea1SDimitry Andric     SG->addRule(std::make_shared<IsSuccOfPrevGroup>(TII, SG->getSGID()));
221006c3fb27SDimitry Andric     SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
221106c3fb27SDimitry Andric 
221206c3fb27SDimitry Andric     SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
221306c3fb27SDimitry Andric         SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG, TII);
221406c3fb27SDimitry Andric     SG->addRule(std::make_shared<SharesPredWithPrevNthGroup>(
221506c3fb27SDimitry Andric         1, TII, SG->getSGID(), true));
2216*0fca6ea1SDimitry Andric     SG->addRule(std::make_shared<VMEMSize>(TII, SG->getSGID()));
221706c3fb27SDimitry Andric     SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
221806c3fb27SDimitry Andric 
221906c3fb27SDimitry Andric     SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
222006c3fb27SDimitry Andric         SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
222106c3fb27SDimitry Andric     SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
222206c3fb27SDimitry Andric 
222306c3fb27SDimitry Andric     SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
222406c3fb27SDimitry Andric         SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG, TII);
222506c3fb27SDimitry Andric     SG->addRule(std::make_shared<SharesPredWithPrevNthGroup>(
222606c3fb27SDimitry Andric         3, TII, SG->getSGID(), true));
2227*0fca6ea1SDimitry Andric     SG->addRule(std::make_shared<VMEMSize>(TII, SG->getSGID()));
222806c3fb27SDimitry Andric     SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
222906c3fb27SDimitry Andric 
223006c3fb27SDimitry Andric     SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
223106c3fb27SDimitry Andric         SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
223206c3fb27SDimitry Andric     SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
223306c3fb27SDimitry Andric   }
223406c3fb27SDimitry Andric 
223506c3fb27SDimitry Andric   // Phase 2b: Loop carried dependency without V_PERM
223606c3fb27SDimitry Andric   // Schedule DS_WRITE as closely as possible to the VMEM_READ they depend on.
223706c3fb27SDimitry Andric   // Interleave MFMA to keep XDL unit busy throughout.
223806c3fb27SDimitry Andric   for (unsigned I = 0; I < DSWCount - DSWWithPermCount; I++) {
223906c3fb27SDimitry Andric     SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
224006c3fb27SDimitry Andric         SchedGroupMask::DS_WRITE, 1, PipelineSyncID, DAG, TII);
224106c3fb27SDimitry Andric     SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
224206c3fb27SDimitry Andric 
224306c3fb27SDimitry Andric     SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
224406c3fb27SDimitry Andric         SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG, TII);
2245*0fca6ea1SDimitry Andric     SG->addRule(std::make_shared<VMEMSize>(TII, SG->getSGID()));
224606c3fb27SDimitry Andric     SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
224706c3fb27SDimitry Andric 
224806c3fb27SDimitry Andric     SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
224906c3fb27SDimitry Andric         SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
225006c3fb27SDimitry Andric     SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
225106c3fb27SDimitry Andric   }
225206c3fb27SDimitry Andric 
225306c3fb27SDimitry Andric   // Phase 2c: Loop carried dependency with V_PERM, VMEM_READs are
225406c3fb27SDimitry Andric   // ultimately used by two DS_WRITE
225506c3fb27SDimitry Andric   // Schedule VPerm & DS_WRITE as closely as possible to the VMEM_READ they
225606c3fb27SDimitry Andric   // depend on. Interleave MFMA to keep XDL unit busy throughout.
225706c3fb27SDimitry Andric 
225806c3fb27SDimitry Andric   for (unsigned I = 0; I < DSWWithSharedVMEMCount; ++I) {
225906c3fb27SDimitry Andric     SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
226006c3fb27SDimitry Andric         SchedGroupMask::VALU, 4, PipelineSyncID, DAG, TII);
226106c3fb27SDimitry Andric     SG->addRule(std::make_shared<IsPermForDSW>(TII, SG->getSGID(), true));
226206c3fb27SDimitry Andric     SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
226306c3fb27SDimitry Andric 
226406c3fb27SDimitry Andric     SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
226506c3fb27SDimitry Andric         SchedGroupMask::DS_WRITE, 1, PipelineSyncID, DAG, TII);
2266*0fca6ea1SDimitry Andric     SG->addRule(std::make_shared<IsSuccOfPrevGroup>(TII, SG->getSGID()));
226706c3fb27SDimitry Andric     SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
226806c3fb27SDimitry Andric 
226906c3fb27SDimitry Andric     SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
227006c3fb27SDimitry Andric         SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
227106c3fb27SDimitry Andric     SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
227206c3fb27SDimitry Andric 
227306c3fb27SDimitry Andric     SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
227406c3fb27SDimitry Andric         SchedGroupMask::VALU, 4, PipelineSyncID, DAG, TII);
227506c3fb27SDimitry Andric     SG->addRule(std::make_shared<IsPermForDSW>(TII, SG->getSGID(), true));
227606c3fb27SDimitry Andric     SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
227706c3fb27SDimitry Andric 
227806c3fb27SDimitry Andric     SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
227906c3fb27SDimitry Andric         SchedGroupMask::DS_WRITE, 1, PipelineSyncID, DAG, TII);
2280*0fca6ea1SDimitry Andric     SG->addRule(std::make_shared<IsSuccOfPrevGroup>(TII, SG->getSGID()));
228106c3fb27SDimitry Andric     SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
228206c3fb27SDimitry Andric 
228306c3fb27SDimitry Andric     SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
228406c3fb27SDimitry Andric         SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
228506c3fb27SDimitry Andric     SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
228606c3fb27SDimitry Andric 
228706c3fb27SDimitry Andric     SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
228806c3fb27SDimitry Andric         SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG, TII);
228906c3fb27SDimitry Andric     SG->addRule(std::make_shared<SharesPredWithPrevNthGroup>(
229006c3fb27SDimitry Andric         2, TII, SG->getSGID(), true));
2291*0fca6ea1SDimitry Andric     SG->addRule(std::make_shared<VMEMSize>(TII, SG->getSGID()));
229206c3fb27SDimitry Andric     SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
229306c3fb27SDimitry Andric 
229406c3fb27SDimitry Andric     SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
229506c3fb27SDimitry Andric         SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
229606c3fb27SDimitry Andric     SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
229706c3fb27SDimitry Andric 
229806c3fb27SDimitry Andric     SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
229906c3fb27SDimitry Andric         SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG, TII);
230006c3fb27SDimitry Andric     SG->addRule(std::make_shared<SharesPredWithPrevNthGroup>(
230106c3fb27SDimitry Andric         4, TII, SG->getSGID(), true));
2302*0fca6ea1SDimitry Andric     SG->addRule(std::make_shared<VMEMSize>(TII, SG->getSGID()));
230306c3fb27SDimitry Andric     SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
230406c3fb27SDimitry Andric 
230506c3fb27SDimitry Andric     SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
230606c3fb27SDimitry Andric         SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
230706c3fb27SDimitry Andric     SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
230806c3fb27SDimitry Andric   }
2309*0fca6ea1SDimitry Andric 
2310*0fca6ea1SDimitry Andric   return true;
231106c3fb27SDimitry Andric }
231206c3fb27SDimitry Andric 
2313bdd1243dSDimitry Andric static std::unique_ptr<IGLPStrategy>
2314bdd1243dSDimitry Andric createIGLPStrategy(IGLPStrategyID ID, ScheduleDAGInstrs *DAG,
2315bdd1243dSDimitry Andric                    const SIInstrInfo *TII) {
2316bdd1243dSDimitry Andric   switch (ID) {
2317bdd1243dSDimitry Andric   case MFMASmallGemmOptID:
2318bdd1243dSDimitry Andric     return std::make_unique<MFMASmallGemmOpt>(DAG, TII);
231906c3fb27SDimitry Andric   case MFMASmallGemmSingleWaveOptID:
232006c3fb27SDimitry Andric     return std::make_unique<MFMASmallGemmSingleWaveOpt>(DAG, TII);
2321*0fca6ea1SDimitry Andric   case MFMAExpInterleave:
2322*0fca6ea1SDimitry Andric     return std::make_unique<MFMAExpInterleaveOpt>(DAG, TII);
2323bdd1243dSDimitry Andric   }
2324bdd1243dSDimitry Andric 
2325bdd1243dSDimitry Andric   llvm_unreachable("Unknown IGLPStrategyID");
2326bdd1243dSDimitry Andric }
2327bdd1243dSDimitry Andric 
2328bdd1243dSDimitry Andric class IGroupLPDAGMutation : public ScheduleDAGMutation {
2329bdd1243dSDimitry Andric private:
2330bdd1243dSDimitry Andric   const SIInstrInfo *TII;
2331bdd1243dSDimitry Andric 
2332bdd1243dSDimitry Andric   ScheduleDAGMI *DAG;
2333bdd1243dSDimitry Andric 
2334bdd1243dSDimitry Andric   // Organize lists of SchedGroups by their SyncID. SchedGroups /
2335bdd1243dSDimitry Andric   // SCHED_GROUP_BARRIERs with different SyncIDs will have no edges added
2336bdd1243dSDimitry Andric   // between then.
2337bdd1243dSDimitry Andric   DenseMap<int, SmallVector<SchedGroup, 4>> SyncedSchedGroups;
2338bdd1243dSDimitry Andric 
2339bdd1243dSDimitry Andric   // Used to track instructions that can be mapped to multiple sched groups
2340bdd1243dSDimitry Andric   DenseMap<int, SUnitsToCandidateSGsMap> SyncedInstrs;
2341bdd1243dSDimitry Andric 
2342bdd1243dSDimitry Andric   // Add DAG edges that enforce SCHED_BARRIER ordering.
2343bdd1243dSDimitry Andric   void addSchedBarrierEdges(SUnit &SU);
2344bdd1243dSDimitry Andric 
2345bdd1243dSDimitry Andric   // Use a SCHED_BARRIER's mask to identify instruction SchedGroups that should
2346bdd1243dSDimitry Andric   // not be reordered accross the SCHED_BARRIER. This is used for the base
2347bdd1243dSDimitry Andric   // SCHED_BARRIER, and not SCHED_GROUP_BARRIER. The difference is that
2348bdd1243dSDimitry Andric   // SCHED_BARRIER will always block all instructions that can be classified
2349bdd1243dSDimitry Andric   // into a particular SchedClass, whereas SCHED_GROUP_BARRIER has a fixed size
2350bdd1243dSDimitry Andric   // and may only synchronize with some SchedGroups. Returns the inverse of
2351bdd1243dSDimitry Andric   // Mask. SCHED_BARRIER's mask describes which instruction types should be
2352bdd1243dSDimitry Andric   // allowed to be scheduled across it. Invert the mask to get the
2353bdd1243dSDimitry Andric   // SchedGroupMask of instructions that should be barred.
2354bdd1243dSDimitry Andric   SchedGroupMask invertSchedBarrierMask(SchedGroupMask Mask) const;
2355bdd1243dSDimitry Andric 
2356bdd1243dSDimitry Andric   // Create SchedGroups for a SCHED_GROUP_BARRIER.
2357bdd1243dSDimitry Andric   void initSchedGroupBarrierPipelineStage(
2358bdd1243dSDimitry Andric       std::vector<SUnit>::reverse_iterator RIter);
2359bdd1243dSDimitry Andric 
2360*0fca6ea1SDimitry Andric   bool initIGLPOpt(SUnit &SU);
2361bdd1243dSDimitry Andric 
2362bdd1243dSDimitry Andric public:
2363bdd1243dSDimitry Andric   void apply(ScheduleDAGInstrs *DAGInstrs) override;
2364bdd1243dSDimitry Andric 
236506c3fb27SDimitry Andric   // The order in which the PipelineSolver should process the candidate
236606c3fb27SDimitry Andric   // SchedGroup for a PipelineInstr. BOTTOM_UP will try to add SUs to the last
236706c3fb27SDimitry Andric   // created SchedGroup first, and will consider that as the ultimate
236806c3fb27SDimitry Andric   // predecessor group when linking. TOP_DOWN instead links and processes the
236906c3fb27SDimitry Andric   // first created SchedGroup first.
2370*0fca6ea1SDimitry Andric   bool IsBottomUp = true;
237106c3fb27SDimitry Andric 
2372*0fca6ea1SDimitry Andric   // The scheduling phase this application of IGLP corresponds with.
2373*0fca6ea1SDimitry Andric   AMDGPU::SchedulingPhase Phase = AMDGPU::SchedulingPhase::Initial;
23745f757f3fSDimitry Andric 
2375bdd1243dSDimitry Andric   IGroupLPDAGMutation() = default;
2376*0fca6ea1SDimitry Andric   IGroupLPDAGMutation(AMDGPU::SchedulingPhase Phase) : Phase(Phase) {}
2377bdd1243dSDimitry Andric };
2378bdd1243dSDimitry Andric 
2379bdd1243dSDimitry Andric unsigned SchedGroup::NumSchedGroups = 0;
2380bdd1243dSDimitry Andric 
2381bdd1243dSDimitry Andric bool SchedGroup::tryAddEdge(SUnit *A, SUnit *B) {
2382bdd1243dSDimitry Andric   if (A != B && DAG->canAddEdge(B, A)) {
2383bdd1243dSDimitry Andric     DAG->addEdge(B, SDep(A, SDep::Artificial));
2384bdd1243dSDimitry Andric     return true;
2385bdd1243dSDimitry Andric   }
2386bdd1243dSDimitry Andric   return false;
2387bdd1243dSDimitry Andric }
2388bdd1243dSDimitry Andric 
2389bdd1243dSDimitry Andric bool SchedGroup::canAddMI(const MachineInstr &MI) const {
2390bdd1243dSDimitry Andric   bool Result = false;
2391bdd1243dSDimitry Andric   if (MI.isMetaInstruction())
2392bdd1243dSDimitry Andric     Result = false;
2393bdd1243dSDimitry Andric 
2394bdd1243dSDimitry Andric   else if (((SGMask & SchedGroupMask::ALU) != SchedGroupMask::NONE) &&
2395cb14a3feSDimitry Andric            (TII->isVALU(MI) || TII->isMFMAorWMMA(MI) || TII->isSALU(MI) ||
2396cb14a3feSDimitry Andric             TII->isTRANS(MI)))
2397bdd1243dSDimitry Andric     Result = true;
2398bdd1243dSDimitry Andric 
2399bdd1243dSDimitry Andric   else if (((SGMask & SchedGroupMask::VALU) != SchedGroupMask::NONE) &&
2400cb14a3feSDimitry Andric            TII->isVALU(MI) && !TII->isMFMAorWMMA(MI) && !TII->isTRANS(MI))
2401bdd1243dSDimitry Andric     Result = true;
2402bdd1243dSDimitry Andric 
2403bdd1243dSDimitry Andric   else if (((SGMask & SchedGroupMask::SALU) != SchedGroupMask::NONE) &&
2404bdd1243dSDimitry Andric            TII->isSALU(MI))
2405bdd1243dSDimitry Andric     Result = true;
2406bdd1243dSDimitry Andric 
2407bdd1243dSDimitry Andric   else if (((SGMask & SchedGroupMask::MFMA) != SchedGroupMask::NONE) &&
2408bdd1243dSDimitry Andric            TII->isMFMAorWMMA(MI))
2409bdd1243dSDimitry Andric     Result = true;
2410bdd1243dSDimitry Andric 
2411bdd1243dSDimitry Andric   else if (((SGMask & SchedGroupMask::VMEM) != SchedGroupMask::NONE) &&
2412bdd1243dSDimitry Andric            (TII->isVMEM(MI) || (TII->isFLAT(MI) && !TII->isDS(MI))))
2413bdd1243dSDimitry Andric     Result = true;
2414bdd1243dSDimitry Andric 
2415bdd1243dSDimitry Andric   else if (((SGMask & SchedGroupMask::VMEM_READ) != SchedGroupMask::NONE) &&
2416bdd1243dSDimitry Andric            MI.mayLoad() &&
2417bdd1243dSDimitry Andric            (TII->isVMEM(MI) || (TII->isFLAT(MI) && !TII->isDS(MI))))
2418bdd1243dSDimitry Andric     Result = true;
2419bdd1243dSDimitry Andric 
2420bdd1243dSDimitry Andric   else if (((SGMask & SchedGroupMask::VMEM_WRITE) != SchedGroupMask::NONE) &&
2421bdd1243dSDimitry Andric            MI.mayStore() &&
2422bdd1243dSDimitry Andric            (TII->isVMEM(MI) || (TII->isFLAT(MI) && !TII->isDS(MI))))
2423bdd1243dSDimitry Andric     Result = true;
2424bdd1243dSDimitry Andric 
2425bdd1243dSDimitry Andric   else if (((SGMask & SchedGroupMask::DS) != SchedGroupMask::NONE) &&
2426bdd1243dSDimitry Andric            TII->isDS(MI))
2427bdd1243dSDimitry Andric     Result = true;
2428bdd1243dSDimitry Andric 
2429bdd1243dSDimitry Andric   else if (((SGMask & SchedGroupMask::DS_READ) != SchedGroupMask::NONE) &&
2430bdd1243dSDimitry Andric            MI.mayLoad() && TII->isDS(MI))
2431bdd1243dSDimitry Andric     Result = true;
2432bdd1243dSDimitry Andric 
2433bdd1243dSDimitry Andric   else if (((SGMask & SchedGroupMask::DS_WRITE) != SchedGroupMask::NONE) &&
2434bdd1243dSDimitry Andric            MI.mayStore() && TII->isDS(MI))
2435bdd1243dSDimitry Andric     Result = true;
2436bdd1243dSDimitry Andric 
2437cb14a3feSDimitry Andric   else if (((SGMask & SchedGroupMask::TRANS) != SchedGroupMask::NONE) &&
2438cb14a3feSDimitry Andric            TII->isTRANS(MI))
2439cb14a3feSDimitry Andric     Result = true;
2440cb14a3feSDimitry Andric 
2441bdd1243dSDimitry Andric   LLVM_DEBUG(
2442bdd1243dSDimitry Andric       dbgs() << "For SchedGroup with mask " << format_hex((int)SGMask, 10, true)
2443bdd1243dSDimitry Andric              << (Result ? " could classify " : " unable to classify ") << MI);
2444bdd1243dSDimitry Andric 
2445bdd1243dSDimitry Andric   return Result;
2446bdd1243dSDimitry Andric }
2447bdd1243dSDimitry Andric 
2448bdd1243dSDimitry Andric int SchedGroup::link(SUnit &SU, bool MakePred,
2449bdd1243dSDimitry Andric                      std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges) {
2450bdd1243dSDimitry Andric   int MissedEdges = 0;
2451bdd1243dSDimitry Andric   for (auto *A : Collection) {
2452bdd1243dSDimitry Andric     SUnit *B = &SU;
2453bdd1243dSDimitry Andric     if (A == B || A->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER)
2454bdd1243dSDimitry Andric       continue;
2455bdd1243dSDimitry Andric     if (MakePred)
2456bdd1243dSDimitry Andric       std::swap(A, B);
2457bdd1243dSDimitry Andric 
2458bdd1243dSDimitry Andric     if (DAG->IsReachable(B, A))
2459bdd1243dSDimitry Andric       continue;
246006c3fb27SDimitry Andric 
2461bdd1243dSDimitry Andric     // tryAddEdge returns false if there is a dependency that makes adding
2462bdd1243dSDimitry Andric     // the A->B edge impossible, otherwise it returns true;
2463bdd1243dSDimitry Andric     bool Added = tryAddEdge(A, B);
2464bdd1243dSDimitry Andric     if (Added)
2465*0fca6ea1SDimitry Andric       AddedEdges.emplace_back(A, B);
2466bdd1243dSDimitry Andric     else
2467bdd1243dSDimitry Andric       ++MissedEdges;
2468bdd1243dSDimitry Andric   }
2469bdd1243dSDimitry Andric 
2470bdd1243dSDimitry Andric   return MissedEdges;
2471bdd1243dSDimitry Andric }
2472bdd1243dSDimitry Andric 
2473bdd1243dSDimitry Andric void SchedGroup::link(SUnit &SU, bool MakePred) {
2474bdd1243dSDimitry Andric   for (auto *A : Collection) {
2475bdd1243dSDimitry Andric     SUnit *B = &SU;
2476bdd1243dSDimitry Andric     if (A->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER)
2477bdd1243dSDimitry Andric       continue;
2478bdd1243dSDimitry Andric     if (MakePred)
2479bdd1243dSDimitry Andric       std::swap(A, B);
2480bdd1243dSDimitry Andric 
2481bdd1243dSDimitry Andric     tryAddEdge(A, B);
2482bdd1243dSDimitry Andric   }
2483bdd1243dSDimitry Andric }
2484bdd1243dSDimitry Andric 
2485bdd1243dSDimitry Andric void SchedGroup::link(SUnit &SU,
2486bdd1243dSDimitry Andric                       function_ref<bool(const SUnit *A, const SUnit *B)> P) {
2487bdd1243dSDimitry Andric   for (auto *A : Collection) {
2488bdd1243dSDimitry Andric     SUnit *B = &SU;
2489bdd1243dSDimitry Andric     if (P(A, B))
2490bdd1243dSDimitry Andric       std::swap(A, B);
2491bdd1243dSDimitry Andric 
2492bdd1243dSDimitry Andric     tryAddEdge(A, B);
2493bdd1243dSDimitry Andric   }
2494bdd1243dSDimitry Andric }
2495bdd1243dSDimitry Andric 
2496bdd1243dSDimitry Andric void SchedGroup::link(SchedGroup &OtherGroup) {
2497bdd1243dSDimitry Andric   for (auto *B : OtherGroup.Collection)
2498bdd1243dSDimitry Andric     link(*B);
2499bdd1243dSDimitry Andric }
2500bdd1243dSDimitry Andric 
2501bdd1243dSDimitry Andric bool SchedGroup::canAddSU(SUnit &SU) const {
2502bdd1243dSDimitry Andric   MachineInstr &MI = *SU.getInstr();
2503bdd1243dSDimitry Andric   if (MI.getOpcode() != TargetOpcode::BUNDLE)
2504bdd1243dSDimitry Andric     return canAddMI(MI);
2505bdd1243dSDimitry Andric 
2506bdd1243dSDimitry Andric   // Special case for bundled MIs.
2507bdd1243dSDimitry Andric   const MachineBasicBlock *MBB = MI.getParent();
2508bdd1243dSDimitry Andric   MachineBasicBlock::instr_iterator B = MI.getIterator(), E = ++B;
2509bdd1243dSDimitry Andric   while (E != MBB->end() && E->isBundledWithPred())
2510bdd1243dSDimitry Andric     ++E;
2511bdd1243dSDimitry Andric 
2512bdd1243dSDimitry Andric   // Return true if all of the bundled MIs can be added to this group.
2513bdd1243dSDimitry Andric   return std::all_of(B, E, [this](MachineInstr &MI) { return canAddMI(MI); });
2514bdd1243dSDimitry Andric }
2515bdd1243dSDimitry Andric 
2516bdd1243dSDimitry Andric void SchedGroup::initSchedGroup() {
2517bdd1243dSDimitry Andric   for (auto &SU : DAG->SUnits) {
2518bdd1243dSDimitry Andric     if (isFull())
2519bdd1243dSDimitry Andric       break;
2520bdd1243dSDimitry Andric 
2521bdd1243dSDimitry Andric     if (canAddSU(SU))
2522bdd1243dSDimitry Andric       add(SU);
2523bdd1243dSDimitry Andric   }
2524bdd1243dSDimitry Andric }
2525bdd1243dSDimitry Andric 
2526bdd1243dSDimitry Andric void SchedGroup::initSchedGroup(std::vector<SUnit>::reverse_iterator RIter,
2527bdd1243dSDimitry Andric                                 SUnitsToCandidateSGsMap &SyncedInstrs) {
2528bdd1243dSDimitry Andric   SUnit &InitSU = *RIter;
2529bdd1243dSDimitry Andric   for (auto E = DAG->SUnits.rend(); RIter != E; ++RIter) {
2530bdd1243dSDimitry Andric     auto &SU = *RIter;
2531bdd1243dSDimitry Andric     if (isFull())
2532bdd1243dSDimitry Andric       break;
2533bdd1243dSDimitry Andric 
2534bdd1243dSDimitry Andric     if (canAddSU(SU))
2535bdd1243dSDimitry Andric       SyncedInstrs[&SU].push_back(SGID);
2536bdd1243dSDimitry Andric   }
2537bdd1243dSDimitry Andric 
2538bdd1243dSDimitry Andric   add(InitSU);
2539bdd1243dSDimitry Andric   assert(MaxSize);
2540bdd1243dSDimitry Andric   (*MaxSize)++;
2541bdd1243dSDimitry Andric }
2542bdd1243dSDimitry Andric 
2543bdd1243dSDimitry Andric void SchedGroup::initSchedGroup(SUnitsToCandidateSGsMap &SyncedInstrs) {
2544bdd1243dSDimitry Andric   auto I = DAG->SUnits.rbegin();
2545bdd1243dSDimitry Andric   auto E = DAG->SUnits.rend();
2546bdd1243dSDimitry Andric   for (; I != E; ++I) {
2547bdd1243dSDimitry Andric     auto &SU = *I;
2548bdd1243dSDimitry Andric     if (isFull())
2549bdd1243dSDimitry Andric       break;
2550bdd1243dSDimitry Andric     if (canAddSU(SU))
2551bdd1243dSDimitry Andric       SyncedInstrs[&SU].push_back(SGID);
2552bdd1243dSDimitry Andric   }
2553bdd1243dSDimitry Andric }
2554bdd1243dSDimitry Andric 
2555bdd1243dSDimitry Andric void IGroupLPDAGMutation::apply(ScheduleDAGInstrs *DAGInstrs) {
255681ad6265SDimitry Andric   const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel();
255781ad6265SDimitry Andric   if (!TSchedModel || DAGInstrs->SUnits.empty())
255881ad6265SDimitry Andric     return;
255981ad6265SDimitry Andric 
2560bdd1243dSDimitry Andric   LLVM_DEBUG(dbgs() << "Applying IGroupLPDAGMutation...\n");
256181ad6265SDimitry Andric   const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>();
256281ad6265SDimitry Andric   TII = ST.getInstrInfo();
256381ad6265SDimitry Andric   DAG = static_cast<ScheduleDAGMI *>(DAGInstrs);
2564bdd1243dSDimitry Andric   SyncedSchedGroups.clear();
2565bdd1243dSDimitry Andric   SyncedInstrs.clear();
2566*0fca6ea1SDimitry Andric   bool FoundSB = false;
2567*0fca6ea1SDimitry Andric   bool FoundIGLP = false;
2568*0fca6ea1SDimitry Andric   bool ShouldApplyIGLP = false;
2569bdd1243dSDimitry Andric   for (auto R = DAG->SUnits.rbegin(), E = DAG->SUnits.rend(); R != E; ++R) {
2570bdd1243dSDimitry Andric     unsigned Opc = R->getInstr()->getOpcode();
2571bdd1243dSDimitry Andric     // SCHED_[GROUP_]BARRIER and IGLP are mutually exclusive.
2572bdd1243dSDimitry Andric     if (Opc == AMDGPU::SCHED_BARRIER) {
2573bdd1243dSDimitry Andric       addSchedBarrierEdges(*R);
2574*0fca6ea1SDimitry Andric       FoundSB = true;
2575bdd1243dSDimitry Andric     } else if (Opc == AMDGPU::SCHED_GROUP_BARRIER) {
2576bdd1243dSDimitry Andric       initSchedGroupBarrierPipelineStage(R);
2577*0fca6ea1SDimitry Andric       FoundSB = true;
2578bdd1243dSDimitry Andric     } else if (Opc == AMDGPU::IGLP_OPT) {
2579bdd1243dSDimitry Andric       resetEdges(*R, DAG);
2580*0fca6ea1SDimitry Andric       if (!FoundSB && !FoundIGLP) {
2581*0fca6ea1SDimitry Andric         FoundIGLP = true;
2582*0fca6ea1SDimitry Andric         ShouldApplyIGLP = initIGLPOpt(*R);
2583*0fca6ea1SDimitry Andric       }
2584bdd1243dSDimitry Andric     }
258581ad6265SDimitry Andric   }
258681ad6265SDimitry Andric 
2587*0fca6ea1SDimitry Andric   if (FoundSB || (FoundIGLP && ShouldApplyIGLP)) {
258806c3fb27SDimitry Andric     PipelineSolver PS(SyncedSchedGroups, SyncedInstrs, DAG, IsBottomUp);
2589bdd1243dSDimitry Andric     // PipelineSolver performs the mutation by adding the edges it
2590bdd1243dSDimitry Andric     // determined as the best
2591bdd1243dSDimitry Andric     PS.solve();
2592*0fca6ea1SDimitry Andric     return;
2593bdd1243dSDimitry Andric   }
2594bdd1243dSDimitry Andric }
2595bdd1243dSDimitry Andric 
2596bdd1243dSDimitry Andric void IGroupLPDAGMutation::addSchedBarrierEdges(SUnit &SchedBarrier) {
259781ad6265SDimitry Andric   MachineInstr &MI = *SchedBarrier.getInstr();
259881ad6265SDimitry Andric   assert(MI.getOpcode() == AMDGPU::SCHED_BARRIER);
259981ad6265SDimitry Andric   // Remove all existing edges from the SCHED_BARRIER that were added due to the
260081ad6265SDimitry Andric   // instruction having side effects.
2601bdd1243dSDimitry Andric   resetEdges(SchedBarrier, DAG);
2602cb14a3feSDimitry Andric   LLVM_DEBUG(dbgs() << "Building SchedGroup for SchedBarrier with Mask: "
2603cb14a3feSDimitry Andric                     << MI.getOperand(0).getImm() << "\n");
2604bdd1243dSDimitry Andric   auto InvertedMask =
2605bdd1243dSDimitry Andric       invertSchedBarrierMask((SchedGroupMask)MI.getOperand(0).getImm());
2606bdd1243dSDimitry Andric   SchedGroup SG(InvertedMask, std::nullopt, DAG, TII);
2607bdd1243dSDimitry Andric   SG.initSchedGroup();
2608cb14a3feSDimitry Andric 
2609bdd1243dSDimitry Andric   // Preserve original instruction ordering relative to the SCHED_BARRIER.
2610bdd1243dSDimitry Andric   SG.link(
2611bdd1243dSDimitry Andric       SchedBarrier,
2612bdd1243dSDimitry Andric       (function_ref<bool(const SUnit *A, const SUnit *B)>)[](
2613bdd1243dSDimitry Andric           const SUnit *A, const SUnit *B) { return A->NodeNum > B->NodeNum; });
261481ad6265SDimitry Andric }
261581ad6265SDimitry Andric 
2616bdd1243dSDimitry Andric SchedGroupMask
2617bdd1243dSDimitry Andric IGroupLPDAGMutation::invertSchedBarrierMask(SchedGroupMask Mask) const {
2618bdd1243dSDimitry Andric   // Invert mask and erase bits for types of instructions that are implied to be
2619bdd1243dSDimitry Andric   // allowed past the SCHED_BARRIER.
2620bdd1243dSDimitry Andric   SchedGroupMask InvertedMask = ~Mask;
2621bdd1243dSDimitry Andric 
2622cb14a3feSDimitry Andric   // ALU implies VALU, SALU, MFMA, TRANS.
2623bdd1243dSDimitry Andric   if ((InvertedMask & SchedGroupMask::ALU) == SchedGroupMask::NONE)
2624cb14a3feSDimitry Andric     InvertedMask &= ~SchedGroupMask::VALU & ~SchedGroupMask::SALU &
2625cb14a3feSDimitry Andric                     ~SchedGroupMask::MFMA & ~SchedGroupMask::TRANS;
2626cb14a3feSDimitry Andric   // VALU, SALU, MFMA, TRANS implies ALU.
2627bdd1243dSDimitry Andric   else if ((InvertedMask & SchedGroupMask::VALU) == SchedGroupMask::NONE ||
2628bdd1243dSDimitry Andric            (InvertedMask & SchedGroupMask::SALU) == SchedGroupMask::NONE ||
2629cb14a3feSDimitry Andric            (InvertedMask & SchedGroupMask::MFMA) == SchedGroupMask::NONE ||
2630cb14a3feSDimitry Andric            (InvertedMask & SchedGroupMask::TRANS) == SchedGroupMask::NONE)
2631bdd1243dSDimitry Andric     InvertedMask &= ~SchedGroupMask::ALU;
2632bdd1243dSDimitry Andric 
2633bdd1243dSDimitry Andric   // VMEM implies VMEM_READ, VMEM_WRITE.
2634bdd1243dSDimitry Andric   if ((InvertedMask & SchedGroupMask::VMEM) == SchedGroupMask::NONE)
2635bdd1243dSDimitry Andric     InvertedMask &= ~SchedGroupMask::VMEM_READ & ~SchedGroupMask::VMEM_WRITE;
2636bdd1243dSDimitry Andric   // VMEM_READ, VMEM_WRITE implies VMEM.
2637bdd1243dSDimitry Andric   else if ((InvertedMask & SchedGroupMask::VMEM_READ) == SchedGroupMask::NONE ||
2638bdd1243dSDimitry Andric            (InvertedMask & SchedGroupMask::VMEM_WRITE) == SchedGroupMask::NONE)
2639bdd1243dSDimitry Andric     InvertedMask &= ~SchedGroupMask::VMEM;
2640bdd1243dSDimitry Andric 
2641bdd1243dSDimitry Andric   // DS implies DS_READ, DS_WRITE.
2642bdd1243dSDimitry Andric   if ((InvertedMask & SchedGroupMask::DS) == SchedGroupMask::NONE)
2643bdd1243dSDimitry Andric     InvertedMask &= ~SchedGroupMask::DS_READ & ~SchedGroupMask::DS_WRITE;
2644bdd1243dSDimitry Andric   // DS_READ, DS_WRITE implies DS.
2645bdd1243dSDimitry Andric   else if ((InvertedMask & SchedGroupMask::DS_READ) == SchedGroupMask::NONE ||
2646bdd1243dSDimitry Andric            (InvertedMask & SchedGroupMask::DS_WRITE) == SchedGroupMask::NONE)
2647bdd1243dSDimitry Andric     InvertedMask &= ~SchedGroupMask::DS;
2648bdd1243dSDimitry Andric 
2649cb14a3feSDimitry Andric   LLVM_DEBUG(dbgs() << "After Inverting, SchedGroup Mask: " << (int)InvertedMask
2650cb14a3feSDimitry Andric                     << "\n");
2651cb14a3feSDimitry Andric 
2652bdd1243dSDimitry Andric   return InvertedMask;
265381ad6265SDimitry Andric }
265481ad6265SDimitry Andric 
2655bdd1243dSDimitry Andric void IGroupLPDAGMutation::initSchedGroupBarrierPipelineStage(
2656bdd1243dSDimitry Andric     std::vector<SUnit>::reverse_iterator RIter) {
2657bdd1243dSDimitry Andric   // Remove all existing edges from the SCHED_GROUP_BARRIER that were added due
2658bdd1243dSDimitry Andric   // to the instruction having side effects.
2659bdd1243dSDimitry Andric   resetEdges(*RIter, DAG);
2660bdd1243dSDimitry Andric   MachineInstr &SGB = *RIter->getInstr();
2661bdd1243dSDimitry Andric   assert(SGB.getOpcode() == AMDGPU::SCHED_GROUP_BARRIER);
2662bdd1243dSDimitry Andric   int32_t SGMask = SGB.getOperand(0).getImm();
2663bdd1243dSDimitry Andric   int32_t Size = SGB.getOperand(1).getImm();
2664bdd1243dSDimitry Andric   int32_t SyncID = SGB.getOperand(2).getImm();
2665bdd1243dSDimitry Andric 
2666bdd1243dSDimitry Andric   auto &SG = SyncedSchedGroups[SyncID].emplace_back((SchedGroupMask)SGMask,
2667bdd1243dSDimitry Andric                                                     Size, SyncID, DAG, TII);
2668bdd1243dSDimitry Andric 
2669bdd1243dSDimitry Andric   SG.initSchedGroup(RIter, SyncedInstrs[SG.getSyncID()]);
267081ad6265SDimitry Andric }
267181ad6265SDimitry Andric 
2672*0fca6ea1SDimitry Andric bool IGroupLPDAGMutation::initIGLPOpt(SUnit &SU) {
2673bdd1243dSDimitry Andric   IGLPStrategyID StrategyID =
2674bdd1243dSDimitry Andric       (IGLPStrategyID)SU.getInstr()->getOperand(0).getImm();
2675bdd1243dSDimitry Andric   auto S = createIGLPStrategy(StrategyID, DAG, TII);
2676*0fca6ea1SDimitry Andric   if (!S->shouldApplyStrategy(DAG, Phase))
2677*0fca6ea1SDimitry Andric     return false;
2678*0fca6ea1SDimitry Andric 
267906c3fb27SDimitry Andric   IsBottomUp = S->IsBottomUp;
2680*0fca6ea1SDimitry Andric   return S->applyIGLPStrategy(SyncedInstrs, SyncedSchedGroups, Phase);
268106c3fb27SDimitry Andric }
268281ad6265SDimitry Andric 
268381ad6265SDimitry Andric } // namespace
268481ad6265SDimitry Andric 
268581ad6265SDimitry Andric namespace llvm {
268681ad6265SDimitry Andric 
2687*0fca6ea1SDimitry Andric /// \p Phase specifes whether or not this is a reentry into the
26885f757f3fSDimitry Andric /// IGroupLPDAGMutation. Since there may be multiple scheduling passes on the
26895f757f3fSDimitry Andric /// same scheduling region (e.g. pre and post-RA scheduling / multiple
26905f757f3fSDimitry Andric /// scheduling "phases"), we can reenter this mutation framework more than once
26915f757f3fSDimitry Andric /// for a given region.
2692*0fca6ea1SDimitry Andric std::unique_ptr<ScheduleDAGMutation>
2693*0fca6ea1SDimitry Andric createIGroupLPDAGMutation(AMDGPU::SchedulingPhase Phase) {
2694*0fca6ea1SDimitry Andric   return std::make_unique<IGroupLPDAGMutation>(Phase);
269581ad6265SDimitry Andric }
269681ad6265SDimitry Andric 
269781ad6265SDimitry Andric } // end namespace llvm
2698