xref: /freebsd-src/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp (revision 5f757f3ff9144b609b3c433dfd370cc6bdc191ad)
181ad6265SDimitry Andric //===--- AMDGPUIGroupLP.cpp - AMDGPU IGroupLP  ------------===//
281ad6265SDimitry Andric //
381ad6265SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
481ad6265SDimitry Andric // See https://llvm.org/LICENSE.txt for license information.
581ad6265SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
681ad6265SDimitry Andric //
781ad6265SDimitry Andric //===----------------------------------------------------------------------===//
881ad6265SDimitry Andric //
981ad6265SDimitry Andric // \file This file defines a set of schedule DAG mutations that can be used to
1081ad6265SDimitry Andric // override default scheduler behavior to enforce specific scheduling patterns.
1181ad6265SDimitry Andric // They should be used in cases where runtime performance considerations such as
1281ad6265SDimitry Andric // inter-wavefront interactions, mean that compile-time heuristics cannot
1381ad6265SDimitry Andric // predict the optimal instruction ordering, or in kernels where optimum
1481ad6265SDimitry Andric // instruction scheduling is important enough to warrant manual intervention.
1581ad6265SDimitry Andric //
1681ad6265SDimitry Andric //===----------------------------------------------------------------------===//
1781ad6265SDimitry Andric 
1881ad6265SDimitry Andric #include "AMDGPUIGroupLP.h"
1981ad6265SDimitry Andric #include "AMDGPUTargetMachine.h"
2081ad6265SDimitry Andric #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
2181ad6265SDimitry Andric #include "SIInstrInfo.h"
2281ad6265SDimitry Andric #include "SIMachineFunctionInfo.h"
2381ad6265SDimitry Andric #include "llvm/ADT/BitmaskEnum.h"
24bdd1243dSDimitry Andric #include "llvm/ADT/DenseMap.h"
2581ad6265SDimitry Andric #include "llvm/CodeGen/MachineScheduler.h"
2681ad6265SDimitry Andric #include "llvm/CodeGen/TargetOpcodes.h"
2781ad6265SDimitry Andric 
2881ad6265SDimitry Andric using namespace llvm;
2981ad6265SDimitry Andric 
30bdd1243dSDimitry Andric #define DEBUG_TYPE "igrouplp"
3181ad6265SDimitry Andric 
3281ad6265SDimitry Andric namespace {
3381ad6265SDimitry Andric 
34bdd1243dSDimitry Andric static cl::opt<bool> EnableExactSolver(
35bdd1243dSDimitry Andric     "amdgpu-igrouplp-exact-solver", cl::Hidden,
36bdd1243dSDimitry Andric     cl::desc("Whether to use the exponential time solver to fit "
37bdd1243dSDimitry Andric              "the instructions to the pipeline as closely as "
38bdd1243dSDimitry Andric              "possible."),
3981ad6265SDimitry Andric     cl::init(false));
4081ad6265SDimitry Andric 
41bdd1243dSDimitry Andric static cl::opt<unsigned> CutoffForExact(
42bdd1243dSDimitry Andric     "amdgpu-igrouplp-exact-solver-cutoff", cl::init(0), cl::Hidden,
43bdd1243dSDimitry Andric     cl::desc("The maximum number of scheduling group conflicts "
44bdd1243dSDimitry Andric              "which we attempt to solve with the exponential time "
45bdd1243dSDimitry Andric              "exact solver. Problem sizes greater than this will"
46bdd1243dSDimitry Andric              "be solved by the less accurate greedy algorithm. Selecting "
47bdd1243dSDimitry Andric              "solver by size is superseded by manually selecting "
48bdd1243dSDimitry Andric              "the solver (e.g. by amdgpu-igrouplp-exact-solver"));
4981ad6265SDimitry Andric 
50bdd1243dSDimitry Andric static cl::opt<uint64_t> MaxBranchesExplored(
51bdd1243dSDimitry Andric     "amdgpu-igrouplp-exact-solver-max-branches", cl::init(0), cl::Hidden,
52bdd1243dSDimitry Andric     cl::desc("The amount of branches that we are willing to explore with"
53bdd1243dSDimitry Andric              "the exact algorithm before giving up."));
5481ad6265SDimitry Andric 
55bdd1243dSDimitry Andric static cl::opt<bool> UseCostHeur(
56bdd1243dSDimitry Andric     "amdgpu-igrouplp-exact-solver-cost-heur", cl::init(true), cl::Hidden,
57bdd1243dSDimitry Andric     cl::desc("Whether to use the cost heuristic to make choices as we "
58bdd1243dSDimitry Andric              "traverse the search space using the exact solver. Defaulted "
59bdd1243dSDimitry Andric              "to on, and if turned off, we will use the node order -- "
60bdd1243dSDimitry Andric              "attempting to put the later nodes in the later sched groups. "
61bdd1243dSDimitry Andric              "Experimentally, results are mixed, so this should be set on a "
62bdd1243dSDimitry Andric              "case-by-case basis."));
6381ad6265SDimitry Andric 
64bdd1243dSDimitry Andric // Components of the mask that determines which instruction types may be may be
65bdd1243dSDimitry Andric // classified into a SchedGroup.
66bdd1243dSDimitry Andric enum class SchedGroupMask {
6781ad6265SDimitry Andric   NONE = 0u,
6881ad6265SDimitry Andric   ALU = 1u << 0,
6981ad6265SDimitry Andric   VALU = 1u << 1,
7081ad6265SDimitry Andric   SALU = 1u << 2,
7181ad6265SDimitry Andric   MFMA = 1u << 3,
7281ad6265SDimitry Andric   VMEM = 1u << 4,
7381ad6265SDimitry Andric   VMEM_READ = 1u << 5,
7481ad6265SDimitry Andric   VMEM_WRITE = 1u << 6,
7581ad6265SDimitry Andric   DS = 1u << 7,
7681ad6265SDimitry Andric   DS_READ = 1u << 8,
7781ad6265SDimitry Andric   DS_WRITE = 1u << 9,
78bdd1243dSDimitry Andric   ALL = ALU | VALU | SALU | MFMA | VMEM | VMEM_READ | VMEM_WRITE | DS |
79bdd1243dSDimitry Andric         DS_READ | DS_WRITE,
80bdd1243dSDimitry Andric   LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ ALL)
8181ad6265SDimitry Andric };
8281ad6265SDimitry Andric 
8306c3fb27SDimitry Andric class SchedGroup;
8406c3fb27SDimitry Andric 
8506c3fb27SDimitry Andric // InstructionRule class is used to enact a filter which determines whether or
8606c3fb27SDimitry Andric // not an SU maps to a given SchedGroup. It contains complementary data
8706c3fb27SDimitry Andric // structures (e.g Cache) to help those filters.
8806c3fb27SDimitry Andric class InstructionRule {
8906c3fb27SDimitry Andric protected:
9006c3fb27SDimitry Andric   const SIInstrInfo *TII;
9106c3fb27SDimitry Andric   unsigned SGID;
9206c3fb27SDimitry Andric   // A cache made available to the Filter to store SUnits for subsequent
9306c3fb27SDimitry Andric   // invocations of the Filter
9406c3fb27SDimitry Andric   std::optional<SmallVector<SUnit *, 4>> Cache;
9506c3fb27SDimitry Andric 
9606c3fb27SDimitry Andric public:
9706c3fb27SDimitry Andric   virtual bool
9806c3fb27SDimitry Andric   apply(const SUnit *, const ArrayRef<SUnit *>,
9906c3fb27SDimitry Andric         SmallVectorImpl<SchedGroup> &) {
10006c3fb27SDimitry Andric     return true;
10106c3fb27SDimitry Andric   };
10206c3fb27SDimitry Andric 
10306c3fb27SDimitry Andric   InstructionRule(const SIInstrInfo *TII, unsigned SGID,
10406c3fb27SDimitry Andric                   bool NeedsCache = false)
10506c3fb27SDimitry Andric       : TII(TII), SGID(SGID) {
10606c3fb27SDimitry Andric     if (NeedsCache) {
10706c3fb27SDimitry Andric       Cache = SmallVector<SUnit *, 4>();
10806c3fb27SDimitry Andric     }
10906c3fb27SDimitry Andric   }
11006c3fb27SDimitry Andric 
11106c3fb27SDimitry Andric   virtual ~InstructionRule() = default;
11206c3fb27SDimitry Andric };
11306c3fb27SDimitry Andric 
114bdd1243dSDimitry Andric typedef DenseMap<SUnit *, SmallVector<int, 4>> SUnitsToCandidateSGsMap;
11581ad6265SDimitry Andric 
116bdd1243dSDimitry Andric // Classify instructions into groups to enable fine tuned control over the
117bdd1243dSDimitry Andric // scheduler. These groups may be more specific than current SchedModel
118bdd1243dSDimitry Andric // instruction classes.
119bdd1243dSDimitry Andric class SchedGroup {
120bdd1243dSDimitry Andric private:
121bdd1243dSDimitry Andric   // Mask that defines which instruction types can be classified into this
122bdd1243dSDimitry Andric   // SchedGroup. The instruction types correspond to the mask from SCHED_BARRIER
123bdd1243dSDimitry Andric   // and SCHED_GROUP_BARRIER.
124bdd1243dSDimitry Andric   SchedGroupMask SGMask;
12581ad6265SDimitry Andric 
126bdd1243dSDimitry Andric   // Maximum number of SUnits that can be added to this group.
127bdd1243dSDimitry Andric   std::optional<unsigned> MaxSize;
12881ad6265SDimitry Andric 
129bdd1243dSDimitry Andric   // SchedGroups will only synchronize with other SchedGroups that have the same
130bdd1243dSDimitry Andric   // SyncID.
131bdd1243dSDimitry Andric   int SyncID = 0;
13281ad6265SDimitry Andric 
133bdd1243dSDimitry Andric   // SGID is used to map instructions to candidate SchedGroups
134bdd1243dSDimitry Andric   unsigned SGID;
135bdd1243dSDimitry Andric 
13606c3fb27SDimitry Andric   // The different rules each instruction in this SchedGroup must conform to
13706c3fb27SDimitry Andric   SmallVector<std::shared_ptr<InstructionRule>, 4> Rules;
13806c3fb27SDimitry Andric 
139bdd1243dSDimitry Andric   // Count of the number of created SchedGroups, used to initialize SGID.
140bdd1243dSDimitry Andric   static unsigned NumSchedGroups;
141bdd1243dSDimitry Andric 
142bdd1243dSDimitry Andric   const SIInstrInfo *TII;
143bdd1243dSDimitry Andric 
144bdd1243dSDimitry Andric   // Try to add and edge from SU A to SU B.
145bdd1243dSDimitry Andric   bool tryAddEdge(SUnit *A, SUnit *B);
146bdd1243dSDimitry Andric 
147bdd1243dSDimitry Andric   // Use SGMask to determine whether we can classify MI as a member of this
148bdd1243dSDimitry Andric   // SchedGroup object.
149bdd1243dSDimitry Andric   bool canAddMI(const MachineInstr &MI) const;
15081ad6265SDimitry Andric 
15181ad6265SDimitry Andric public:
152bdd1243dSDimitry Andric   // Collection of SUnits that are classified as members of this group.
153bdd1243dSDimitry Andric   SmallVector<SUnit *, 32> Collection;
15481ad6265SDimitry Andric 
15506c3fb27SDimitry Andric   ScheduleDAGInstrs *DAG;
15606c3fb27SDimitry Andric 
157bdd1243dSDimitry Andric   // Returns true if SU can be added to this SchedGroup.
158bdd1243dSDimitry Andric   bool canAddSU(SUnit &SU) const;
15981ad6265SDimitry Andric 
160bdd1243dSDimitry Andric   // Add DAG dependencies from all SUnits in this SchedGroup and this SU. If
161bdd1243dSDimitry Andric   // MakePred is true, SU will be a predecessor of the SUnits in this
162bdd1243dSDimitry Andric   // SchedGroup, otherwise SU will be a successor.
163bdd1243dSDimitry Andric   void link(SUnit &SU, bool MakePred = false);
16481ad6265SDimitry Andric 
165bdd1243dSDimitry Andric   // Add DAG dependencies and track which edges are added, and the count of
166bdd1243dSDimitry Andric   // missed edges
167bdd1243dSDimitry Andric   int link(SUnit &SU, bool MakePred,
168bdd1243dSDimitry Andric            std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges);
16981ad6265SDimitry Andric 
170bdd1243dSDimitry Andric   // Add DAG dependencies from all SUnits in this SchedGroup and this SU.
171bdd1243dSDimitry Andric   // Use the predicate to determine whether SU should be a predecessor (P =
172bdd1243dSDimitry Andric   // true) or a successor (P = false) of this SchedGroup.
173bdd1243dSDimitry Andric   void link(SUnit &SU, function_ref<bool(const SUnit *A, const SUnit *B)> P);
17481ad6265SDimitry Andric 
175bdd1243dSDimitry Andric   // Add DAG dependencies such that SUnits in this group shall be ordered
176bdd1243dSDimitry Andric   // before SUnits in OtherGroup.
177bdd1243dSDimitry Andric   void link(SchedGroup &OtherGroup);
178bdd1243dSDimitry Andric 
179bdd1243dSDimitry Andric   // Returns true if no more instructions may be added to this group.
180bdd1243dSDimitry Andric   bool isFull() const { return MaxSize && Collection.size() >= *MaxSize; }
181bdd1243dSDimitry Andric 
18206c3fb27SDimitry Andric   // Append a constraint that SUs must meet in order to fit into this
18306c3fb27SDimitry Andric   // SchedGroup. Since many rules involve the relationship between a SchedGroup
18406c3fb27SDimitry Andric   // and the SUnits in other SchedGroups, rules are checked at Pipeline Solve
18506c3fb27SDimitry Andric   // time (rather than SchedGroup init time.)
18606c3fb27SDimitry Andric   void addRule(std::shared_ptr<InstructionRule> NewRule) {
18706c3fb27SDimitry Andric     Rules.push_back(NewRule);
18806c3fb27SDimitry Andric   }
18906c3fb27SDimitry Andric 
19006c3fb27SDimitry Andric   // Returns true if the SU matches all rules
19106c3fb27SDimitry Andric   bool allowedByRules(const SUnit *SU,
19206c3fb27SDimitry Andric                       SmallVectorImpl<SchedGroup> &SyncPipe) const {
19306c3fb27SDimitry Andric     if (Rules.empty())
19406c3fb27SDimitry Andric       return true;
19506c3fb27SDimitry Andric     for (size_t I = 0; I < Rules.size(); I++) {
19606c3fb27SDimitry Andric       auto TheRule = Rules[I].get();
19706c3fb27SDimitry Andric       if (!TheRule->apply(SU, Collection, SyncPipe)) {
19806c3fb27SDimitry Andric         return false;
19906c3fb27SDimitry Andric       }
20006c3fb27SDimitry Andric     }
20106c3fb27SDimitry Andric     return true;
20206c3fb27SDimitry Andric   }
20306c3fb27SDimitry Andric 
204bdd1243dSDimitry Andric   // Add SU to the SchedGroup.
205bdd1243dSDimitry Andric   void add(SUnit &SU) {
206bdd1243dSDimitry Andric     LLVM_DEBUG(dbgs() << "For SchedGroup with mask "
207bdd1243dSDimitry Andric                       << format_hex((int)SGMask, 10, true) << " adding "
208bdd1243dSDimitry Andric                       << *SU.getInstr());
209bdd1243dSDimitry Andric     Collection.push_back(&SU);
21081ad6265SDimitry Andric   }
21181ad6265SDimitry Andric 
212bdd1243dSDimitry Andric   // Remove last element in the SchedGroup
213bdd1243dSDimitry Andric   void pop() { Collection.pop_back(); }
214bdd1243dSDimitry Andric 
215bdd1243dSDimitry Andric   // Identify and add all relevant SUs from the DAG to this SchedGroup.
216bdd1243dSDimitry Andric   void initSchedGroup();
217bdd1243dSDimitry Andric 
218bdd1243dSDimitry Andric   // Add instructions to the SchedGroup bottom up starting from RIter.
219bdd1243dSDimitry Andric   // PipelineInstrs is a set of instructions that should not be added to the
220bdd1243dSDimitry Andric   // SchedGroup even when the other conditions for adding it are satisfied.
221bdd1243dSDimitry Andric   // RIter will be added to the SchedGroup as well, and dependencies will be
222bdd1243dSDimitry Andric   // added so that RIter will always be scheduled at the end of the group.
223bdd1243dSDimitry Andric   void initSchedGroup(std::vector<SUnit>::reverse_iterator RIter,
224bdd1243dSDimitry Andric                       SUnitsToCandidateSGsMap &SyncedInstrs);
225bdd1243dSDimitry Andric 
226bdd1243dSDimitry Andric   void initSchedGroup(SUnitsToCandidateSGsMap &SyncedInstrs);
227bdd1243dSDimitry Andric 
228bdd1243dSDimitry Andric   int getSyncID() { return SyncID; }
229bdd1243dSDimitry Andric 
230bdd1243dSDimitry Andric   int getSGID() { return SGID; }
231bdd1243dSDimitry Andric 
232bdd1243dSDimitry Andric   SchedGroupMask getMask() { return SGMask; }
233bdd1243dSDimitry Andric 
234bdd1243dSDimitry Andric   SchedGroup(SchedGroupMask SGMask, std::optional<unsigned> MaxSize,
235bdd1243dSDimitry Andric              ScheduleDAGInstrs *DAG, const SIInstrInfo *TII)
23606c3fb27SDimitry Andric       : SGMask(SGMask), MaxSize(MaxSize), TII(TII), DAG(DAG) {
237bdd1243dSDimitry Andric     SGID = NumSchedGroups++;
238bdd1243dSDimitry Andric   }
239bdd1243dSDimitry Andric 
240bdd1243dSDimitry Andric   SchedGroup(SchedGroupMask SGMask, std::optional<unsigned> MaxSize, int SyncID,
241bdd1243dSDimitry Andric              ScheduleDAGInstrs *DAG, const SIInstrInfo *TII)
24206c3fb27SDimitry Andric       : SGMask(SGMask), MaxSize(MaxSize), SyncID(SyncID), TII(TII), DAG(DAG) {
243bdd1243dSDimitry Andric     SGID = NumSchedGroups++;
244bdd1243dSDimitry Andric   }
245bdd1243dSDimitry Andric };
246bdd1243dSDimitry Andric 
247bdd1243dSDimitry Andric // Remove all existing edges from a SCHED_BARRIER or SCHED_GROUP_BARRIER.
248bdd1243dSDimitry Andric static void resetEdges(SUnit &SU, ScheduleDAGInstrs *DAG) {
249bdd1243dSDimitry Andric   assert(SU.getInstr()->getOpcode() == AMDGPU::SCHED_BARRIER ||
250bdd1243dSDimitry Andric          SU.getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER ||
251bdd1243dSDimitry Andric          SU.getInstr()->getOpcode() == AMDGPU::IGLP_OPT);
252bdd1243dSDimitry Andric 
253bdd1243dSDimitry Andric   while (!SU.Preds.empty())
254bdd1243dSDimitry Andric     for (auto &P : SU.Preds)
255bdd1243dSDimitry Andric       SU.removePred(P);
256bdd1243dSDimitry Andric 
257bdd1243dSDimitry Andric   while (!SU.Succs.empty())
258bdd1243dSDimitry Andric     for (auto &S : SU.Succs)
259bdd1243dSDimitry Andric       for (auto &SP : S.getSUnit()->Preds)
260bdd1243dSDimitry Andric         if (SP.getSUnit() == &SU)
261bdd1243dSDimitry Andric           S.getSUnit()->removePred(SP);
262bdd1243dSDimitry Andric }
263bdd1243dSDimitry Andric 
264bdd1243dSDimitry Andric typedef std::pair<SUnit *, SmallVector<int, 4>> SUToCandSGsPair;
265bdd1243dSDimitry Andric typedef SmallVector<SUToCandSGsPair, 4> SUsToCandSGsVec;
266bdd1243dSDimitry Andric 
267bdd1243dSDimitry Andric // The PipelineSolver is used to assign SUnits to SchedGroups in a pipeline
268bdd1243dSDimitry Andric // in non-trivial cases. For example, if the requested pipeline is
269bdd1243dSDimitry Andric // {VMEM_READ, VALU, MFMA, VMEM_READ} and we encounter a VMEM_READ instruction
270bdd1243dSDimitry Andric // in the DAG, then we will have an instruction that can not be trivially
271bdd1243dSDimitry Andric // assigned to a SchedGroup. The PipelineSolver class implements two algorithms
272bdd1243dSDimitry Andric // to find a good solution to the pipeline -- a greedy algorithm and an exact
273bdd1243dSDimitry Andric // algorithm. The exact algorithm has an exponential time complexity and should
274bdd1243dSDimitry Andric // only be used for small sized problems or medium sized problems where an exact
275bdd1243dSDimitry Andric // solution is highly desired.
276bdd1243dSDimitry Andric class PipelineSolver {
277bdd1243dSDimitry Andric   ScheduleDAGMI *DAG;
278bdd1243dSDimitry Andric 
279bdd1243dSDimitry Andric   // Instructions that can be assigned to multiple SchedGroups
280bdd1243dSDimitry Andric   DenseMap<int, SUnitsToCandidateSGsMap> SyncedInstrs;
281bdd1243dSDimitry Andric   SmallVector<SUsToCandSGsVec, 4> PipelineInstrs;
282bdd1243dSDimitry Andric   DenseMap<int, SmallVector<SchedGroup, 4>> SyncedSchedGroups;
283bdd1243dSDimitry Andric   // The current working pipeline
284bdd1243dSDimitry Andric   SmallVector<SmallVector<SchedGroup, 4>, 4> CurrPipeline;
285bdd1243dSDimitry Andric   // The pipeline that has the best solution found so far
286bdd1243dSDimitry Andric   SmallVector<SmallVector<SchedGroup, 4>, 4> BestPipeline;
287bdd1243dSDimitry Andric 
288bdd1243dSDimitry Andric   // Whether or not we actually have any SyncedInstrs to try to solve.
289bdd1243dSDimitry Andric   bool NeedsSolver = false;
290bdd1243dSDimitry Andric 
291bdd1243dSDimitry Andric   // Compute an estimate of the size of search tree -- the true size is
292bdd1243dSDimitry Andric   // the product of each conflictedInst.Matches.size() across all SyncPipelines
293bdd1243dSDimitry Andric   unsigned computeProblemSize();
294bdd1243dSDimitry Andric 
295bdd1243dSDimitry Andric   // The cost penalty of not assigning a SU to a SchedGroup
296bdd1243dSDimitry Andric   int MissPenalty = 0;
297bdd1243dSDimitry Andric 
298bdd1243dSDimitry Andric   // Costs in terms of the number of edges we are unable to add
299bdd1243dSDimitry Andric   int BestCost = -1;
300bdd1243dSDimitry Andric   int CurrCost = 0;
301bdd1243dSDimitry Andric 
302bdd1243dSDimitry Andric   // Index pointing to the conflicting instruction that is currently being
303bdd1243dSDimitry Andric   // fitted
304bdd1243dSDimitry Andric   int CurrConflInstNo = 0;
305bdd1243dSDimitry Andric   // Index to the pipeline that is currently being fitted
306bdd1243dSDimitry Andric   int CurrSyncGroupIdx = 0;
307bdd1243dSDimitry Andric   // The first non trivial pipeline
308bdd1243dSDimitry Andric   int BeginSyncGroupIdx = 0;
309bdd1243dSDimitry Andric 
310bdd1243dSDimitry Andric   // How many branches we have explored
311bdd1243dSDimitry Andric   uint64_t BranchesExplored = 0;
312bdd1243dSDimitry Andric 
31306c3fb27SDimitry Andric   // The direction in which we process the candidate SchedGroups per SU
31406c3fb27SDimitry Andric   bool IsBottomUp = 1;
31506c3fb27SDimitry Andric 
316bdd1243dSDimitry Andric   // Update indices to fit next conflicting instruction
317bdd1243dSDimitry Andric   void advancePosition();
318bdd1243dSDimitry Andric   // Recede indices to attempt to find better fit for previous conflicting
319bdd1243dSDimitry Andric   // instruction
320bdd1243dSDimitry Andric   void retreatPosition();
321bdd1243dSDimitry Andric 
322bdd1243dSDimitry Andric   // The exponential time algorithm which finds the provably best fit
323bdd1243dSDimitry Andric   bool solveExact();
324bdd1243dSDimitry Andric   // The polynomial time algorithm which attempts to find a good fit
325bdd1243dSDimitry Andric   bool solveGreedy();
32606c3fb27SDimitry Andric   // Find the best SchedGroup for the current SU using the heuristic given all
32706c3fb27SDimitry Andric   // current information. One step in the greedy algorithm. Templated against
32806c3fb27SDimitry Andric   // the SchedGroup iterator (either reverse or forward).
32906c3fb27SDimitry Andric   template <typename T>
33006c3fb27SDimitry Andric   void greedyFind(std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges, T I,
33106c3fb27SDimitry Andric                   T E);
332bdd1243dSDimitry Andric   // Whether or not the current solution is optimal
333bdd1243dSDimitry Andric   bool checkOptimal();
334bdd1243dSDimitry Andric   // Populate the ready list, prioiritizing fewest missed edges first
33506c3fb27SDimitry Andric   // Templated against the SchedGroup iterator (either reverse or forward).
33606c3fb27SDimitry Andric   template <typename T>
33706c3fb27SDimitry Andric   void populateReadyList(SmallVectorImpl<std::pair<int, int>> &ReadyList, T I,
33806c3fb27SDimitry Andric                          T E);
339bdd1243dSDimitry Andric   // Add edges corresponding to the SchedGroups as assigned by solver
340bdd1243dSDimitry Andric   void makePipeline();
34106c3fb27SDimitry Andric   // Link the SchedGroups in the best found pipeline.
34206c3fb27SDimitry Andric   // Tmplated against the SchedGroup iterator (either reverse or forward).
34306c3fb27SDimitry Andric   template <typename T> void linkSchedGroups(T I, T E);
344bdd1243dSDimitry Andric   // Add the edges from the SU to the other SchedGroups in pipeline, and
345bdd1243dSDimitry Andric   // return the number of edges missed.
346bdd1243dSDimitry Andric   int addEdges(SmallVectorImpl<SchedGroup> &SyncPipeline, SUnit *SU, int SGID,
347bdd1243dSDimitry Andric                std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges);
348*5f757f3fSDimitry Andric   /// Link the pipeline as if \p SU was in the SchedGroup with ID \p SGID. It
349*5f757f3fSDimitry Andric   /// returns the cost (in terms of missed pipeline edges), and tracks the edges
350*5f757f3fSDimitry Andric   /// added in \p AddedEdges
35106c3fb27SDimitry Andric   template <typename T>
35206c3fb27SDimitry Andric   int linkSUnit(SUnit *SU, int SGID,
35306c3fb27SDimitry Andric                 std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges, T I, T E);
354*5f757f3fSDimitry Andric   /// Remove the edges passed via \p AddedEdges
355bdd1243dSDimitry Andric   void removeEdges(const std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges);
356bdd1243dSDimitry Andric   // Convert the passed in maps to arrays for bidirectional iterators
357bdd1243dSDimitry Andric   void convertSyncMapsToArrays();
358bdd1243dSDimitry Andric 
359bdd1243dSDimitry Andric   void reset();
360bdd1243dSDimitry Andric 
361bdd1243dSDimitry Andric public:
362bdd1243dSDimitry Andric   // Invoke the solver to map instructions to instruction groups. Heuristic &&
363bdd1243dSDimitry Andric   // command-line-option determines to use exact or greedy algorithm.
364bdd1243dSDimitry Andric   void solve();
365bdd1243dSDimitry Andric 
366bdd1243dSDimitry Andric   PipelineSolver(DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
367bdd1243dSDimitry Andric                  DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
36806c3fb27SDimitry Andric                  ScheduleDAGMI *DAG, bool IsBottomUp = 1)
369bdd1243dSDimitry Andric       : DAG(DAG), SyncedInstrs(SyncedInstrs),
37006c3fb27SDimitry Andric         SyncedSchedGroups(SyncedSchedGroups), IsBottomUp(IsBottomUp) {
371bdd1243dSDimitry Andric 
372bdd1243dSDimitry Andric     for (auto &PipelineInstrs : SyncedInstrs) {
373bdd1243dSDimitry Andric       if (PipelineInstrs.second.size() > 0) {
374bdd1243dSDimitry Andric         NeedsSolver = true;
375bdd1243dSDimitry Andric         break;
376bdd1243dSDimitry Andric       }
377bdd1243dSDimitry Andric     }
378bdd1243dSDimitry Andric 
379bdd1243dSDimitry Andric     if (!NeedsSolver)
380bdd1243dSDimitry Andric       return;
381bdd1243dSDimitry Andric 
382bdd1243dSDimitry Andric     convertSyncMapsToArrays();
383bdd1243dSDimitry Andric 
384bdd1243dSDimitry Andric     CurrPipeline = BestPipeline;
385bdd1243dSDimitry Andric 
386bdd1243dSDimitry Andric     while (static_cast<size_t>(BeginSyncGroupIdx) < PipelineInstrs.size() &&
387bdd1243dSDimitry Andric            PipelineInstrs[BeginSyncGroupIdx].size() == 0)
388bdd1243dSDimitry Andric       ++BeginSyncGroupIdx;
389bdd1243dSDimitry Andric 
390bdd1243dSDimitry Andric     if (static_cast<size_t>(BeginSyncGroupIdx) >= PipelineInstrs.size())
391bdd1243dSDimitry Andric       return;
392bdd1243dSDimitry Andric   }
393bdd1243dSDimitry Andric };
394bdd1243dSDimitry Andric 
395bdd1243dSDimitry Andric void PipelineSolver::reset() {
396bdd1243dSDimitry Andric 
397bdd1243dSDimitry Andric   for (auto &SyncPipeline : CurrPipeline) {
398bdd1243dSDimitry Andric     for (auto &SG : SyncPipeline) {
399bdd1243dSDimitry Andric       SmallVector<SUnit *, 32> TempCollection = SG.Collection;
400bdd1243dSDimitry Andric       SG.Collection.clear();
401bdd1243dSDimitry Andric       auto SchedBarr = llvm::find_if(TempCollection, [](SUnit *SU) {
402bdd1243dSDimitry Andric         return SU->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER;
403bdd1243dSDimitry Andric       });
404bdd1243dSDimitry Andric       if (SchedBarr != TempCollection.end())
405bdd1243dSDimitry Andric         SG.Collection.push_back(*SchedBarr);
406bdd1243dSDimitry Andric     }
407bdd1243dSDimitry Andric   }
408bdd1243dSDimitry Andric 
409bdd1243dSDimitry Andric   CurrSyncGroupIdx = BeginSyncGroupIdx;
410bdd1243dSDimitry Andric   CurrConflInstNo = 0;
411bdd1243dSDimitry Andric   CurrCost = 0;
412bdd1243dSDimitry Andric }
413bdd1243dSDimitry Andric 
414bdd1243dSDimitry Andric void PipelineSolver::convertSyncMapsToArrays() {
415bdd1243dSDimitry Andric   for (auto &SyncPipe : SyncedSchedGroups) {
416bdd1243dSDimitry Andric     BestPipeline.insert(BestPipeline.begin(), SyncPipe.second);
417bdd1243dSDimitry Andric   }
418bdd1243dSDimitry Andric 
419bdd1243dSDimitry Andric   int PipelineIDx = SyncedInstrs.size() - 1;
420bdd1243dSDimitry Andric   PipelineInstrs.resize(SyncedInstrs.size());
421bdd1243dSDimitry Andric   for (auto &SyncInstrMap : SyncedInstrs) {
422bdd1243dSDimitry Andric     for (auto &SUsToCandSGs : SyncInstrMap.second) {
423bdd1243dSDimitry Andric       if (PipelineInstrs[PipelineIDx].size() == 0) {
424bdd1243dSDimitry Andric         PipelineInstrs[PipelineIDx].push_back(
425bdd1243dSDimitry Andric             std::pair(SUsToCandSGs.first, SUsToCandSGs.second));
426bdd1243dSDimitry Andric         continue;
427bdd1243dSDimitry Andric       }
428bdd1243dSDimitry Andric       auto SortPosition = PipelineInstrs[PipelineIDx].begin();
429bdd1243dSDimitry Andric       // Insert them in sorted order -- this allows for good parsing order in
430bdd1243dSDimitry Andric       // the greedy algorithm
431bdd1243dSDimitry Andric       while (SortPosition != PipelineInstrs[PipelineIDx].end() &&
432bdd1243dSDimitry Andric              SUsToCandSGs.first->NodeNum > SortPosition->first->NodeNum)
433bdd1243dSDimitry Andric         ++SortPosition;
434bdd1243dSDimitry Andric       PipelineInstrs[PipelineIDx].insert(
435bdd1243dSDimitry Andric           SortPosition, std::pair(SUsToCandSGs.first, SUsToCandSGs.second));
436bdd1243dSDimitry Andric     }
437bdd1243dSDimitry Andric     --PipelineIDx;
438bdd1243dSDimitry Andric   }
439bdd1243dSDimitry Andric }
440bdd1243dSDimitry Andric 
44106c3fb27SDimitry Andric template <typename T> void PipelineSolver::linkSchedGroups(T I, T E) {
44206c3fb27SDimitry Andric   for (; I != E; ++I) {
44306c3fb27SDimitry Andric     auto &GroupA = *I;
44406c3fb27SDimitry Andric     for (auto J = std::next(I); J != E; ++J) {
44506c3fb27SDimitry Andric       auto &GroupB = *J;
44606c3fb27SDimitry Andric       GroupA.link(GroupB);
44706c3fb27SDimitry Andric     }
44806c3fb27SDimitry Andric   }
44906c3fb27SDimitry Andric }
45006c3fb27SDimitry Andric 
451bdd1243dSDimitry Andric void PipelineSolver::makePipeline() {
452bdd1243dSDimitry Andric   // Preserve the order of barrier for subsequent SchedGroupBarrier mutations
453bdd1243dSDimitry Andric   for (auto &SyncPipeline : BestPipeline) {
45406c3fb27SDimitry Andric     LLVM_DEBUG(dbgs() << "Printing SchedGroups\n");
455bdd1243dSDimitry Andric     for (auto &SG : SyncPipeline) {
45606c3fb27SDimitry Andric       LLVM_DEBUG(dbgs() << "SchedGroup with SGID " << SG.getSGID()
45706c3fb27SDimitry Andric                         << " has: \n");
458bdd1243dSDimitry Andric       SUnit *SGBarr = nullptr;
459bdd1243dSDimitry Andric       for (auto &SU : SG.Collection) {
460bdd1243dSDimitry Andric         if (SU->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER)
461bdd1243dSDimitry Andric           SGBarr = SU;
46206c3fb27SDimitry Andric         LLVM_DEBUG(dbgs() << "SU(" << SU->NodeNum << ")\n");
463bdd1243dSDimitry Andric       }
464bdd1243dSDimitry Andric       // Command line requested IGroupLP doesn't have SGBarr
465bdd1243dSDimitry Andric       if (!SGBarr)
466bdd1243dSDimitry Andric         continue;
467bdd1243dSDimitry Andric       resetEdges(*SGBarr, DAG);
468bdd1243dSDimitry Andric       SG.link(*SGBarr, false);
469bdd1243dSDimitry Andric     }
470bdd1243dSDimitry Andric   }
471bdd1243dSDimitry Andric 
472bdd1243dSDimitry Andric   for (auto &SyncPipeline : BestPipeline) {
47306c3fb27SDimitry Andric     IsBottomUp ? linkSchedGroups(SyncPipeline.rbegin(), SyncPipeline.rend())
47406c3fb27SDimitry Andric                : linkSchedGroups(SyncPipeline.begin(), SyncPipeline.end());
47581ad6265SDimitry Andric   }
47681ad6265SDimitry Andric }
47706c3fb27SDimitry Andric 
47806c3fb27SDimitry Andric template <typename T>
47906c3fb27SDimitry Andric int PipelineSolver::linkSUnit(
48006c3fb27SDimitry Andric     SUnit *SU, int SGID, std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges,
48106c3fb27SDimitry Andric     T I, T E) {
48206c3fb27SDimitry Andric   bool MakePred = false;
48306c3fb27SDimitry Andric   int AddedCost = 0;
48406c3fb27SDimitry Andric   for (; I < E; ++I) {
48506c3fb27SDimitry Andric     if (I->getSGID() == SGID) {
48606c3fb27SDimitry Andric       MakePred = true;
48706c3fb27SDimitry Andric       continue;
48881ad6265SDimitry Andric     }
48906c3fb27SDimitry Andric     auto Group = *I;
49006c3fb27SDimitry Andric     AddedCost += Group.link(*SU, MakePred, AddedEdges);
49106c3fb27SDimitry Andric     assert(AddedCost >= 0);
49206c3fb27SDimitry Andric   }
49306c3fb27SDimitry Andric   return AddedCost;
494bdd1243dSDimitry Andric }
49581ad6265SDimitry Andric 
496bdd1243dSDimitry Andric int PipelineSolver::addEdges(
497bdd1243dSDimitry Andric     SmallVectorImpl<SchedGroup> &SyncPipeline, SUnit *SU, int SGID,
498bdd1243dSDimitry Andric     std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges) {
499bdd1243dSDimitry Andric 
50006c3fb27SDimitry Andric   // For IsBottomUp, the first SchedGroup in SyncPipeline contains the
50106c3fb27SDimitry Andric   // instructions that are the ultimate successors in the resultant mutation.
50206c3fb27SDimitry Andric   // Therefore, in such a configuration, the SchedGroups occurring before the
50306c3fb27SDimitry Andric   // candidate SGID are successors of the candidate SchedGroup, thus the current
50406c3fb27SDimitry Andric   // SU should be linked as a predecessor to SUs in those SchedGroups. The
50506c3fb27SDimitry Andric   // opposite is true if !IsBottomUp. IsBottomUp occurs in the case of multiple
50606c3fb27SDimitry Andric   // SCHED_GROUP_BARRIERS, or if a user specifies IGLP_OPT SchedGroups using
50706c3fb27SDimitry Andric   // IsBottomUp (in reverse).
50806c3fb27SDimitry Andric   return IsBottomUp ? linkSUnit(SU, SGID, AddedEdges, SyncPipeline.rbegin(),
50906c3fb27SDimitry Andric                                 SyncPipeline.rend())
51006c3fb27SDimitry Andric                     : linkSUnit(SU, SGID, AddedEdges, SyncPipeline.begin(),
51106c3fb27SDimitry Andric                                 SyncPipeline.end());
512bdd1243dSDimitry Andric }
513bdd1243dSDimitry Andric 
514bdd1243dSDimitry Andric void PipelineSolver::removeEdges(
515bdd1243dSDimitry Andric     const std::vector<std::pair<SUnit *, SUnit *>> &EdgesToRemove) {
516bdd1243dSDimitry Andric   // Only remove the edges that we have added when testing
517bdd1243dSDimitry Andric   // the fit.
518bdd1243dSDimitry Andric   for (auto &PredSuccPair : EdgesToRemove) {
519bdd1243dSDimitry Andric     SUnit *Pred = PredSuccPair.first;
520bdd1243dSDimitry Andric     SUnit *Succ = PredSuccPair.second;
521bdd1243dSDimitry Andric 
522bdd1243dSDimitry Andric     auto Match = llvm::find_if(
523bdd1243dSDimitry Andric         Succ->Preds, [&Pred](SDep &P) { return P.getSUnit() == Pred; });
524bdd1243dSDimitry Andric     if (Match != Succ->Preds.end()) {
525bdd1243dSDimitry Andric       assert(Match->isArtificial());
526bdd1243dSDimitry Andric       Succ->removePred(*Match);
527bdd1243dSDimitry Andric     }
528bdd1243dSDimitry Andric   }
529bdd1243dSDimitry Andric }
530bdd1243dSDimitry Andric 
531bdd1243dSDimitry Andric void PipelineSolver::advancePosition() {
532bdd1243dSDimitry Andric   ++CurrConflInstNo;
533bdd1243dSDimitry Andric 
534bdd1243dSDimitry Andric   if (static_cast<size_t>(CurrConflInstNo) >=
535bdd1243dSDimitry Andric       PipelineInstrs[CurrSyncGroupIdx].size()) {
536bdd1243dSDimitry Andric     CurrConflInstNo = 0;
537bdd1243dSDimitry Andric     ++CurrSyncGroupIdx;
538bdd1243dSDimitry Andric     // Advance to next non-trivial pipeline
539bdd1243dSDimitry Andric     while (static_cast<size_t>(CurrSyncGroupIdx) < PipelineInstrs.size() &&
540bdd1243dSDimitry Andric            PipelineInstrs[CurrSyncGroupIdx].size() == 0)
541bdd1243dSDimitry Andric       ++CurrSyncGroupIdx;
542bdd1243dSDimitry Andric   }
543bdd1243dSDimitry Andric }
544bdd1243dSDimitry Andric 
545bdd1243dSDimitry Andric void PipelineSolver::retreatPosition() {
546bdd1243dSDimitry Andric   assert(CurrConflInstNo >= 0);
547bdd1243dSDimitry Andric   assert(CurrSyncGroupIdx >= 0);
548bdd1243dSDimitry Andric 
549bdd1243dSDimitry Andric   if (CurrConflInstNo > 0) {
550bdd1243dSDimitry Andric     --CurrConflInstNo;
551bdd1243dSDimitry Andric     return;
552bdd1243dSDimitry Andric   }
553bdd1243dSDimitry Andric 
554bdd1243dSDimitry Andric   if (CurrConflInstNo == 0) {
555bdd1243dSDimitry Andric     // If we return to the starting position, we have explored
556bdd1243dSDimitry Andric     // the entire tree
557bdd1243dSDimitry Andric     if (CurrSyncGroupIdx == BeginSyncGroupIdx)
558bdd1243dSDimitry Andric       return;
559bdd1243dSDimitry Andric 
560bdd1243dSDimitry Andric     --CurrSyncGroupIdx;
561bdd1243dSDimitry Andric     // Go to previous non-trivial pipeline
562bdd1243dSDimitry Andric     while (PipelineInstrs[CurrSyncGroupIdx].size() == 0)
563bdd1243dSDimitry Andric       --CurrSyncGroupIdx;
564bdd1243dSDimitry Andric 
565bdd1243dSDimitry Andric     CurrConflInstNo = PipelineInstrs[CurrSyncGroupIdx].size() - 1;
566bdd1243dSDimitry Andric   }
567bdd1243dSDimitry Andric }
568bdd1243dSDimitry Andric 
569bdd1243dSDimitry Andric bool PipelineSolver::checkOptimal() {
570bdd1243dSDimitry Andric   if (static_cast<size_t>(CurrSyncGroupIdx) == PipelineInstrs.size()) {
571bdd1243dSDimitry Andric     if (BestCost == -1 || CurrCost < BestCost) {
572bdd1243dSDimitry Andric       BestPipeline = CurrPipeline;
573bdd1243dSDimitry Andric       BestCost = CurrCost;
574bdd1243dSDimitry Andric       LLVM_DEBUG(dbgs() << "Found Fit with cost " << BestCost << "\n");
575bdd1243dSDimitry Andric     }
576bdd1243dSDimitry Andric     assert(BestCost >= 0);
577bdd1243dSDimitry Andric   }
578bdd1243dSDimitry Andric 
579bdd1243dSDimitry Andric   bool DoneExploring = false;
580bdd1243dSDimitry Andric   if (MaxBranchesExplored > 0 && BranchesExplored >= MaxBranchesExplored)
581bdd1243dSDimitry Andric     DoneExploring = true;
582bdd1243dSDimitry Andric 
583bdd1243dSDimitry Andric   return (DoneExploring || BestCost == 0);
584bdd1243dSDimitry Andric }
585bdd1243dSDimitry Andric 
58606c3fb27SDimitry Andric template <typename T>
587bdd1243dSDimitry Andric void PipelineSolver::populateReadyList(
58806c3fb27SDimitry Andric     SmallVectorImpl<std::pair<int, int>> &ReadyList, T I, T E) {
58906c3fb27SDimitry Andric   SUToCandSGsPair CurrSU = PipelineInstrs[CurrSyncGroupIdx][CurrConflInstNo];
59006c3fb27SDimitry Andric   auto SyncPipeline = CurrPipeline[CurrSyncGroupIdx];
591bdd1243dSDimitry Andric   assert(CurrSU.second.size() >= 1);
59206c3fb27SDimitry Andric 
593bdd1243dSDimitry Andric   for (; I != E; ++I) {
594bdd1243dSDimitry Andric     std::vector<std::pair<SUnit *, SUnit *>> AddedEdges;
595bdd1243dSDimitry Andric     int CandSGID = *I;
596*5f757f3fSDimitry Andric     SchedGroup *Match = llvm::find_if(SyncPipeline, [CandSGID](SchedGroup &SG) {
597*5f757f3fSDimitry Andric       return SG.getSGID() == CandSGID;
598*5f757f3fSDimitry Andric     });
599*5f757f3fSDimitry Andric     assert(Match);
600bdd1243dSDimitry Andric 
601bdd1243dSDimitry Andric     if (UseCostHeur) {
602bdd1243dSDimitry Andric       if (Match->isFull()) {
603bdd1243dSDimitry Andric         ReadyList.push_back(std::pair(*I, MissPenalty));
604bdd1243dSDimitry Andric         continue;
605bdd1243dSDimitry Andric       }
606bdd1243dSDimitry Andric 
607bdd1243dSDimitry Andric       int TempCost = addEdges(SyncPipeline, CurrSU.first, CandSGID, AddedEdges);
608bdd1243dSDimitry Andric       ReadyList.push_back(std::pair(*I, TempCost));
609bdd1243dSDimitry Andric       removeEdges(AddedEdges);
610bdd1243dSDimitry Andric     } else
611bdd1243dSDimitry Andric       ReadyList.push_back(std::pair(*I, -1));
612bdd1243dSDimitry Andric   }
613bdd1243dSDimitry Andric 
614bdd1243dSDimitry Andric   if (UseCostHeur) {
615bdd1243dSDimitry Andric     std::sort(ReadyList.begin(), ReadyList.end(),
616bdd1243dSDimitry Andric               [](std::pair<int, int> A, std::pair<int, int> B) {
617bdd1243dSDimitry Andric                 return A.second < B.second;
618bdd1243dSDimitry Andric               });
619bdd1243dSDimitry Andric   }
620bdd1243dSDimitry Andric 
621bdd1243dSDimitry Andric   assert(ReadyList.size() == CurrSU.second.size());
622bdd1243dSDimitry Andric }
623bdd1243dSDimitry Andric 
624bdd1243dSDimitry Andric bool PipelineSolver::solveExact() {
625bdd1243dSDimitry Andric   if (checkOptimal())
626bdd1243dSDimitry Andric     return true;
627bdd1243dSDimitry Andric 
628bdd1243dSDimitry Andric   if (static_cast<size_t>(CurrSyncGroupIdx) == PipelineInstrs.size())
629bdd1243dSDimitry Andric     return false;
630bdd1243dSDimitry Andric 
631bdd1243dSDimitry Andric   assert(static_cast<size_t>(CurrSyncGroupIdx) < PipelineInstrs.size());
632bdd1243dSDimitry Andric   assert(static_cast<size_t>(CurrConflInstNo) <
633bdd1243dSDimitry Andric          PipelineInstrs[CurrSyncGroupIdx].size());
634bdd1243dSDimitry Andric   SUToCandSGsPair CurrSU = PipelineInstrs[CurrSyncGroupIdx][CurrConflInstNo];
635bdd1243dSDimitry Andric   LLVM_DEBUG(dbgs() << "Fitting SU(" << CurrSU.first->NodeNum
636bdd1243dSDimitry Andric                     << ") in Pipeline # " << CurrSyncGroupIdx << "\n");
637bdd1243dSDimitry Andric 
638bdd1243dSDimitry Andric   // SchedGroup -> Cost pairs
639bdd1243dSDimitry Andric   SmallVector<std::pair<int, int>, 4> ReadyList;
640bdd1243dSDimitry Andric   // Prioritize the candidate sched groups in terms of lowest cost first
64106c3fb27SDimitry Andric   IsBottomUp ? populateReadyList(ReadyList, CurrSU.second.rbegin(),
64206c3fb27SDimitry Andric                                  CurrSU.second.rend())
64306c3fb27SDimitry Andric              : populateReadyList(ReadyList, CurrSU.second.begin(),
64406c3fb27SDimitry Andric                                  CurrSU.second.end());
645bdd1243dSDimitry Andric 
646bdd1243dSDimitry Andric   auto I = ReadyList.begin();
647bdd1243dSDimitry Andric   auto E = ReadyList.end();
648bdd1243dSDimitry Andric   for (; I != E; ++I) {
649bdd1243dSDimitry Andric     // If we are trying SGs in least cost order, and the current SG is cost
650bdd1243dSDimitry Andric     // infeasible, then all subsequent SGs will also be cost infeasible, so we
651bdd1243dSDimitry Andric     // can prune.
652bdd1243dSDimitry Andric     if (BestCost != -1 && (CurrCost + I->second > BestCost))
653bdd1243dSDimitry Andric       return false;
654bdd1243dSDimitry Andric 
655bdd1243dSDimitry Andric     int CandSGID = I->first;
656bdd1243dSDimitry Andric     int AddedCost = 0;
657bdd1243dSDimitry Andric     std::vector<std::pair<SUnit *, SUnit *>> AddedEdges;
658bdd1243dSDimitry Andric     auto &SyncPipeline = CurrPipeline[CurrSyncGroupIdx];
659bdd1243dSDimitry Andric     SchedGroup *Match;
660bdd1243dSDimitry Andric     for (auto &SG : SyncPipeline) {
661bdd1243dSDimitry Andric       if (SG.getSGID() == CandSGID)
662bdd1243dSDimitry Andric         Match = &SG;
663bdd1243dSDimitry Andric     }
664bdd1243dSDimitry Andric 
665bdd1243dSDimitry Andric     if (Match->isFull())
666bdd1243dSDimitry Andric       continue;
667bdd1243dSDimitry Andric 
66806c3fb27SDimitry Andric     if (!Match->allowedByRules(CurrSU.first, SyncPipeline))
66906c3fb27SDimitry Andric       continue;
67006c3fb27SDimitry Andric 
671bdd1243dSDimitry Andric     LLVM_DEBUG(dbgs() << "Assigning to SchedGroup with Mask "
672bdd1243dSDimitry Andric                       << (int)Match->getMask() << "and ID " << CandSGID
673bdd1243dSDimitry Andric                       << "\n");
674bdd1243dSDimitry Andric     Match->add(*CurrSU.first);
675bdd1243dSDimitry Andric     AddedCost = addEdges(SyncPipeline, CurrSU.first, CandSGID, AddedEdges);
676bdd1243dSDimitry Andric     LLVM_DEBUG(dbgs() << "Cost of Assignment: " << AddedCost << "\n");
677bdd1243dSDimitry Andric     CurrCost += AddedCost;
678bdd1243dSDimitry Andric     advancePosition();
679bdd1243dSDimitry Andric     ++BranchesExplored;
680bdd1243dSDimitry Andric     bool FinishedExploring = false;
681bdd1243dSDimitry Andric     // If the Cost after adding edges is greater than a known solution,
682bdd1243dSDimitry Andric     // backtrack
683bdd1243dSDimitry Andric     if (CurrCost < BestCost || BestCost == -1) {
684bdd1243dSDimitry Andric       if (solveExact()) {
685bdd1243dSDimitry Andric         FinishedExploring = BestCost != 0;
686bdd1243dSDimitry Andric         if (!FinishedExploring)
687bdd1243dSDimitry Andric           return true;
688bdd1243dSDimitry Andric       }
689bdd1243dSDimitry Andric     }
690bdd1243dSDimitry Andric 
691bdd1243dSDimitry Andric     retreatPosition();
692bdd1243dSDimitry Andric     CurrCost -= AddedCost;
693bdd1243dSDimitry Andric     removeEdges(AddedEdges);
694bdd1243dSDimitry Andric     Match->pop();
695bdd1243dSDimitry Andric     CurrPipeline[CurrSyncGroupIdx] = SyncPipeline;
696bdd1243dSDimitry Andric     if (FinishedExploring)
697bdd1243dSDimitry Andric       return true;
698bdd1243dSDimitry Andric   }
699bdd1243dSDimitry Andric 
700bdd1243dSDimitry Andric   // Try the pipeline where the current instruction is omitted
701bdd1243dSDimitry Andric   // Potentially if we omit a problematic instruction from the pipeline,
702bdd1243dSDimitry Andric   // all the other instructions can nicely fit.
703bdd1243dSDimitry Andric   CurrCost += MissPenalty;
704bdd1243dSDimitry Andric   advancePosition();
705bdd1243dSDimitry Andric 
706bdd1243dSDimitry Andric   LLVM_DEBUG(dbgs() << "NOT Assigned (" << CurrSU.first->NodeNum << ")\n");
707bdd1243dSDimitry Andric 
708bdd1243dSDimitry Andric   bool FinishedExploring = false;
709bdd1243dSDimitry Andric   if (CurrCost < BestCost || BestCost == -1) {
710bdd1243dSDimitry Andric     if (solveExact()) {
711bdd1243dSDimitry Andric       bool FinishedExploring = BestCost != 0;
712bdd1243dSDimitry Andric       if (!FinishedExploring)
713bdd1243dSDimitry Andric         return true;
714bdd1243dSDimitry Andric     }
715bdd1243dSDimitry Andric   }
716bdd1243dSDimitry Andric 
717bdd1243dSDimitry Andric   retreatPosition();
718bdd1243dSDimitry Andric   CurrCost -= MissPenalty;
719bdd1243dSDimitry Andric   return FinishedExploring;
720bdd1243dSDimitry Andric }
721bdd1243dSDimitry Andric 
72206c3fb27SDimitry Andric template <typename T>
72306c3fb27SDimitry Andric void PipelineSolver::greedyFind(
72406c3fb27SDimitry Andric     std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges, T I, T E) {
725bdd1243dSDimitry Andric   SUToCandSGsPair CurrSU = PipelineInstrs[CurrSyncGroupIdx][CurrConflInstNo];
726bdd1243dSDimitry Andric   int BestNodeCost = -1;
727bdd1243dSDimitry Andric   int TempCost;
728bdd1243dSDimitry Andric   SchedGroup *BestGroup = nullptr;
729bdd1243dSDimitry Andric   int BestGroupID = -1;
730bdd1243dSDimitry Andric   auto &SyncPipeline = CurrPipeline[CurrSyncGroupIdx];
731bdd1243dSDimitry Andric   LLVM_DEBUG(dbgs() << "Fitting SU(" << CurrSU.first->NodeNum
732bdd1243dSDimitry Andric                     << ") in Pipeline # " << CurrSyncGroupIdx << "\n");
733bdd1243dSDimitry Andric 
734bdd1243dSDimitry Andric   // Since we have added the potential SchedGroups from bottom up, but
735bdd1243dSDimitry Andric   // traversed the DAG from top down, parse over the groups from last to
736bdd1243dSDimitry Andric   // first. If we fail to do this for the greedy algorithm, the solution will
737bdd1243dSDimitry Andric   // likely not be good in more complex cases.
738bdd1243dSDimitry Andric   for (; I != E; ++I) {
739bdd1243dSDimitry Andric     std::vector<std::pair<SUnit *, SUnit *>> AddedEdges;
740bdd1243dSDimitry Andric     int CandSGID = *I;
741*5f757f3fSDimitry Andric     SchedGroup *Match = llvm::find_if(SyncPipeline, [CandSGID](SchedGroup &SG) {
742*5f757f3fSDimitry Andric       return SG.getSGID() == CandSGID;
743*5f757f3fSDimitry Andric     });
744*5f757f3fSDimitry Andric     assert(Match);
745bdd1243dSDimitry Andric 
746bdd1243dSDimitry Andric     LLVM_DEBUG(dbgs() << "Trying SGID # " << CandSGID << " with Mask "
747bdd1243dSDimitry Andric                       << (int)Match->getMask() << "\n");
748bdd1243dSDimitry Andric 
749bdd1243dSDimitry Andric     if (Match->isFull()) {
750bdd1243dSDimitry Andric       LLVM_DEBUG(dbgs() << "SGID # " << CandSGID << " is full\n");
751bdd1243dSDimitry Andric       continue;
752bdd1243dSDimitry Andric     }
75306c3fb27SDimitry Andric     if (!Match->allowedByRules(CurrSU.first, SyncPipeline)) {
75406c3fb27SDimitry Andric       LLVM_DEBUG(dbgs() << "SGID # " << CandSGID << " has conflicting rule\n");
75506c3fb27SDimitry Andric       continue;
75606c3fb27SDimitry Andric     }
757bdd1243dSDimitry Andric     TempCost = addEdges(SyncPipeline, CurrSU.first, CandSGID, AddedEdges);
758bdd1243dSDimitry Andric     LLVM_DEBUG(dbgs() << "Cost of Group " << TempCost << "\n");
759bdd1243dSDimitry Andric     if (TempCost < BestNodeCost || BestNodeCost == -1) {
760bdd1243dSDimitry Andric       BestGroup = Match;
761bdd1243dSDimitry Andric       BestNodeCost = TempCost;
762bdd1243dSDimitry Andric       BestGroupID = CandSGID;
763bdd1243dSDimitry Andric     }
764bdd1243dSDimitry Andric     removeEdges(AddedEdges);
765bdd1243dSDimitry Andric     if (BestNodeCost == 0)
766bdd1243dSDimitry Andric       break;
767bdd1243dSDimitry Andric   }
768bdd1243dSDimitry Andric 
769bdd1243dSDimitry Andric   if (BestGroupID != -1) {
770bdd1243dSDimitry Andric     BestGroup->add(*CurrSU.first);
771bdd1243dSDimitry Andric     addEdges(SyncPipeline, CurrSU.first, BestGroupID, AddedEdges);
772bdd1243dSDimitry Andric     LLVM_DEBUG(dbgs() << "Best Group has ID: " << BestGroupID << " and Mask"
773bdd1243dSDimitry Andric                       << (int)BestGroup->getMask() << "\n");
774bdd1243dSDimitry Andric     BestCost += TempCost;
775bdd1243dSDimitry Andric   } else
776bdd1243dSDimitry Andric     BestCost += MissPenalty;
777bdd1243dSDimitry Andric 
778bdd1243dSDimitry Andric   CurrPipeline[CurrSyncGroupIdx] = SyncPipeline;
77906c3fb27SDimitry Andric }
78006c3fb27SDimitry Andric 
78106c3fb27SDimitry Andric bool PipelineSolver::solveGreedy() {
78206c3fb27SDimitry Andric   BestCost = 0;
78306c3fb27SDimitry Andric   std::vector<std::pair<SUnit *, SUnit *>> AddedEdges;
78406c3fb27SDimitry Andric 
78506c3fb27SDimitry Andric   while (static_cast<size_t>(CurrSyncGroupIdx) < PipelineInstrs.size()) {
78606c3fb27SDimitry Andric     SUToCandSGsPair CurrSU = PipelineInstrs[CurrSyncGroupIdx][CurrConflInstNo];
78706c3fb27SDimitry Andric     IsBottomUp
78806c3fb27SDimitry Andric         ? greedyFind(AddedEdges, CurrSU.second.rbegin(), CurrSU.second.rend())
78906c3fb27SDimitry Andric         : greedyFind(AddedEdges, CurrSU.second.begin(), CurrSU.second.end());
790bdd1243dSDimitry Andric     advancePosition();
791bdd1243dSDimitry Andric   }
792bdd1243dSDimitry Andric   BestPipeline = CurrPipeline;
793bdd1243dSDimitry Andric   removeEdges(AddedEdges);
794bdd1243dSDimitry Andric   return false;
795bdd1243dSDimitry Andric }
796bdd1243dSDimitry Andric 
797bdd1243dSDimitry Andric unsigned PipelineSolver::computeProblemSize() {
798bdd1243dSDimitry Andric   unsigned ProblemSize = 0;
799bdd1243dSDimitry Andric   for (auto &PipeConflicts : PipelineInstrs) {
800bdd1243dSDimitry Andric     ProblemSize += PipeConflicts.size();
801bdd1243dSDimitry Andric   }
802bdd1243dSDimitry Andric 
803bdd1243dSDimitry Andric   return ProblemSize;
804bdd1243dSDimitry Andric }
805bdd1243dSDimitry Andric 
806bdd1243dSDimitry Andric void PipelineSolver::solve() {
807bdd1243dSDimitry Andric   if (!NeedsSolver)
808bdd1243dSDimitry Andric     return;
809bdd1243dSDimitry Andric 
810bdd1243dSDimitry Andric   unsigned ProblemSize = computeProblemSize();
811bdd1243dSDimitry Andric   assert(ProblemSize > 0);
812bdd1243dSDimitry Andric 
813bdd1243dSDimitry Andric   bool BelowCutoff = (CutoffForExact > 0) && ProblemSize <= CutoffForExact;
814bdd1243dSDimitry Andric   MissPenalty = (ProblemSize / 2) + 1;
815bdd1243dSDimitry Andric 
816bdd1243dSDimitry Andric   LLVM_DEBUG(DAG->dump());
817bdd1243dSDimitry Andric   if (EnableExactSolver || BelowCutoff) {
818bdd1243dSDimitry Andric     LLVM_DEBUG(dbgs() << "Starting Greedy pipeline solver\n");
819bdd1243dSDimitry Andric     solveGreedy();
820bdd1243dSDimitry Andric     reset();
821bdd1243dSDimitry Andric     LLVM_DEBUG(dbgs() << "Greedy produced best cost of " << BestCost << "\n");
822bdd1243dSDimitry Andric     if (BestCost > 0) {
823bdd1243dSDimitry Andric       LLVM_DEBUG(dbgs() << "Starting EXACT pipeline solver\n");
824bdd1243dSDimitry Andric       solveExact();
825bdd1243dSDimitry Andric       LLVM_DEBUG(dbgs() << "Exact produced best cost of " << BestCost << "\n");
826bdd1243dSDimitry Andric     }
827bdd1243dSDimitry Andric   } else { // Use the Greedy Algorithm by default
828bdd1243dSDimitry Andric     LLVM_DEBUG(dbgs() << "Starting GREEDY pipeline solver\n");
829bdd1243dSDimitry Andric     solveGreedy();
830bdd1243dSDimitry Andric   }
831bdd1243dSDimitry Andric 
832bdd1243dSDimitry Andric   makePipeline();
83306c3fb27SDimitry Andric   LLVM_DEBUG(dbgs() << "After applying mutation\n");
83406c3fb27SDimitry Andric   LLVM_DEBUG(DAG->dump());
835bdd1243dSDimitry Andric }
836bdd1243dSDimitry Andric 
83706c3fb27SDimitry Andric enum IGLPStrategyID : int {
83806c3fb27SDimitry Andric   MFMASmallGemmOptID = 0,
83906c3fb27SDimitry Andric   MFMASmallGemmSingleWaveOptID = 1,
84006c3fb27SDimitry Andric };
841bdd1243dSDimitry Andric 
842bdd1243dSDimitry Andric // Implement a IGLP scheduling strategy.
843bdd1243dSDimitry Andric class IGLPStrategy {
844bdd1243dSDimitry Andric protected:
845bdd1243dSDimitry Andric   ScheduleDAGInstrs *DAG;
846bdd1243dSDimitry Andric 
847bdd1243dSDimitry Andric   const SIInstrInfo *TII;
848bdd1243dSDimitry Andric 
849bdd1243dSDimitry Andric public:
850*5f757f3fSDimitry Andric   /// Add SchedGroups to \p SyncedSchedGroups to implement this Strategy.
851bdd1243dSDimitry Andric   virtual void applyIGLPStrategy(
852bdd1243dSDimitry Andric       DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
853*5f757f3fSDimitry Andric       DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
854*5f757f3fSDimitry Andric       bool IsReentry) = 0;
855bdd1243dSDimitry Andric 
856bdd1243dSDimitry Andric   // Returns true if this strategy should be applied to a ScheduleDAG.
857bdd1243dSDimitry Andric   virtual bool shouldApplyStrategy(ScheduleDAGInstrs *DAG) = 0;
858bdd1243dSDimitry Andric 
85906c3fb27SDimitry Andric   bool IsBottomUp = 1;
86006c3fb27SDimitry Andric 
861bdd1243dSDimitry Andric   IGLPStrategy(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII)
862bdd1243dSDimitry Andric       : DAG(DAG), TII(TII) {}
863bdd1243dSDimitry Andric 
864bdd1243dSDimitry Andric   virtual ~IGLPStrategy() = default;
865bdd1243dSDimitry Andric };
866bdd1243dSDimitry Andric 
867bdd1243dSDimitry Andric class MFMASmallGemmOpt final : public IGLPStrategy {
86806c3fb27SDimitry Andric private:
869bdd1243dSDimitry Andric public:
870bdd1243dSDimitry Andric   void applyIGLPStrategy(
871bdd1243dSDimitry Andric       DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
872*5f757f3fSDimitry Andric       DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
873*5f757f3fSDimitry Andric       bool IsReentry) override;
874bdd1243dSDimitry Andric 
875bdd1243dSDimitry Andric   bool shouldApplyStrategy(ScheduleDAGInstrs *DAG) override { return true; }
876bdd1243dSDimitry Andric 
877bdd1243dSDimitry Andric   MFMASmallGemmOpt(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII)
87806c3fb27SDimitry Andric       : IGLPStrategy(DAG, TII) {
87906c3fb27SDimitry Andric     IsBottomUp = 1;
88006c3fb27SDimitry Andric   }
881bdd1243dSDimitry Andric };
882bdd1243dSDimitry Andric 
883bdd1243dSDimitry Andric void MFMASmallGemmOpt::applyIGLPStrategy(
884bdd1243dSDimitry Andric     DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
885*5f757f3fSDimitry Andric     DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
886*5f757f3fSDimitry Andric     bool IsReentry) {
887bdd1243dSDimitry Andric   // Count the number of MFMA instructions.
888bdd1243dSDimitry Andric   unsigned MFMACount = 0;
889bdd1243dSDimitry Andric   for (const MachineInstr &I : *DAG)
890bdd1243dSDimitry Andric     if (TII->isMFMAorWMMA(I))
891bdd1243dSDimitry Andric       ++MFMACount;
892bdd1243dSDimitry Andric 
893bdd1243dSDimitry Andric   const unsigned PipelineSyncID = 0;
894bdd1243dSDimitry Andric   SchedGroup *SG = nullptr;
895bdd1243dSDimitry Andric   for (unsigned I = 0; I < MFMACount * 3; ++I) {
896bdd1243dSDimitry Andric     SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
897bdd1243dSDimitry Andric         SchedGroupMask::DS, 2, PipelineSyncID, DAG, TII);
898bdd1243dSDimitry Andric     SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
899bdd1243dSDimitry Andric 
900bdd1243dSDimitry Andric     SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
901bdd1243dSDimitry Andric         SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
902bdd1243dSDimitry Andric     SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
903bdd1243dSDimitry Andric   }
904bdd1243dSDimitry Andric }
905bdd1243dSDimitry Andric 
90606c3fb27SDimitry Andric class MFMASmallGemmSingleWaveOpt final : public IGLPStrategy {
90706c3fb27SDimitry Andric private:
90806c3fb27SDimitry Andric   // Whether the DS_READ is a predecessor of first four MFMA in region
90906c3fb27SDimitry Andric   class EnablesInitialMFMA final : public InstructionRule {
91006c3fb27SDimitry Andric   public:
91106c3fb27SDimitry Andric     bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
91206c3fb27SDimitry Andric                SmallVectorImpl<SchedGroup> &SyncPipe) override {
91306c3fb27SDimitry Andric       if (!SyncPipe.size())
91406c3fb27SDimitry Andric         return false;
91506c3fb27SDimitry Andric       int MFMAsFound = 0;
91606c3fb27SDimitry Andric       if (!Cache->size()) {
91706c3fb27SDimitry Andric         for (auto &Elt : SyncPipe[0].DAG->SUnits) {
91806c3fb27SDimitry Andric           if (TII->isMFMAorWMMA(*Elt.getInstr())) {
91906c3fb27SDimitry Andric             ++MFMAsFound;
92006c3fb27SDimitry Andric             if (MFMAsFound > 4)
92106c3fb27SDimitry Andric               break;
92206c3fb27SDimitry Andric             Cache->push_back(&Elt);
92306c3fb27SDimitry Andric           }
92406c3fb27SDimitry Andric         }
92506c3fb27SDimitry Andric       }
92606c3fb27SDimitry Andric 
92706c3fb27SDimitry Andric       assert(Cache->size());
92806c3fb27SDimitry Andric       auto DAG = SyncPipe[0].DAG;
92906c3fb27SDimitry Andric       for (auto &Elt : *Cache) {
93006c3fb27SDimitry Andric         if (DAG->IsReachable(Elt, const_cast<SUnit *>(SU)))
93106c3fb27SDimitry Andric           return true;
93206c3fb27SDimitry Andric       }
93306c3fb27SDimitry Andric       return false;
93406c3fb27SDimitry Andric     }
93506c3fb27SDimitry Andric 
93606c3fb27SDimitry Andric     EnablesInitialMFMA(const SIInstrInfo *TII, unsigned SGID,
93706c3fb27SDimitry Andric                        bool NeedsCache = false)
93806c3fb27SDimitry Andric         : InstructionRule(TII, SGID, NeedsCache) {}
93906c3fb27SDimitry Andric   };
94006c3fb27SDimitry Andric 
94106c3fb27SDimitry Andric   // Whether the MI is a V_PERM and is a predecessor of a common DS_WRITE
94206c3fb27SDimitry Andric   class IsPermForDSW final : public InstructionRule {
94306c3fb27SDimitry Andric   public:
94406c3fb27SDimitry Andric     bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
94506c3fb27SDimitry Andric                SmallVectorImpl<SchedGroup> &SyncPipe) override {
94606c3fb27SDimitry Andric       auto MI = SU->getInstr();
94706c3fb27SDimitry Andric       if (MI->getOpcode() != AMDGPU::V_PERM_B32_e64)
94806c3fb27SDimitry Andric         return false;
94906c3fb27SDimitry Andric 
95006c3fb27SDimitry Andric       bool FitsInGroup = false;
95106c3fb27SDimitry Andric       // Does the VALU have a DS_WRITE successor
95206c3fb27SDimitry Andric       if (!Collection.size()) {
95306c3fb27SDimitry Andric         for (auto &Succ : SU->Succs) {
95406c3fb27SDimitry Andric           SUnit *SuccUnit = Succ.getSUnit();
95506c3fb27SDimitry Andric           if (TII->isDS(*SuccUnit->getInstr()) &&
95606c3fb27SDimitry Andric               SuccUnit->getInstr()->mayStore()) {
95706c3fb27SDimitry Andric             Cache->push_back(SuccUnit);
95806c3fb27SDimitry Andric             FitsInGroup = true;
95906c3fb27SDimitry Andric           }
96006c3fb27SDimitry Andric         }
96106c3fb27SDimitry Andric         return FitsInGroup;
96206c3fb27SDimitry Andric       }
96306c3fb27SDimitry Andric 
96406c3fb27SDimitry Andric       assert(Cache->size());
96506c3fb27SDimitry Andric 
96606c3fb27SDimitry Andric       // Does the VALU have a DS_WRITE successor that is the same as other
96706c3fb27SDimitry Andric       // VALU already in the group. The V_PERMs will all share 1 DS_W succ
968*5f757f3fSDimitry Andric       return llvm::any_of(*Cache, [&SU](SUnit *Elt) {
969*5f757f3fSDimitry Andric         return llvm::any_of(SU->Succs, [&Elt](const SDep &ThisSucc) {
97006c3fb27SDimitry Andric           return ThisSucc.getSUnit() == Elt;
97106c3fb27SDimitry Andric         });
97206c3fb27SDimitry Andric       });
97306c3fb27SDimitry Andric     }
97406c3fb27SDimitry Andric 
97506c3fb27SDimitry Andric     IsPermForDSW(const SIInstrInfo *TII, unsigned SGID, bool NeedsCache = false)
97606c3fb27SDimitry Andric         : InstructionRule(TII, SGID, NeedsCache) {}
97706c3fb27SDimitry Andric   };
97806c3fb27SDimitry Andric 
97906c3fb27SDimitry Andric   // Whether the SU is a successor of any element in previous SchedGroup
98006c3fb27SDimitry Andric   class IsSuccOfPrevGroup final : public InstructionRule {
98106c3fb27SDimitry Andric   public:
98206c3fb27SDimitry Andric     bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
98306c3fb27SDimitry Andric                SmallVectorImpl<SchedGroup> &SyncPipe) override {
98406c3fb27SDimitry Andric       SchedGroup *OtherGroup = nullptr;
98506c3fb27SDimitry Andric       for (auto &PipeSG : SyncPipe) {
98606c3fb27SDimitry Andric         if ((unsigned)PipeSG.getSGID() == SGID - 1) {
98706c3fb27SDimitry Andric           OtherGroup = &PipeSG;
98806c3fb27SDimitry Andric         }
98906c3fb27SDimitry Andric       }
99006c3fb27SDimitry Andric 
99106c3fb27SDimitry Andric       if (!OtherGroup)
99206c3fb27SDimitry Andric         return false;
99306c3fb27SDimitry Andric       if (!OtherGroup->Collection.size())
99406c3fb27SDimitry Andric         return true;
99506c3fb27SDimitry Andric 
99606c3fb27SDimitry Andric       // Does the previous VALU have this DS_Write as a successor
99706c3fb27SDimitry Andric       return (std::any_of(OtherGroup->Collection.begin(),
99806c3fb27SDimitry Andric                           OtherGroup->Collection.end(), [&SU](SUnit *Elt) {
99906c3fb27SDimitry Andric                             return std::any_of(Elt->Succs.begin(),
100006c3fb27SDimitry Andric                                                Elt->Succs.end(),
100106c3fb27SDimitry Andric                                                [&SU](SDep &Succ) {
100206c3fb27SDimitry Andric                                                  return Succ.getSUnit() == SU;
100306c3fb27SDimitry Andric                                                });
100406c3fb27SDimitry Andric                           }));
100506c3fb27SDimitry Andric     }
100606c3fb27SDimitry Andric     IsSuccOfPrevGroup(const SIInstrInfo *TII, unsigned SGID,
100706c3fb27SDimitry Andric                       bool NeedsCache = false)
100806c3fb27SDimitry Andric         : InstructionRule(TII, SGID, NeedsCache) {}
100906c3fb27SDimitry Andric   };
101006c3fb27SDimitry Andric 
101106c3fb27SDimitry Andric   // Whether the combined load width of group is 128 bits
101206c3fb27SDimitry Andric   class VMEMSize final : public InstructionRule {
101306c3fb27SDimitry Andric   public:
101406c3fb27SDimitry Andric     bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
101506c3fb27SDimitry Andric                SmallVectorImpl<SchedGroup> &SyncPipe) override {
101606c3fb27SDimitry Andric       auto MI = SU->getInstr();
101706c3fb27SDimitry Andric       if (MI->getOpcode() == TargetOpcode::BUNDLE)
101806c3fb27SDimitry Andric         return false;
101906c3fb27SDimitry Andric       if (!Collection.size())
102006c3fb27SDimitry Andric         return true;
102106c3fb27SDimitry Andric 
102206c3fb27SDimitry Andric       int NumBits = 0;
102306c3fb27SDimitry Andric 
102406c3fb27SDimitry Andric       auto TRI = TII->getRegisterInfo();
102506c3fb27SDimitry Andric       auto &MRI = MI->getParent()->getParent()->getRegInfo();
102606c3fb27SDimitry Andric       for (auto &Elt : Collection) {
102706c3fb27SDimitry Andric         auto Op = Elt->getInstr()->getOperand(0);
102806c3fb27SDimitry Andric         auto Size =
102906c3fb27SDimitry Andric             TRI.getRegSizeInBits(*TRI.getRegClassForOperandReg(MRI, Op));
103006c3fb27SDimitry Andric         NumBits += Size;
103106c3fb27SDimitry Andric       }
103206c3fb27SDimitry Andric 
103306c3fb27SDimitry Andric       if (NumBits < 128) {
103406c3fb27SDimitry Andric         assert(TII->isVMEM(*MI) && MI->mayLoad());
103506c3fb27SDimitry Andric         if (NumBits + TRI.getRegSizeInBits(*TRI.getRegClassForOperandReg(
103606c3fb27SDimitry Andric                           MRI, MI->getOperand(0))) <=
103706c3fb27SDimitry Andric             128)
103806c3fb27SDimitry Andric           return true;
103906c3fb27SDimitry Andric       }
104006c3fb27SDimitry Andric 
104106c3fb27SDimitry Andric       return false;
104206c3fb27SDimitry Andric     }
104306c3fb27SDimitry Andric 
104406c3fb27SDimitry Andric     VMEMSize(const SIInstrInfo *TII, unsigned SGID, bool NeedsCache = false)
104506c3fb27SDimitry Andric         : InstructionRule(TII, SGID, NeedsCache) {}
104606c3fb27SDimitry Andric   };
104706c3fb27SDimitry Andric 
1048*5f757f3fSDimitry Andric   /// Whether the SU shares a V_PERM predecessor with any SU in the SchedGroup
1049*5f757f3fSDimitry Andric   /// that is \p Distance steps away
105006c3fb27SDimitry Andric   class SharesPredWithPrevNthGroup final : public InstructionRule {
105106c3fb27SDimitry Andric   private:
105206c3fb27SDimitry Andric     unsigned Distance = 1;
105306c3fb27SDimitry Andric 
105406c3fb27SDimitry Andric   public:
105506c3fb27SDimitry Andric     bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
105606c3fb27SDimitry Andric                SmallVectorImpl<SchedGroup> &SyncPipe) override {
105706c3fb27SDimitry Andric       SchedGroup *OtherGroup = nullptr;
105806c3fb27SDimitry Andric       if (!SyncPipe.size())
105906c3fb27SDimitry Andric         return false;
106006c3fb27SDimitry Andric 
106106c3fb27SDimitry Andric       if (!Cache->size()) {
106206c3fb27SDimitry Andric 
106306c3fb27SDimitry Andric         for (auto &PipeSG : SyncPipe) {
106406c3fb27SDimitry Andric           if ((unsigned)PipeSG.getSGID() == SGID - Distance) {
106506c3fb27SDimitry Andric             OtherGroup = &PipeSG;
106606c3fb27SDimitry Andric           }
106706c3fb27SDimitry Andric         }
106806c3fb27SDimitry Andric 
106906c3fb27SDimitry Andric         if (!OtherGroup)
107006c3fb27SDimitry Andric           return false;
107106c3fb27SDimitry Andric         if (!OtherGroup->Collection.size())
107206c3fb27SDimitry Andric           return true;
107306c3fb27SDimitry Andric 
107406c3fb27SDimitry Andric         for (auto &OtherEle : OtherGroup->Collection) {
107506c3fb27SDimitry Andric           for (auto &Pred : OtherEle->Preds) {
107606c3fb27SDimitry Andric             if (Pred.getSUnit()->getInstr()->getOpcode() ==
107706c3fb27SDimitry Andric                 AMDGPU::V_PERM_B32_e64)
107806c3fb27SDimitry Andric               Cache->push_back(Pred.getSUnit());
107906c3fb27SDimitry Andric           }
108006c3fb27SDimitry Andric         }
1081*5f757f3fSDimitry Andric 
1082*5f757f3fSDimitry Andric         // If the other group has no PERM preds, then this group won't share any
1083*5f757f3fSDimitry Andric         if (!Cache->size())
1084*5f757f3fSDimitry Andric           return false;
108506c3fb27SDimitry Andric       }
108606c3fb27SDimitry Andric 
108706c3fb27SDimitry Andric       auto DAG = SyncPipe[0].DAG;
108806c3fb27SDimitry Andric       // Does the previous DS_WRITE share a V_PERM predecessor with this
108906c3fb27SDimitry Andric       // VMEM_READ
1090*5f757f3fSDimitry Andric       return llvm::any_of(*Cache, [&SU, &DAG](SUnit *Elt) {
109106c3fb27SDimitry Andric         return DAG->IsReachable(const_cast<SUnit *>(SU), Elt);
1092*5f757f3fSDimitry Andric       });
109306c3fb27SDimitry Andric     }
109406c3fb27SDimitry Andric     SharesPredWithPrevNthGroup(unsigned Distance, const SIInstrInfo *TII,
109506c3fb27SDimitry Andric                                unsigned SGID, bool NeedsCache = false)
109606c3fb27SDimitry Andric         : InstructionRule(TII, SGID, NeedsCache), Distance(Distance) {}
109706c3fb27SDimitry Andric   };
109806c3fb27SDimitry Andric 
109906c3fb27SDimitry Andric public:
110006c3fb27SDimitry Andric   void applyIGLPStrategy(
110106c3fb27SDimitry Andric       DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
1102*5f757f3fSDimitry Andric       DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
1103*5f757f3fSDimitry Andric       bool IsReentry) override;
110406c3fb27SDimitry Andric 
110506c3fb27SDimitry Andric   bool shouldApplyStrategy(ScheduleDAGInstrs *DAG) override { return true; }
110606c3fb27SDimitry Andric 
110706c3fb27SDimitry Andric   MFMASmallGemmSingleWaveOpt(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII)
110806c3fb27SDimitry Andric       : IGLPStrategy(DAG, TII) {
110906c3fb27SDimitry Andric     IsBottomUp = 0;
111006c3fb27SDimitry Andric   }
111106c3fb27SDimitry Andric };
111206c3fb27SDimitry Andric 
1113*5f757f3fSDimitry Andric static unsigned DSWCount = 0;
1114*5f757f3fSDimitry Andric static unsigned DSWWithPermCount = 0;
1115*5f757f3fSDimitry Andric static unsigned DSWWithSharedVMEMCount = 0;
1116*5f757f3fSDimitry Andric 
111706c3fb27SDimitry Andric void MFMASmallGemmSingleWaveOpt::applyIGLPStrategy(
111806c3fb27SDimitry Andric     DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
1119*5f757f3fSDimitry Andric     DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
1120*5f757f3fSDimitry Andric     bool IsReentry) {
112106c3fb27SDimitry Andric   unsigned MFMACount = 0;
112206c3fb27SDimitry Andric   unsigned DSRCount = 0;
1123*5f757f3fSDimitry Andric 
1124*5f757f3fSDimitry Andric   assert((IsReentry || (DSWCount == 0 && DSWWithPermCount == 0 &&
1125*5f757f3fSDimitry Andric                         DSWWithSharedVMEMCount == 0)) &&
1126*5f757f3fSDimitry Andric          "DSWCounters should be zero in pre-RA scheduling!");
112706c3fb27SDimitry Andric   SmallVector<SUnit *, 6> DSWithPerms;
112806c3fb27SDimitry Andric   for (auto &SU : DAG->SUnits) {
112906c3fb27SDimitry Andric     auto I = SU.getInstr();
113006c3fb27SDimitry Andric     if (TII->isMFMAorWMMA(*I))
113106c3fb27SDimitry Andric       ++MFMACount;
113206c3fb27SDimitry Andric     else if (TII->isDS(*I)) {
113306c3fb27SDimitry Andric       if (I->mayLoad())
113406c3fb27SDimitry Andric         ++DSRCount;
1135*5f757f3fSDimitry Andric       else if (I->mayStore() && !IsReentry) {
113606c3fb27SDimitry Andric         ++DSWCount;
113706c3fb27SDimitry Andric         for (auto Pred : SU.Preds) {
113806c3fb27SDimitry Andric           if (Pred.getSUnit()->getInstr()->getOpcode() ==
113906c3fb27SDimitry Andric               AMDGPU::V_PERM_B32_e64) {
114006c3fb27SDimitry Andric             DSWithPerms.push_back(&SU);
114106c3fb27SDimitry Andric             break;
114206c3fb27SDimitry Andric           }
114306c3fb27SDimitry Andric         }
114406c3fb27SDimitry Andric       }
114506c3fb27SDimitry Andric     }
114606c3fb27SDimitry Andric   }
1147*5f757f3fSDimitry Andric 
1148*5f757f3fSDimitry Andric   if (!IsReentry) {
114906c3fb27SDimitry Andric     DSWWithPermCount = DSWithPerms.size();
115006c3fb27SDimitry Andric     auto I = DSWithPerms.begin();
115106c3fb27SDimitry Andric     auto E = DSWithPerms.end();
115206c3fb27SDimitry Andric 
115306c3fb27SDimitry Andric     // Get the count of DS_WRITES with V_PERM predecessors which
115406c3fb27SDimitry Andric     // have loop carried dependencies (WAR) on the same VMEM_READs.
115506c3fb27SDimitry Andric     // We consider partial overlap as a miss -- in other words,
115606c3fb27SDimitry Andric     // for a given DS_W, we only consider another DS_W as matching
115706c3fb27SDimitry Andric     // if there is a corresponding (in terms of the VMEM_R it uses) V_PERM pred
115806c3fb27SDimitry Andric     // for every V_PERM pred of this DS_W.
115906c3fb27SDimitry Andric     DenseMap<MachineInstr *, SUnit *> VMEMLookup;
116006c3fb27SDimitry Andric     SmallVector<SUnit *, 6> Counted;
116106c3fb27SDimitry Andric     for (; I != E; I++) {
116206c3fb27SDimitry Andric       SUnit *Cand = nullptr;
116306c3fb27SDimitry Andric       bool MissedAny = false;
116406c3fb27SDimitry Andric       for (auto &Pred : (*I)->Preds) {
116506c3fb27SDimitry Andric         if (Pred.getSUnit()->getInstr()->getOpcode() != AMDGPU::V_PERM_B32_e64)
116606c3fb27SDimitry Andric           continue;
116706c3fb27SDimitry Andric 
1168*5f757f3fSDimitry Andric         if (Cand && llvm::is_contained(Counted, Cand))
116906c3fb27SDimitry Andric           break;
117006c3fb27SDimitry Andric 
117106c3fb27SDimitry Andric         for (auto &Succ : Pred.getSUnit()->Succs) {
117206c3fb27SDimitry Andric           auto MI = Succ.getSUnit()->getInstr();
117306c3fb27SDimitry Andric           if (!TII->isVMEM(*MI) || !MI->mayLoad())
117406c3fb27SDimitry Andric             continue;
117506c3fb27SDimitry Andric 
117606c3fb27SDimitry Andric           if (MissedAny || !VMEMLookup.size()) {
117706c3fb27SDimitry Andric             MissedAny = true;
117806c3fb27SDimitry Andric             VMEMLookup[MI] = *I;
117906c3fb27SDimitry Andric             continue;
118006c3fb27SDimitry Andric           }
118106c3fb27SDimitry Andric 
118206c3fb27SDimitry Andric           if (!VMEMLookup.contains(MI)) {
118306c3fb27SDimitry Andric             MissedAny = true;
118406c3fb27SDimitry Andric             VMEMLookup[MI] = *I;
118506c3fb27SDimitry Andric             continue;
118606c3fb27SDimitry Andric           }
118706c3fb27SDimitry Andric 
118806c3fb27SDimitry Andric           Cand = VMEMLookup[MI];
1189*5f757f3fSDimitry Andric           if (llvm::is_contained(Counted, Cand)) {
119006c3fb27SDimitry Andric             MissedAny = true;
119106c3fb27SDimitry Andric             break;
119206c3fb27SDimitry Andric           }
119306c3fb27SDimitry Andric         }
119406c3fb27SDimitry Andric       }
119506c3fb27SDimitry Andric       if (!MissedAny && Cand) {
119606c3fb27SDimitry Andric         DSWWithSharedVMEMCount += 2;
119706c3fb27SDimitry Andric         Counted.push_back(Cand);
119806c3fb27SDimitry Andric         Counted.push_back(*I);
119906c3fb27SDimitry Andric       }
120006c3fb27SDimitry Andric     }
1201*5f757f3fSDimitry Andric   }
120206c3fb27SDimitry Andric 
120306c3fb27SDimitry Andric   assert(DSWWithSharedVMEMCount <= DSWWithPermCount);
120406c3fb27SDimitry Andric   SchedGroup *SG;
120506c3fb27SDimitry Andric   unsigned PipelineSyncID = 0;
120606c3fb27SDimitry Andric   // For kernels with V_PERM, there are enough VALU to mix in between MFMAs
120706c3fb27SDimitry Andric   if (DSWWithPermCount) {
120806c3fb27SDimitry Andric     for (unsigned I = 0; I < MFMACount; I++) {
120906c3fb27SDimitry Andric       SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
121006c3fb27SDimitry Andric           SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
121106c3fb27SDimitry Andric       SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
121206c3fb27SDimitry Andric 
121306c3fb27SDimitry Andric       SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
121406c3fb27SDimitry Andric           SchedGroupMask::VALU, 2, PipelineSyncID, DAG, TII);
121506c3fb27SDimitry Andric       SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
121606c3fb27SDimitry Andric     }
121706c3fb27SDimitry Andric   }
121806c3fb27SDimitry Andric 
121906c3fb27SDimitry Andric   PipelineSyncID = 1;
122006c3fb27SDimitry Andric   // Phase 1: Break up DS_READ and MFMA clusters.
122106c3fb27SDimitry Andric   // First DS_READ to make ready initial MFMA, then interleave MFMA with DS_READ
122206c3fb27SDimitry Andric   // prefetch
122306c3fb27SDimitry Andric 
122406c3fb27SDimitry Andric   // Make ready initial MFMA
122506c3fb27SDimitry Andric   SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
122606c3fb27SDimitry Andric       SchedGroupMask::DS_READ, 4, PipelineSyncID, DAG, TII);
122706c3fb27SDimitry Andric   SG->addRule(std::make_shared<EnablesInitialMFMA>(TII, SG->getSGID(), true));
122806c3fb27SDimitry Andric   SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
122906c3fb27SDimitry Andric 
123006c3fb27SDimitry Andric   SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
123106c3fb27SDimitry Andric       SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
123206c3fb27SDimitry Andric   SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
123306c3fb27SDimitry Andric 
123406c3fb27SDimitry Andric   // Interleave MFMA with DS_READ prefetch
123506c3fb27SDimitry Andric   for (unsigned I = 0; I < DSRCount - 4; ++I) {
123606c3fb27SDimitry Andric     SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
123706c3fb27SDimitry Andric         SchedGroupMask::DS_READ, 1, PipelineSyncID, DAG, TII);
123806c3fb27SDimitry Andric     SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
123906c3fb27SDimitry Andric 
124006c3fb27SDimitry Andric     SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
124106c3fb27SDimitry Andric         SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
124206c3fb27SDimitry Andric     SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
124306c3fb27SDimitry Andric   }
124406c3fb27SDimitry Andric 
124506c3fb27SDimitry Andric   // Phase 2a: Loop carried dependency with V_PERM
124606c3fb27SDimitry Andric   // Schedule VPerm & DS_WRITE as closely as possible to the VMEM_READ they
124706c3fb27SDimitry Andric   // depend on. Interleave MFMA to keep XDL unit busy throughout.
124806c3fb27SDimitry Andric   for (unsigned I = 0; I < DSWWithPermCount - DSWWithSharedVMEMCount; ++I) {
124906c3fb27SDimitry Andric     SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
125006c3fb27SDimitry Andric         SchedGroupMask::VALU, 4, PipelineSyncID, DAG, TII);
125106c3fb27SDimitry Andric     SG->addRule(std::make_shared<IsPermForDSW>(TII, SG->getSGID(), true));
125206c3fb27SDimitry Andric     SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
125306c3fb27SDimitry Andric 
125406c3fb27SDimitry Andric     SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
125506c3fb27SDimitry Andric         SchedGroupMask::DS_WRITE, 1, PipelineSyncID, DAG, TII);
125606c3fb27SDimitry Andric     SG->addRule(std::make_shared<IsSuccOfPrevGroup>(TII, SG->getSGID(), false));
125706c3fb27SDimitry Andric     SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
125806c3fb27SDimitry Andric 
125906c3fb27SDimitry Andric     SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
126006c3fb27SDimitry Andric         SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG, TII);
126106c3fb27SDimitry Andric     SG->addRule(std::make_shared<SharesPredWithPrevNthGroup>(
126206c3fb27SDimitry Andric         1, TII, SG->getSGID(), true));
126306c3fb27SDimitry Andric     SG->addRule(std::make_shared<VMEMSize>(TII, SG->getSGID(), false));
126406c3fb27SDimitry Andric     SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
126506c3fb27SDimitry Andric 
126606c3fb27SDimitry Andric     SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
126706c3fb27SDimitry Andric         SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
126806c3fb27SDimitry Andric     SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
126906c3fb27SDimitry Andric 
127006c3fb27SDimitry Andric     SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
127106c3fb27SDimitry Andric         SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG, TII);
127206c3fb27SDimitry Andric     SG->addRule(std::make_shared<SharesPredWithPrevNthGroup>(
127306c3fb27SDimitry Andric         3, TII, SG->getSGID(), true));
127406c3fb27SDimitry Andric     SG->addRule(std::make_shared<VMEMSize>(TII, SG->getSGID(), false));
127506c3fb27SDimitry Andric     SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
127606c3fb27SDimitry Andric 
127706c3fb27SDimitry Andric     SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
127806c3fb27SDimitry Andric         SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
127906c3fb27SDimitry Andric     SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
128006c3fb27SDimitry Andric   }
128106c3fb27SDimitry Andric 
128206c3fb27SDimitry Andric   // Phase 2b: Loop carried dependency without V_PERM
128306c3fb27SDimitry Andric   // Schedule DS_WRITE as closely as possible to the VMEM_READ they depend on.
128406c3fb27SDimitry Andric   // Interleave MFMA to keep XDL unit busy throughout.
128506c3fb27SDimitry Andric   for (unsigned I = 0; I < DSWCount - DSWWithPermCount; I++) {
128606c3fb27SDimitry Andric     SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
128706c3fb27SDimitry Andric         SchedGroupMask::DS_WRITE, 1, PipelineSyncID, DAG, TII);
128806c3fb27SDimitry Andric     SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
128906c3fb27SDimitry Andric 
129006c3fb27SDimitry Andric     SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
129106c3fb27SDimitry Andric         SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG, TII);
129206c3fb27SDimitry Andric     SG->addRule(std::make_shared<VMEMSize>(TII, SG->getSGID(), false));
129306c3fb27SDimitry Andric     SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
129406c3fb27SDimitry Andric 
129506c3fb27SDimitry Andric     SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
129606c3fb27SDimitry Andric         SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
129706c3fb27SDimitry Andric     SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
129806c3fb27SDimitry Andric   }
129906c3fb27SDimitry Andric 
130006c3fb27SDimitry Andric   // Phase 2c: Loop carried dependency with V_PERM, VMEM_READs are
130106c3fb27SDimitry Andric   // ultimately used by two DS_WRITE
130206c3fb27SDimitry Andric   // Schedule VPerm & DS_WRITE as closely as possible to the VMEM_READ they
130306c3fb27SDimitry Andric   // depend on. Interleave MFMA to keep XDL unit busy throughout.
130406c3fb27SDimitry Andric 
130506c3fb27SDimitry Andric   for (unsigned I = 0; I < DSWWithSharedVMEMCount; ++I) {
130606c3fb27SDimitry Andric     SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
130706c3fb27SDimitry Andric         SchedGroupMask::VALU, 4, PipelineSyncID, DAG, TII);
130806c3fb27SDimitry Andric     SG->addRule(std::make_shared<IsPermForDSW>(TII, SG->getSGID(), true));
130906c3fb27SDimitry Andric     SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
131006c3fb27SDimitry Andric 
131106c3fb27SDimitry Andric     SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
131206c3fb27SDimitry Andric         SchedGroupMask::DS_WRITE, 1, PipelineSyncID, DAG, TII);
131306c3fb27SDimitry Andric     SG->addRule(std::make_shared<IsSuccOfPrevGroup>(TII, SG->getSGID(), false));
131406c3fb27SDimitry Andric     SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
131506c3fb27SDimitry Andric 
131606c3fb27SDimitry Andric     SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
131706c3fb27SDimitry Andric         SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
131806c3fb27SDimitry Andric     SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
131906c3fb27SDimitry Andric 
132006c3fb27SDimitry Andric     SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
132106c3fb27SDimitry Andric         SchedGroupMask::VALU, 4, PipelineSyncID, DAG, TII);
132206c3fb27SDimitry Andric     SG->addRule(std::make_shared<IsPermForDSW>(TII, SG->getSGID(), true));
132306c3fb27SDimitry Andric     SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
132406c3fb27SDimitry Andric 
132506c3fb27SDimitry Andric     SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
132606c3fb27SDimitry Andric         SchedGroupMask::DS_WRITE, 1, PipelineSyncID, DAG, TII);
132706c3fb27SDimitry Andric     SG->addRule(std::make_shared<IsSuccOfPrevGroup>(TII, SG->getSGID(), false));
132806c3fb27SDimitry Andric     SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
132906c3fb27SDimitry Andric 
133006c3fb27SDimitry Andric     SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
133106c3fb27SDimitry Andric         SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
133206c3fb27SDimitry Andric     SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
133306c3fb27SDimitry Andric 
133406c3fb27SDimitry Andric     SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
133506c3fb27SDimitry Andric         SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG, TII);
133606c3fb27SDimitry Andric     SG->addRule(std::make_shared<SharesPredWithPrevNthGroup>(
133706c3fb27SDimitry Andric         2, TII, SG->getSGID(), true));
133806c3fb27SDimitry Andric     SG->addRule(std::make_shared<VMEMSize>(TII, SG->getSGID(), false));
133906c3fb27SDimitry Andric     SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
134006c3fb27SDimitry Andric 
134106c3fb27SDimitry Andric     SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
134206c3fb27SDimitry Andric         SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
134306c3fb27SDimitry Andric     SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
134406c3fb27SDimitry Andric 
134506c3fb27SDimitry Andric     SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
134606c3fb27SDimitry Andric         SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG, TII);
134706c3fb27SDimitry Andric     SG->addRule(std::make_shared<SharesPredWithPrevNthGroup>(
134806c3fb27SDimitry Andric         4, TII, SG->getSGID(), true));
134906c3fb27SDimitry Andric     SG->addRule(std::make_shared<VMEMSize>(TII, SG->getSGID(), false));
135006c3fb27SDimitry Andric     SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
135106c3fb27SDimitry Andric 
135206c3fb27SDimitry Andric     SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
135306c3fb27SDimitry Andric         SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
135406c3fb27SDimitry Andric     SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
135506c3fb27SDimitry Andric   }
135606c3fb27SDimitry Andric }
135706c3fb27SDimitry Andric 
1358bdd1243dSDimitry Andric static std::unique_ptr<IGLPStrategy>
1359bdd1243dSDimitry Andric createIGLPStrategy(IGLPStrategyID ID, ScheduleDAGInstrs *DAG,
1360bdd1243dSDimitry Andric                    const SIInstrInfo *TII) {
1361bdd1243dSDimitry Andric   switch (ID) {
1362bdd1243dSDimitry Andric   case MFMASmallGemmOptID:
1363bdd1243dSDimitry Andric     return std::make_unique<MFMASmallGemmOpt>(DAG, TII);
136406c3fb27SDimitry Andric   case MFMASmallGemmSingleWaveOptID:
136506c3fb27SDimitry Andric     return std::make_unique<MFMASmallGemmSingleWaveOpt>(DAG, TII);
1366bdd1243dSDimitry Andric   }
1367bdd1243dSDimitry Andric 
1368bdd1243dSDimitry Andric   llvm_unreachable("Unknown IGLPStrategyID");
1369bdd1243dSDimitry Andric }
1370bdd1243dSDimitry Andric 
1371bdd1243dSDimitry Andric class IGroupLPDAGMutation : public ScheduleDAGMutation {
1372bdd1243dSDimitry Andric private:
1373bdd1243dSDimitry Andric   const SIInstrInfo *TII;
1374bdd1243dSDimitry Andric 
1375bdd1243dSDimitry Andric   ScheduleDAGMI *DAG;
1376bdd1243dSDimitry Andric 
1377bdd1243dSDimitry Andric   // Organize lists of SchedGroups by their SyncID. SchedGroups /
1378bdd1243dSDimitry Andric   // SCHED_GROUP_BARRIERs with different SyncIDs will have no edges added
1379bdd1243dSDimitry Andric   // between then.
1380bdd1243dSDimitry Andric   DenseMap<int, SmallVector<SchedGroup, 4>> SyncedSchedGroups;
1381bdd1243dSDimitry Andric 
1382bdd1243dSDimitry Andric   // Used to track instructions that can be mapped to multiple sched groups
1383bdd1243dSDimitry Andric   DenseMap<int, SUnitsToCandidateSGsMap> SyncedInstrs;
1384bdd1243dSDimitry Andric 
1385bdd1243dSDimitry Andric   // Add DAG edges that enforce SCHED_BARRIER ordering.
1386bdd1243dSDimitry Andric   void addSchedBarrierEdges(SUnit &SU);
1387bdd1243dSDimitry Andric 
1388bdd1243dSDimitry Andric   // Use a SCHED_BARRIER's mask to identify instruction SchedGroups that should
1389bdd1243dSDimitry Andric   // not be reordered accross the SCHED_BARRIER. This is used for the base
1390bdd1243dSDimitry Andric   // SCHED_BARRIER, and not SCHED_GROUP_BARRIER. The difference is that
1391bdd1243dSDimitry Andric   // SCHED_BARRIER will always block all instructions that can be classified
1392bdd1243dSDimitry Andric   // into a particular SchedClass, whereas SCHED_GROUP_BARRIER has a fixed size
1393bdd1243dSDimitry Andric   // and may only synchronize with some SchedGroups. Returns the inverse of
1394bdd1243dSDimitry Andric   // Mask. SCHED_BARRIER's mask describes which instruction types should be
1395bdd1243dSDimitry Andric   // allowed to be scheduled across it. Invert the mask to get the
1396bdd1243dSDimitry Andric   // SchedGroupMask of instructions that should be barred.
1397bdd1243dSDimitry Andric   SchedGroupMask invertSchedBarrierMask(SchedGroupMask Mask) const;
1398bdd1243dSDimitry Andric 
1399bdd1243dSDimitry Andric   // Create SchedGroups for a SCHED_GROUP_BARRIER.
1400bdd1243dSDimitry Andric   void initSchedGroupBarrierPipelineStage(
1401bdd1243dSDimitry Andric       std::vector<SUnit>::reverse_iterator RIter);
1402bdd1243dSDimitry Andric 
1403bdd1243dSDimitry Andric   void initIGLPOpt(SUnit &SU);
1404bdd1243dSDimitry Andric 
1405bdd1243dSDimitry Andric public:
1406bdd1243dSDimitry Andric   void apply(ScheduleDAGInstrs *DAGInstrs) override;
1407bdd1243dSDimitry Andric 
140806c3fb27SDimitry Andric   // The order in which the PipelineSolver should process the candidate
140906c3fb27SDimitry Andric   // SchedGroup for a PipelineInstr. BOTTOM_UP will try to add SUs to the last
141006c3fb27SDimitry Andric   // created SchedGroup first, and will consider that as the ultimate
141106c3fb27SDimitry Andric   // predecessor group when linking. TOP_DOWN instead links and processes the
141206c3fb27SDimitry Andric   // first created SchedGroup first.
141306c3fb27SDimitry Andric   bool IsBottomUp = 1;
141406c3fb27SDimitry Andric 
1415*5f757f3fSDimitry Andric   // Whether or not this is a reentry into the IGroupLPDAGMutation.
1416*5f757f3fSDimitry Andric   bool IsReentry = false;
1417*5f757f3fSDimitry Andric 
1418bdd1243dSDimitry Andric   IGroupLPDAGMutation() = default;
1419*5f757f3fSDimitry Andric   IGroupLPDAGMutation(bool IsReentry) : IsReentry(IsReentry) {}
1420bdd1243dSDimitry Andric };
1421bdd1243dSDimitry Andric 
1422bdd1243dSDimitry Andric unsigned SchedGroup::NumSchedGroups = 0;
1423bdd1243dSDimitry Andric 
1424bdd1243dSDimitry Andric bool SchedGroup::tryAddEdge(SUnit *A, SUnit *B) {
1425bdd1243dSDimitry Andric   if (A != B && DAG->canAddEdge(B, A)) {
1426bdd1243dSDimitry Andric     DAG->addEdge(B, SDep(A, SDep::Artificial));
1427bdd1243dSDimitry Andric     return true;
1428bdd1243dSDimitry Andric   }
1429bdd1243dSDimitry Andric   return false;
1430bdd1243dSDimitry Andric }
1431bdd1243dSDimitry Andric 
1432bdd1243dSDimitry Andric bool SchedGroup::canAddMI(const MachineInstr &MI) const {
1433bdd1243dSDimitry Andric   bool Result = false;
1434bdd1243dSDimitry Andric   if (MI.isMetaInstruction())
1435bdd1243dSDimitry Andric     Result = false;
1436bdd1243dSDimitry Andric 
1437bdd1243dSDimitry Andric   else if (((SGMask & SchedGroupMask::ALU) != SchedGroupMask::NONE) &&
1438bdd1243dSDimitry Andric            (TII->isVALU(MI) || TII->isMFMAorWMMA(MI) || TII->isSALU(MI)))
1439bdd1243dSDimitry Andric     Result = true;
1440bdd1243dSDimitry Andric 
1441bdd1243dSDimitry Andric   else if (((SGMask & SchedGroupMask::VALU) != SchedGroupMask::NONE) &&
1442bdd1243dSDimitry Andric            TII->isVALU(MI) && !TII->isMFMAorWMMA(MI))
1443bdd1243dSDimitry Andric     Result = true;
1444bdd1243dSDimitry Andric 
1445bdd1243dSDimitry Andric   else if (((SGMask & SchedGroupMask::SALU) != SchedGroupMask::NONE) &&
1446bdd1243dSDimitry Andric            TII->isSALU(MI))
1447bdd1243dSDimitry Andric     Result = true;
1448bdd1243dSDimitry Andric 
1449bdd1243dSDimitry Andric   else if (((SGMask & SchedGroupMask::MFMA) != SchedGroupMask::NONE) &&
1450bdd1243dSDimitry Andric            TII->isMFMAorWMMA(MI))
1451bdd1243dSDimitry Andric     Result = true;
1452bdd1243dSDimitry Andric 
1453bdd1243dSDimitry Andric   else if (((SGMask & SchedGroupMask::VMEM) != SchedGroupMask::NONE) &&
1454bdd1243dSDimitry Andric            (TII->isVMEM(MI) || (TII->isFLAT(MI) && !TII->isDS(MI))))
1455bdd1243dSDimitry Andric     Result = true;
1456bdd1243dSDimitry Andric 
1457bdd1243dSDimitry Andric   else if (((SGMask & SchedGroupMask::VMEM_READ) != SchedGroupMask::NONE) &&
1458bdd1243dSDimitry Andric            MI.mayLoad() &&
1459bdd1243dSDimitry Andric            (TII->isVMEM(MI) || (TII->isFLAT(MI) && !TII->isDS(MI))))
1460bdd1243dSDimitry Andric     Result = true;
1461bdd1243dSDimitry Andric 
1462bdd1243dSDimitry Andric   else if (((SGMask & SchedGroupMask::VMEM_WRITE) != SchedGroupMask::NONE) &&
1463bdd1243dSDimitry Andric            MI.mayStore() &&
1464bdd1243dSDimitry Andric            (TII->isVMEM(MI) || (TII->isFLAT(MI) && !TII->isDS(MI))))
1465bdd1243dSDimitry Andric     Result = true;
1466bdd1243dSDimitry Andric 
1467bdd1243dSDimitry Andric   else if (((SGMask & SchedGroupMask::DS) != SchedGroupMask::NONE) &&
1468bdd1243dSDimitry Andric            TII->isDS(MI))
1469bdd1243dSDimitry Andric     Result = true;
1470bdd1243dSDimitry Andric 
1471bdd1243dSDimitry Andric   else if (((SGMask & SchedGroupMask::DS_READ) != SchedGroupMask::NONE) &&
1472bdd1243dSDimitry Andric            MI.mayLoad() && TII->isDS(MI))
1473bdd1243dSDimitry Andric     Result = true;
1474bdd1243dSDimitry Andric 
1475bdd1243dSDimitry Andric   else if (((SGMask & SchedGroupMask::DS_WRITE) != SchedGroupMask::NONE) &&
1476bdd1243dSDimitry Andric            MI.mayStore() && TII->isDS(MI))
1477bdd1243dSDimitry Andric     Result = true;
1478bdd1243dSDimitry Andric 
1479bdd1243dSDimitry Andric   LLVM_DEBUG(
1480bdd1243dSDimitry Andric       dbgs() << "For SchedGroup with mask " << format_hex((int)SGMask, 10, true)
1481bdd1243dSDimitry Andric              << (Result ? " could classify " : " unable to classify ") << MI);
1482bdd1243dSDimitry Andric 
1483bdd1243dSDimitry Andric   return Result;
1484bdd1243dSDimitry Andric }
1485bdd1243dSDimitry Andric 
1486bdd1243dSDimitry Andric int SchedGroup::link(SUnit &SU, bool MakePred,
1487bdd1243dSDimitry Andric                      std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges) {
1488bdd1243dSDimitry Andric   int MissedEdges = 0;
1489bdd1243dSDimitry Andric   for (auto *A : Collection) {
1490bdd1243dSDimitry Andric     SUnit *B = &SU;
1491bdd1243dSDimitry Andric     if (A == B || A->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER)
1492bdd1243dSDimitry Andric       continue;
1493bdd1243dSDimitry Andric     if (MakePred)
1494bdd1243dSDimitry Andric       std::swap(A, B);
1495bdd1243dSDimitry Andric 
1496bdd1243dSDimitry Andric     if (DAG->IsReachable(B, A))
1497bdd1243dSDimitry Andric       continue;
149806c3fb27SDimitry Andric 
1499bdd1243dSDimitry Andric     // tryAddEdge returns false if there is a dependency that makes adding
1500bdd1243dSDimitry Andric     // the A->B edge impossible, otherwise it returns true;
1501bdd1243dSDimitry Andric     bool Added = tryAddEdge(A, B);
1502bdd1243dSDimitry Andric     if (Added)
1503bdd1243dSDimitry Andric       AddedEdges.push_back(std::pair(A, B));
1504bdd1243dSDimitry Andric     else
1505bdd1243dSDimitry Andric       ++MissedEdges;
1506bdd1243dSDimitry Andric   }
1507bdd1243dSDimitry Andric 
1508bdd1243dSDimitry Andric   return MissedEdges;
1509bdd1243dSDimitry Andric }
1510bdd1243dSDimitry Andric 
1511bdd1243dSDimitry Andric void SchedGroup::link(SUnit &SU, bool MakePred) {
1512bdd1243dSDimitry Andric   for (auto *A : Collection) {
1513bdd1243dSDimitry Andric     SUnit *B = &SU;
1514bdd1243dSDimitry Andric     if (A->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER)
1515bdd1243dSDimitry Andric       continue;
1516bdd1243dSDimitry Andric     if (MakePred)
1517bdd1243dSDimitry Andric       std::swap(A, B);
1518bdd1243dSDimitry Andric 
1519bdd1243dSDimitry Andric     tryAddEdge(A, B);
1520bdd1243dSDimitry Andric   }
1521bdd1243dSDimitry Andric }
1522bdd1243dSDimitry Andric 
1523bdd1243dSDimitry Andric void SchedGroup::link(SUnit &SU,
1524bdd1243dSDimitry Andric                       function_ref<bool(const SUnit *A, const SUnit *B)> P) {
1525bdd1243dSDimitry Andric   for (auto *A : Collection) {
1526bdd1243dSDimitry Andric     SUnit *B = &SU;
1527bdd1243dSDimitry Andric     if (P(A, B))
1528bdd1243dSDimitry Andric       std::swap(A, B);
1529bdd1243dSDimitry Andric 
1530bdd1243dSDimitry Andric     tryAddEdge(A, B);
1531bdd1243dSDimitry Andric   }
1532bdd1243dSDimitry Andric }
1533bdd1243dSDimitry Andric 
1534bdd1243dSDimitry Andric void SchedGroup::link(SchedGroup &OtherGroup) {
1535bdd1243dSDimitry Andric   for (auto *B : OtherGroup.Collection)
1536bdd1243dSDimitry Andric     link(*B);
1537bdd1243dSDimitry Andric }
1538bdd1243dSDimitry Andric 
1539bdd1243dSDimitry Andric bool SchedGroup::canAddSU(SUnit &SU) const {
1540bdd1243dSDimitry Andric   MachineInstr &MI = *SU.getInstr();
1541bdd1243dSDimitry Andric   if (MI.getOpcode() != TargetOpcode::BUNDLE)
1542bdd1243dSDimitry Andric     return canAddMI(MI);
1543bdd1243dSDimitry Andric 
1544bdd1243dSDimitry Andric   // Special case for bundled MIs.
1545bdd1243dSDimitry Andric   const MachineBasicBlock *MBB = MI.getParent();
1546bdd1243dSDimitry Andric   MachineBasicBlock::instr_iterator B = MI.getIterator(), E = ++B;
1547bdd1243dSDimitry Andric   while (E != MBB->end() && E->isBundledWithPred())
1548bdd1243dSDimitry Andric     ++E;
1549bdd1243dSDimitry Andric 
1550bdd1243dSDimitry Andric   // Return true if all of the bundled MIs can be added to this group.
1551bdd1243dSDimitry Andric   return std::all_of(B, E, [this](MachineInstr &MI) { return canAddMI(MI); });
1552bdd1243dSDimitry Andric }
1553bdd1243dSDimitry Andric 
1554bdd1243dSDimitry Andric void SchedGroup::initSchedGroup() {
1555bdd1243dSDimitry Andric   for (auto &SU : DAG->SUnits) {
1556bdd1243dSDimitry Andric     if (isFull())
1557bdd1243dSDimitry Andric       break;
1558bdd1243dSDimitry Andric 
1559bdd1243dSDimitry Andric     if (canAddSU(SU))
1560bdd1243dSDimitry Andric       add(SU);
1561bdd1243dSDimitry Andric   }
1562bdd1243dSDimitry Andric }
1563bdd1243dSDimitry Andric 
1564bdd1243dSDimitry Andric void SchedGroup::initSchedGroup(std::vector<SUnit>::reverse_iterator RIter,
1565bdd1243dSDimitry Andric                                 SUnitsToCandidateSGsMap &SyncedInstrs) {
1566bdd1243dSDimitry Andric   SUnit &InitSU = *RIter;
1567bdd1243dSDimitry Andric   for (auto E = DAG->SUnits.rend(); RIter != E; ++RIter) {
1568bdd1243dSDimitry Andric     auto &SU = *RIter;
1569bdd1243dSDimitry Andric     if (isFull())
1570bdd1243dSDimitry Andric       break;
1571bdd1243dSDimitry Andric 
1572bdd1243dSDimitry Andric     if (canAddSU(SU))
1573bdd1243dSDimitry Andric       SyncedInstrs[&SU].push_back(SGID);
1574bdd1243dSDimitry Andric   }
1575bdd1243dSDimitry Andric 
1576bdd1243dSDimitry Andric   add(InitSU);
1577bdd1243dSDimitry Andric   assert(MaxSize);
1578bdd1243dSDimitry Andric   (*MaxSize)++;
1579bdd1243dSDimitry Andric }
1580bdd1243dSDimitry Andric 
1581bdd1243dSDimitry Andric void SchedGroup::initSchedGroup(SUnitsToCandidateSGsMap &SyncedInstrs) {
1582bdd1243dSDimitry Andric   auto I = DAG->SUnits.rbegin();
1583bdd1243dSDimitry Andric   auto E = DAG->SUnits.rend();
1584bdd1243dSDimitry Andric   for (; I != E; ++I) {
1585bdd1243dSDimitry Andric     auto &SU = *I;
1586bdd1243dSDimitry Andric     if (isFull())
1587bdd1243dSDimitry Andric       break;
1588bdd1243dSDimitry Andric 
1589bdd1243dSDimitry Andric     if (canAddSU(SU))
1590bdd1243dSDimitry Andric       SyncedInstrs[&SU].push_back(SGID);
1591bdd1243dSDimitry Andric   }
1592bdd1243dSDimitry Andric }
1593bdd1243dSDimitry Andric 
1594bdd1243dSDimitry Andric void IGroupLPDAGMutation::apply(ScheduleDAGInstrs *DAGInstrs) {
159581ad6265SDimitry Andric   const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel();
159681ad6265SDimitry Andric   if (!TSchedModel || DAGInstrs->SUnits.empty())
159781ad6265SDimitry Andric     return;
159881ad6265SDimitry Andric 
1599bdd1243dSDimitry Andric   LLVM_DEBUG(dbgs() << "Applying IGroupLPDAGMutation...\n");
160081ad6265SDimitry Andric   const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>();
160181ad6265SDimitry Andric   TII = ST.getInstrInfo();
160281ad6265SDimitry Andric   DAG = static_cast<ScheduleDAGMI *>(DAGInstrs);
1603bdd1243dSDimitry Andric   SyncedSchedGroups.clear();
1604bdd1243dSDimitry Andric   SyncedInstrs.clear();
1605bdd1243dSDimitry Andric   bool foundSB = false;
1606bdd1243dSDimitry Andric   bool foundIGLP = false;
1607bdd1243dSDimitry Andric   for (auto R = DAG->SUnits.rbegin(), E = DAG->SUnits.rend(); R != E; ++R) {
1608bdd1243dSDimitry Andric     unsigned Opc = R->getInstr()->getOpcode();
1609bdd1243dSDimitry Andric     // SCHED_[GROUP_]BARRIER and IGLP are mutually exclusive.
1610bdd1243dSDimitry Andric     if (Opc == AMDGPU::SCHED_BARRIER) {
1611bdd1243dSDimitry Andric       addSchedBarrierEdges(*R);
1612bdd1243dSDimitry Andric       foundSB = true;
1613bdd1243dSDimitry Andric     } else if (Opc == AMDGPU::SCHED_GROUP_BARRIER) {
1614bdd1243dSDimitry Andric       initSchedGroupBarrierPipelineStage(R);
1615bdd1243dSDimitry Andric       foundSB = true;
1616bdd1243dSDimitry Andric     } else if (Opc == AMDGPU::IGLP_OPT) {
1617bdd1243dSDimitry Andric       resetEdges(*R, DAG);
1618bdd1243dSDimitry Andric       if (!foundSB && !foundIGLP)
1619bdd1243dSDimitry Andric         initIGLPOpt(*R);
1620bdd1243dSDimitry Andric       foundIGLP = true;
1621bdd1243dSDimitry Andric     }
162281ad6265SDimitry Andric   }
162381ad6265SDimitry Andric 
1624bdd1243dSDimitry Andric   if (foundSB || foundIGLP) {
162506c3fb27SDimitry Andric     PipelineSolver PS(SyncedSchedGroups, SyncedInstrs, DAG, IsBottomUp);
1626bdd1243dSDimitry Andric     // PipelineSolver performs the mutation by adding the edges it
1627bdd1243dSDimitry Andric     // determined as the best
1628bdd1243dSDimitry Andric     PS.solve();
1629bdd1243dSDimitry Andric   }
1630bdd1243dSDimitry Andric }
1631bdd1243dSDimitry Andric 
1632bdd1243dSDimitry Andric void IGroupLPDAGMutation::addSchedBarrierEdges(SUnit &SchedBarrier) {
163381ad6265SDimitry Andric   MachineInstr &MI = *SchedBarrier.getInstr();
163481ad6265SDimitry Andric   assert(MI.getOpcode() == AMDGPU::SCHED_BARRIER);
163581ad6265SDimitry Andric   // Remove all existing edges from the SCHED_BARRIER that were added due to the
163681ad6265SDimitry Andric   // instruction having side effects.
1637bdd1243dSDimitry Andric   resetEdges(SchedBarrier, DAG);
1638bdd1243dSDimitry Andric   auto InvertedMask =
1639bdd1243dSDimitry Andric       invertSchedBarrierMask((SchedGroupMask)MI.getOperand(0).getImm());
1640bdd1243dSDimitry Andric   SchedGroup SG(InvertedMask, std::nullopt, DAG, TII);
1641bdd1243dSDimitry Andric   SG.initSchedGroup();
1642bdd1243dSDimitry Andric   // Preserve original instruction ordering relative to the SCHED_BARRIER.
1643bdd1243dSDimitry Andric   SG.link(
1644bdd1243dSDimitry Andric       SchedBarrier,
1645bdd1243dSDimitry Andric       (function_ref<bool(const SUnit *A, const SUnit *B)>)[](
1646bdd1243dSDimitry Andric           const SUnit *A, const SUnit *B) { return A->NodeNum > B->NodeNum; });
164781ad6265SDimitry Andric }
164881ad6265SDimitry Andric 
1649bdd1243dSDimitry Andric SchedGroupMask
1650bdd1243dSDimitry Andric IGroupLPDAGMutation::invertSchedBarrierMask(SchedGroupMask Mask) const {
1651bdd1243dSDimitry Andric   // Invert mask and erase bits for types of instructions that are implied to be
1652bdd1243dSDimitry Andric   // allowed past the SCHED_BARRIER.
1653bdd1243dSDimitry Andric   SchedGroupMask InvertedMask = ~Mask;
1654bdd1243dSDimitry Andric 
1655bdd1243dSDimitry Andric   // ALU implies VALU, SALU, MFMA.
1656bdd1243dSDimitry Andric   if ((InvertedMask & SchedGroupMask::ALU) == SchedGroupMask::NONE)
1657bdd1243dSDimitry Andric     InvertedMask &=
1658bdd1243dSDimitry Andric         ~SchedGroupMask::VALU & ~SchedGroupMask::SALU & ~SchedGroupMask::MFMA;
1659bdd1243dSDimitry Andric   // VALU, SALU, MFMA implies ALU.
1660bdd1243dSDimitry Andric   else if ((InvertedMask & SchedGroupMask::VALU) == SchedGroupMask::NONE ||
1661bdd1243dSDimitry Andric            (InvertedMask & SchedGroupMask::SALU) == SchedGroupMask::NONE ||
1662bdd1243dSDimitry Andric            (InvertedMask & SchedGroupMask::MFMA) == SchedGroupMask::NONE)
1663bdd1243dSDimitry Andric     InvertedMask &= ~SchedGroupMask::ALU;
1664bdd1243dSDimitry Andric 
1665bdd1243dSDimitry Andric   // VMEM implies VMEM_READ, VMEM_WRITE.
1666bdd1243dSDimitry Andric   if ((InvertedMask & SchedGroupMask::VMEM) == SchedGroupMask::NONE)
1667bdd1243dSDimitry Andric     InvertedMask &= ~SchedGroupMask::VMEM_READ & ~SchedGroupMask::VMEM_WRITE;
1668bdd1243dSDimitry Andric   // VMEM_READ, VMEM_WRITE implies VMEM.
1669bdd1243dSDimitry Andric   else if ((InvertedMask & SchedGroupMask::VMEM_READ) == SchedGroupMask::NONE ||
1670bdd1243dSDimitry Andric            (InvertedMask & SchedGroupMask::VMEM_WRITE) == SchedGroupMask::NONE)
1671bdd1243dSDimitry Andric     InvertedMask &= ~SchedGroupMask::VMEM;
1672bdd1243dSDimitry Andric 
1673bdd1243dSDimitry Andric   // DS implies DS_READ, DS_WRITE.
1674bdd1243dSDimitry Andric   if ((InvertedMask & SchedGroupMask::DS) == SchedGroupMask::NONE)
1675bdd1243dSDimitry Andric     InvertedMask &= ~SchedGroupMask::DS_READ & ~SchedGroupMask::DS_WRITE;
1676bdd1243dSDimitry Andric   // DS_READ, DS_WRITE implies DS.
1677bdd1243dSDimitry Andric   else if ((InvertedMask & SchedGroupMask::DS_READ) == SchedGroupMask::NONE ||
1678bdd1243dSDimitry Andric            (InvertedMask & SchedGroupMask::DS_WRITE) == SchedGroupMask::NONE)
1679bdd1243dSDimitry Andric     InvertedMask &= ~SchedGroupMask::DS;
1680bdd1243dSDimitry Andric 
1681bdd1243dSDimitry Andric   return InvertedMask;
168281ad6265SDimitry Andric }
168381ad6265SDimitry Andric 
1684bdd1243dSDimitry Andric void IGroupLPDAGMutation::initSchedGroupBarrierPipelineStage(
1685bdd1243dSDimitry Andric     std::vector<SUnit>::reverse_iterator RIter) {
1686bdd1243dSDimitry Andric   // Remove all existing edges from the SCHED_GROUP_BARRIER that were added due
1687bdd1243dSDimitry Andric   // to the instruction having side effects.
1688bdd1243dSDimitry Andric   resetEdges(*RIter, DAG);
1689bdd1243dSDimitry Andric   MachineInstr &SGB = *RIter->getInstr();
1690bdd1243dSDimitry Andric   assert(SGB.getOpcode() == AMDGPU::SCHED_GROUP_BARRIER);
1691bdd1243dSDimitry Andric   int32_t SGMask = SGB.getOperand(0).getImm();
1692bdd1243dSDimitry Andric   int32_t Size = SGB.getOperand(1).getImm();
1693bdd1243dSDimitry Andric   int32_t SyncID = SGB.getOperand(2).getImm();
1694bdd1243dSDimitry Andric 
1695bdd1243dSDimitry Andric   auto &SG = SyncedSchedGroups[SyncID].emplace_back((SchedGroupMask)SGMask,
1696bdd1243dSDimitry Andric                                                     Size, SyncID, DAG, TII);
1697bdd1243dSDimitry Andric 
1698bdd1243dSDimitry Andric   SG.initSchedGroup(RIter, SyncedInstrs[SG.getSyncID()]);
169981ad6265SDimitry Andric }
170081ad6265SDimitry Andric 
1701bdd1243dSDimitry Andric void IGroupLPDAGMutation::initIGLPOpt(SUnit &SU) {
1702bdd1243dSDimitry Andric   IGLPStrategyID StrategyID =
1703bdd1243dSDimitry Andric       (IGLPStrategyID)SU.getInstr()->getOperand(0).getImm();
1704bdd1243dSDimitry Andric   auto S = createIGLPStrategy(StrategyID, DAG, TII);
170506c3fb27SDimitry Andric   if (S->shouldApplyStrategy(DAG)) {
170606c3fb27SDimitry Andric     IsBottomUp = S->IsBottomUp;
1707*5f757f3fSDimitry Andric     S->applyIGLPStrategy(SyncedInstrs, SyncedSchedGroups, IsReentry);
170881ad6265SDimitry Andric   }
170906c3fb27SDimitry Andric }
171081ad6265SDimitry Andric 
171181ad6265SDimitry Andric } // namespace
171281ad6265SDimitry Andric 
171381ad6265SDimitry Andric namespace llvm {
171481ad6265SDimitry Andric 
1715*5f757f3fSDimitry Andric /// \p IsReentry specifes whether or not this is a reentry into the
1716*5f757f3fSDimitry Andric /// IGroupLPDAGMutation. Since there may be multiple scheduling passes on the
1717*5f757f3fSDimitry Andric /// same scheduling region (e.g. pre and post-RA scheduling / multiple
1718*5f757f3fSDimitry Andric /// scheduling "phases"), we can reenter this mutation framework more than once
1719*5f757f3fSDimitry Andric /// for a given region.
1720*5f757f3fSDimitry Andric std::unique_ptr<ScheduleDAGMutation> createIGroupLPDAGMutation(bool IsReentry) {
1721*5f757f3fSDimitry Andric   return std::make_unique<IGroupLPDAGMutation>(IsReentry);
172281ad6265SDimitry Andric }
172381ad6265SDimitry Andric 
172481ad6265SDimitry Andric } // end namespace llvm
1725