181ad6265SDimitry Andric //===--- AMDGPUIGroupLP.cpp - AMDGPU IGroupLP ------------===// 281ad6265SDimitry Andric // 381ad6265SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 481ad6265SDimitry Andric // See https://llvm.org/LICENSE.txt for license information. 581ad6265SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 681ad6265SDimitry Andric // 781ad6265SDimitry Andric //===----------------------------------------------------------------------===// 881ad6265SDimitry Andric // 981ad6265SDimitry Andric // \file This file defines a set of schedule DAG mutations that can be used to 1081ad6265SDimitry Andric // override default scheduler behavior to enforce specific scheduling patterns. 1181ad6265SDimitry Andric // They should be used in cases where runtime performance considerations such as 1281ad6265SDimitry Andric // inter-wavefront interactions, mean that compile-time heuristics cannot 1381ad6265SDimitry Andric // predict the optimal instruction ordering, or in kernels where optimum 1481ad6265SDimitry Andric // instruction scheduling is important enough to warrant manual intervention. 1581ad6265SDimitry Andric // 1681ad6265SDimitry Andric //===----------------------------------------------------------------------===// 1781ad6265SDimitry Andric 1881ad6265SDimitry Andric #include "AMDGPUIGroupLP.h" 1981ad6265SDimitry Andric #include "AMDGPUTargetMachine.h" 2081ad6265SDimitry Andric #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 2181ad6265SDimitry Andric #include "SIInstrInfo.h" 2281ad6265SDimitry Andric #include "SIMachineFunctionInfo.h" 2381ad6265SDimitry Andric #include "llvm/ADT/BitmaskEnum.h" 24bdd1243dSDimitry Andric #include "llvm/ADT/DenseMap.h" 2581ad6265SDimitry Andric #include "llvm/CodeGen/MachineScheduler.h" 2681ad6265SDimitry Andric #include "llvm/CodeGen/TargetOpcodes.h" 2781ad6265SDimitry Andric 2881ad6265SDimitry Andric using namespace llvm; 2981ad6265SDimitry Andric 30bdd1243dSDimitry Andric #define DEBUG_TYPE "igrouplp" 3181ad6265SDimitry Andric 3281ad6265SDimitry Andric namespace { 3381ad6265SDimitry Andric 34bdd1243dSDimitry Andric static cl::opt<bool> EnableExactSolver( 35bdd1243dSDimitry Andric "amdgpu-igrouplp-exact-solver", cl::Hidden, 36bdd1243dSDimitry Andric cl::desc("Whether to use the exponential time solver to fit " 37bdd1243dSDimitry Andric "the instructions to the pipeline as closely as " 38bdd1243dSDimitry Andric "possible."), 3981ad6265SDimitry Andric cl::init(false)); 4081ad6265SDimitry Andric 41bdd1243dSDimitry Andric static cl::opt<unsigned> CutoffForExact( 42bdd1243dSDimitry Andric "amdgpu-igrouplp-exact-solver-cutoff", cl::init(0), cl::Hidden, 43bdd1243dSDimitry Andric cl::desc("The maximum number of scheduling group conflicts " 44bdd1243dSDimitry Andric "which we attempt to solve with the exponential time " 45bdd1243dSDimitry Andric "exact solver. Problem sizes greater than this will" 46bdd1243dSDimitry Andric "be solved by the less accurate greedy algorithm. Selecting " 47bdd1243dSDimitry Andric "solver by size is superseded by manually selecting " 48bdd1243dSDimitry Andric "the solver (e.g. by amdgpu-igrouplp-exact-solver")); 4981ad6265SDimitry Andric 50bdd1243dSDimitry Andric static cl::opt<uint64_t> MaxBranchesExplored( 51bdd1243dSDimitry Andric "amdgpu-igrouplp-exact-solver-max-branches", cl::init(0), cl::Hidden, 52bdd1243dSDimitry Andric cl::desc("The amount of branches that we are willing to explore with" 53bdd1243dSDimitry Andric "the exact algorithm before giving up.")); 5481ad6265SDimitry Andric 55bdd1243dSDimitry Andric static cl::opt<bool> UseCostHeur( 56bdd1243dSDimitry Andric "amdgpu-igrouplp-exact-solver-cost-heur", cl::init(true), cl::Hidden, 57bdd1243dSDimitry Andric cl::desc("Whether to use the cost heuristic to make choices as we " 58bdd1243dSDimitry Andric "traverse the search space using the exact solver. Defaulted " 59bdd1243dSDimitry Andric "to on, and if turned off, we will use the node order -- " 60bdd1243dSDimitry Andric "attempting to put the later nodes in the later sched groups. " 61bdd1243dSDimitry Andric "Experimentally, results are mixed, so this should be set on a " 62bdd1243dSDimitry Andric "case-by-case basis.")); 6381ad6265SDimitry Andric 64bdd1243dSDimitry Andric // Components of the mask that determines which instruction types may be may be 65bdd1243dSDimitry Andric // classified into a SchedGroup. 66bdd1243dSDimitry Andric enum class SchedGroupMask { 6781ad6265SDimitry Andric NONE = 0u, 6881ad6265SDimitry Andric ALU = 1u << 0, 6981ad6265SDimitry Andric VALU = 1u << 1, 7081ad6265SDimitry Andric SALU = 1u << 2, 7181ad6265SDimitry Andric MFMA = 1u << 3, 7281ad6265SDimitry Andric VMEM = 1u << 4, 7381ad6265SDimitry Andric VMEM_READ = 1u << 5, 7481ad6265SDimitry Andric VMEM_WRITE = 1u << 6, 7581ad6265SDimitry Andric DS = 1u << 7, 7681ad6265SDimitry Andric DS_READ = 1u << 8, 7781ad6265SDimitry Andric DS_WRITE = 1u << 9, 78*cb14a3feSDimitry Andric TRANS = 1u << 10, 79bdd1243dSDimitry Andric ALL = ALU | VALU | SALU | MFMA | VMEM | VMEM_READ | VMEM_WRITE | DS | 80*cb14a3feSDimitry Andric DS_READ | DS_WRITE | TRANS, 81bdd1243dSDimitry Andric LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ ALL) 8281ad6265SDimitry Andric }; 8381ad6265SDimitry Andric 8406c3fb27SDimitry Andric class SchedGroup; 8506c3fb27SDimitry Andric 8606c3fb27SDimitry Andric // InstructionRule class is used to enact a filter which determines whether or 8706c3fb27SDimitry Andric // not an SU maps to a given SchedGroup. It contains complementary data 8806c3fb27SDimitry Andric // structures (e.g Cache) to help those filters. 8906c3fb27SDimitry Andric class InstructionRule { 9006c3fb27SDimitry Andric protected: 9106c3fb27SDimitry Andric const SIInstrInfo *TII; 9206c3fb27SDimitry Andric unsigned SGID; 9306c3fb27SDimitry Andric // A cache made available to the Filter to store SUnits for subsequent 9406c3fb27SDimitry Andric // invocations of the Filter 9506c3fb27SDimitry Andric std::optional<SmallVector<SUnit *, 4>> Cache; 9606c3fb27SDimitry Andric 9706c3fb27SDimitry Andric public: 9806c3fb27SDimitry Andric virtual bool 9906c3fb27SDimitry Andric apply(const SUnit *, const ArrayRef<SUnit *>, 10006c3fb27SDimitry Andric SmallVectorImpl<SchedGroup> &) { 10106c3fb27SDimitry Andric return true; 10206c3fb27SDimitry Andric }; 10306c3fb27SDimitry Andric 10406c3fb27SDimitry Andric InstructionRule(const SIInstrInfo *TII, unsigned SGID, 10506c3fb27SDimitry Andric bool NeedsCache = false) 10606c3fb27SDimitry Andric : TII(TII), SGID(SGID) { 10706c3fb27SDimitry Andric if (NeedsCache) { 10806c3fb27SDimitry Andric Cache = SmallVector<SUnit *, 4>(); 10906c3fb27SDimitry Andric } 11006c3fb27SDimitry Andric } 11106c3fb27SDimitry Andric 11206c3fb27SDimitry Andric virtual ~InstructionRule() = default; 11306c3fb27SDimitry Andric }; 11406c3fb27SDimitry Andric 115bdd1243dSDimitry Andric typedef DenseMap<SUnit *, SmallVector<int, 4>> SUnitsToCandidateSGsMap; 11681ad6265SDimitry Andric 117bdd1243dSDimitry Andric // Classify instructions into groups to enable fine tuned control over the 118bdd1243dSDimitry Andric // scheduler. These groups may be more specific than current SchedModel 119bdd1243dSDimitry Andric // instruction classes. 120bdd1243dSDimitry Andric class SchedGroup { 121bdd1243dSDimitry Andric private: 122bdd1243dSDimitry Andric // Mask that defines which instruction types can be classified into this 123bdd1243dSDimitry Andric // SchedGroup. The instruction types correspond to the mask from SCHED_BARRIER 124bdd1243dSDimitry Andric // and SCHED_GROUP_BARRIER. 125bdd1243dSDimitry Andric SchedGroupMask SGMask; 12681ad6265SDimitry Andric 127bdd1243dSDimitry Andric // Maximum number of SUnits that can be added to this group. 128bdd1243dSDimitry Andric std::optional<unsigned> MaxSize; 12981ad6265SDimitry Andric 130bdd1243dSDimitry Andric // SchedGroups will only synchronize with other SchedGroups that have the same 131bdd1243dSDimitry Andric // SyncID. 132bdd1243dSDimitry Andric int SyncID = 0; 13381ad6265SDimitry Andric 134bdd1243dSDimitry Andric // SGID is used to map instructions to candidate SchedGroups 135bdd1243dSDimitry Andric unsigned SGID; 136bdd1243dSDimitry Andric 13706c3fb27SDimitry Andric // The different rules each instruction in this SchedGroup must conform to 13806c3fb27SDimitry Andric SmallVector<std::shared_ptr<InstructionRule>, 4> Rules; 13906c3fb27SDimitry Andric 140bdd1243dSDimitry Andric // Count of the number of created SchedGroups, used to initialize SGID. 141bdd1243dSDimitry Andric static unsigned NumSchedGroups; 142bdd1243dSDimitry Andric 143bdd1243dSDimitry Andric const SIInstrInfo *TII; 144bdd1243dSDimitry Andric 145bdd1243dSDimitry Andric // Try to add and edge from SU A to SU B. 146bdd1243dSDimitry Andric bool tryAddEdge(SUnit *A, SUnit *B); 147bdd1243dSDimitry Andric 148bdd1243dSDimitry Andric // Use SGMask to determine whether we can classify MI as a member of this 149bdd1243dSDimitry Andric // SchedGroup object. 150bdd1243dSDimitry Andric bool canAddMI(const MachineInstr &MI) const; 15181ad6265SDimitry Andric 15281ad6265SDimitry Andric public: 153bdd1243dSDimitry Andric // Collection of SUnits that are classified as members of this group. 154bdd1243dSDimitry Andric SmallVector<SUnit *, 32> Collection; 15581ad6265SDimitry Andric 15606c3fb27SDimitry Andric ScheduleDAGInstrs *DAG; 15706c3fb27SDimitry Andric 158bdd1243dSDimitry Andric // Returns true if SU can be added to this SchedGroup. 159bdd1243dSDimitry Andric bool canAddSU(SUnit &SU) const; 16081ad6265SDimitry Andric 161bdd1243dSDimitry Andric // Add DAG dependencies from all SUnits in this SchedGroup and this SU. If 162bdd1243dSDimitry Andric // MakePred is true, SU will be a predecessor of the SUnits in this 163bdd1243dSDimitry Andric // SchedGroup, otherwise SU will be a successor. 164bdd1243dSDimitry Andric void link(SUnit &SU, bool MakePred = false); 16581ad6265SDimitry Andric 166bdd1243dSDimitry Andric // Add DAG dependencies and track which edges are added, and the count of 167bdd1243dSDimitry Andric // missed edges 168bdd1243dSDimitry Andric int link(SUnit &SU, bool MakePred, 169bdd1243dSDimitry Andric std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges); 17081ad6265SDimitry Andric 171bdd1243dSDimitry Andric // Add DAG dependencies from all SUnits in this SchedGroup and this SU. 172bdd1243dSDimitry Andric // Use the predicate to determine whether SU should be a predecessor (P = 173bdd1243dSDimitry Andric // true) or a successor (P = false) of this SchedGroup. 174bdd1243dSDimitry Andric void link(SUnit &SU, function_ref<bool(const SUnit *A, const SUnit *B)> P); 17581ad6265SDimitry Andric 176bdd1243dSDimitry Andric // Add DAG dependencies such that SUnits in this group shall be ordered 177bdd1243dSDimitry Andric // before SUnits in OtherGroup. 178bdd1243dSDimitry Andric void link(SchedGroup &OtherGroup); 179bdd1243dSDimitry Andric 180bdd1243dSDimitry Andric // Returns true if no more instructions may be added to this group. 181bdd1243dSDimitry Andric bool isFull() const { return MaxSize && Collection.size() >= *MaxSize; } 182bdd1243dSDimitry Andric 18306c3fb27SDimitry Andric // Append a constraint that SUs must meet in order to fit into this 18406c3fb27SDimitry Andric // SchedGroup. Since many rules involve the relationship between a SchedGroup 18506c3fb27SDimitry Andric // and the SUnits in other SchedGroups, rules are checked at Pipeline Solve 18606c3fb27SDimitry Andric // time (rather than SchedGroup init time.) 18706c3fb27SDimitry Andric void addRule(std::shared_ptr<InstructionRule> NewRule) { 18806c3fb27SDimitry Andric Rules.push_back(NewRule); 18906c3fb27SDimitry Andric } 19006c3fb27SDimitry Andric 19106c3fb27SDimitry Andric // Returns true if the SU matches all rules 19206c3fb27SDimitry Andric bool allowedByRules(const SUnit *SU, 19306c3fb27SDimitry Andric SmallVectorImpl<SchedGroup> &SyncPipe) const { 19406c3fb27SDimitry Andric if (Rules.empty()) 19506c3fb27SDimitry Andric return true; 19606c3fb27SDimitry Andric for (size_t I = 0; I < Rules.size(); I++) { 19706c3fb27SDimitry Andric auto TheRule = Rules[I].get(); 19806c3fb27SDimitry Andric if (!TheRule->apply(SU, Collection, SyncPipe)) { 19906c3fb27SDimitry Andric return false; 20006c3fb27SDimitry Andric } 20106c3fb27SDimitry Andric } 20206c3fb27SDimitry Andric return true; 20306c3fb27SDimitry Andric } 20406c3fb27SDimitry Andric 205bdd1243dSDimitry Andric // Add SU to the SchedGroup. 206bdd1243dSDimitry Andric void add(SUnit &SU) { 207bdd1243dSDimitry Andric LLVM_DEBUG(dbgs() << "For SchedGroup with mask " 208bdd1243dSDimitry Andric << format_hex((int)SGMask, 10, true) << " adding " 209bdd1243dSDimitry Andric << *SU.getInstr()); 210bdd1243dSDimitry Andric Collection.push_back(&SU); 21181ad6265SDimitry Andric } 21281ad6265SDimitry Andric 213bdd1243dSDimitry Andric // Remove last element in the SchedGroup 214bdd1243dSDimitry Andric void pop() { Collection.pop_back(); } 215bdd1243dSDimitry Andric 216bdd1243dSDimitry Andric // Identify and add all relevant SUs from the DAG to this SchedGroup. 217bdd1243dSDimitry Andric void initSchedGroup(); 218bdd1243dSDimitry Andric 219bdd1243dSDimitry Andric // Add instructions to the SchedGroup bottom up starting from RIter. 220bdd1243dSDimitry Andric // PipelineInstrs is a set of instructions that should not be added to the 221bdd1243dSDimitry Andric // SchedGroup even when the other conditions for adding it are satisfied. 222bdd1243dSDimitry Andric // RIter will be added to the SchedGroup as well, and dependencies will be 223bdd1243dSDimitry Andric // added so that RIter will always be scheduled at the end of the group. 224bdd1243dSDimitry Andric void initSchedGroup(std::vector<SUnit>::reverse_iterator RIter, 225bdd1243dSDimitry Andric SUnitsToCandidateSGsMap &SyncedInstrs); 226bdd1243dSDimitry Andric 227bdd1243dSDimitry Andric void initSchedGroup(SUnitsToCandidateSGsMap &SyncedInstrs); 228bdd1243dSDimitry Andric 229bdd1243dSDimitry Andric int getSyncID() { return SyncID; } 230bdd1243dSDimitry Andric 231bdd1243dSDimitry Andric int getSGID() { return SGID; } 232bdd1243dSDimitry Andric 233bdd1243dSDimitry Andric SchedGroupMask getMask() { return SGMask; } 234bdd1243dSDimitry Andric 235bdd1243dSDimitry Andric SchedGroup(SchedGroupMask SGMask, std::optional<unsigned> MaxSize, 236bdd1243dSDimitry Andric ScheduleDAGInstrs *DAG, const SIInstrInfo *TII) 23706c3fb27SDimitry Andric : SGMask(SGMask), MaxSize(MaxSize), TII(TII), DAG(DAG) { 238bdd1243dSDimitry Andric SGID = NumSchedGroups++; 239bdd1243dSDimitry Andric } 240bdd1243dSDimitry Andric 241bdd1243dSDimitry Andric SchedGroup(SchedGroupMask SGMask, std::optional<unsigned> MaxSize, int SyncID, 242bdd1243dSDimitry Andric ScheduleDAGInstrs *DAG, const SIInstrInfo *TII) 24306c3fb27SDimitry Andric : SGMask(SGMask), MaxSize(MaxSize), SyncID(SyncID), TII(TII), DAG(DAG) { 244bdd1243dSDimitry Andric SGID = NumSchedGroups++; 245bdd1243dSDimitry Andric } 246bdd1243dSDimitry Andric }; 247bdd1243dSDimitry Andric 248bdd1243dSDimitry Andric // Remove all existing edges from a SCHED_BARRIER or SCHED_GROUP_BARRIER. 249bdd1243dSDimitry Andric static void resetEdges(SUnit &SU, ScheduleDAGInstrs *DAG) { 250bdd1243dSDimitry Andric assert(SU.getInstr()->getOpcode() == AMDGPU::SCHED_BARRIER || 251bdd1243dSDimitry Andric SU.getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER || 252bdd1243dSDimitry Andric SU.getInstr()->getOpcode() == AMDGPU::IGLP_OPT); 253bdd1243dSDimitry Andric 254bdd1243dSDimitry Andric while (!SU.Preds.empty()) 255bdd1243dSDimitry Andric for (auto &P : SU.Preds) 256bdd1243dSDimitry Andric SU.removePred(P); 257bdd1243dSDimitry Andric 258bdd1243dSDimitry Andric while (!SU.Succs.empty()) 259bdd1243dSDimitry Andric for (auto &S : SU.Succs) 260bdd1243dSDimitry Andric for (auto &SP : S.getSUnit()->Preds) 261bdd1243dSDimitry Andric if (SP.getSUnit() == &SU) 262bdd1243dSDimitry Andric S.getSUnit()->removePred(SP); 263bdd1243dSDimitry Andric } 264bdd1243dSDimitry Andric 265bdd1243dSDimitry Andric typedef std::pair<SUnit *, SmallVector<int, 4>> SUToCandSGsPair; 266bdd1243dSDimitry Andric typedef SmallVector<SUToCandSGsPair, 4> SUsToCandSGsVec; 267bdd1243dSDimitry Andric 268bdd1243dSDimitry Andric // The PipelineSolver is used to assign SUnits to SchedGroups in a pipeline 269bdd1243dSDimitry Andric // in non-trivial cases. For example, if the requested pipeline is 270bdd1243dSDimitry Andric // {VMEM_READ, VALU, MFMA, VMEM_READ} and we encounter a VMEM_READ instruction 271bdd1243dSDimitry Andric // in the DAG, then we will have an instruction that can not be trivially 272bdd1243dSDimitry Andric // assigned to a SchedGroup. The PipelineSolver class implements two algorithms 273bdd1243dSDimitry Andric // to find a good solution to the pipeline -- a greedy algorithm and an exact 274bdd1243dSDimitry Andric // algorithm. The exact algorithm has an exponential time complexity and should 275bdd1243dSDimitry Andric // only be used for small sized problems or medium sized problems where an exact 276bdd1243dSDimitry Andric // solution is highly desired. 277bdd1243dSDimitry Andric class PipelineSolver { 278bdd1243dSDimitry Andric ScheduleDAGMI *DAG; 279bdd1243dSDimitry Andric 280bdd1243dSDimitry Andric // Instructions that can be assigned to multiple SchedGroups 281bdd1243dSDimitry Andric DenseMap<int, SUnitsToCandidateSGsMap> SyncedInstrs; 282bdd1243dSDimitry Andric SmallVector<SUsToCandSGsVec, 4> PipelineInstrs; 283bdd1243dSDimitry Andric DenseMap<int, SmallVector<SchedGroup, 4>> SyncedSchedGroups; 284bdd1243dSDimitry Andric // The current working pipeline 285bdd1243dSDimitry Andric SmallVector<SmallVector<SchedGroup, 4>, 4> CurrPipeline; 286bdd1243dSDimitry Andric // The pipeline that has the best solution found so far 287bdd1243dSDimitry Andric SmallVector<SmallVector<SchedGroup, 4>, 4> BestPipeline; 288bdd1243dSDimitry Andric 289bdd1243dSDimitry Andric // Whether or not we actually have any SyncedInstrs to try to solve. 290bdd1243dSDimitry Andric bool NeedsSolver = false; 291bdd1243dSDimitry Andric 292bdd1243dSDimitry Andric // Compute an estimate of the size of search tree -- the true size is 293bdd1243dSDimitry Andric // the product of each conflictedInst.Matches.size() across all SyncPipelines 294bdd1243dSDimitry Andric unsigned computeProblemSize(); 295bdd1243dSDimitry Andric 296bdd1243dSDimitry Andric // The cost penalty of not assigning a SU to a SchedGroup 297bdd1243dSDimitry Andric int MissPenalty = 0; 298bdd1243dSDimitry Andric 299bdd1243dSDimitry Andric // Costs in terms of the number of edges we are unable to add 300bdd1243dSDimitry Andric int BestCost = -1; 301bdd1243dSDimitry Andric int CurrCost = 0; 302bdd1243dSDimitry Andric 303bdd1243dSDimitry Andric // Index pointing to the conflicting instruction that is currently being 304bdd1243dSDimitry Andric // fitted 305bdd1243dSDimitry Andric int CurrConflInstNo = 0; 306bdd1243dSDimitry Andric // Index to the pipeline that is currently being fitted 307bdd1243dSDimitry Andric int CurrSyncGroupIdx = 0; 308bdd1243dSDimitry Andric // The first non trivial pipeline 309bdd1243dSDimitry Andric int BeginSyncGroupIdx = 0; 310bdd1243dSDimitry Andric 311bdd1243dSDimitry Andric // How many branches we have explored 312bdd1243dSDimitry Andric uint64_t BranchesExplored = 0; 313bdd1243dSDimitry Andric 31406c3fb27SDimitry Andric // The direction in which we process the candidate SchedGroups per SU 31506c3fb27SDimitry Andric bool IsBottomUp = 1; 31606c3fb27SDimitry Andric 317bdd1243dSDimitry Andric // Update indices to fit next conflicting instruction 318bdd1243dSDimitry Andric void advancePosition(); 319bdd1243dSDimitry Andric // Recede indices to attempt to find better fit for previous conflicting 320bdd1243dSDimitry Andric // instruction 321bdd1243dSDimitry Andric void retreatPosition(); 322bdd1243dSDimitry Andric 323bdd1243dSDimitry Andric // The exponential time algorithm which finds the provably best fit 324bdd1243dSDimitry Andric bool solveExact(); 325bdd1243dSDimitry Andric // The polynomial time algorithm which attempts to find a good fit 326bdd1243dSDimitry Andric bool solveGreedy(); 32706c3fb27SDimitry Andric // Find the best SchedGroup for the current SU using the heuristic given all 32806c3fb27SDimitry Andric // current information. One step in the greedy algorithm. Templated against 32906c3fb27SDimitry Andric // the SchedGroup iterator (either reverse or forward). 33006c3fb27SDimitry Andric template <typename T> 33106c3fb27SDimitry Andric void greedyFind(std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges, T I, 33206c3fb27SDimitry Andric T E); 333bdd1243dSDimitry Andric // Whether or not the current solution is optimal 334bdd1243dSDimitry Andric bool checkOptimal(); 335bdd1243dSDimitry Andric // Populate the ready list, prioiritizing fewest missed edges first 33606c3fb27SDimitry Andric // Templated against the SchedGroup iterator (either reverse or forward). 33706c3fb27SDimitry Andric template <typename T> 33806c3fb27SDimitry Andric void populateReadyList(SmallVectorImpl<std::pair<int, int>> &ReadyList, T I, 33906c3fb27SDimitry Andric T E); 340bdd1243dSDimitry Andric // Add edges corresponding to the SchedGroups as assigned by solver 341bdd1243dSDimitry Andric void makePipeline(); 34206c3fb27SDimitry Andric // Link the SchedGroups in the best found pipeline. 34306c3fb27SDimitry Andric // Tmplated against the SchedGroup iterator (either reverse or forward). 34406c3fb27SDimitry Andric template <typename T> void linkSchedGroups(T I, T E); 345bdd1243dSDimitry Andric // Add the edges from the SU to the other SchedGroups in pipeline, and 346bdd1243dSDimitry Andric // return the number of edges missed. 347bdd1243dSDimitry Andric int addEdges(SmallVectorImpl<SchedGroup> &SyncPipeline, SUnit *SU, int SGID, 348bdd1243dSDimitry Andric std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges); 3495f757f3fSDimitry Andric /// Link the pipeline as if \p SU was in the SchedGroup with ID \p SGID. It 3505f757f3fSDimitry Andric /// returns the cost (in terms of missed pipeline edges), and tracks the edges 3515f757f3fSDimitry Andric /// added in \p AddedEdges 35206c3fb27SDimitry Andric template <typename T> 35306c3fb27SDimitry Andric int linkSUnit(SUnit *SU, int SGID, 35406c3fb27SDimitry Andric std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges, T I, T E); 3555f757f3fSDimitry Andric /// Remove the edges passed via \p AddedEdges 356bdd1243dSDimitry Andric void removeEdges(const std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges); 357bdd1243dSDimitry Andric // Convert the passed in maps to arrays for bidirectional iterators 358bdd1243dSDimitry Andric void convertSyncMapsToArrays(); 359bdd1243dSDimitry Andric 360bdd1243dSDimitry Andric void reset(); 361bdd1243dSDimitry Andric 362bdd1243dSDimitry Andric public: 363bdd1243dSDimitry Andric // Invoke the solver to map instructions to instruction groups. Heuristic && 364bdd1243dSDimitry Andric // command-line-option determines to use exact or greedy algorithm. 365bdd1243dSDimitry Andric void solve(); 366bdd1243dSDimitry Andric 367bdd1243dSDimitry Andric PipelineSolver(DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups, 368bdd1243dSDimitry Andric DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs, 36906c3fb27SDimitry Andric ScheduleDAGMI *DAG, bool IsBottomUp = 1) 370bdd1243dSDimitry Andric : DAG(DAG), SyncedInstrs(SyncedInstrs), 37106c3fb27SDimitry Andric SyncedSchedGroups(SyncedSchedGroups), IsBottomUp(IsBottomUp) { 372bdd1243dSDimitry Andric 373bdd1243dSDimitry Andric for (auto &PipelineInstrs : SyncedInstrs) { 374bdd1243dSDimitry Andric if (PipelineInstrs.second.size() > 0) { 375bdd1243dSDimitry Andric NeedsSolver = true; 376bdd1243dSDimitry Andric break; 377bdd1243dSDimitry Andric } 378bdd1243dSDimitry Andric } 379bdd1243dSDimitry Andric 380bdd1243dSDimitry Andric if (!NeedsSolver) 381bdd1243dSDimitry Andric return; 382bdd1243dSDimitry Andric 383bdd1243dSDimitry Andric convertSyncMapsToArrays(); 384bdd1243dSDimitry Andric 385bdd1243dSDimitry Andric CurrPipeline = BestPipeline; 386bdd1243dSDimitry Andric 387bdd1243dSDimitry Andric while (static_cast<size_t>(BeginSyncGroupIdx) < PipelineInstrs.size() && 388bdd1243dSDimitry Andric PipelineInstrs[BeginSyncGroupIdx].size() == 0) 389bdd1243dSDimitry Andric ++BeginSyncGroupIdx; 390bdd1243dSDimitry Andric 391bdd1243dSDimitry Andric if (static_cast<size_t>(BeginSyncGroupIdx) >= PipelineInstrs.size()) 392bdd1243dSDimitry Andric return; 393bdd1243dSDimitry Andric } 394bdd1243dSDimitry Andric }; 395bdd1243dSDimitry Andric 396bdd1243dSDimitry Andric void PipelineSolver::reset() { 397bdd1243dSDimitry Andric 398bdd1243dSDimitry Andric for (auto &SyncPipeline : CurrPipeline) { 399bdd1243dSDimitry Andric for (auto &SG : SyncPipeline) { 400bdd1243dSDimitry Andric SmallVector<SUnit *, 32> TempCollection = SG.Collection; 401bdd1243dSDimitry Andric SG.Collection.clear(); 402bdd1243dSDimitry Andric auto SchedBarr = llvm::find_if(TempCollection, [](SUnit *SU) { 403bdd1243dSDimitry Andric return SU->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER; 404bdd1243dSDimitry Andric }); 405bdd1243dSDimitry Andric if (SchedBarr != TempCollection.end()) 406bdd1243dSDimitry Andric SG.Collection.push_back(*SchedBarr); 407bdd1243dSDimitry Andric } 408bdd1243dSDimitry Andric } 409bdd1243dSDimitry Andric 410bdd1243dSDimitry Andric CurrSyncGroupIdx = BeginSyncGroupIdx; 411bdd1243dSDimitry Andric CurrConflInstNo = 0; 412bdd1243dSDimitry Andric CurrCost = 0; 413bdd1243dSDimitry Andric } 414bdd1243dSDimitry Andric 415bdd1243dSDimitry Andric void PipelineSolver::convertSyncMapsToArrays() { 416bdd1243dSDimitry Andric for (auto &SyncPipe : SyncedSchedGroups) { 417bdd1243dSDimitry Andric BestPipeline.insert(BestPipeline.begin(), SyncPipe.second); 418bdd1243dSDimitry Andric } 419bdd1243dSDimitry Andric 420bdd1243dSDimitry Andric int PipelineIDx = SyncedInstrs.size() - 1; 421bdd1243dSDimitry Andric PipelineInstrs.resize(SyncedInstrs.size()); 422bdd1243dSDimitry Andric for (auto &SyncInstrMap : SyncedInstrs) { 423bdd1243dSDimitry Andric for (auto &SUsToCandSGs : SyncInstrMap.second) { 424bdd1243dSDimitry Andric if (PipelineInstrs[PipelineIDx].size() == 0) { 425bdd1243dSDimitry Andric PipelineInstrs[PipelineIDx].push_back( 426bdd1243dSDimitry Andric std::pair(SUsToCandSGs.first, SUsToCandSGs.second)); 427bdd1243dSDimitry Andric continue; 428bdd1243dSDimitry Andric } 429bdd1243dSDimitry Andric auto SortPosition = PipelineInstrs[PipelineIDx].begin(); 430bdd1243dSDimitry Andric // Insert them in sorted order -- this allows for good parsing order in 431bdd1243dSDimitry Andric // the greedy algorithm 432bdd1243dSDimitry Andric while (SortPosition != PipelineInstrs[PipelineIDx].end() && 433bdd1243dSDimitry Andric SUsToCandSGs.first->NodeNum > SortPosition->first->NodeNum) 434bdd1243dSDimitry Andric ++SortPosition; 435bdd1243dSDimitry Andric PipelineInstrs[PipelineIDx].insert( 436bdd1243dSDimitry Andric SortPosition, std::pair(SUsToCandSGs.first, SUsToCandSGs.second)); 437bdd1243dSDimitry Andric } 438bdd1243dSDimitry Andric --PipelineIDx; 439bdd1243dSDimitry Andric } 440bdd1243dSDimitry Andric } 441bdd1243dSDimitry Andric 44206c3fb27SDimitry Andric template <typename T> void PipelineSolver::linkSchedGroups(T I, T E) { 44306c3fb27SDimitry Andric for (; I != E; ++I) { 44406c3fb27SDimitry Andric auto &GroupA = *I; 44506c3fb27SDimitry Andric for (auto J = std::next(I); J != E; ++J) { 44606c3fb27SDimitry Andric auto &GroupB = *J; 44706c3fb27SDimitry Andric GroupA.link(GroupB); 44806c3fb27SDimitry Andric } 44906c3fb27SDimitry Andric } 45006c3fb27SDimitry Andric } 45106c3fb27SDimitry Andric 452bdd1243dSDimitry Andric void PipelineSolver::makePipeline() { 453bdd1243dSDimitry Andric // Preserve the order of barrier for subsequent SchedGroupBarrier mutations 454bdd1243dSDimitry Andric for (auto &SyncPipeline : BestPipeline) { 45506c3fb27SDimitry Andric LLVM_DEBUG(dbgs() << "Printing SchedGroups\n"); 456bdd1243dSDimitry Andric for (auto &SG : SyncPipeline) { 45706c3fb27SDimitry Andric LLVM_DEBUG(dbgs() << "SchedGroup with SGID " << SG.getSGID() 45806c3fb27SDimitry Andric << " has: \n"); 459bdd1243dSDimitry Andric SUnit *SGBarr = nullptr; 460bdd1243dSDimitry Andric for (auto &SU : SG.Collection) { 461bdd1243dSDimitry Andric if (SU->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER) 462bdd1243dSDimitry Andric SGBarr = SU; 46306c3fb27SDimitry Andric LLVM_DEBUG(dbgs() << "SU(" << SU->NodeNum << ")\n"); 464bdd1243dSDimitry Andric } 465bdd1243dSDimitry Andric // Command line requested IGroupLP doesn't have SGBarr 466bdd1243dSDimitry Andric if (!SGBarr) 467bdd1243dSDimitry Andric continue; 468bdd1243dSDimitry Andric resetEdges(*SGBarr, DAG); 469bdd1243dSDimitry Andric SG.link(*SGBarr, false); 470bdd1243dSDimitry Andric } 471bdd1243dSDimitry Andric } 472bdd1243dSDimitry Andric 473bdd1243dSDimitry Andric for (auto &SyncPipeline : BestPipeline) { 47406c3fb27SDimitry Andric IsBottomUp ? linkSchedGroups(SyncPipeline.rbegin(), SyncPipeline.rend()) 47506c3fb27SDimitry Andric : linkSchedGroups(SyncPipeline.begin(), SyncPipeline.end()); 47681ad6265SDimitry Andric } 47781ad6265SDimitry Andric } 47806c3fb27SDimitry Andric 47906c3fb27SDimitry Andric template <typename T> 48006c3fb27SDimitry Andric int PipelineSolver::linkSUnit( 48106c3fb27SDimitry Andric SUnit *SU, int SGID, std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges, 48206c3fb27SDimitry Andric T I, T E) { 48306c3fb27SDimitry Andric bool MakePred = false; 48406c3fb27SDimitry Andric int AddedCost = 0; 48506c3fb27SDimitry Andric for (; I < E; ++I) { 48606c3fb27SDimitry Andric if (I->getSGID() == SGID) { 48706c3fb27SDimitry Andric MakePred = true; 48806c3fb27SDimitry Andric continue; 48981ad6265SDimitry Andric } 49006c3fb27SDimitry Andric auto Group = *I; 49106c3fb27SDimitry Andric AddedCost += Group.link(*SU, MakePred, AddedEdges); 49206c3fb27SDimitry Andric assert(AddedCost >= 0); 49306c3fb27SDimitry Andric } 49406c3fb27SDimitry Andric return AddedCost; 495bdd1243dSDimitry Andric } 49681ad6265SDimitry Andric 497bdd1243dSDimitry Andric int PipelineSolver::addEdges( 498bdd1243dSDimitry Andric SmallVectorImpl<SchedGroup> &SyncPipeline, SUnit *SU, int SGID, 499bdd1243dSDimitry Andric std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges) { 500bdd1243dSDimitry Andric 50106c3fb27SDimitry Andric // For IsBottomUp, the first SchedGroup in SyncPipeline contains the 50206c3fb27SDimitry Andric // instructions that are the ultimate successors in the resultant mutation. 50306c3fb27SDimitry Andric // Therefore, in such a configuration, the SchedGroups occurring before the 50406c3fb27SDimitry Andric // candidate SGID are successors of the candidate SchedGroup, thus the current 50506c3fb27SDimitry Andric // SU should be linked as a predecessor to SUs in those SchedGroups. The 50606c3fb27SDimitry Andric // opposite is true if !IsBottomUp. IsBottomUp occurs in the case of multiple 50706c3fb27SDimitry Andric // SCHED_GROUP_BARRIERS, or if a user specifies IGLP_OPT SchedGroups using 50806c3fb27SDimitry Andric // IsBottomUp (in reverse). 50906c3fb27SDimitry Andric return IsBottomUp ? linkSUnit(SU, SGID, AddedEdges, SyncPipeline.rbegin(), 51006c3fb27SDimitry Andric SyncPipeline.rend()) 51106c3fb27SDimitry Andric : linkSUnit(SU, SGID, AddedEdges, SyncPipeline.begin(), 51206c3fb27SDimitry Andric SyncPipeline.end()); 513bdd1243dSDimitry Andric } 514bdd1243dSDimitry Andric 515bdd1243dSDimitry Andric void PipelineSolver::removeEdges( 516bdd1243dSDimitry Andric const std::vector<std::pair<SUnit *, SUnit *>> &EdgesToRemove) { 517bdd1243dSDimitry Andric // Only remove the edges that we have added when testing 518bdd1243dSDimitry Andric // the fit. 519bdd1243dSDimitry Andric for (auto &PredSuccPair : EdgesToRemove) { 520bdd1243dSDimitry Andric SUnit *Pred = PredSuccPair.first; 521bdd1243dSDimitry Andric SUnit *Succ = PredSuccPair.second; 522bdd1243dSDimitry Andric 523bdd1243dSDimitry Andric auto Match = llvm::find_if( 524bdd1243dSDimitry Andric Succ->Preds, [&Pred](SDep &P) { return P.getSUnit() == Pred; }); 525bdd1243dSDimitry Andric if (Match != Succ->Preds.end()) { 526bdd1243dSDimitry Andric assert(Match->isArtificial()); 527bdd1243dSDimitry Andric Succ->removePred(*Match); 528bdd1243dSDimitry Andric } 529bdd1243dSDimitry Andric } 530bdd1243dSDimitry Andric } 531bdd1243dSDimitry Andric 532bdd1243dSDimitry Andric void PipelineSolver::advancePosition() { 533bdd1243dSDimitry Andric ++CurrConflInstNo; 534bdd1243dSDimitry Andric 535bdd1243dSDimitry Andric if (static_cast<size_t>(CurrConflInstNo) >= 536bdd1243dSDimitry Andric PipelineInstrs[CurrSyncGroupIdx].size()) { 537bdd1243dSDimitry Andric CurrConflInstNo = 0; 538bdd1243dSDimitry Andric ++CurrSyncGroupIdx; 539bdd1243dSDimitry Andric // Advance to next non-trivial pipeline 540bdd1243dSDimitry Andric while (static_cast<size_t>(CurrSyncGroupIdx) < PipelineInstrs.size() && 541bdd1243dSDimitry Andric PipelineInstrs[CurrSyncGroupIdx].size() == 0) 542bdd1243dSDimitry Andric ++CurrSyncGroupIdx; 543bdd1243dSDimitry Andric } 544bdd1243dSDimitry Andric } 545bdd1243dSDimitry Andric 546bdd1243dSDimitry Andric void PipelineSolver::retreatPosition() { 547bdd1243dSDimitry Andric assert(CurrConflInstNo >= 0); 548bdd1243dSDimitry Andric assert(CurrSyncGroupIdx >= 0); 549bdd1243dSDimitry Andric 550bdd1243dSDimitry Andric if (CurrConflInstNo > 0) { 551bdd1243dSDimitry Andric --CurrConflInstNo; 552bdd1243dSDimitry Andric return; 553bdd1243dSDimitry Andric } 554bdd1243dSDimitry Andric 555bdd1243dSDimitry Andric if (CurrConflInstNo == 0) { 556bdd1243dSDimitry Andric // If we return to the starting position, we have explored 557bdd1243dSDimitry Andric // the entire tree 558bdd1243dSDimitry Andric if (CurrSyncGroupIdx == BeginSyncGroupIdx) 559bdd1243dSDimitry Andric return; 560bdd1243dSDimitry Andric 561bdd1243dSDimitry Andric --CurrSyncGroupIdx; 562bdd1243dSDimitry Andric // Go to previous non-trivial pipeline 563bdd1243dSDimitry Andric while (PipelineInstrs[CurrSyncGroupIdx].size() == 0) 564bdd1243dSDimitry Andric --CurrSyncGroupIdx; 565bdd1243dSDimitry Andric 566bdd1243dSDimitry Andric CurrConflInstNo = PipelineInstrs[CurrSyncGroupIdx].size() - 1; 567bdd1243dSDimitry Andric } 568bdd1243dSDimitry Andric } 569bdd1243dSDimitry Andric 570bdd1243dSDimitry Andric bool PipelineSolver::checkOptimal() { 571bdd1243dSDimitry Andric if (static_cast<size_t>(CurrSyncGroupIdx) == PipelineInstrs.size()) { 572bdd1243dSDimitry Andric if (BestCost == -1 || CurrCost < BestCost) { 573bdd1243dSDimitry Andric BestPipeline = CurrPipeline; 574bdd1243dSDimitry Andric BestCost = CurrCost; 575bdd1243dSDimitry Andric LLVM_DEBUG(dbgs() << "Found Fit with cost " << BestCost << "\n"); 576bdd1243dSDimitry Andric } 577bdd1243dSDimitry Andric assert(BestCost >= 0); 578bdd1243dSDimitry Andric } 579bdd1243dSDimitry Andric 580bdd1243dSDimitry Andric bool DoneExploring = false; 581bdd1243dSDimitry Andric if (MaxBranchesExplored > 0 && BranchesExplored >= MaxBranchesExplored) 582bdd1243dSDimitry Andric DoneExploring = true; 583bdd1243dSDimitry Andric 584bdd1243dSDimitry Andric return (DoneExploring || BestCost == 0); 585bdd1243dSDimitry Andric } 586bdd1243dSDimitry Andric 58706c3fb27SDimitry Andric template <typename T> 588bdd1243dSDimitry Andric void PipelineSolver::populateReadyList( 58906c3fb27SDimitry Andric SmallVectorImpl<std::pair<int, int>> &ReadyList, T I, T E) { 59006c3fb27SDimitry Andric SUToCandSGsPair CurrSU = PipelineInstrs[CurrSyncGroupIdx][CurrConflInstNo]; 59106c3fb27SDimitry Andric auto SyncPipeline = CurrPipeline[CurrSyncGroupIdx]; 592bdd1243dSDimitry Andric assert(CurrSU.second.size() >= 1); 59306c3fb27SDimitry Andric 594bdd1243dSDimitry Andric for (; I != E; ++I) { 595bdd1243dSDimitry Andric std::vector<std::pair<SUnit *, SUnit *>> AddedEdges; 596bdd1243dSDimitry Andric int CandSGID = *I; 5975f757f3fSDimitry Andric SchedGroup *Match = llvm::find_if(SyncPipeline, [CandSGID](SchedGroup &SG) { 5985f757f3fSDimitry Andric return SG.getSGID() == CandSGID; 5995f757f3fSDimitry Andric }); 6005f757f3fSDimitry Andric assert(Match); 601bdd1243dSDimitry Andric 602bdd1243dSDimitry Andric if (UseCostHeur) { 603bdd1243dSDimitry Andric if (Match->isFull()) { 604bdd1243dSDimitry Andric ReadyList.push_back(std::pair(*I, MissPenalty)); 605bdd1243dSDimitry Andric continue; 606bdd1243dSDimitry Andric } 607bdd1243dSDimitry Andric 608bdd1243dSDimitry Andric int TempCost = addEdges(SyncPipeline, CurrSU.first, CandSGID, AddedEdges); 609bdd1243dSDimitry Andric ReadyList.push_back(std::pair(*I, TempCost)); 610bdd1243dSDimitry Andric removeEdges(AddedEdges); 611bdd1243dSDimitry Andric } else 612bdd1243dSDimitry Andric ReadyList.push_back(std::pair(*I, -1)); 613bdd1243dSDimitry Andric } 614bdd1243dSDimitry Andric 615bdd1243dSDimitry Andric if (UseCostHeur) { 616bdd1243dSDimitry Andric std::sort(ReadyList.begin(), ReadyList.end(), 617bdd1243dSDimitry Andric [](std::pair<int, int> A, std::pair<int, int> B) { 618bdd1243dSDimitry Andric return A.second < B.second; 619bdd1243dSDimitry Andric }); 620bdd1243dSDimitry Andric } 621bdd1243dSDimitry Andric 622bdd1243dSDimitry Andric assert(ReadyList.size() == CurrSU.second.size()); 623bdd1243dSDimitry Andric } 624bdd1243dSDimitry Andric 625bdd1243dSDimitry Andric bool PipelineSolver::solveExact() { 626bdd1243dSDimitry Andric if (checkOptimal()) 627bdd1243dSDimitry Andric return true; 628bdd1243dSDimitry Andric 629bdd1243dSDimitry Andric if (static_cast<size_t>(CurrSyncGroupIdx) == PipelineInstrs.size()) 630bdd1243dSDimitry Andric return false; 631bdd1243dSDimitry Andric 632bdd1243dSDimitry Andric assert(static_cast<size_t>(CurrSyncGroupIdx) < PipelineInstrs.size()); 633bdd1243dSDimitry Andric assert(static_cast<size_t>(CurrConflInstNo) < 634bdd1243dSDimitry Andric PipelineInstrs[CurrSyncGroupIdx].size()); 635bdd1243dSDimitry Andric SUToCandSGsPair CurrSU = PipelineInstrs[CurrSyncGroupIdx][CurrConflInstNo]; 636bdd1243dSDimitry Andric LLVM_DEBUG(dbgs() << "Fitting SU(" << CurrSU.first->NodeNum 637bdd1243dSDimitry Andric << ") in Pipeline # " << CurrSyncGroupIdx << "\n"); 638bdd1243dSDimitry Andric 639bdd1243dSDimitry Andric // SchedGroup -> Cost pairs 640bdd1243dSDimitry Andric SmallVector<std::pair<int, int>, 4> ReadyList; 641bdd1243dSDimitry Andric // Prioritize the candidate sched groups in terms of lowest cost first 64206c3fb27SDimitry Andric IsBottomUp ? populateReadyList(ReadyList, CurrSU.second.rbegin(), 64306c3fb27SDimitry Andric CurrSU.second.rend()) 64406c3fb27SDimitry Andric : populateReadyList(ReadyList, CurrSU.second.begin(), 64506c3fb27SDimitry Andric CurrSU.second.end()); 646bdd1243dSDimitry Andric 647bdd1243dSDimitry Andric auto I = ReadyList.begin(); 648bdd1243dSDimitry Andric auto E = ReadyList.end(); 649bdd1243dSDimitry Andric for (; I != E; ++I) { 650bdd1243dSDimitry Andric // If we are trying SGs in least cost order, and the current SG is cost 651bdd1243dSDimitry Andric // infeasible, then all subsequent SGs will also be cost infeasible, so we 652bdd1243dSDimitry Andric // can prune. 653bdd1243dSDimitry Andric if (BestCost != -1 && (CurrCost + I->second > BestCost)) 654bdd1243dSDimitry Andric return false; 655bdd1243dSDimitry Andric 656bdd1243dSDimitry Andric int CandSGID = I->first; 657bdd1243dSDimitry Andric int AddedCost = 0; 658bdd1243dSDimitry Andric std::vector<std::pair<SUnit *, SUnit *>> AddedEdges; 659bdd1243dSDimitry Andric auto &SyncPipeline = CurrPipeline[CurrSyncGroupIdx]; 660bdd1243dSDimitry Andric SchedGroup *Match; 661bdd1243dSDimitry Andric for (auto &SG : SyncPipeline) { 662bdd1243dSDimitry Andric if (SG.getSGID() == CandSGID) 663bdd1243dSDimitry Andric Match = &SG; 664bdd1243dSDimitry Andric } 665bdd1243dSDimitry Andric 666bdd1243dSDimitry Andric if (Match->isFull()) 667bdd1243dSDimitry Andric continue; 668bdd1243dSDimitry Andric 66906c3fb27SDimitry Andric if (!Match->allowedByRules(CurrSU.first, SyncPipeline)) 67006c3fb27SDimitry Andric continue; 67106c3fb27SDimitry Andric 672bdd1243dSDimitry Andric LLVM_DEBUG(dbgs() << "Assigning to SchedGroup with Mask " 673bdd1243dSDimitry Andric << (int)Match->getMask() << "and ID " << CandSGID 674bdd1243dSDimitry Andric << "\n"); 675bdd1243dSDimitry Andric Match->add(*CurrSU.first); 676bdd1243dSDimitry Andric AddedCost = addEdges(SyncPipeline, CurrSU.first, CandSGID, AddedEdges); 677bdd1243dSDimitry Andric LLVM_DEBUG(dbgs() << "Cost of Assignment: " << AddedCost << "\n"); 678bdd1243dSDimitry Andric CurrCost += AddedCost; 679bdd1243dSDimitry Andric advancePosition(); 680bdd1243dSDimitry Andric ++BranchesExplored; 681bdd1243dSDimitry Andric bool FinishedExploring = false; 682bdd1243dSDimitry Andric // If the Cost after adding edges is greater than a known solution, 683bdd1243dSDimitry Andric // backtrack 684bdd1243dSDimitry Andric if (CurrCost < BestCost || BestCost == -1) { 685bdd1243dSDimitry Andric if (solveExact()) { 686bdd1243dSDimitry Andric FinishedExploring = BestCost != 0; 687bdd1243dSDimitry Andric if (!FinishedExploring) 688bdd1243dSDimitry Andric return true; 689bdd1243dSDimitry Andric } 690bdd1243dSDimitry Andric } 691bdd1243dSDimitry Andric 692bdd1243dSDimitry Andric retreatPosition(); 693bdd1243dSDimitry Andric CurrCost -= AddedCost; 694bdd1243dSDimitry Andric removeEdges(AddedEdges); 695bdd1243dSDimitry Andric Match->pop(); 696bdd1243dSDimitry Andric CurrPipeline[CurrSyncGroupIdx] = SyncPipeline; 697bdd1243dSDimitry Andric if (FinishedExploring) 698bdd1243dSDimitry Andric return true; 699bdd1243dSDimitry Andric } 700bdd1243dSDimitry Andric 701bdd1243dSDimitry Andric // Try the pipeline where the current instruction is omitted 702bdd1243dSDimitry Andric // Potentially if we omit a problematic instruction from the pipeline, 703bdd1243dSDimitry Andric // all the other instructions can nicely fit. 704bdd1243dSDimitry Andric CurrCost += MissPenalty; 705bdd1243dSDimitry Andric advancePosition(); 706bdd1243dSDimitry Andric 707bdd1243dSDimitry Andric LLVM_DEBUG(dbgs() << "NOT Assigned (" << CurrSU.first->NodeNum << ")\n"); 708bdd1243dSDimitry Andric 709bdd1243dSDimitry Andric bool FinishedExploring = false; 710bdd1243dSDimitry Andric if (CurrCost < BestCost || BestCost == -1) { 711bdd1243dSDimitry Andric if (solveExact()) { 712bdd1243dSDimitry Andric bool FinishedExploring = BestCost != 0; 713bdd1243dSDimitry Andric if (!FinishedExploring) 714bdd1243dSDimitry Andric return true; 715bdd1243dSDimitry Andric } 716bdd1243dSDimitry Andric } 717bdd1243dSDimitry Andric 718bdd1243dSDimitry Andric retreatPosition(); 719bdd1243dSDimitry Andric CurrCost -= MissPenalty; 720bdd1243dSDimitry Andric return FinishedExploring; 721bdd1243dSDimitry Andric } 722bdd1243dSDimitry Andric 72306c3fb27SDimitry Andric template <typename T> 72406c3fb27SDimitry Andric void PipelineSolver::greedyFind( 72506c3fb27SDimitry Andric std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges, T I, T E) { 726bdd1243dSDimitry Andric SUToCandSGsPair CurrSU = PipelineInstrs[CurrSyncGroupIdx][CurrConflInstNo]; 727bdd1243dSDimitry Andric int BestNodeCost = -1; 728bdd1243dSDimitry Andric int TempCost; 729bdd1243dSDimitry Andric SchedGroup *BestGroup = nullptr; 730bdd1243dSDimitry Andric int BestGroupID = -1; 731bdd1243dSDimitry Andric auto &SyncPipeline = CurrPipeline[CurrSyncGroupIdx]; 732bdd1243dSDimitry Andric LLVM_DEBUG(dbgs() << "Fitting SU(" << CurrSU.first->NodeNum 733bdd1243dSDimitry Andric << ") in Pipeline # " << CurrSyncGroupIdx << "\n"); 734bdd1243dSDimitry Andric 735bdd1243dSDimitry Andric // Since we have added the potential SchedGroups from bottom up, but 736bdd1243dSDimitry Andric // traversed the DAG from top down, parse over the groups from last to 737bdd1243dSDimitry Andric // first. If we fail to do this for the greedy algorithm, the solution will 738bdd1243dSDimitry Andric // likely not be good in more complex cases. 739bdd1243dSDimitry Andric for (; I != E; ++I) { 740bdd1243dSDimitry Andric std::vector<std::pair<SUnit *, SUnit *>> AddedEdges; 741bdd1243dSDimitry Andric int CandSGID = *I; 7425f757f3fSDimitry Andric SchedGroup *Match = llvm::find_if(SyncPipeline, [CandSGID](SchedGroup &SG) { 7435f757f3fSDimitry Andric return SG.getSGID() == CandSGID; 7445f757f3fSDimitry Andric }); 7455f757f3fSDimitry Andric assert(Match); 746bdd1243dSDimitry Andric 747bdd1243dSDimitry Andric LLVM_DEBUG(dbgs() << "Trying SGID # " << CandSGID << " with Mask " 748bdd1243dSDimitry Andric << (int)Match->getMask() << "\n"); 749bdd1243dSDimitry Andric 750bdd1243dSDimitry Andric if (Match->isFull()) { 751bdd1243dSDimitry Andric LLVM_DEBUG(dbgs() << "SGID # " << CandSGID << " is full\n"); 752bdd1243dSDimitry Andric continue; 753bdd1243dSDimitry Andric } 75406c3fb27SDimitry Andric if (!Match->allowedByRules(CurrSU.first, SyncPipeline)) { 75506c3fb27SDimitry Andric LLVM_DEBUG(dbgs() << "SGID # " << CandSGID << " has conflicting rule\n"); 75606c3fb27SDimitry Andric continue; 75706c3fb27SDimitry Andric } 758bdd1243dSDimitry Andric TempCost = addEdges(SyncPipeline, CurrSU.first, CandSGID, AddedEdges); 759bdd1243dSDimitry Andric LLVM_DEBUG(dbgs() << "Cost of Group " << TempCost << "\n"); 760bdd1243dSDimitry Andric if (TempCost < BestNodeCost || BestNodeCost == -1) { 761bdd1243dSDimitry Andric BestGroup = Match; 762bdd1243dSDimitry Andric BestNodeCost = TempCost; 763bdd1243dSDimitry Andric BestGroupID = CandSGID; 764bdd1243dSDimitry Andric } 765bdd1243dSDimitry Andric removeEdges(AddedEdges); 766bdd1243dSDimitry Andric if (BestNodeCost == 0) 767bdd1243dSDimitry Andric break; 768bdd1243dSDimitry Andric } 769bdd1243dSDimitry Andric 770bdd1243dSDimitry Andric if (BestGroupID != -1) { 771bdd1243dSDimitry Andric BestGroup->add(*CurrSU.first); 772bdd1243dSDimitry Andric addEdges(SyncPipeline, CurrSU.first, BestGroupID, AddedEdges); 773bdd1243dSDimitry Andric LLVM_DEBUG(dbgs() << "Best Group has ID: " << BestGroupID << " and Mask" 774bdd1243dSDimitry Andric << (int)BestGroup->getMask() << "\n"); 775bdd1243dSDimitry Andric BestCost += TempCost; 776bdd1243dSDimitry Andric } else 777bdd1243dSDimitry Andric BestCost += MissPenalty; 778bdd1243dSDimitry Andric 779bdd1243dSDimitry Andric CurrPipeline[CurrSyncGroupIdx] = SyncPipeline; 78006c3fb27SDimitry Andric } 78106c3fb27SDimitry Andric 78206c3fb27SDimitry Andric bool PipelineSolver::solveGreedy() { 78306c3fb27SDimitry Andric BestCost = 0; 78406c3fb27SDimitry Andric std::vector<std::pair<SUnit *, SUnit *>> AddedEdges; 78506c3fb27SDimitry Andric 78606c3fb27SDimitry Andric while (static_cast<size_t>(CurrSyncGroupIdx) < PipelineInstrs.size()) { 78706c3fb27SDimitry Andric SUToCandSGsPair CurrSU = PipelineInstrs[CurrSyncGroupIdx][CurrConflInstNo]; 78806c3fb27SDimitry Andric IsBottomUp 78906c3fb27SDimitry Andric ? greedyFind(AddedEdges, CurrSU.second.rbegin(), CurrSU.second.rend()) 79006c3fb27SDimitry Andric : greedyFind(AddedEdges, CurrSU.second.begin(), CurrSU.second.end()); 791bdd1243dSDimitry Andric advancePosition(); 792bdd1243dSDimitry Andric } 793bdd1243dSDimitry Andric BestPipeline = CurrPipeline; 794bdd1243dSDimitry Andric removeEdges(AddedEdges); 795bdd1243dSDimitry Andric return false; 796bdd1243dSDimitry Andric } 797bdd1243dSDimitry Andric 798bdd1243dSDimitry Andric unsigned PipelineSolver::computeProblemSize() { 799bdd1243dSDimitry Andric unsigned ProblemSize = 0; 800bdd1243dSDimitry Andric for (auto &PipeConflicts : PipelineInstrs) { 801bdd1243dSDimitry Andric ProblemSize += PipeConflicts.size(); 802bdd1243dSDimitry Andric } 803bdd1243dSDimitry Andric 804bdd1243dSDimitry Andric return ProblemSize; 805bdd1243dSDimitry Andric } 806bdd1243dSDimitry Andric 807bdd1243dSDimitry Andric void PipelineSolver::solve() { 808bdd1243dSDimitry Andric if (!NeedsSolver) 809bdd1243dSDimitry Andric return; 810bdd1243dSDimitry Andric 811bdd1243dSDimitry Andric unsigned ProblemSize = computeProblemSize(); 812bdd1243dSDimitry Andric assert(ProblemSize > 0); 813bdd1243dSDimitry Andric 814bdd1243dSDimitry Andric bool BelowCutoff = (CutoffForExact > 0) && ProblemSize <= CutoffForExact; 815bdd1243dSDimitry Andric MissPenalty = (ProblemSize / 2) + 1; 816bdd1243dSDimitry Andric 817bdd1243dSDimitry Andric LLVM_DEBUG(DAG->dump()); 818bdd1243dSDimitry Andric if (EnableExactSolver || BelowCutoff) { 819bdd1243dSDimitry Andric LLVM_DEBUG(dbgs() << "Starting Greedy pipeline solver\n"); 820bdd1243dSDimitry Andric solveGreedy(); 821bdd1243dSDimitry Andric reset(); 822bdd1243dSDimitry Andric LLVM_DEBUG(dbgs() << "Greedy produced best cost of " << BestCost << "\n"); 823bdd1243dSDimitry Andric if (BestCost > 0) { 824bdd1243dSDimitry Andric LLVM_DEBUG(dbgs() << "Starting EXACT pipeline solver\n"); 825bdd1243dSDimitry Andric solveExact(); 826bdd1243dSDimitry Andric LLVM_DEBUG(dbgs() << "Exact produced best cost of " << BestCost << "\n"); 827bdd1243dSDimitry Andric } 828bdd1243dSDimitry Andric } else { // Use the Greedy Algorithm by default 829bdd1243dSDimitry Andric LLVM_DEBUG(dbgs() << "Starting GREEDY pipeline solver\n"); 830bdd1243dSDimitry Andric solveGreedy(); 831bdd1243dSDimitry Andric } 832bdd1243dSDimitry Andric 833bdd1243dSDimitry Andric makePipeline(); 83406c3fb27SDimitry Andric LLVM_DEBUG(dbgs() << "After applying mutation\n"); 83506c3fb27SDimitry Andric LLVM_DEBUG(DAG->dump()); 836bdd1243dSDimitry Andric } 837bdd1243dSDimitry Andric 83806c3fb27SDimitry Andric enum IGLPStrategyID : int { 83906c3fb27SDimitry Andric MFMASmallGemmOptID = 0, 84006c3fb27SDimitry Andric MFMASmallGemmSingleWaveOptID = 1, 84106c3fb27SDimitry Andric }; 842bdd1243dSDimitry Andric 843bdd1243dSDimitry Andric // Implement a IGLP scheduling strategy. 844bdd1243dSDimitry Andric class IGLPStrategy { 845bdd1243dSDimitry Andric protected: 846bdd1243dSDimitry Andric ScheduleDAGInstrs *DAG; 847bdd1243dSDimitry Andric 848bdd1243dSDimitry Andric const SIInstrInfo *TII; 849bdd1243dSDimitry Andric 850bdd1243dSDimitry Andric public: 8515f757f3fSDimitry Andric /// Add SchedGroups to \p SyncedSchedGroups to implement this Strategy. 852bdd1243dSDimitry Andric virtual void applyIGLPStrategy( 853bdd1243dSDimitry Andric DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs, 8545f757f3fSDimitry Andric DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups, 8555f757f3fSDimitry Andric bool IsReentry) = 0; 856bdd1243dSDimitry Andric 857bdd1243dSDimitry Andric // Returns true if this strategy should be applied to a ScheduleDAG. 858bdd1243dSDimitry Andric virtual bool shouldApplyStrategy(ScheduleDAGInstrs *DAG) = 0; 859bdd1243dSDimitry Andric 86006c3fb27SDimitry Andric bool IsBottomUp = 1; 86106c3fb27SDimitry Andric 862bdd1243dSDimitry Andric IGLPStrategy(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII) 863bdd1243dSDimitry Andric : DAG(DAG), TII(TII) {} 864bdd1243dSDimitry Andric 865bdd1243dSDimitry Andric virtual ~IGLPStrategy() = default; 866bdd1243dSDimitry Andric }; 867bdd1243dSDimitry Andric 868bdd1243dSDimitry Andric class MFMASmallGemmOpt final : public IGLPStrategy { 86906c3fb27SDimitry Andric private: 870bdd1243dSDimitry Andric public: 871bdd1243dSDimitry Andric void applyIGLPStrategy( 872bdd1243dSDimitry Andric DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs, 8735f757f3fSDimitry Andric DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups, 8745f757f3fSDimitry Andric bool IsReentry) override; 875bdd1243dSDimitry Andric 876bdd1243dSDimitry Andric bool shouldApplyStrategy(ScheduleDAGInstrs *DAG) override { return true; } 877bdd1243dSDimitry Andric 878bdd1243dSDimitry Andric MFMASmallGemmOpt(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII) 87906c3fb27SDimitry Andric : IGLPStrategy(DAG, TII) { 88006c3fb27SDimitry Andric IsBottomUp = 1; 88106c3fb27SDimitry Andric } 882bdd1243dSDimitry Andric }; 883bdd1243dSDimitry Andric 884bdd1243dSDimitry Andric void MFMASmallGemmOpt::applyIGLPStrategy( 885bdd1243dSDimitry Andric DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs, 8865f757f3fSDimitry Andric DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups, 8875f757f3fSDimitry Andric bool IsReentry) { 888bdd1243dSDimitry Andric // Count the number of MFMA instructions. 889bdd1243dSDimitry Andric unsigned MFMACount = 0; 890bdd1243dSDimitry Andric for (const MachineInstr &I : *DAG) 891bdd1243dSDimitry Andric if (TII->isMFMAorWMMA(I)) 892bdd1243dSDimitry Andric ++MFMACount; 893bdd1243dSDimitry Andric 894bdd1243dSDimitry Andric const unsigned PipelineSyncID = 0; 895bdd1243dSDimitry Andric SchedGroup *SG = nullptr; 896bdd1243dSDimitry Andric for (unsigned I = 0; I < MFMACount * 3; ++I) { 897bdd1243dSDimitry Andric SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 898bdd1243dSDimitry Andric SchedGroupMask::DS, 2, PipelineSyncID, DAG, TII); 899bdd1243dSDimitry Andric SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 900bdd1243dSDimitry Andric 901bdd1243dSDimitry Andric SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 902bdd1243dSDimitry Andric SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII); 903bdd1243dSDimitry Andric SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 904bdd1243dSDimitry Andric } 905bdd1243dSDimitry Andric } 906bdd1243dSDimitry Andric 90706c3fb27SDimitry Andric class MFMASmallGemmSingleWaveOpt final : public IGLPStrategy { 90806c3fb27SDimitry Andric private: 90906c3fb27SDimitry Andric // Whether the DS_READ is a predecessor of first four MFMA in region 91006c3fb27SDimitry Andric class EnablesInitialMFMA final : public InstructionRule { 91106c3fb27SDimitry Andric public: 91206c3fb27SDimitry Andric bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection, 91306c3fb27SDimitry Andric SmallVectorImpl<SchedGroup> &SyncPipe) override { 91406c3fb27SDimitry Andric if (!SyncPipe.size()) 91506c3fb27SDimitry Andric return false; 91606c3fb27SDimitry Andric int MFMAsFound = 0; 91706c3fb27SDimitry Andric if (!Cache->size()) { 91806c3fb27SDimitry Andric for (auto &Elt : SyncPipe[0].DAG->SUnits) { 91906c3fb27SDimitry Andric if (TII->isMFMAorWMMA(*Elt.getInstr())) { 92006c3fb27SDimitry Andric ++MFMAsFound; 92106c3fb27SDimitry Andric if (MFMAsFound > 4) 92206c3fb27SDimitry Andric break; 92306c3fb27SDimitry Andric Cache->push_back(&Elt); 92406c3fb27SDimitry Andric } 92506c3fb27SDimitry Andric } 92606c3fb27SDimitry Andric } 92706c3fb27SDimitry Andric 92806c3fb27SDimitry Andric assert(Cache->size()); 92906c3fb27SDimitry Andric auto DAG = SyncPipe[0].DAG; 93006c3fb27SDimitry Andric for (auto &Elt : *Cache) { 93106c3fb27SDimitry Andric if (DAG->IsReachable(Elt, const_cast<SUnit *>(SU))) 93206c3fb27SDimitry Andric return true; 93306c3fb27SDimitry Andric } 93406c3fb27SDimitry Andric return false; 93506c3fb27SDimitry Andric } 93606c3fb27SDimitry Andric 93706c3fb27SDimitry Andric EnablesInitialMFMA(const SIInstrInfo *TII, unsigned SGID, 93806c3fb27SDimitry Andric bool NeedsCache = false) 93906c3fb27SDimitry Andric : InstructionRule(TII, SGID, NeedsCache) {} 94006c3fb27SDimitry Andric }; 94106c3fb27SDimitry Andric 94206c3fb27SDimitry Andric // Whether the MI is a V_PERM and is a predecessor of a common DS_WRITE 94306c3fb27SDimitry Andric class IsPermForDSW final : public InstructionRule { 94406c3fb27SDimitry Andric public: 94506c3fb27SDimitry Andric bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection, 94606c3fb27SDimitry Andric SmallVectorImpl<SchedGroup> &SyncPipe) override { 94706c3fb27SDimitry Andric auto MI = SU->getInstr(); 94806c3fb27SDimitry Andric if (MI->getOpcode() != AMDGPU::V_PERM_B32_e64) 94906c3fb27SDimitry Andric return false; 95006c3fb27SDimitry Andric 95106c3fb27SDimitry Andric bool FitsInGroup = false; 95206c3fb27SDimitry Andric // Does the VALU have a DS_WRITE successor 95306c3fb27SDimitry Andric if (!Collection.size()) { 95406c3fb27SDimitry Andric for (auto &Succ : SU->Succs) { 95506c3fb27SDimitry Andric SUnit *SuccUnit = Succ.getSUnit(); 95606c3fb27SDimitry Andric if (TII->isDS(*SuccUnit->getInstr()) && 95706c3fb27SDimitry Andric SuccUnit->getInstr()->mayStore()) { 95806c3fb27SDimitry Andric Cache->push_back(SuccUnit); 95906c3fb27SDimitry Andric FitsInGroup = true; 96006c3fb27SDimitry Andric } 96106c3fb27SDimitry Andric } 96206c3fb27SDimitry Andric return FitsInGroup; 96306c3fb27SDimitry Andric } 96406c3fb27SDimitry Andric 96506c3fb27SDimitry Andric assert(Cache->size()); 96606c3fb27SDimitry Andric 96706c3fb27SDimitry Andric // Does the VALU have a DS_WRITE successor that is the same as other 96806c3fb27SDimitry Andric // VALU already in the group. The V_PERMs will all share 1 DS_W succ 9695f757f3fSDimitry Andric return llvm::any_of(*Cache, [&SU](SUnit *Elt) { 9705f757f3fSDimitry Andric return llvm::any_of(SU->Succs, [&Elt](const SDep &ThisSucc) { 97106c3fb27SDimitry Andric return ThisSucc.getSUnit() == Elt; 97206c3fb27SDimitry Andric }); 97306c3fb27SDimitry Andric }); 97406c3fb27SDimitry Andric } 97506c3fb27SDimitry Andric 97606c3fb27SDimitry Andric IsPermForDSW(const SIInstrInfo *TII, unsigned SGID, bool NeedsCache = false) 97706c3fb27SDimitry Andric : InstructionRule(TII, SGID, NeedsCache) {} 97806c3fb27SDimitry Andric }; 97906c3fb27SDimitry Andric 98006c3fb27SDimitry Andric // Whether the SU is a successor of any element in previous SchedGroup 98106c3fb27SDimitry Andric class IsSuccOfPrevGroup final : public InstructionRule { 98206c3fb27SDimitry Andric public: 98306c3fb27SDimitry Andric bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection, 98406c3fb27SDimitry Andric SmallVectorImpl<SchedGroup> &SyncPipe) override { 98506c3fb27SDimitry Andric SchedGroup *OtherGroup = nullptr; 98606c3fb27SDimitry Andric for (auto &PipeSG : SyncPipe) { 98706c3fb27SDimitry Andric if ((unsigned)PipeSG.getSGID() == SGID - 1) { 98806c3fb27SDimitry Andric OtherGroup = &PipeSG; 98906c3fb27SDimitry Andric } 99006c3fb27SDimitry Andric } 99106c3fb27SDimitry Andric 99206c3fb27SDimitry Andric if (!OtherGroup) 99306c3fb27SDimitry Andric return false; 99406c3fb27SDimitry Andric if (!OtherGroup->Collection.size()) 99506c3fb27SDimitry Andric return true; 99606c3fb27SDimitry Andric 99706c3fb27SDimitry Andric // Does the previous VALU have this DS_Write as a successor 99806c3fb27SDimitry Andric return (std::any_of(OtherGroup->Collection.begin(), 99906c3fb27SDimitry Andric OtherGroup->Collection.end(), [&SU](SUnit *Elt) { 100006c3fb27SDimitry Andric return std::any_of(Elt->Succs.begin(), 100106c3fb27SDimitry Andric Elt->Succs.end(), 100206c3fb27SDimitry Andric [&SU](SDep &Succ) { 100306c3fb27SDimitry Andric return Succ.getSUnit() == SU; 100406c3fb27SDimitry Andric }); 100506c3fb27SDimitry Andric })); 100606c3fb27SDimitry Andric } 100706c3fb27SDimitry Andric IsSuccOfPrevGroup(const SIInstrInfo *TII, unsigned SGID, 100806c3fb27SDimitry Andric bool NeedsCache = false) 100906c3fb27SDimitry Andric : InstructionRule(TII, SGID, NeedsCache) {} 101006c3fb27SDimitry Andric }; 101106c3fb27SDimitry Andric 101206c3fb27SDimitry Andric // Whether the combined load width of group is 128 bits 101306c3fb27SDimitry Andric class VMEMSize final : public InstructionRule { 101406c3fb27SDimitry Andric public: 101506c3fb27SDimitry Andric bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection, 101606c3fb27SDimitry Andric SmallVectorImpl<SchedGroup> &SyncPipe) override { 101706c3fb27SDimitry Andric auto MI = SU->getInstr(); 101806c3fb27SDimitry Andric if (MI->getOpcode() == TargetOpcode::BUNDLE) 101906c3fb27SDimitry Andric return false; 102006c3fb27SDimitry Andric if (!Collection.size()) 102106c3fb27SDimitry Andric return true; 102206c3fb27SDimitry Andric 102306c3fb27SDimitry Andric int NumBits = 0; 102406c3fb27SDimitry Andric 102506c3fb27SDimitry Andric auto TRI = TII->getRegisterInfo(); 102606c3fb27SDimitry Andric auto &MRI = MI->getParent()->getParent()->getRegInfo(); 102706c3fb27SDimitry Andric for (auto &Elt : Collection) { 102806c3fb27SDimitry Andric auto Op = Elt->getInstr()->getOperand(0); 102906c3fb27SDimitry Andric auto Size = 103006c3fb27SDimitry Andric TRI.getRegSizeInBits(*TRI.getRegClassForOperandReg(MRI, Op)); 103106c3fb27SDimitry Andric NumBits += Size; 103206c3fb27SDimitry Andric } 103306c3fb27SDimitry Andric 103406c3fb27SDimitry Andric if (NumBits < 128) { 103506c3fb27SDimitry Andric assert(TII->isVMEM(*MI) && MI->mayLoad()); 103606c3fb27SDimitry Andric if (NumBits + TRI.getRegSizeInBits(*TRI.getRegClassForOperandReg( 103706c3fb27SDimitry Andric MRI, MI->getOperand(0))) <= 103806c3fb27SDimitry Andric 128) 103906c3fb27SDimitry Andric return true; 104006c3fb27SDimitry Andric } 104106c3fb27SDimitry Andric 104206c3fb27SDimitry Andric return false; 104306c3fb27SDimitry Andric } 104406c3fb27SDimitry Andric 104506c3fb27SDimitry Andric VMEMSize(const SIInstrInfo *TII, unsigned SGID, bool NeedsCache = false) 104606c3fb27SDimitry Andric : InstructionRule(TII, SGID, NeedsCache) {} 104706c3fb27SDimitry Andric }; 104806c3fb27SDimitry Andric 10495f757f3fSDimitry Andric /// Whether the SU shares a V_PERM predecessor with any SU in the SchedGroup 10505f757f3fSDimitry Andric /// that is \p Distance steps away 105106c3fb27SDimitry Andric class SharesPredWithPrevNthGroup final : public InstructionRule { 105206c3fb27SDimitry Andric private: 105306c3fb27SDimitry Andric unsigned Distance = 1; 105406c3fb27SDimitry Andric 105506c3fb27SDimitry Andric public: 105606c3fb27SDimitry Andric bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection, 105706c3fb27SDimitry Andric SmallVectorImpl<SchedGroup> &SyncPipe) override { 105806c3fb27SDimitry Andric SchedGroup *OtherGroup = nullptr; 105906c3fb27SDimitry Andric if (!SyncPipe.size()) 106006c3fb27SDimitry Andric return false; 106106c3fb27SDimitry Andric 106206c3fb27SDimitry Andric if (!Cache->size()) { 106306c3fb27SDimitry Andric 106406c3fb27SDimitry Andric for (auto &PipeSG : SyncPipe) { 106506c3fb27SDimitry Andric if ((unsigned)PipeSG.getSGID() == SGID - Distance) { 106606c3fb27SDimitry Andric OtherGroup = &PipeSG; 106706c3fb27SDimitry Andric } 106806c3fb27SDimitry Andric } 106906c3fb27SDimitry Andric 107006c3fb27SDimitry Andric if (!OtherGroup) 107106c3fb27SDimitry Andric return false; 107206c3fb27SDimitry Andric if (!OtherGroup->Collection.size()) 107306c3fb27SDimitry Andric return true; 107406c3fb27SDimitry Andric 107506c3fb27SDimitry Andric for (auto &OtherEle : OtherGroup->Collection) { 107606c3fb27SDimitry Andric for (auto &Pred : OtherEle->Preds) { 107706c3fb27SDimitry Andric if (Pred.getSUnit()->getInstr()->getOpcode() == 107806c3fb27SDimitry Andric AMDGPU::V_PERM_B32_e64) 107906c3fb27SDimitry Andric Cache->push_back(Pred.getSUnit()); 108006c3fb27SDimitry Andric } 108106c3fb27SDimitry Andric } 10825f757f3fSDimitry Andric 10835f757f3fSDimitry Andric // If the other group has no PERM preds, then this group won't share any 10845f757f3fSDimitry Andric if (!Cache->size()) 10855f757f3fSDimitry Andric return false; 108606c3fb27SDimitry Andric } 108706c3fb27SDimitry Andric 108806c3fb27SDimitry Andric auto DAG = SyncPipe[0].DAG; 108906c3fb27SDimitry Andric // Does the previous DS_WRITE share a V_PERM predecessor with this 109006c3fb27SDimitry Andric // VMEM_READ 10915f757f3fSDimitry Andric return llvm::any_of(*Cache, [&SU, &DAG](SUnit *Elt) { 109206c3fb27SDimitry Andric return DAG->IsReachable(const_cast<SUnit *>(SU), Elt); 10935f757f3fSDimitry Andric }); 109406c3fb27SDimitry Andric } 109506c3fb27SDimitry Andric SharesPredWithPrevNthGroup(unsigned Distance, const SIInstrInfo *TII, 109606c3fb27SDimitry Andric unsigned SGID, bool NeedsCache = false) 109706c3fb27SDimitry Andric : InstructionRule(TII, SGID, NeedsCache), Distance(Distance) {} 109806c3fb27SDimitry Andric }; 109906c3fb27SDimitry Andric 110006c3fb27SDimitry Andric public: 110106c3fb27SDimitry Andric void applyIGLPStrategy( 110206c3fb27SDimitry Andric DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs, 11035f757f3fSDimitry Andric DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups, 11045f757f3fSDimitry Andric bool IsReentry) override; 110506c3fb27SDimitry Andric 110606c3fb27SDimitry Andric bool shouldApplyStrategy(ScheduleDAGInstrs *DAG) override { return true; } 110706c3fb27SDimitry Andric 110806c3fb27SDimitry Andric MFMASmallGemmSingleWaveOpt(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII) 110906c3fb27SDimitry Andric : IGLPStrategy(DAG, TII) { 111006c3fb27SDimitry Andric IsBottomUp = 0; 111106c3fb27SDimitry Andric } 111206c3fb27SDimitry Andric }; 111306c3fb27SDimitry Andric 11145f757f3fSDimitry Andric static unsigned DSWCount = 0; 11155f757f3fSDimitry Andric static unsigned DSWWithPermCount = 0; 11165f757f3fSDimitry Andric static unsigned DSWWithSharedVMEMCount = 0; 11175f757f3fSDimitry Andric 111806c3fb27SDimitry Andric void MFMASmallGemmSingleWaveOpt::applyIGLPStrategy( 111906c3fb27SDimitry Andric DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs, 11205f757f3fSDimitry Andric DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups, 11215f757f3fSDimitry Andric bool IsReentry) { 112206c3fb27SDimitry Andric unsigned MFMACount = 0; 112306c3fb27SDimitry Andric unsigned DSRCount = 0; 11245f757f3fSDimitry Andric 11255f757f3fSDimitry Andric assert((IsReentry || (DSWCount == 0 && DSWWithPermCount == 0 && 11265f757f3fSDimitry Andric DSWWithSharedVMEMCount == 0)) && 11275f757f3fSDimitry Andric "DSWCounters should be zero in pre-RA scheduling!"); 112806c3fb27SDimitry Andric SmallVector<SUnit *, 6> DSWithPerms; 112906c3fb27SDimitry Andric for (auto &SU : DAG->SUnits) { 113006c3fb27SDimitry Andric auto I = SU.getInstr(); 113106c3fb27SDimitry Andric if (TII->isMFMAorWMMA(*I)) 113206c3fb27SDimitry Andric ++MFMACount; 113306c3fb27SDimitry Andric else if (TII->isDS(*I)) { 113406c3fb27SDimitry Andric if (I->mayLoad()) 113506c3fb27SDimitry Andric ++DSRCount; 11365f757f3fSDimitry Andric else if (I->mayStore() && !IsReentry) { 113706c3fb27SDimitry Andric ++DSWCount; 113806c3fb27SDimitry Andric for (auto Pred : SU.Preds) { 113906c3fb27SDimitry Andric if (Pred.getSUnit()->getInstr()->getOpcode() == 114006c3fb27SDimitry Andric AMDGPU::V_PERM_B32_e64) { 114106c3fb27SDimitry Andric DSWithPerms.push_back(&SU); 114206c3fb27SDimitry Andric break; 114306c3fb27SDimitry Andric } 114406c3fb27SDimitry Andric } 114506c3fb27SDimitry Andric } 114606c3fb27SDimitry Andric } 114706c3fb27SDimitry Andric } 11485f757f3fSDimitry Andric 11495f757f3fSDimitry Andric if (!IsReentry) { 115006c3fb27SDimitry Andric DSWWithPermCount = DSWithPerms.size(); 115106c3fb27SDimitry Andric auto I = DSWithPerms.begin(); 115206c3fb27SDimitry Andric auto E = DSWithPerms.end(); 115306c3fb27SDimitry Andric 115406c3fb27SDimitry Andric // Get the count of DS_WRITES with V_PERM predecessors which 115506c3fb27SDimitry Andric // have loop carried dependencies (WAR) on the same VMEM_READs. 115606c3fb27SDimitry Andric // We consider partial overlap as a miss -- in other words, 115706c3fb27SDimitry Andric // for a given DS_W, we only consider another DS_W as matching 115806c3fb27SDimitry Andric // if there is a corresponding (in terms of the VMEM_R it uses) V_PERM pred 115906c3fb27SDimitry Andric // for every V_PERM pred of this DS_W. 116006c3fb27SDimitry Andric DenseMap<MachineInstr *, SUnit *> VMEMLookup; 116106c3fb27SDimitry Andric SmallVector<SUnit *, 6> Counted; 116206c3fb27SDimitry Andric for (; I != E; I++) { 116306c3fb27SDimitry Andric SUnit *Cand = nullptr; 116406c3fb27SDimitry Andric bool MissedAny = false; 116506c3fb27SDimitry Andric for (auto &Pred : (*I)->Preds) { 116606c3fb27SDimitry Andric if (Pred.getSUnit()->getInstr()->getOpcode() != AMDGPU::V_PERM_B32_e64) 116706c3fb27SDimitry Andric continue; 116806c3fb27SDimitry Andric 11695f757f3fSDimitry Andric if (Cand && llvm::is_contained(Counted, Cand)) 117006c3fb27SDimitry Andric break; 117106c3fb27SDimitry Andric 117206c3fb27SDimitry Andric for (auto &Succ : Pred.getSUnit()->Succs) { 117306c3fb27SDimitry Andric auto MI = Succ.getSUnit()->getInstr(); 117406c3fb27SDimitry Andric if (!TII->isVMEM(*MI) || !MI->mayLoad()) 117506c3fb27SDimitry Andric continue; 117606c3fb27SDimitry Andric 117706c3fb27SDimitry Andric if (MissedAny || !VMEMLookup.size()) { 117806c3fb27SDimitry Andric MissedAny = true; 117906c3fb27SDimitry Andric VMEMLookup[MI] = *I; 118006c3fb27SDimitry Andric continue; 118106c3fb27SDimitry Andric } 118206c3fb27SDimitry Andric 118306c3fb27SDimitry Andric if (!VMEMLookup.contains(MI)) { 118406c3fb27SDimitry Andric MissedAny = true; 118506c3fb27SDimitry Andric VMEMLookup[MI] = *I; 118606c3fb27SDimitry Andric continue; 118706c3fb27SDimitry Andric } 118806c3fb27SDimitry Andric 118906c3fb27SDimitry Andric Cand = VMEMLookup[MI]; 11905f757f3fSDimitry Andric if (llvm::is_contained(Counted, Cand)) { 119106c3fb27SDimitry Andric MissedAny = true; 119206c3fb27SDimitry Andric break; 119306c3fb27SDimitry Andric } 119406c3fb27SDimitry Andric } 119506c3fb27SDimitry Andric } 119606c3fb27SDimitry Andric if (!MissedAny && Cand) { 119706c3fb27SDimitry Andric DSWWithSharedVMEMCount += 2; 119806c3fb27SDimitry Andric Counted.push_back(Cand); 119906c3fb27SDimitry Andric Counted.push_back(*I); 120006c3fb27SDimitry Andric } 120106c3fb27SDimitry Andric } 12025f757f3fSDimitry Andric } 120306c3fb27SDimitry Andric 120406c3fb27SDimitry Andric assert(DSWWithSharedVMEMCount <= DSWWithPermCount); 120506c3fb27SDimitry Andric SchedGroup *SG; 120606c3fb27SDimitry Andric unsigned PipelineSyncID = 0; 120706c3fb27SDimitry Andric // For kernels with V_PERM, there are enough VALU to mix in between MFMAs 120806c3fb27SDimitry Andric if (DSWWithPermCount) { 120906c3fb27SDimitry Andric for (unsigned I = 0; I < MFMACount; I++) { 121006c3fb27SDimitry Andric SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 121106c3fb27SDimitry Andric SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII); 121206c3fb27SDimitry Andric SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 121306c3fb27SDimitry Andric 121406c3fb27SDimitry Andric SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 121506c3fb27SDimitry Andric SchedGroupMask::VALU, 2, PipelineSyncID, DAG, TII); 121606c3fb27SDimitry Andric SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 121706c3fb27SDimitry Andric } 121806c3fb27SDimitry Andric } 121906c3fb27SDimitry Andric 122006c3fb27SDimitry Andric PipelineSyncID = 1; 122106c3fb27SDimitry Andric // Phase 1: Break up DS_READ and MFMA clusters. 122206c3fb27SDimitry Andric // First DS_READ to make ready initial MFMA, then interleave MFMA with DS_READ 122306c3fb27SDimitry Andric // prefetch 122406c3fb27SDimitry Andric 122506c3fb27SDimitry Andric // Make ready initial MFMA 122606c3fb27SDimitry Andric SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 122706c3fb27SDimitry Andric SchedGroupMask::DS_READ, 4, PipelineSyncID, DAG, TII); 122806c3fb27SDimitry Andric SG->addRule(std::make_shared<EnablesInitialMFMA>(TII, SG->getSGID(), true)); 122906c3fb27SDimitry Andric SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 123006c3fb27SDimitry Andric 123106c3fb27SDimitry Andric SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 123206c3fb27SDimitry Andric SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII); 123306c3fb27SDimitry Andric SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 123406c3fb27SDimitry Andric 123506c3fb27SDimitry Andric // Interleave MFMA with DS_READ prefetch 123606c3fb27SDimitry Andric for (unsigned I = 0; I < DSRCount - 4; ++I) { 123706c3fb27SDimitry Andric SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 123806c3fb27SDimitry Andric SchedGroupMask::DS_READ, 1, PipelineSyncID, DAG, TII); 123906c3fb27SDimitry Andric SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 124006c3fb27SDimitry Andric 124106c3fb27SDimitry Andric SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 124206c3fb27SDimitry Andric SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII); 124306c3fb27SDimitry Andric SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 124406c3fb27SDimitry Andric } 124506c3fb27SDimitry Andric 124606c3fb27SDimitry Andric // Phase 2a: Loop carried dependency with V_PERM 124706c3fb27SDimitry Andric // Schedule VPerm & DS_WRITE as closely as possible to the VMEM_READ they 124806c3fb27SDimitry Andric // depend on. Interleave MFMA to keep XDL unit busy throughout. 124906c3fb27SDimitry Andric for (unsigned I = 0; I < DSWWithPermCount - DSWWithSharedVMEMCount; ++I) { 125006c3fb27SDimitry Andric SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 125106c3fb27SDimitry Andric SchedGroupMask::VALU, 4, PipelineSyncID, DAG, TII); 125206c3fb27SDimitry Andric SG->addRule(std::make_shared<IsPermForDSW>(TII, SG->getSGID(), true)); 125306c3fb27SDimitry Andric SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 125406c3fb27SDimitry Andric 125506c3fb27SDimitry Andric SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 125606c3fb27SDimitry Andric SchedGroupMask::DS_WRITE, 1, PipelineSyncID, DAG, TII); 125706c3fb27SDimitry Andric SG->addRule(std::make_shared<IsSuccOfPrevGroup>(TII, SG->getSGID(), false)); 125806c3fb27SDimitry Andric SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 125906c3fb27SDimitry Andric 126006c3fb27SDimitry Andric SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 126106c3fb27SDimitry Andric SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG, TII); 126206c3fb27SDimitry Andric SG->addRule(std::make_shared<SharesPredWithPrevNthGroup>( 126306c3fb27SDimitry Andric 1, TII, SG->getSGID(), true)); 126406c3fb27SDimitry Andric SG->addRule(std::make_shared<VMEMSize>(TII, SG->getSGID(), false)); 126506c3fb27SDimitry Andric SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 126606c3fb27SDimitry Andric 126706c3fb27SDimitry Andric SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 126806c3fb27SDimitry Andric SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII); 126906c3fb27SDimitry Andric SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 127006c3fb27SDimitry Andric 127106c3fb27SDimitry Andric SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 127206c3fb27SDimitry Andric SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG, TII); 127306c3fb27SDimitry Andric SG->addRule(std::make_shared<SharesPredWithPrevNthGroup>( 127406c3fb27SDimitry Andric 3, TII, SG->getSGID(), true)); 127506c3fb27SDimitry Andric SG->addRule(std::make_shared<VMEMSize>(TII, SG->getSGID(), false)); 127606c3fb27SDimitry Andric SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 127706c3fb27SDimitry Andric 127806c3fb27SDimitry Andric SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 127906c3fb27SDimitry Andric SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII); 128006c3fb27SDimitry Andric SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 128106c3fb27SDimitry Andric } 128206c3fb27SDimitry Andric 128306c3fb27SDimitry Andric // Phase 2b: Loop carried dependency without V_PERM 128406c3fb27SDimitry Andric // Schedule DS_WRITE as closely as possible to the VMEM_READ they depend on. 128506c3fb27SDimitry Andric // Interleave MFMA to keep XDL unit busy throughout. 128606c3fb27SDimitry Andric for (unsigned I = 0; I < DSWCount - DSWWithPermCount; I++) { 128706c3fb27SDimitry Andric SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 128806c3fb27SDimitry Andric SchedGroupMask::DS_WRITE, 1, PipelineSyncID, DAG, TII); 128906c3fb27SDimitry Andric SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 129006c3fb27SDimitry Andric 129106c3fb27SDimitry Andric SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 129206c3fb27SDimitry Andric SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG, TII); 129306c3fb27SDimitry Andric SG->addRule(std::make_shared<VMEMSize>(TII, SG->getSGID(), false)); 129406c3fb27SDimitry Andric SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 129506c3fb27SDimitry Andric 129606c3fb27SDimitry Andric SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 129706c3fb27SDimitry Andric SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII); 129806c3fb27SDimitry Andric SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 129906c3fb27SDimitry Andric } 130006c3fb27SDimitry Andric 130106c3fb27SDimitry Andric // Phase 2c: Loop carried dependency with V_PERM, VMEM_READs are 130206c3fb27SDimitry Andric // ultimately used by two DS_WRITE 130306c3fb27SDimitry Andric // Schedule VPerm & DS_WRITE as closely as possible to the VMEM_READ they 130406c3fb27SDimitry Andric // depend on. Interleave MFMA to keep XDL unit busy throughout. 130506c3fb27SDimitry Andric 130606c3fb27SDimitry Andric for (unsigned I = 0; I < DSWWithSharedVMEMCount; ++I) { 130706c3fb27SDimitry Andric SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 130806c3fb27SDimitry Andric SchedGroupMask::VALU, 4, PipelineSyncID, DAG, TII); 130906c3fb27SDimitry Andric SG->addRule(std::make_shared<IsPermForDSW>(TII, SG->getSGID(), true)); 131006c3fb27SDimitry Andric SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 131106c3fb27SDimitry Andric 131206c3fb27SDimitry Andric SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 131306c3fb27SDimitry Andric SchedGroupMask::DS_WRITE, 1, PipelineSyncID, DAG, TII); 131406c3fb27SDimitry Andric SG->addRule(std::make_shared<IsSuccOfPrevGroup>(TII, SG->getSGID(), false)); 131506c3fb27SDimitry Andric SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 131606c3fb27SDimitry Andric 131706c3fb27SDimitry Andric SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 131806c3fb27SDimitry Andric SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII); 131906c3fb27SDimitry Andric SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 132006c3fb27SDimitry Andric 132106c3fb27SDimitry Andric SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 132206c3fb27SDimitry Andric SchedGroupMask::VALU, 4, PipelineSyncID, DAG, TII); 132306c3fb27SDimitry Andric SG->addRule(std::make_shared<IsPermForDSW>(TII, SG->getSGID(), true)); 132406c3fb27SDimitry Andric SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 132506c3fb27SDimitry Andric 132606c3fb27SDimitry Andric SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 132706c3fb27SDimitry Andric SchedGroupMask::DS_WRITE, 1, PipelineSyncID, DAG, TII); 132806c3fb27SDimitry Andric SG->addRule(std::make_shared<IsSuccOfPrevGroup>(TII, SG->getSGID(), false)); 132906c3fb27SDimitry Andric SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 133006c3fb27SDimitry Andric 133106c3fb27SDimitry Andric SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 133206c3fb27SDimitry Andric SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII); 133306c3fb27SDimitry Andric SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 133406c3fb27SDimitry Andric 133506c3fb27SDimitry Andric SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 133606c3fb27SDimitry Andric SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG, TII); 133706c3fb27SDimitry Andric SG->addRule(std::make_shared<SharesPredWithPrevNthGroup>( 133806c3fb27SDimitry Andric 2, TII, SG->getSGID(), true)); 133906c3fb27SDimitry Andric SG->addRule(std::make_shared<VMEMSize>(TII, SG->getSGID(), false)); 134006c3fb27SDimitry Andric SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 134106c3fb27SDimitry Andric 134206c3fb27SDimitry Andric SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 134306c3fb27SDimitry Andric SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII); 134406c3fb27SDimitry Andric SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 134506c3fb27SDimitry Andric 134606c3fb27SDimitry Andric SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 134706c3fb27SDimitry Andric SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG, TII); 134806c3fb27SDimitry Andric SG->addRule(std::make_shared<SharesPredWithPrevNthGroup>( 134906c3fb27SDimitry Andric 4, TII, SG->getSGID(), true)); 135006c3fb27SDimitry Andric SG->addRule(std::make_shared<VMEMSize>(TII, SG->getSGID(), false)); 135106c3fb27SDimitry Andric SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 135206c3fb27SDimitry Andric 135306c3fb27SDimitry Andric SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 135406c3fb27SDimitry Andric SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII); 135506c3fb27SDimitry Andric SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 135606c3fb27SDimitry Andric } 135706c3fb27SDimitry Andric } 135806c3fb27SDimitry Andric 1359bdd1243dSDimitry Andric static std::unique_ptr<IGLPStrategy> 1360bdd1243dSDimitry Andric createIGLPStrategy(IGLPStrategyID ID, ScheduleDAGInstrs *DAG, 1361bdd1243dSDimitry Andric const SIInstrInfo *TII) { 1362bdd1243dSDimitry Andric switch (ID) { 1363bdd1243dSDimitry Andric case MFMASmallGemmOptID: 1364bdd1243dSDimitry Andric return std::make_unique<MFMASmallGemmOpt>(DAG, TII); 136506c3fb27SDimitry Andric case MFMASmallGemmSingleWaveOptID: 136606c3fb27SDimitry Andric return std::make_unique<MFMASmallGemmSingleWaveOpt>(DAG, TII); 1367bdd1243dSDimitry Andric } 1368bdd1243dSDimitry Andric 1369bdd1243dSDimitry Andric llvm_unreachable("Unknown IGLPStrategyID"); 1370bdd1243dSDimitry Andric } 1371bdd1243dSDimitry Andric 1372bdd1243dSDimitry Andric class IGroupLPDAGMutation : public ScheduleDAGMutation { 1373bdd1243dSDimitry Andric private: 1374bdd1243dSDimitry Andric const SIInstrInfo *TII; 1375bdd1243dSDimitry Andric 1376bdd1243dSDimitry Andric ScheduleDAGMI *DAG; 1377bdd1243dSDimitry Andric 1378bdd1243dSDimitry Andric // Organize lists of SchedGroups by their SyncID. SchedGroups / 1379bdd1243dSDimitry Andric // SCHED_GROUP_BARRIERs with different SyncIDs will have no edges added 1380bdd1243dSDimitry Andric // between then. 1381bdd1243dSDimitry Andric DenseMap<int, SmallVector<SchedGroup, 4>> SyncedSchedGroups; 1382bdd1243dSDimitry Andric 1383bdd1243dSDimitry Andric // Used to track instructions that can be mapped to multiple sched groups 1384bdd1243dSDimitry Andric DenseMap<int, SUnitsToCandidateSGsMap> SyncedInstrs; 1385bdd1243dSDimitry Andric 1386bdd1243dSDimitry Andric // Add DAG edges that enforce SCHED_BARRIER ordering. 1387bdd1243dSDimitry Andric void addSchedBarrierEdges(SUnit &SU); 1388bdd1243dSDimitry Andric 1389bdd1243dSDimitry Andric // Use a SCHED_BARRIER's mask to identify instruction SchedGroups that should 1390bdd1243dSDimitry Andric // not be reordered accross the SCHED_BARRIER. This is used for the base 1391bdd1243dSDimitry Andric // SCHED_BARRIER, and not SCHED_GROUP_BARRIER. The difference is that 1392bdd1243dSDimitry Andric // SCHED_BARRIER will always block all instructions that can be classified 1393bdd1243dSDimitry Andric // into a particular SchedClass, whereas SCHED_GROUP_BARRIER has a fixed size 1394bdd1243dSDimitry Andric // and may only synchronize with some SchedGroups. Returns the inverse of 1395bdd1243dSDimitry Andric // Mask. SCHED_BARRIER's mask describes which instruction types should be 1396bdd1243dSDimitry Andric // allowed to be scheduled across it. Invert the mask to get the 1397bdd1243dSDimitry Andric // SchedGroupMask of instructions that should be barred. 1398bdd1243dSDimitry Andric SchedGroupMask invertSchedBarrierMask(SchedGroupMask Mask) const; 1399bdd1243dSDimitry Andric 1400bdd1243dSDimitry Andric // Create SchedGroups for a SCHED_GROUP_BARRIER. 1401bdd1243dSDimitry Andric void initSchedGroupBarrierPipelineStage( 1402bdd1243dSDimitry Andric std::vector<SUnit>::reverse_iterator RIter); 1403bdd1243dSDimitry Andric 1404bdd1243dSDimitry Andric void initIGLPOpt(SUnit &SU); 1405bdd1243dSDimitry Andric 1406bdd1243dSDimitry Andric public: 1407bdd1243dSDimitry Andric void apply(ScheduleDAGInstrs *DAGInstrs) override; 1408bdd1243dSDimitry Andric 140906c3fb27SDimitry Andric // The order in which the PipelineSolver should process the candidate 141006c3fb27SDimitry Andric // SchedGroup for a PipelineInstr. BOTTOM_UP will try to add SUs to the last 141106c3fb27SDimitry Andric // created SchedGroup first, and will consider that as the ultimate 141206c3fb27SDimitry Andric // predecessor group when linking. TOP_DOWN instead links and processes the 141306c3fb27SDimitry Andric // first created SchedGroup first. 141406c3fb27SDimitry Andric bool IsBottomUp = 1; 141506c3fb27SDimitry Andric 14165f757f3fSDimitry Andric // Whether or not this is a reentry into the IGroupLPDAGMutation. 14175f757f3fSDimitry Andric bool IsReentry = false; 14185f757f3fSDimitry Andric 1419bdd1243dSDimitry Andric IGroupLPDAGMutation() = default; 14205f757f3fSDimitry Andric IGroupLPDAGMutation(bool IsReentry) : IsReentry(IsReentry) {} 1421bdd1243dSDimitry Andric }; 1422bdd1243dSDimitry Andric 1423bdd1243dSDimitry Andric unsigned SchedGroup::NumSchedGroups = 0; 1424bdd1243dSDimitry Andric 1425bdd1243dSDimitry Andric bool SchedGroup::tryAddEdge(SUnit *A, SUnit *B) { 1426bdd1243dSDimitry Andric if (A != B && DAG->canAddEdge(B, A)) { 1427bdd1243dSDimitry Andric DAG->addEdge(B, SDep(A, SDep::Artificial)); 1428bdd1243dSDimitry Andric return true; 1429bdd1243dSDimitry Andric } 1430bdd1243dSDimitry Andric return false; 1431bdd1243dSDimitry Andric } 1432bdd1243dSDimitry Andric 1433bdd1243dSDimitry Andric bool SchedGroup::canAddMI(const MachineInstr &MI) const { 1434bdd1243dSDimitry Andric bool Result = false; 1435bdd1243dSDimitry Andric if (MI.isMetaInstruction()) 1436bdd1243dSDimitry Andric Result = false; 1437bdd1243dSDimitry Andric 1438bdd1243dSDimitry Andric else if (((SGMask & SchedGroupMask::ALU) != SchedGroupMask::NONE) && 1439*cb14a3feSDimitry Andric (TII->isVALU(MI) || TII->isMFMAorWMMA(MI) || TII->isSALU(MI) || 1440*cb14a3feSDimitry Andric TII->isTRANS(MI))) 1441bdd1243dSDimitry Andric Result = true; 1442bdd1243dSDimitry Andric 1443bdd1243dSDimitry Andric else if (((SGMask & SchedGroupMask::VALU) != SchedGroupMask::NONE) && 1444*cb14a3feSDimitry Andric TII->isVALU(MI) && !TII->isMFMAorWMMA(MI) && !TII->isTRANS(MI)) 1445bdd1243dSDimitry Andric Result = true; 1446bdd1243dSDimitry Andric 1447bdd1243dSDimitry Andric else if (((SGMask & SchedGroupMask::SALU) != SchedGroupMask::NONE) && 1448bdd1243dSDimitry Andric TII->isSALU(MI)) 1449bdd1243dSDimitry Andric Result = true; 1450bdd1243dSDimitry Andric 1451bdd1243dSDimitry Andric else if (((SGMask & SchedGroupMask::MFMA) != SchedGroupMask::NONE) && 1452bdd1243dSDimitry Andric TII->isMFMAorWMMA(MI)) 1453bdd1243dSDimitry Andric Result = true; 1454bdd1243dSDimitry Andric 1455bdd1243dSDimitry Andric else if (((SGMask & SchedGroupMask::VMEM) != SchedGroupMask::NONE) && 1456bdd1243dSDimitry Andric (TII->isVMEM(MI) || (TII->isFLAT(MI) && !TII->isDS(MI)))) 1457bdd1243dSDimitry Andric Result = true; 1458bdd1243dSDimitry Andric 1459bdd1243dSDimitry Andric else if (((SGMask & SchedGroupMask::VMEM_READ) != SchedGroupMask::NONE) && 1460bdd1243dSDimitry Andric MI.mayLoad() && 1461bdd1243dSDimitry Andric (TII->isVMEM(MI) || (TII->isFLAT(MI) && !TII->isDS(MI)))) 1462bdd1243dSDimitry Andric Result = true; 1463bdd1243dSDimitry Andric 1464bdd1243dSDimitry Andric else if (((SGMask & SchedGroupMask::VMEM_WRITE) != SchedGroupMask::NONE) && 1465bdd1243dSDimitry Andric MI.mayStore() && 1466bdd1243dSDimitry Andric (TII->isVMEM(MI) || (TII->isFLAT(MI) && !TII->isDS(MI)))) 1467bdd1243dSDimitry Andric Result = true; 1468bdd1243dSDimitry Andric 1469bdd1243dSDimitry Andric else if (((SGMask & SchedGroupMask::DS) != SchedGroupMask::NONE) && 1470bdd1243dSDimitry Andric TII->isDS(MI)) 1471bdd1243dSDimitry Andric Result = true; 1472bdd1243dSDimitry Andric 1473bdd1243dSDimitry Andric else if (((SGMask & SchedGroupMask::DS_READ) != SchedGroupMask::NONE) && 1474bdd1243dSDimitry Andric MI.mayLoad() && TII->isDS(MI)) 1475bdd1243dSDimitry Andric Result = true; 1476bdd1243dSDimitry Andric 1477bdd1243dSDimitry Andric else if (((SGMask & SchedGroupMask::DS_WRITE) != SchedGroupMask::NONE) && 1478bdd1243dSDimitry Andric MI.mayStore() && TII->isDS(MI)) 1479bdd1243dSDimitry Andric Result = true; 1480bdd1243dSDimitry Andric 1481*cb14a3feSDimitry Andric else if (((SGMask & SchedGroupMask::TRANS) != SchedGroupMask::NONE) && 1482*cb14a3feSDimitry Andric TII->isTRANS(MI)) 1483*cb14a3feSDimitry Andric Result = true; 1484*cb14a3feSDimitry Andric 1485bdd1243dSDimitry Andric LLVM_DEBUG( 1486bdd1243dSDimitry Andric dbgs() << "For SchedGroup with mask " << format_hex((int)SGMask, 10, true) 1487bdd1243dSDimitry Andric << (Result ? " could classify " : " unable to classify ") << MI); 1488bdd1243dSDimitry Andric 1489bdd1243dSDimitry Andric return Result; 1490bdd1243dSDimitry Andric } 1491bdd1243dSDimitry Andric 1492bdd1243dSDimitry Andric int SchedGroup::link(SUnit &SU, bool MakePred, 1493bdd1243dSDimitry Andric std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges) { 1494bdd1243dSDimitry Andric int MissedEdges = 0; 1495bdd1243dSDimitry Andric for (auto *A : Collection) { 1496bdd1243dSDimitry Andric SUnit *B = &SU; 1497bdd1243dSDimitry Andric if (A == B || A->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER) 1498bdd1243dSDimitry Andric continue; 1499bdd1243dSDimitry Andric if (MakePred) 1500bdd1243dSDimitry Andric std::swap(A, B); 1501bdd1243dSDimitry Andric 1502bdd1243dSDimitry Andric if (DAG->IsReachable(B, A)) 1503bdd1243dSDimitry Andric continue; 150406c3fb27SDimitry Andric 1505bdd1243dSDimitry Andric // tryAddEdge returns false if there is a dependency that makes adding 1506bdd1243dSDimitry Andric // the A->B edge impossible, otherwise it returns true; 1507bdd1243dSDimitry Andric bool Added = tryAddEdge(A, B); 1508bdd1243dSDimitry Andric if (Added) 1509bdd1243dSDimitry Andric AddedEdges.push_back(std::pair(A, B)); 1510bdd1243dSDimitry Andric else 1511bdd1243dSDimitry Andric ++MissedEdges; 1512bdd1243dSDimitry Andric } 1513bdd1243dSDimitry Andric 1514bdd1243dSDimitry Andric return MissedEdges; 1515bdd1243dSDimitry Andric } 1516bdd1243dSDimitry Andric 1517bdd1243dSDimitry Andric void SchedGroup::link(SUnit &SU, bool MakePred) { 1518bdd1243dSDimitry Andric for (auto *A : Collection) { 1519bdd1243dSDimitry Andric SUnit *B = &SU; 1520bdd1243dSDimitry Andric if (A->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER) 1521bdd1243dSDimitry Andric continue; 1522bdd1243dSDimitry Andric if (MakePred) 1523bdd1243dSDimitry Andric std::swap(A, B); 1524bdd1243dSDimitry Andric 1525bdd1243dSDimitry Andric tryAddEdge(A, B); 1526bdd1243dSDimitry Andric } 1527bdd1243dSDimitry Andric } 1528bdd1243dSDimitry Andric 1529bdd1243dSDimitry Andric void SchedGroup::link(SUnit &SU, 1530bdd1243dSDimitry Andric function_ref<bool(const SUnit *A, const SUnit *B)> P) { 1531bdd1243dSDimitry Andric for (auto *A : Collection) { 1532bdd1243dSDimitry Andric SUnit *B = &SU; 1533bdd1243dSDimitry Andric if (P(A, B)) 1534bdd1243dSDimitry Andric std::swap(A, B); 1535bdd1243dSDimitry Andric 1536bdd1243dSDimitry Andric tryAddEdge(A, B); 1537bdd1243dSDimitry Andric } 1538bdd1243dSDimitry Andric } 1539bdd1243dSDimitry Andric 1540bdd1243dSDimitry Andric void SchedGroup::link(SchedGroup &OtherGroup) { 1541bdd1243dSDimitry Andric for (auto *B : OtherGroup.Collection) 1542bdd1243dSDimitry Andric link(*B); 1543bdd1243dSDimitry Andric } 1544bdd1243dSDimitry Andric 1545bdd1243dSDimitry Andric bool SchedGroup::canAddSU(SUnit &SU) const { 1546bdd1243dSDimitry Andric MachineInstr &MI = *SU.getInstr(); 1547bdd1243dSDimitry Andric if (MI.getOpcode() != TargetOpcode::BUNDLE) 1548bdd1243dSDimitry Andric return canAddMI(MI); 1549bdd1243dSDimitry Andric 1550bdd1243dSDimitry Andric // Special case for bundled MIs. 1551bdd1243dSDimitry Andric const MachineBasicBlock *MBB = MI.getParent(); 1552bdd1243dSDimitry Andric MachineBasicBlock::instr_iterator B = MI.getIterator(), E = ++B; 1553bdd1243dSDimitry Andric while (E != MBB->end() && E->isBundledWithPred()) 1554bdd1243dSDimitry Andric ++E; 1555bdd1243dSDimitry Andric 1556bdd1243dSDimitry Andric // Return true if all of the bundled MIs can be added to this group. 1557bdd1243dSDimitry Andric return std::all_of(B, E, [this](MachineInstr &MI) { return canAddMI(MI); }); 1558bdd1243dSDimitry Andric } 1559bdd1243dSDimitry Andric 1560bdd1243dSDimitry Andric void SchedGroup::initSchedGroup() { 1561bdd1243dSDimitry Andric for (auto &SU : DAG->SUnits) { 1562bdd1243dSDimitry Andric if (isFull()) 1563bdd1243dSDimitry Andric break; 1564bdd1243dSDimitry Andric 1565bdd1243dSDimitry Andric if (canAddSU(SU)) 1566bdd1243dSDimitry Andric add(SU); 1567bdd1243dSDimitry Andric } 1568bdd1243dSDimitry Andric } 1569bdd1243dSDimitry Andric 1570bdd1243dSDimitry Andric void SchedGroup::initSchedGroup(std::vector<SUnit>::reverse_iterator RIter, 1571bdd1243dSDimitry Andric SUnitsToCandidateSGsMap &SyncedInstrs) { 1572bdd1243dSDimitry Andric SUnit &InitSU = *RIter; 1573bdd1243dSDimitry Andric for (auto E = DAG->SUnits.rend(); RIter != E; ++RIter) { 1574bdd1243dSDimitry Andric auto &SU = *RIter; 1575bdd1243dSDimitry Andric if (isFull()) 1576bdd1243dSDimitry Andric break; 1577bdd1243dSDimitry Andric 1578bdd1243dSDimitry Andric if (canAddSU(SU)) 1579bdd1243dSDimitry Andric SyncedInstrs[&SU].push_back(SGID); 1580bdd1243dSDimitry Andric } 1581bdd1243dSDimitry Andric 1582bdd1243dSDimitry Andric add(InitSU); 1583bdd1243dSDimitry Andric assert(MaxSize); 1584bdd1243dSDimitry Andric (*MaxSize)++; 1585bdd1243dSDimitry Andric } 1586bdd1243dSDimitry Andric 1587bdd1243dSDimitry Andric void SchedGroup::initSchedGroup(SUnitsToCandidateSGsMap &SyncedInstrs) { 1588bdd1243dSDimitry Andric auto I = DAG->SUnits.rbegin(); 1589bdd1243dSDimitry Andric auto E = DAG->SUnits.rend(); 1590bdd1243dSDimitry Andric for (; I != E; ++I) { 1591bdd1243dSDimitry Andric auto &SU = *I; 1592bdd1243dSDimitry Andric if (isFull()) 1593bdd1243dSDimitry Andric break; 1594bdd1243dSDimitry Andric 1595bdd1243dSDimitry Andric if (canAddSU(SU)) 1596bdd1243dSDimitry Andric SyncedInstrs[&SU].push_back(SGID); 1597bdd1243dSDimitry Andric } 1598bdd1243dSDimitry Andric } 1599bdd1243dSDimitry Andric 1600bdd1243dSDimitry Andric void IGroupLPDAGMutation::apply(ScheduleDAGInstrs *DAGInstrs) { 160181ad6265SDimitry Andric const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel(); 160281ad6265SDimitry Andric if (!TSchedModel || DAGInstrs->SUnits.empty()) 160381ad6265SDimitry Andric return; 160481ad6265SDimitry Andric 1605bdd1243dSDimitry Andric LLVM_DEBUG(dbgs() << "Applying IGroupLPDAGMutation...\n"); 160681ad6265SDimitry Andric const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>(); 160781ad6265SDimitry Andric TII = ST.getInstrInfo(); 160881ad6265SDimitry Andric DAG = static_cast<ScheduleDAGMI *>(DAGInstrs); 1609bdd1243dSDimitry Andric SyncedSchedGroups.clear(); 1610bdd1243dSDimitry Andric SyncedInstrs.clear(); 1611bdd1243dSDimitry Andric bool foundSB = false; 1612bdd1243dSDimitry Andric bool foundIGLP = false; 1613bdd1243dSDimitry Andric for (auto R = DAG->SUnits.rbegin(), E = DAG->SUnits.rend(); R != E; ++R) { 1614bdd1243dSDimitry Andric unsigned Opc = R->getInstr()->getOpcode(); 1615bdd1243dSDimitry Andric // SCHED_[GROUP_]BARRIER and IGLP are mutually exclusive. 1616bdd1243dSDimitry Andric if (Opc == AMDGPU::SCHED_BARRIER) { 1617bdd1243dSDimitry Andric addSchedBarrierEdges(*R); 1618bdd1243dSDimitry Andric foundSB = true; 1619bdd1243dSDimitry Andric } else if (Opc == AMDGPU::SCHED_GROUP_BARRIER) { 1620bdd1243dSDimitry Andric initSchedGroupBarrierPipelineStage(R); 1621bdd1243dSDimitry Andric foundSB = true; 1622bdd1243dSDimitry Andric } else if (Opc == AMDGPU::IGLP_OPT) { 1623bdd1243dSDimitry Andric resetEdges(*R, DAG); 1624bdd1243dSDimitry Andric if (!foundSB && !foundIGLP) 1625bdd1243dSDimitry Andric initIGLPOpt(*R); 1626bdd1243dSDimitry Andric foundIGLP = true; 1627bdd1243dSDimitry Andric } 162881ad6265SDimitry Andric } 162981ad6265SDimitry Andric 1630bdd1243dSDimitry Andric if (foundSB || foundIGLP) { 163106c3fb27SDimitry Andric PipelineSolver PS(SyncedSchedGroups, SyncedInstrs, DAG, IsBottomUp); 1632bdd1243dSDimitry Andric // PipelineSolver performs the mutation by adding the edges it 1633bdd1243dSDimitry Andric // determined as the best 1634bdd1243dSDimitry Andric PS.solve(); 1635bdd1243dSDimitry Andric } 1636bdd1243dSDimitry Andric } 1637bdd1243dSDimitry Andric 1638bdd1243dSDimitry Andric void IGroupLPDAGMutation::addSchedBarrierEdges(SUnit &SchedBarrier) { 163981ad6265SDimitry Andric MachineInstr &MI = *SchedBarrier.getInstr(); 164081ad6265SDimitry Andric assert(MI.getOpcode() == AMDGPU::SCHED_BARRIER); 164181ad6265SDimitry Andric // Remove all existing edges from the SCHED_BARRIER that were added due to the 164281ad6265SDimitry Andric // instruction having side effects. 1643bdd1243dSDimitry Andric resetEdges(SchedBarrier, DAG); 1644*cb14a3feSDimitry Andric LLVM_DEBUG(dbgs() << "Building SchedGroup for SchedBarrier with Mask: " 1645*cb14a3feSDimitry Andric << MI.getOperand(0).getImm() << "\n"); 1646bdd1243dSDimitry Andric auto InvertedMask = 1647bdd1243dSDimitry Andric invertSchedBarrierMask((SchedGroupMask)MI.getOperand(0).getImm()); 1648bdd1243dSDimitry Andric SchedGroup SG(InvertedMask, std::nullopt, DAG, TII); 1649bdd1243dSDimitry Andric SG.initSchedGroup(); 1650*cb14a3feSDimitry Andric 1651bdd1243dSDimitry Andric // Preserve original instruction ordering relative to the SCHED_BARRIER. 1652bdd1243dSDimitry Andric SG.link( 1653bdd1243dSDimitry Andric SchedBarrier, 1654bdd1243dSDimitry Andric (function_ref<bool(const SUnit *A, const SUnit *B)>)[]( 1655bdd1243dSDimitry Andric const SUnit *A, const SUnit *B) { return A->NodeNum > B->NodeNum; }); 165681ad6265SDimitry Andric } 165781ad6265SDimitry Andric 1658bdd1243dSDimitry Andric SchedGroupMask 1659bdd1243dSDimitry Andric IGroupLPDAGMutation::invertSchedBarrierMask(SchedGroupMask Mask) const { 1660bdd1243dSDimitry Andric // Invert mask and erase bits for types of instructions that are implied to be 1661bdd1243dSDimitry Andric // allowed past the SCHED_BARRIER. 1662bdd1243dSDimitry Andric SchedGroupMask InvertedMask = ~Mask; 1663bdd1243dSDimitry Andric 1664*cb14a3feSDimitry Andric // ALU implies VALU, SALU, MFMA, TRANS. 1665bdd1243dSDimitry Andric if ((InvertedMask & SchedGroupMask::ALU) == SchedGroupMask::NONE) 1666*cb14a3feSDimitry Andric InvertedMask &= ~SchedGroupMask::VALU & ~SchedGroupMask::SALU & 1667*cb14a3feSDimitry Andric ~SchedGroupMask::MFMA & ~SchedGroupMask::TRANS; 1668*cb14a3feSDimitry Andric // VALU, SALU, MFMA, TRANS implies ALU. 1669bdd1243dSDimitry Andric else if ((InvertedMask & SchedGroupMask::VALU) == SchedGroupMask::NONE || 1670bdd1243dSDimitry Andric (InvertedMask & SchedGroupMask::SALU) == SchedGroupMask::NONE || 1671*cb14a3feSDimitry Andric (InvertedMask & SchedGroupMask::MFMA) == SchedGroupMask::NONE || 1672*cb14a3feSDimitry Andric (InvertedMask & SchedGroupMask::TRANS) == SchedGroupMask::NONE) 1673bdd1243dSDimitry Andric InvertedMask &= ~SchedGroupMask::ALU; 1674bdd1243dSDimitry Andric 1675bdd1243dSDimitry Andric // VMEM implies VMEM_READ, VMEM_WRITE. 1676bdd1243dSDimitry Andric if ((InvertedMask & SchedGroupMask::VMEM) == SchedGroupMask::NONE) 1677bdd1243dSDimitry Andric InvertedMask &= ~SchedGroupMask::VMEM_READ & ~SchedGroupMask::VMEM_WRITE; 1678bdd1243dSDimitry Andric // VMEM_READ, VMEM_WRITE implies VMEM. 1679bdd1243dSDimitry Andric else if ((InvertedMask & SchedGroupMask::VMEM_READ) == SchedGroupMask::NONE || 1680bdd1243dSDimitry Andric (InvertedMask & SchedGroupMask::VMEM_WRITE) == SchedGroupMask::NONE) 1681bdd1243dSDimitry Andric InvertedMask &= ~SchedGroupMask::VMEM; 1682bdd1243dSDimitry Andric 1683bdd1243dSDimitry Andric // DS implies DS_READ, DS_WRITE. 1684bdd1243dSDimitry Andric if ((InvertedMask & SchedGroupMask::DS) == SchedGroupMask::NONE) 1685bdd1243dSDimitry Andric InvertedMask &= ~SchedGroupMask::DS_READ & ~SchedGroupMask::DS_WRITE; 1686bdd1243dSDimitry Andric // DS_READ, DS_WRITE implies DS. 1687bdd1243dSDimitry Andric else if ((InvertedMask & SchedGroupMask::DS_READ) == SchedGroupMask::NONE || 1688bdd1243dSDimitry Andric (InvertedMask & SchedGroupMask::DS_WRITE) == SchedGroupMask::NONE) 1689bdd1243dSDimitry Andric InvertedMask &= ~SchedGroupMask::DS; 1690bdd1243dSDimitry Andric 1691*cb14a3feSDimitry Andric LLVM_DEBUG(dbgs() << "After Inverting, SchedGroup Mask: " << (int)InvertedMask 1692*cb14a3feSDimitry Andric << "\n"); 1693*cb14a3feSDimitry Andric 1694bdd1243dSDimitry Andric return InvertedMask; 169581ad6265SDimitry Andric } 169681ad6265SDimitry Andric 1697bdd1243dSDimitry Andric void IGroupLPDAGMutation::initSchedGroupBarrierPipelineStage( 1698bdd1243dSDimitry Andric std::vector<SUnit>::reverse_iterator RIter) { 1699bdd1243dSDimitry Andric // Remove all existing edges from the SCHED_GROUP_BARRIER that were added due 1700bdd1243dSDimitry Andric // to the instruction having side effects. 1701bdd1243dSDimitry Andric resetEdges(*RIter, DAG); 1702bdd1243dSDimitry Andric MachineInstr &SGB = *RIter->getInstr(); 1703bdd1243dSDimitry Andric assert(SGB.getOpcode() == AMDGPU::SCHED_GROUP_BARRIER); 1704bdd1243dSDimitry Andric int32_t SGMask = SGB.getOperand(0).getImm(); 1705bdd1243dSDimitry Andric int32_t Size = SGB.getOperand(1).getImm(); 1706bdd1243dSDimitry Andric int32_t SyncID = SGB.getOperand(2).getImm(); 1707bdd1243dSDimitry Andric 1708bdd1243dSDimitry Andric auto &SG = SyncedSchedGroups[SyncID].emplace_back((SchedGroupMask)SGMask, 1709bdd1243dSDimitry Andric Size, SyncID, DAG, TII); 1710bdd1243dSDimitry Andric 1711bdd1243dSDimitry Andric SG.initSchedGroup(RIter, SyncedInstrs[SG.getSyncID()]); 171281ad6265SDimitry Andric } 171381ad6265SDimitry Andric 1714bdd1243dSDimitry Andric void IGroupLPDAGMutation::initIGLPOpt(SUnit &SU) { 1715bdd1243dSDimitry Andric IGLPStrategyID StrategyID = 1716bdd1243dSDimitry Andric (IGLPStrategyID)SU.getInstr()->getOperand(0).getImm(); 1717bdd1243dSDimitry Andric auto S = createIGLPStrategy(StrategyID, DAG, TII); 171806c3fb27SDimitry Andric if (S->shouldApplyStrategy(DAG)) { 171906c3fb27SDimitry Andric IsBottomUp = S->IsBottomUp; 17205f757f3fSDimitry Andric S->applyIGLPStrategy(SyncedInstrs, SyncedSchedGroups, IsReentry); 172181ad6265SDimitry Andric } 172206c3fb27SDimitry Andric } 172381ad6265SDimitry Andric 172481ad6265SDimitry Andric } // namespace 172581ad6265SDimitry Andric 172681ad6265SDimitry Andric namespace llvm { 172781ad6265SDimitry Andric 17285f757f3fSDimitry Andric /// \p IsReentry specifes whether or not this is a reentry into the 17295f757f3fSDimitry Andric /// IGroupLPDAGMutation. Since there may be multiple scheduling passes on the 17305f757f3fSDimitry Andric /// same scheduling region (e.g. pre and post-RA scheduling / multiple 17315f757f3fSDimitry Andric /// scheduling "phases"), we can reenter this mutation framework more than once 17325f757f3fSDimitry Andric /// for a given region. 17335f757f3fSDimitry Andric std::unique_ptr<ScheduleDAGMutation> createIGroupLPDAGMutation(bool IsReentry) { 17345f757f3fSDimitry Andric return std::make_unique<IGroupLPDAGMutation>(IsReentry); 173581ad6265SDimitry Andric } 173681ad6265SDimitry Andric 173781ad6265SDimitry Andric } // end namespace llvm 1738