181ad6265SDimitry Andric //===--- AMDGPUIGroupLP.cpp - AMDGPU IGroupLP ------------===// 281ad6265SDimitry Andric // 381ad6265SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 481ad6265SDimitry Andric // See https://llvm.org/LICENSE.txt for license information. 581ad6265SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 681ad6265SDimitry Andric // 781ad6265SDimitry Andric //===----------------------------------------------------------------------===// 881ad6265SDimitry Andric // 981ad6265SDimitry Andric // \file This file defines a set of schedule DAG mutations that can be used to 1081ad6265SDimitry Andric // override default scheduler behavior to enforce specific scheduling patterns. 1181ad6265SDimitry Andric // They should be used in cases where runtime performance considerations such as 1281ad6265SDimitry Andric // inter-wavefront interactions, mean that compile-time heuristics cannot 1381ad6265SDimitry Andric // predict the optimal instruction ordering, or in kernels where optimum 1481ad6265SDimitry Andric // instruction scheduling is important enough to warrant manual intervention. 1581ad6265SDimitry Andric // 1681ad6265SDimitry Andric //===----------------------------------------------------------------------===// 1781ad6265SDimitry Andric 1881ad6265SDimitry Andric #include "AMDGPUIGroupLP.h" 1981ad6265SDimitry Andric #include "AMDGPUTargetMachine.h" 2081ad6265SDimitry Andric #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 2181ad6265SDimitry Andric #include "SIInstrInfo.h" 2281ad6265SDimitry Andric #include "SIMachineFunctionInfo.h" 2381ad6265SDimitry Andric #include "llvm/ADT/BitmaskEnum.h" 24bdd1243dSDimitry Andric #include "llvm/ADT/DenseMap.h" 2581ad6265SDimitry Andric #include "llvm/CodeGen/MachineScheduler.h" 2681ad6265SDimitry Andric #include "llvm/CodeGen/TargetOpcodes.h" 2781ad6265SDimitry Andric 2881ad6265SDimitry Andric using namespace llvm; 2981ad6265SDimitry Andric 30bdd1243dSDimitry Andric #define DEBUG_TYPE "igrouplp" 3181ad6265SDimitry Andric 3281ad6265SDimitry Andric namespace { 3381ad6265SDimitry Andric 34bdd1243dSDimitry Andric static cl::opt<bool> EnableExactSolver( 35bdd1243dSDimitry Andric "amdgpu-igrouplp-exact-solver", cl::Hidden, 36bdd1243dSDimitry Andric cl::desc("Whether to use the exponential time solver to fit " 37bdd1243dSDimitry Andric "the instructions to the pipeline as closely as " 38bdd1243dSDimitry Andric "possible."), 3981ad6265SDimitry Andric cl::init(false)); 4081ad6265SDimitry Andric 41bdd1243dSDimitry Andric static cl::opt<unsigned> CutoffForExact( 42bdd1243dSDimitry Andric "amdgpu-igrouplp-exact-solver-cutoff", cl::init(0), cl::Hidden, 43bdd1243dSDimitry Andric cl::desc("The maximum number of scheduling group conflicts " 44bdd1243dSDimitry Andric "which we attempt to solve with the exponential time " 45bdd1243dSDimitry Andric "exact solver. Problem sizes greater than this will" 46bdd1243dSDimitry Andric "be solved by the less accurate greedy algorithm. Selecting " 47bdd1243dSDimitry Andric "solver by size is superseded by manually selecting " 48bdd1243dSDimitry Andric "the solver (e.g. by amdgpu-igrouplp-exact-solver")); 4981ad6265SDimitry Andric 50bdd1243dSDimitry Andric static cl::opt<uint64_t> MaxBranchesExplored( 51bdd1243dSDimitry Andric "amdgpu-igrouplp-exact-solver-max-branches", cl::init(0), cl::Hidden, 52bdd1243dSDimitry Andric cl::desc("The amount of branches that we are willing to explore with" 53bdd1243dSDimitry Andric "the exact algorithm before giving up.")); 5481ad6265SDimitry Andric 55bdd1243dSDimitry Andric static cl::opt<bool> UseCostHeur( 56bdd1243dSDimitry Andric "amdgpu-igrouplp-exact-solver-cost-heur", cl::init(true), cl::Hidden, 57bdd1243dSDimitry Andric cl::desc("Whether to use the cost heuristic to make choices as we " 58bdd1243dSDimitry Andric "traverse the search space using the exact solver. Defaulted " 59bdd1243dSDimitry Andric "to on, and if turned off, we will use the node order -- " 60bdd1243dSDimitry Andric "attempting to put the later nodes in the later sched groups. " 61bdd1243dSDimitry Andric "Experimentally, results are mixed, so this should be set on a " 62bdd1243dSDimitry Andric "case-by-case basis.")); 6381ad6265SDimitry Andric 64bdd1243dSDimitry Andric // Components of the mask that determines which instruction types may be may be 65bdd1243dSDimitry Andric // classified into a SchedGroup. 66bdd1243dSDimitry Andric enum class SchedGroupMask { 6781ad6265SDimitry Andric NONE = 0u, 6881ad6265SDimitry Andric ALU = 1u << 0, 6981ad6265SDimitry Andric VALU = 1u << 1, 7081ad6265SDimitry Andric SALU = 1u << 2, 7181ad6265SDimitry Andric MFMA = 1u << 3, 7281ad6265SDimitry Andric VMEM = 1u << 4, 7381ad6265SDimitry Andric VMEM_READ = 1u << 5, 7481ad6265SDimitry Andric VMEM_WRITE = 1u << 6, 7581ad6265SDimitry Andric DS = 1u << 7, 7681ad6265SDimitry Andric DS_READ = 1u << 8, 7781ad6265SDimitry Andric DS_WRITE = 1u << 9, 78bdd1243dSDimitry Andric ALL = ALU | VALU | SALU | MFMA | VMEM | VMEM_READ | VMEM_WRITE | DS | 79bdd1243dSDimitry Andric DS_READ | DS_WRITE, 80bdd1243dSDimitry Andric LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ ALL) 8181ad6265SDimitry Andric }; 8281ad6265SDimitry Andric 8306c3fb27SDimitry Andric class SchedGroup; 8406c3fb27SDimitry Andric 8506c3fb27SDimitry Andric // InstructionRule class is used to enact a filter which determines whether or 8606c3fb27SDimitry Andric // not an SU maps to a given SchedGroup. It contains complementary data 8706c3fb27SDimitry Andric // structures (e.g Cache) to help those filters. 8806c3fb27SDimitry Andric class InstructionRule { 8906c3fb27SDimitry Andric protected: 9006c3fb27SDimitry Andric const SIInstrInfo *TII; 9106c3fb27SDimitry Andric unsigned SGID; 9206c3fb27SDimitry Andric // A cache made available to the Filter to store SUnits for subsequent 9306c3fb27SDimitry Andric // invocations of the Filter 9406c3fb27SDimitry Andric std::optional<SmallVector<SUnit *, 4>> Cache; 9506c3fb27SDimitry Andric 9606c3fb27SDimitry Andric public: 9706c3fb27SDimitry Andric virtual bool 9806c3fb27SDimitry Andric apply(const SUnit *, const ArrayRef<SUnit *>, 9906c3fb27SDimitry Andric SmallVectorImpl<SchedGroup> &) { 10006c3fb27SDimitry Andric return true; 10106c3fb27SDimitry Andric }; 10206c3fb27SDimitry Andric 10306c3fb27SDimitry Andric InstructionRule(const SIInstrInfo *TII, unsigned SGID, 10406c3fb27SDimitry Andric bool NeedsCache = false) 10506c3fb27SDimitry Andric : TII(TII), SGID(SGID) { 10606c3fb27SDimitry Andric if (NeedsCache) { 10706c3fb27SDimitry Andric Cache = SmallVector<SUnit *, 4>(); 10806c3fb27SDimitry Andric } 10906c3fb27SDimitry Andric } 11006c3fb27SDimitry Andric 11106c3fb27SDimitry Andric virtual ~InstructionRule() = default; 11206c3fb27SDimitry Andric }; 11306c3fb27SDimitry Andric 114bdd1243dSDimitry Andric typedef DenseMap<SUnit *, SmallVector<int, 4>> SUnitsToCandidateSGsMap; 11581ad6265SDimitry Andric 116bdd1243dSDimitry Andric // Classify instructions into groups to enable fine tuned control over the 117bdd1243dSDimitry Andric // scheduler. These groups may be more specific than current SchedModel 118bdd1243dSDimitry Andric // instruction classes. 119bdd1243dSDimitry Andric class SchedGroup { 120bdd1243dSDimitry Andric private: 121bdd1243dSDimitry Andric // Mask that defines which instruction types can be classified into this 122bdd1243dSDimitry Andric // SchedGroup. The instruction types correspond to the mask from SCHED_BARRIER 123bdd1243dSDimitry Andric // and SCHED_GROUP_BARRIER. 124bdd1243dSDimitry Andric SchedGroupMask SGMask; 12581ad6265SDimitry Andric 126bdd1243dSDimitry Andric // Maximum number of SUnits that can be added to this group. 127bdd1243dSDimitry Andric std::optional<unsigned> MaxSize; 12881ad6265SDimitry Andric 129bdd1243dSDimitry Andric // SchedGroups will only synchronize with other SchedGroups that have the same 130bdd1243dSDimitry Andric // SyncID. 131bdd1243dSDimitry Andric int SyncID = 0; 13281ad6265SDimitry Andric 133bdd1243dSDimitry Andric // SGID is used to map instructions to candidate SchedGroups 134bdd1243dSDimitry Andric unsigned SGID; 135bdd1243dSDimitry Andric 13606c3fb27SDimitry Andric // The different rules each instruction in this SchedGroup must conform to 13706c3fb27SDimitry Andric SmallVector<std::shared_ptr<InstructionRule>, 4> Rules; 13806c3fb27SDimitry Andric 139bdd1243dSDimitry Andric // Count of the number of created SchedGroups, used to initialize SGID. 140bdd1243dSDimitry Andric static unsigned NumSchedGroups; 141bdd1243dSDimitry Andric 142bdd1243dSDimitry Andric const SIInstrInfo *TII; 143bdd1243dSDimitry Andric 144bdd1243dSDimitry Andric // Try to add and edge from SU A to SU B. 145bdd1243dSDimitry Andric bool tryAddEdge(SUnit *A, SUnit *B); 146bdd1243dSDimitry Andric 147bdd1243dSDimitry Andric // Use SGMask to determine whether we can classify MI as a member of this 148bdd1243dSDimitry Andric // SchedGroup object. 149bdd1243dSDimitry Andric bool canAddMI(const MachineInstr &MI) const; 15081ad6265SDimitry Andric 15181ad6265SDimitry Andric public: 152bdd1243dSDimitry Andric // Collection of SUnits that are classified as members of this group. 153bdd1243dSDimitry Andric SmallVector<SUnit *, 32> Collection; 15481ad6265SDimitry Andric 15506c3fb27SDimitry Andric ScheduleDAGInstrs *DAG; 15606c3fb27SDimitry Andric 157bdd1243dSDimitry Andric // Returns true if SU can be added to this SchedGroup. 158bdd1243dSDimitry Andric bool canAddSU(SUnit &SU) const; 15981ad6265SDimitry Andric 160bdd1243dSDimitry Andric // Add DAG dependencies from all SUnits in this SchedGroup and this SU. If 161bdd1243dSDimitry Andric // MakePred is true, SU will be a predecessor of the SUnits in this 162bdd1243dSDimitry Andric // SchedGroup, otherwise SU will be a successor. 163bdd1243dSDimitry Andric void link(SUnit &SU, bool MakePred = false); 16481ad6265SDimitry Andric 165bdd1243dSDimitry Andric // Add DAG dependencies and track which edges are added, and the count of 166bdd1243dSDimitry Andric // missed edges 167bdd1243dSDimitry Andric int link(SUnit &SU, bool MakePred, 168bdd1243dSDimitry Andric std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges); 16981ad6265SDimitry Andric 170bdd1243dSDimitry Andric // Add DAG dependencies from all SUnits in this SchedGroup and this SU. 171bdd1243dSDimitry Andric // Use the predicate to determine whether SU should be a predecessor (P = 172bdd1243dSDimitry Andric // true) or a successor (P = false) of this SchedGroup. 173bdd1243dSDimitry Andric void link(SUnit &SU, function_ref<bool(const SUnit *A, const SUnit *B)> P); 17481ad6265SDimitry Andric 175bdd1243dSDimitry Andric // Add DAG dependencies such that SUnits in this group shall be ordered 176bdd1243dSDimitry Andric // before SUnits in OtherGroup. 177bdd1243dSDimitry Andric void link(SchedGroup &OtherGroup); 178bdd1243dSDimitry Andric 179bdd1243dSDimitry Andric // Returns true if no more instructions may be added to this group. 180bdd1243dSDimitry Andric bool isFull() const { return MaxSize && Collection.size() >= *MaxSize; } 181bdd1243dSDimitry Andric 18206c3fb27SDimitry Andric // Append a constraint that SUs must meet in order to fit into this 18306c3fb27SDimitry Andric // SchedGroup. Since many rules involve the relationship between a SchedGroup 18406c3fb27SDimitry Andric // and the SUnits in other SchedGroups, rules are checked at Pipeline Solve 18506c3fb27SDimitry Andric // time (rather than SchedGroup init time.) 18606c3fb27SDimitry Andric void addRule(std::shared_ptr<InstructionRule> NewRule) { 18706c3fb27SDimitry Andric Rules.push_back(NewRule); 18806c3fb27SDimitry Andric } 18906c3fb27SDimitry Andric 19006c3fb27SDimitry Andric // Returns true if the SU matches all rules 19106c3fb27SDimitry Andric bool allowedByRules(const SUnit *SU, 19206c3fb27SDimitry Andric SmallVectorImpl<SchedGroup> &SyncPipe) const { 19306c3fb27SDimitry Andric if (Rules.empty()) 19406c3fb27SDimitry Andric return true; 19506c3fb27SDimitry Andric for (size_t I = 0; I < Rules.size(); I++) { 19606c3fb27SDimitry Andric auto TheRule = Rules[I].get(); 19706c3fb27SDimitry Andric if (!TheRule->apply(SU, Collection, SyncPipe)) { 19806c3fb27SDimitry Andric return false; 19906c3fb27SDimitry Andric } 20006c3fb27SDimitry Andric } 20106c3fb27SDimitry Andric return true; 20206c3fb27SDimitry Andric } 20306c3fb27SDimitry Andric 204bdd1243dSDimitry Andric // Add SU to the SchedGroup. 205bdd1243dSDimitry Andric void add(SUnit &SU) { 206bdd1243dSDimitry Andric LLVM_DEBUG(dbgs() << "For SchedGroup with mask " 207bdd1243dSDimitry Andric << format_hex((int)SGMask, 10, true) << " adding " 208bdd1243dSDimitry Andric << *SU.getInstr()); 209bdd1243dSDimitry Andric Collection.push_back(&SU); 21081ad6265SDimitry Andric } 21181ad6265SDimitry Andric 212bdd1243dSDimitry Andric // Remove last element in the SchedGroup 213bdd1243dSDimitry Andric void pop() { Collection.pop_back(); } 214bdd1243dSDimitry Andric 215bdd1243dSDimitry Andric // Identify and add all relevant SUs from the DAG to this SchedGroup. 216bdd1243dSDimitry Andric void initSchedGroup(); 217bdd1243dSDimitry Andric 218bdd1243dSDimitry Andric // Add instructions to the SchedGroup bottom up starting from RIter. 219bdd1243dSDimitry Andric // PipelineInstrs is a set of instructions that should not be added to the 220bdd1243dSDimitry Andric // SchedGroup even when the other conditions for adding it are satisfied. 221bdd1243dSDimitry Andric // RIter will be added to the SchedGroup as well, and dependencies will be 222bdd1243dSDimitry Andric // added so that RIter will always be scheduled at the end of the group. 223bdd1243dSDimitry Andric void initSchedGroup(std::vector<SUnit>::reverse_iterator RIter, 224bdd1243dSDimitry Andric SUnitsToCandidateSGsMap &SyncedInstrs); 225bdd1243dSDimitry Andric 226bdd1243dSDimitry Andric void initSchedGroup(SUnitsToCandidateSGsMap &SyncedInstrs); 227bdd1243dSDimitry Andric 228bdd1243dSDimitry Andric int getSyncID() { return SyncID; } 229bdd1243dSDimitry Andric 230bdd1243dSDimitry Andric int getSGID() { return SGID; } 231bdd1243dSDimitry Andric 232bdd1243dSDimitry Andric SchedGroupMask getMask() { return SGMask; } 233bdd1243dSDimitry Andric 234bdd1243dSDimitry Andric SchedGroup(SchedGroupMask SGMask, std::optional<unsigned> MaxSize, 235bdd1243dSDimitry Andric ScheduleDAGInstrs *DAG, const SIInstrInfo *TII) 23606c3fb27SDimitry Andric : SGMask(SGMask), MaxSize(MaxSize), TII(TII), DAG(DAG) { 237bdd1243dSDimitry Andric SGID = NumSchedGroups++; 238bdd1243dSDimitry Andric } 239bdd1243dSDimitry Andric 240bdd1243dSDimitry Andric SchedGroup(SchedGroupMask SGMask, std::optional<unsigned> MaxSize, int SyncID, 241bdd1243dSDimitry Andric ScheduleDAGInstrs *DAG, const SIInstrInfo *TII) 24206c3fb27SDimitry Andric : SGMask(SGMask), MaxSize(MaxSize), SyncID(SyncID), TII(TII), DAG(DAG) { 243bdd1243dSDimitry Andric SGID = NumSchedGroups++; 244bdd1243dSDimitry Andric } 245bdd1243dSDimitry Andric }; 246bdd1243dSDimitry Andric 247bdd1243dSDimitry Andric // Remove all existing edges from a SCHED_BARRIER or SCHED_GROUP_BARRIER. 248bdd1243dSDimitry Andric static void resetEdges(SUnit &SU, ScheduleDAGInstrs *DAG) { 249bdd1243dSDimitry Andric assert(SU.getInstr()->getOpcode() == AMDGPU::SCHED_BARRIER || 250bdd1243dSDimitry Andric SU.getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER || 251bdd1243dSDimitry Andric SU.getInstr()->getOpcode() == AMDGPU::IGLP_OPT); 252bdd1243dSDimitry Andric 253bdd1243dSDimitry Andric while (!SU.Preds.empty()) 254bdd1243dSDimitry Andric for (auto &P : SU.Preds) 255bdd1243dSDimitry Andric SU.removePred(P); 256bdd1243dSDimitry Andric 257bdd1243dSDimitry Andric while (!SU.Succs.empty()) 258bdd1243dSDimitry Andric for (auto &S : SU.Succs) 259bdd1243dSDimitry Andric for (auto &SP : S.getSUnit()->Preds) 260bdd1243dSDimitry Andric if (SP.getSUnit() == &SU) 261bdd1243dSDimitry Andric S.getSUnit()->removePred(SP); 262bdd1243dSDimitry Andric } 263bdd1243dSDimitry Andric 264bdd1243dSDimitry Andric typedef std::pair<SUnit *, SmallVector<int, 4>> SUToCandSGsPair; 265bdd1243dSDimitry Andric typedef SmallVector<SUToCandSGsPair, 4> SUsToCandSGsVec; 266bdd1243dSDimitry Andric 267bdd1243dSDimitry Andric // The PipelineSolver is used to assign SUnits to SchedGroups in a pipeline 268bdd1243dSDimitry Andric // in non-trivial cases. For example, if the requested pipeline is 269bdd1243dSDimitry Andric // {VMEM_READ, VALU, MFMA, VMEM_READ} and we encounter a VMEM_READ instruction 270bdd1243dSDimitry Andric // in the DAG, then we will have an instruction that can not be trivially 271bdd1243dSDimitry Andric // assigned to a SchedGroup. The PipelineSolver class implements two algorithms 272bdd1243dSDimitry Andric // to find a good solution to the pipeline -- a greedy algorithm and an exact 273bdd1243dSDimitry Andric // algorithm. The exact algorithm has an exponential time complexity and should 274bdd1243dSDimitry Andric // only be used for small sized problems or medium sized problems where an exact 275bdd1243dSDimitry Andric // solution is highly desired. 276bdd1243dSDimitry Andric class PipelineSolver { 277bdd1243dSDimitry Andric ScheduleDAGMI *DAG; 278bdd1243dSDimitry Andric 279bdd1243dSDimitry Andric // Instructions that can be assigned to multiple SchedGroups 280bdd1243dSDimitry Andric DenseMap<int, SUnitsToCandidateSGsMap> SyncedInstrs; 281bdd1243dSDimitry Andric SmallVector<SUsToCandSGsVec, 4> PipelineInstrs; 282bdd1243dSDimitry Andric DenseMap<int, SmallVector<SchedGroup, 4>> SyncedSchedGroups; 283bdd1243dSDimitry Andric // The current working pipeline 284bdd1243dSDimitry Andric SmallVector<SmallVector<SchedGroup, 4>, 4> CurrPipeline; 285bdd1243dSDimitry Andric // The pipeline that has the best solution found so far 286bdd1243dSDimitry Andric SmallVector<SmallVector<SchedGroup, 4>, 4> BestPipeline; 287bdd1243dSDimitry Andric 288bdd1243dSDimitry Andric // Whether or not we actually have any SyncedInstrs to try to solve. 289bdd1243dSDimitry Andric bool NeedsSolver = false; 290bdd1243dSDimitry Andric 291bdd1243dSDimitry Andric // Compute an estimate of the size of search tree -- the true size is 292bdd1243dSDimitry Andric // the product of each conflictedInst.Matches.size() across all SyncPipelines 293bdd1243dSDimitry Andric unsigned computeProblemSize(); 294bdd1243dSDimitry Andric 295bdd1243dSDimitry Andric // The cost penalty of not assigning a SU to a SchedGroup 296bdd1243dSDimitry Andric int MissPenalty = 0; 297bdd1243dSDimitry Andric 298bdd1243dSDimitry Andric // Costs in terms of the number of edges we are unable to add 299bdd1243dSDimitry Andric int BestCost = -1; 300bdd1243dSDimitry Andric int CurrCost = 0; 301bdd1243dSDimitry Andric 302bdd1243dSDimitry Andric // Index pointing to the conflicting instruction that is currently being 303bdd1243dSDimitry Andric // fitted 304bdd1243dSDimitry Andric int CurrConflInstNo = 0; 305bdd1243dSDimitry Andric // Index to the pipeline that is currently being fitted 306bdd1243dSDimitry Andric int CurrSyncGroupIdx = 0; 307bdd1243dSDimitry Andric // The first non trivial pipeline 308bdd1243dSDimitry Andric int BeginSyncGroupIdx = 0; 309bdd1243dSDimitry Andric 310bdd1243dSDimitry Andric // How many branches we have explored 311bdd1243dSDimitry Andric uint64_t BranchesExplored = 0; 312bdd1243dSDimitry Andric 31306c3fb27SDimitry Andric // The direction in which we process the candidate SchedGroups per SU 31406c3fb27SDimitry Andric bool IsBottomUp = 1; 31506c3fb27SDimitry Andric 316bdd1243dSDimitry Andric // Update indices to fit next conflicting instruction 317bdd1243dSDimitry Andric void advancePosition(); 318bdd1243dSDimitry Andric // Recede indices to attempt to find better fit for previous conflicting 319bdd1243dSDimitry Andric // instruction 320bdd1243dSDimitry Andric void retreatPosition(); 321bdd1243dSDimitry Andric 322bdd1243dSDimitry Andric // The exponential time algorithm which finds the provably best fit 323bdd1243dSDimitry Andric bool solveExact(); 324bdd1243dSDimitry Andric // The polynomial time algorithm which attempts to find a good fit 325bdd1243dSDimitry Andric bool solveGreedy(); 32606c3fb27SDimitry Andric // Find the best SchedGroup for the current SU using the heuristic given all 32706c3fb27SDimitry Andric // current information. One step in the greedy algorithm. Templated against 32806c3fb27SDimitry Andric // the SchedGroup iterator (either reverse or forward). 32906c3fb27SDimitry Andric template <typename T> 33006c3fb27SDimitry Andric void greedyFind(std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges, T I, 33106c3fb27SDimitry Andric T E); 332bdd1243dSDimitry Andric // Whether or not the current solution is optimal 333bdd1243dSDimitry Andric bool checkOptimal(); 334bdd1243dSDimitry Andric // Populate the ready list, prioiritizing fewest missed edges first 33506c3fb27SDimitry Andric // Templated against the SchedGroup iterator (either reverse or forward). 33606c3fb27SDimitry Andric template <typename T> 33706c3fb27SDimitry Andric void populateReadyList(SmallVectorImpl<std::pair<int, int>> &ReadyList, T I, 33806c3fb27SDimitry Andric T E); 339bdd1243dSDimitry Andric // Add edges corresponding to the SchedGroups as assigned by solver 340bdd1243dSDimitry Andric void makePipeline(); 34106c3fb27SDimitry Andric // Link the SchedGroups in the best found pipeline. 34206c3fb27SDimitry Andric // Tmplated against the SchedGroup iterator (either reverse or forward). 34306c3fb27SDimitry Andric template <typename T> void linkSchedGroups(T I, T E); 344bdd1243dSDimitry Andric // Add the edges from the SU to the other SchedGroups in pipeline, and 345bdd1243dSDimitry Andric // return the number of edges missed. 346bdd1243dSDimitry Andric int addEdges(SmallVectorImpl<SchedGroup> &SyncPipeline, SUnit *SU, int SGID, 347bdd1243dSDimitry Andric std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges); 348*5f757f3fSDimitry Andric /// Link the pipeline as if \p SU was in the SchedGroup with ID \p SGID. It 349*5f757f3fSDimitry Andric /// returns the cost (in terms of missed pipeline edges), and tracks the edges 350*5f757f3fSDimitry Andric /// added in \p AddedEdges 35106c3fb27SDimitry Andric template <typename T> 35206c3fb27SDimitry Andric int linkSUnit(SUnit *SU, int SGID, 35306c3fb27SDimitry Andric std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges, T I, T E); 354*5f757f3fSDimitry Andric /// Remove the edges passed via \p AddedEdges 355bdd1243dSDimitry Andric void removeEdges(const std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges); 356bdd1243dSDimitry Andric // Convert the passed in maps to arrays for bidirectional iterators 357bdd1243dSDimitry Andric void convertSyncMapsToArrays(); 358bdd1243dSDimitry Andric 359bdd1243dSDimitry Andric void reset(); 360bdd1243dSDimitry Andric 361bdd1243dSDimitry Andric public: 362bdd1243dSDimitry Andric // Invoke the solver to map instructions to instruction groups. Heuristic && 363bdd1243dSDimitry Andric // command-line-option determines to use exact or greedy algorithm. 364bdd1243dSDimitry Andric void solve(); 365bdd1243dSDimitry Andric 366bdd1243dSDimitry Andric PipelineSolver(DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups, 367bdd1243dSDimitry Andric DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs, 36806c3fb27SDimitry Andric ScheduleDAGMI *DAG, bool IsBottomUp = 1) 369bdd1243dSDimitry Andric : DAG(DAG), SyncedInstrs(SyncedInstrs), 37006c3fb27SDimitry Andric SyncedSchedGroups(SyncedSchedGroups), IsBottomUp(IsBottomUp) { 371bdd1243dSDimitry Andric 372bdd1243dSDimitry Andric for (auto &PipelineInstrs : SyncedInstrs) { 373bdd1243dSDimitry Andric if (PipelineInstrs.second.size() > 0) { 374bdd1243dSDimitry Andric NeedsSolver = true; 375bdd1243dSDimitry Andric break; 376bdd1243dSDimitry Andric } 377bdd1243dSDimitry Andric } 378bdd1243dSDimitry Andric 379bdd1243dSDimitry Andric if (!NeedsSolver) 380bdd1243dSDimitry Andric return; 381bdd1243dSDimitry Andric 382bdd1243dSDimitry Andric convertSyncMapsToArrays(); 383bdd1243dSDimitry Andric 384bdd1243dSDimitry Andric CurrPipeline = BestPipeline; 385bdd1243dSDimitry Andric 386bdd1243dSDimitry Andric while (static_cast<size_t>(BeginSyncGroupIdx) < PipelineInstrs.size() && 387bdd1243dSDimitry Andric PipelineInstrs[BeginSyncGroupIdx].size() == 0) 388bdd1243dSDimitry Andric ++BeginSyncGroupIdx; 389bdd1243dSDimitry Andric 390bdd1243dSDimitry Andric if (static_cast<size_t>(BeginSyncGroupIdx) >= PipelineInstrs.size()) 391bdd1243dSDimitry Andric return; 392bdd1243dSDimitry Andric } 393bdd1243dSDimitry Andric }; 394bdd1243dSDimitry Andric 395bdd1243dSDimitry Andric void PipelineSolver::reset() { 396bdd1243dSDimitry Andric 397bdd1243dSDimitry Andric for (auto &SyncPipeline : CurrPipeline) { 398bdd1243dSDimitry Andric for (auto &SG : SyncPipeline) { 399bdd1243dSDimitry Andric SmallVector<SUnit *, 32> TempCollection = SG.Collection; 400bdd1243dSDimitry Andric SG.Collection.clear(); 401bdd1243dSDimitry Andric auto SchedBarr = llvm::find_if(TempCollection, [](SUnit *SU) { 402bdd1243dSDimitry Andric return SU->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER; 403bdd1243dSDimitry Andric }); 404bdd1243dSDimitry Andric if (SchedBarr != TempCollection.end()) 405bdd1243dSDimitry Andric SG.Collection.push_back(*SchedBarr); 406bdd1243dSDimitry Andric } 407bdd1243dSDimitry Andric } 408bdd1243dSDimitry Andric 409bdd1243dSDimitry Andric CurrSyncGroupIdx = BeginSyncGroupIdx; 410bdd1243dSDimitry Andric CurrConflInstNo = 0; 411bdd1243dSDimitry Andric CurrCost = 0; 412bdd1243dSDimitry Andric } 413bdd1243dSDimitry Andric 414bdd1243dSDimitry Andric void PipelineSolver::convertSyncMapsToArrays() { 415bdd1243dSDimitry Andric for (auto &SyncPipe : SyncedSchedGroups) { 416bdd1243dSDimitry Andric BestPipeline.insert(BestPipeline.begin(), SyncPipe.second); 417bdd1243dSDimitry Andric } 418bdd1243dSDimitry Andric 419bdd1243dSDimitry Andric int PipelineIDx = SyncedInstrs.size() - 1; 420bdd1243dSDimitry Andric PipelineInstrs.resize(SyncedInstrs.size()); 421bdd1243dSDimitry Andric for (auto &SyncInstrMap : SyncedInstrs) { 422bdd1243dSDimitry Andric for (auto &SUsToCandSGs : SyncInstrMap.second) { 423bdd1243dSDimitry Andric if (PipelineInstrs[PipelineIDx].size() == 0) { 424bdd1243dSDimitry Andric PipelineInstrs[PipelineIDx].push_back( 425bdd1243dSDimitry Andric std::pair(SUsToCandSGs.first, SUsToCandSGs.second)); 426bdd1243dSDimitry Andric continue; 427bdd1243dSDimitry Andric } 428bdd1243dSDimitry Andric auto SortPosition = PipelineInstrs[PipelineIDx].begin(); 429bdd1243dSDimitry Andric // Insert them in sorted order -- this allows for good parsing order in 430bdd1243dSDimitry Andric // the greedy algorithm 431bdd1243dSDimitry Andric while (SortPosition != PipelineInstrs[PipelineIDx].end() && 432bdd1243dSDimitry Andric SUsToCandSGs.first->NodeNum > SortPosition->first->NodeNum) 433bdd1243dSDimitry Andric ++SortPosition; 434bdd1243dSDimitry Andric PipelineInstrs[PipelineIDx].insert( 435bdd1243dSDimitry Andric SortPosition, std::pair(SUsToCandSGs.first, SUsToCandSGs.second)); 436bdd1243dSDimitry Andric } 437bdd1243dSDimitry Andric --PipelineIDx; 438bdd1243dSDimitry Andric } 439bdd1243dSDimitry Andric } 440bdd1243dSDimitry Andric 44106c3fb27SDimitry Andric template <typename T> void PipelineSolver::linkSchedGroups(T I, T E) { 44206c3fb27SDimitry Andric for (; I != E; ++I) { 44306c3fb27SDimitry Andric auto &GroupA = *I; 44406c3fb27SDimitry Andric for (auto J = std::next(I); J != E; ++J) { 44506c3fb27SDimitry Andric auto &GroupB = *J; 44606c3fb27SDimitry Andric GroupA.link(GroupB); 44706c3fb27SDimitry Andric } 44806c3fb27SDimitry Andric } 44906c3fb27SDimitry Andric } 45006c3fb27SDimitry Andric 451bdd1243dSDimitry Andric void PipelineSolver::makePipeline() { 452bdd1243dSDimitry Andric // Preserve the order of barrier for subsequent SchedGroupBarrier mutations 453bdd1243dSDimitry Andric for (auto &SyncPipeline : BestPipeline) { 45406c3fb27SDimitry Andric LLVM_DEBUG(dbgs() << "Printing SchedGroups\n"); 455bdd1243dSDimitry Andric for (auto &SG : SyncPipeline) { 45606c3fb27SDimitry Andric LLVM_DEBUG(dbgs() << "SchedGroup with SGID " << SG.getSGID() 45706c3fb27SDimitry Andric << " has: \n"); 458bdd1243dSDimitry Andric SUnit *SGBarr = nullptr; 459bdd1243dSDimitry Andric for (auto &SU : SG.Collection) { 460bdd1243dSDimitry Andric if (SU->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER) 461bdd1243dSDimitry Andric SGBarr = SU; 46206c3fb27SDimitry Andric LLVM_DEBUG(dbgs() << "SU(" << SU->NodeNum << ")\n"); 463bdd1243dSDimitry Andric } 464bdd1243dSDimitry Andric // Command line requested IGroupLP doesn't have SGBarr 465bdd1243dSDimitry Andric if (!SGBarr) 466bdd1243dSDimitry Andric continue; 467bdd1243dSDimitry Andric resetEdges(*SGBarr, DAG); 468bdd1243dSDimitry Andric SG.link(*SGBarr, false); 469bdd1243dSDimitry Andric } 470bdd1243dSDimitry Andric } 471bdd1243dSDimitry Andric 472bdd1243dSDimitry Andric for (auto &SyncPipeline : BestPipeline) { 47306c3fb27SDimitry Andric IsBottomUp ? linkSchedGroups(SyncPipeline.rbegin(), SyncPipeline.rend()) 47406c3fb27SDimitry Andric : linkSchedGroups(SyncPipeline.begin(), SyncPipeline.end()); 47581ad6265SDimitry Andric } 47681ad6265SDimitry Andric } 47706c3fb27SDimitry Andric 47806c3fb27SDimitry Andric template <typename T> 47906c3fb27SDimitry Andric int PipelineSolver::linkSUnit( 48006c3fb27SDimitry Andric SUnit *SU, int SGID, std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges, 48106c3fb27SDimitry Andric T I, T E) { 48206c3fb27SDimitry Andric bool MakePred = false; 48306c3fb27SDimitry Andric int AddedCost = 0; 48406c3fb27SDimitry Andric for (; I < E; ++I) { 48506c3fb27SDimitry Andric if (I->getSGID() == SGID) { 48606c3fb27SDimitry Andric MakePred = true; 48706c3fb27SDimitry Andric continue; 48881ad6265SDimitry Andric } 48906c3fb27SDimitry Andric auto Group = *I; 49006c3fb27SDimitry Andric AddedCost += Group.link(*SU, MakePred, AddedEdges); 49106c3fb27SDimitry Andric assert(AddedCost >= 0); 49206c3fb27SDimitry Andric } 49306c3fb27SDimitry Andric return AddedCost; 494bdd1243dSDimitry Andric } 49581ad6265SDimitry Andric 496bdd1243dSDimitry Andric int PipelineSolver::addEdges( 497bdd1243dSDimitry Andric SmallVectorImpl<SchedGroup> &SyncPipeline, SUnit *SU, int SGID, 498bdd1243dSDimitry Andric std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges) { 499bdd1243dSDimitry Andric 50006c3fb27SDimitry Andric // For IsBottomUp, the first SchedGroup in SyncPipeline contains the 50106c3fb27SDimitry Andric // instructions that are the ultimate successors in the resultant mutation. 50206c3fb27SDimitry Andric // Therefore, in such a configuration, the SchedGroups occurring before the 50306c3fb27SDimitry Andric // candidate SGID are successors of the candidate SchedGroup, thus the current 50406c3fb27SDimitry Andric // SU should be linked as a predecessor to SUs in those SchedGroups. The 50506c3fb27SDimitry Andric // opposite is true if !IsBottomUp. IsBottomUp occurs in the case of multiple 50606c3fb27SDimitry Andric // SCHED_GROUP_BARRIERS, or if a user specifies IGLP_OPT SchedGroups using 50706c3fb27SDimitry Andric // IsBottomUp (in reverse). 50806c3fb27SDimitry Andric return IsBottomUp ? linkSUnit(SU, SGID, AddedEdges, SyncPipeline.rbegin(), 50906c3fb27SDimitry Andric SyncPipeline.rend()) 51006c3fb27SDimitry Andric : linkSUnit(SU, SGID, AddedEdges, SyncPipeline.begin(), 51106c3fb27SDimitry Andric SyncPipeline.end()); 512bdd1243dSDimitry Andric } 513bdd1243dSDimitry Andric 514bdd1243dSDimitry Andric void PipelineSolver::removeEdges( 515bdd1243dSDimitry Andric const std::vector<std::pair<SUnit *, SUnit *>> &EdgesToRemove) { 516bdd1243dSDimitry Andric // Only remove the edges that we have added when testing 517bdd1243dSDimitry Andric // the fit. 518bdd1243dSDimitry Andric for (auto &PredSuccPair : EdgesToRemove) { 519bdd1243dSDimitry Andric SUnit *Pred = PredSuccPair.first; 520bdd1243dSDimitry Andric SUnit *Succ = PredSuccPair.second; 521bdd1243dSDimitry Andric 522bdd1243dSDimitry Andric auto Match = llvm::find_if( 523bdd1243dSDimitry Andric Succ->Preds, [&Pred](SDep &P) { return P.getSUnit() == Pred; }); 524bdd1243dSDimitry Andric if (Match != Succ->Preds.end()) { 525bdd1243dSDimitry Andric assert(Match->isArtificial()); 526bdd1243dSDimitry Andric Succ->removePred(*Match); 527bdd1243dSDimitry Andric } 528bdd1243dSDimitry Andric } 529bdd1243dSDimitry Andric } 530bdd1243dSDimitry Andric 531bdd1243dSDimitry Andric void PipelineSolver::advancePosition() { 532bdd1243dSDimitry Andric ++CurrConflInstNo; 533bdd1243dSDimitry Andric 534bdd1243dSDimitry Andric if (static_cast<size_t>(CurrConflInstNo) >= 535bdd1243dSDimitry Andric PipelineInstrs[CurrSyncGroupIdx].size()) { 536bdd1243dSDimitry Andric CurrConflInstNo = 0; 537bdd1243dSDimitry Andric ++CurrSyncGroupIdx; 538bdd1243dSDimitry Andric // Advance to next non-trivial pipeline 539bdd1243dSDimitry Andric while (static_cast<size_t>(CurrSyncGroupIdx) < PipelineInstrs.size() && 540bdd1243dSDimitry Andric PipelineInstrs[CurrSyncGroupIdx].size() == 0) 541bdd1243dSDimitry Andric ++CurrSyncGroupIdx; 542bdd1243dSDimitry Andric } 543bdd1243dSDimitry Andric } 544bdd1243dSDimitry Andric 545bdd1243dSDimitry Andric void PipelineSolver::retreatPosition() { 546bdd1243dSDimitry Andric assert(CurrConflInstNo >= 0); 547bdd1243dSDimitry Andric assert(CurrSyncGroupIdx >= 0); 548bdd1243dSDimitry Andric 549bdd1243dSDimitry Andric if (CurrConflInstNo > 0) { 550bdd1243dSDimitry Andric --CurrConflInstNo; 551bdd1243dSDimitry Andric return; 552bdd1243dSDimitry Andric } 553bdd1243dSDimitry Andric 554bdd1243dSDimitry Andric if (CurrConflInstNo == 0) { 555bdd1243dSDimitry Andric // If we return to the starting position, we have explored 556bdd1243dSDimitry Andric // the entire tree 557bdd1243dSDimitry Andric if (CurrSyncGroupIdx == BeginSyncGroupIdx) 558bdd1243dSDimitry Andric return; 559bdd1243dSDimitry Andric 560bdd1243dSDimitry Andric --CurrSyncGroupIdx; 561bdd1243dSDimitry Andric // Go to previous non-trivial pipeline 562bdd1243dSDimitry Andric while (PipelineInstrs[CurrSyncGroupIdx].size() == 0) 563bdd1243dSDimitry Andric --CurrSyncGroupIdx; 564bdd1243dSDimitry Andric 565bdd1243dSDimitry Andric CurrConflInstNo = PipelineInstrs[CurrSyncGroupIdx].size() - 1; 566bdd1243dSDimitry Andric } 567bdd1243dSDimitry Andric } 568bdd1243dSDimitry Andric 569bdd1243dSDimitry Andric bool PipelineSolver::checkOptimal() { 570bdd1243dSDimitry Andric if (static_cast<size_t>(CurrSyncGroupIdx) == PipelineInstrs.size()) { 571bdd1243dSDimitry Andric if (BestCost == -1 || CurrCost < BestCost) { 572bdd1243dSDimitry Andric BestPipeline = CurrPipeline; 573bdd1243dSDimitry Andric BestCost = CurrCost; 574bdd1243dSDimitry Andric LLVM_DEBUG(dbgs() << "Found Fit with cost " << BestCost << "\n"); 575bdd1243dSDimitry Andric } 576bdd1243dSDimitry Andric assert(BestCost >= 0); 577bdd1243dSDimitry Andric } 578bdd1243dSDimitry Andric 579bdd1243dSDimitry Andric bool DoneExploring = false; 580bdd1243dSDimitry Andric if (MaxBranchesExplored > 0 && BranchesExplored >= MaxBranchesExplored) 581bdd1243dSDimitry Andric DoneExploring = true; 582bdd1243dSDimitry Andric 583bdd1243dSDimitry Andric return (DoneExploring || BestCost == 0); 584bdd1243dSDimitry Andric } 585bdd1243dSDimitry Andric 58606c3fb27SDimitry Andric template <typename T> 587bdd1243dSDimitry Andric void PipelineSolver::populateReadyList( 58806c3fb27SDimitry Andric SmallVectorImpl<std::pair<int, int>> &ReadyList, T I, T E) { 58906c3fb27SDimitry Andric SUToCandSGsPair CurrSU = PipelineInstrs[CurrSyncGroupIdx][CurrConflInstNo]; 59006c3fb27SDimitry Andric auto SyncPipeline = CurrPipeline[CurrSyncGroupIdx]; 591bdd1243dSDimitry Andric assert(CurrSU.second.size() >= 1); 59206c3fb27SDimitry Andric 593bdd1243dSDimitry Andric for (; I != E; ++I) { 594bdd1243dSDimitry Andric std::vector<std::pair<SUnit *, SUnit *>> AddedEdges; 595bdd1243dSDimitry Andric int CandSGID = *I; 596*5f757f3fSDimitry Andric SchedGroup *Match = llvm::find_if(SyncPipeline, [CandSGID](SchedGroup &SG) { 597*5f757f3fSDimitry Andric return SG.getSGID() == CandSGID; 598*5f757f3fSDimitry Andric }); 599*5f757f3fSDimitry Andric assert(Match); 600bdd1243dSDimitry Andric 601bdd1243dSDimitry Andric if (UseCostHeur) { 602bdd1243dSDimitry Andric if (Match->isFull()) { 603bdd1243dSDimitry Andric ReadyList.push_back(std::pair(*I, MissPenalty)); 604bdd1243dSDimitry Andric continue; 605bdd1243dSDimitry Andric } 606bdd1243dSDimitry Andric 607bdd1243dSDimitry Andric int TempCost = addEdges(SyncPipeline, CurrSU.first, CandSGID, AddedEdges); 608bdd1243dSDimitry Andric ReadyList.push_back(std::pair(*I, TempCost)); 609bdd1243dSDimitry Andric removeEdges(AddedEdges); 610bdd1243dSDimitry Andric } else 611bdd1243dSDimitry Andric ReadyList.push_back(std::pair(*I, -1)); 612bdd1243dSDimitry Andric } 613bdd1243dSDimitry Andric 614bdd1243dSDimitry Andric if (UseCostHeur) { 615bdd1243dSDimitry Andric std::sort(ReadyList.begin(), ReadyList.end(), 616bdd1243dSDimitry Andric [](std::pair<int, int> A, std::pair<int, int> B) { 617bdd1243dSDimitry Andric return A.second < B.second; 618bdd1243dSDimitry Andric }); 619bdd1243dSDimitry Andric } 620bdd1243dSDimitry Andric 621bdd1243dSDimitry Andric assert(ReadyList.size() == CurrSU.second.size()); 622bdd1243dSDimitry Andric } 623bdd1243dSDimitry Andric 624bdd1243dSDimitry Andric bool PipelineSolver::solveExact() { 625bdd1243dSDimitry Andric if (checkOptimal()) 626bdd1243dSDimitry Andric return true; 627bdd1243dSDimitry Andric 628bdd1243dSDimitry Andric if (static_cast<size_t>(CurrSyncGroupIdx) == PipelineInstrs.size()) 629bdd1243dSDimitry Andric return false; 630bdd1243dSDimitry Andric 631bdd1243dSDimitry Andric assert(static_cast<size_t>(CurrSyncGroupIdx) < PipelineInstrs.size()); 632bdd1243dSDimitry Andric assert(static_cast<size_t>(CurrConflInstNo) < 633bdd1243dSDimitry Andric PipelineInstrs[CurrSyncGroupIdx].size()); 634bdd1243dSDimitry Andric SUToCandSGsPair CurrSU = PipelineInstrs[CurrSyncGroupIdx][CurrConflInstNo]; 635bdd1243dSDimitry Andric LLVM_DEBUG(dbgs() << "Fitting SU(" << CurrSU.first->NodeNum 636bdd1243dSDimitry Andric << ") in Pipeline # " << CurrSyncGroupIdx << "\n"); 637bdd1243dSDimitry Andric 638bdd1243dSDimitry Andric // SchedGroup -> Cost pairs 639bdd1243dSDimitry Andric SmallVector<std::pair<int, int>, 4> ReadyList; 640bdd1243dSDimitry Andric // Prioritize the candidate sched groups in terms of lowest cost first 64106c3fb27SDimitry Andric IsBottomUp ? populateReadyList(ReadyList, CurrSU.second.rbegin(), 64206c3fb27SDimitry Andric CurrSU.second.rend()) 64306c3fb27SDimitry Andric : populateReadyList(ReadyList, CurrSU.second.begin(), 64406c3fb27SDimitry Andric CurrSU.second.end()); 645bdd1243dSDimitry Andric 646bdd1243dSDimitry Andric auto I = ReadyList.begin(); 647bdd1243dSDimitry Andric auto E = ReadyList.end(); 648bdd1243dSDimitry Andric for (; I != E; ++I) { 649bdd1243dSDimitry Andric // If we are trying SGs in least cost order, and the current SG is cost 650bdd1243dSDimitry Andric // infeasible, then all subsequent SGs will also be cost infeasible, so we 651bdd1243dSDimitry Andric // can prune. 652bdd1243dSDimitry Andric if (BestCost != -1 && (CurrCost + I->second > BestCost)) 653bdd1243dSDimitry Andric return false; 654bdd1243dSDimitry Andric 655bdd1243dSDimitry Andric int CandSGID = I->first; 656bdd1243dSDimitry Andric int AddedCost = 0; 657bdd1243dSDimitry Andric std::vector<std::pair<SUnit *, SUnit *>> AddedEdges; 658bdd1243dSDimitry Andric auto &SyncPipeline = CurrPipeline[CurrSyncGroupIdx]; 659bdd1243dSDimitry Andric SchedGroup *Match; 660bdd1243dSDimitry Andric for (auto &SG : SyncPipeline) { 661bdd1243dSDimitry Andric if (SG.getSGID() == CandSGID) 662bdd1243dSDimitry Andric Match = &SG; 663bdd1243dSDimitry Andric } 664bdd1243dSDimitry Andric 665bdd1243dSDimitry Andric if (Match->isFull()) 666bdd1243dSDimitry Andric continue; 667bdd1243dSDimitry Andric 66806c3fb27SDimitry Andric if (!Match->allowedByRules(CurrSU.first, SyncPipeline)) 66906c3fb27SDimitry Andric continue; 67006c3fb27SDimitry Andric 671bdd1243dSDimitry Andric LLVM_DEBUG(dbgs() << "Assigning to SchedGroup with Mask " 672bdd1243dSDimitry Andric << (int)Match->getMask() << "and ID " << CandSGID 673bdd1243dSDimitry Andric << "\n"); 674bdd1243dSDimitry Andric Match->add(*CurrSU.first); 675bdd1243dSDimitry Andric AddedCost = addEdges(SyncPipeline, CurrSU.first, CandSGID, AddedEdges); 676bdd1243dSDimitry Andric LLVM_DEBUG(dbgs() << "Cost of Assignment: " << AddedCost << "\n"); 677bdd1243dSDimitry Andric CurrCost += AddedCost; 678bdd1243dSDimitry Andric advancePosition(); 679bdd1243dSDimitry Andric ++BranchesExplored; 680bdd1243dSDimitry Andric bool FinishedExploring = false; 681bdd1243dSDimitry Andric // If the Cost after adding edges is greater than a known solution, 682bdd1243dSDimitry Andric // backtrack 683bdd1243dSDimitry Andric if (CurrCost < BestCost || BestCost == -1) { 684bdd1243dSDimitry Andric if (solveExact()) { 685bdd1243dSDimitry Andric FinishedExploring = BestCost != 0; 686bdd1243dSDimitry Andric if (!FinishedExploring) 687bdd1243dSDimitry Andric return true; 688bdd1243dSDimitry Andric } 689bdd1243dSDimitry Andric } 690bdd1243dSDimitry Andric 691bdd1243dSDimitry Andric retreatPosition(); 692bdd1243dSDimitry Andric CurrCost -= AddedCost; 693bdd1243dSDimitry Andric removeEdges(AddedEdges); 694bdd1243dSDimitry Andric Match->pop(); 695bdd1243dSDimitry Andric CurrPipeline[CurrSyncGroupIdx] = SyncPipeline; 696bdd1243dSDimitry Andric if (FinishedExploring) 697bdd1243dSDimitry Andric return true; 698bdd1243dSDimitry Andric } 699bdd1243dSDimitry Andric 700bdd1243dSDimitry Andric // Try the pipeline where the current instruction is omitted 701bdd1243dSDimitry Andric // Potentially if we omit a problematic instruction from the pipeline, 702bdd1243dSDimitry Andric // all the other instructions can nicely fit. 703bdd1243dSDimitry Andric CurrCost += MissPenalty; 704bdd1243dSDimitry Andric advancePosition(); 705bdd1243dSDimitry Andric 706bdd1243dSDimitry Andric LLVM_DEBUG(dbgs() << "NOT Assigned (" << CurrSU.first->NodeNum << ")\n"); 707bdd1243dSDimitry Andric 708bdd1243dSDimitry Andric bool FinishedExploring = false; 709bdd1243dSDimitry Andric if (CurrCost < BestCost || BestCost == -1) { 710bdd1243dSDimitry Andric if (solveExact()) { 711bdd1243dSDimitry Andric bool FinishedExploring = BestCost != 0; 712bdd1243dSDimitry Andric if (!FinishedExploring) 713bdd1243dSDimitry Andric return true; 714bdd1243dSDimitry Andric } 715bdd1243dSDimitry Andric } 716bdd1243dSDimitry Andric 717bdd1243dSDimitry Andric retreatPosition(); 718bdd1243dSDimitry Andric CurrCost -= MissPenalty; 719bdd1243dSDimitry Andric return FinishedExploring; 720bdd1243dSDimitry Andric } 721bdd1243dSDimitry Andric 72206c3fb27SDimitry Andric template <typename T> 72306c3fb27SDimitry Andric void PipelineSolver::greedyFind( 72406c3fb27SDimitry Andric std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges, T I, T E) { 725bdd1243dSDimitry Andric SUToCandSGsPair CurrSU = PipelineInstrs[CurrSyncGroupIdx][CurrConflInstNo]; 726bdd1243dSDimitry Andric int BestNodeCost = -1; 727bdd1243dSDimitry Andric int TempCost; 728bdd1243dSDimitry Andric SchedGroup *BestGroup = nullptr; 729bdd1243dSDimitry Andric int BestGroupID = -1; 730bdd1243dSDimitry Andric auto &SyncPipeline = CurrPipeline[CurrSyncGroupIdx]; 731bdd1243dSDimitry Andric LLVM_DEBUG(dbgs() << "Fitting SU(" << CurrSU.first->NodeNum 732bdd1243dSDimitry Andric << ") in Pipeline # " << CurrSyncGroupIdx << "\n"); 733bdd1243dSDimitry Andric 734bdd1243dSDimitry Andric // Since we have added the potential SchedGroups from bottom up, but 735bdd1243dSDimitry Andric // traversed the DAG from top down, parse over the groups from last to 736bdd1243dSDimitry Andric // first. If we fail to do this for the greedy algorithm, the solution will 737bdd1243dSDimitry Andric // likely not be good in more complex cases. 738bdd1243dSDimitry Andric for (; I != E; ++I) { 739bdd1243dSDimitry Andric std::vector<std::pair<SUnit *, SUnit *>> AddedEdges; 740bdd1243dSDimitry Andric int CandSGID = *I; 741*5f757f3fSDimitry Andric SchedGroup *Match = llvm::find_if(SyncPipeline, [CandSGID](SchedGroup &SG) { 742*5f757f3fSDimitry Andric return SG.getSGID() == CandSGID; 743*5f757f3fSDimitry Andric }); 744*5f757f3fSDimitry Andric assert(Match); 745bdd1243dSDimitry Andric 746bdd1243dSDimitry Andric LLVM_DEBUG(dbgs() << "Trying SGID # " << CandSGID << " with Mask " 747bdd1243dSDimitry Andric << (int)Match->getMask() << "\n"); 748bdd1243dSDimitry Andric 749bdd1243dSDimitry Andric if (Match->isFull()) { 750bdd1243dSDimitry Andric LLVM_DEBUG(dbgs() << "SGID # " << CandSGID << " is full\n"); 751bdd1243dSDimitry Andric continue; 752bdd1243dSDimitry Andric } 75306c3fb27SDimitry Andric if (!Match->allowedByRules(CurrSU.first, SyncPipeline)) { 75406c3fb27SDimitry Andric LLVM_DEBUG(dbgs() << "SGID # " << CandSGID << " has conflicting rule\n"); 75506c3fb27SDimitry Andric continue; 75606c3fb27SDimitry Andric } 757bdd1243dSDimitry Andric TempCost = addEdges(SyncPipeline, CurrSU.first, CandSGID, AddedEdges); 758bdd1243dSDimitry Andric LLVM_DEBUG(dbgs() << "Cost of Group " << TempCost << "\n"); 759bdd1243dSDimitry Andric if (TempCost < BestNodeCost || BestNodeCost == -1) { 760bdd1243dSDimitry Andric BestGroup = Match; 761bdd1243dSDimitry Andric BestNodeCost = TempCost; 762bdd1243dSDimitry Andric BestGroupID = CandSGID; 763bdd1243dSDimitry Andric } 764bdd1243dSDimitry Andric removeEdges(AddedEdges); 765bdd1243dSDimitry Andric if (BestNodeCost == 0) 766bdd1243dSDimitry Andric break; 767bdd1243dSDimitry Andric } 768bdd1243dSDimitry Andric 769bdd1243dSDimitry Andric if (BestGroupID != -1) { 770bdd1243dSDimitry Andric BestGroup->add(*CurrSU.first); 771bdd1243dSDimitry Andric addEdges(SyncPipeline, CurrSU.first, BestGroupID, AddedEdges); 772bdd1243dSDimitry Andric LLVM_DEBUG(dbgs() << "Best Group has ID: " << BestGroupID << " and Mask" 773bdd1243dSDimitry Andric << (int)BestGroup->getMask() << "\n"); 774bdd1243dSDimitry Andric BestCost += TempCost; 775bdd1243dSDimitry Andric } else 776bdd1243dSDimitry Andric BestCost += MissPenalty; 777bdd1243dSDimitry Andric 778bdd1243dSDimitry Andric CurrPipeline[CurrSyncGroupIdx] = SyncPipeline; 77906c3fb27SDimitry Andric } 78006c3fb27SDimitry Andric 78106c3fb27SDimitry Andric bool PipelineSolver::solveGreedy() { 78206c3fb27SDimitry Andric BestCost = 0; 78306c3fb27SDimitry Andric std::vector<std::pair<SUnit *, SUnit *>> AddedEdges; 78406c3fb27SDimitry Andric 78506c3fb27SDimitry Andric while (static_cast<size_t>(CurrSyncGroupIdx) < PipelineInstrs.size()) { 78606c3fb27SDimitry Andric SUToCandSGsPair CurrSU = PipelineInstrs[CurrSyncGroupIdx][CurrConflInstNo]; 78706c3fb27SDimitry Andric IsBottomUp 78806c3fb27SDimitry Andric ? greedyFind(AddedEdges, CurrSU.second.rbegin(), CurrSU.second.rend()) 78906c3fb27SDimitry Andric : greedyFind(AddedEdges, CurrSU.second.begin(), CurrSU.second.end()); 790bdd1243dSDimitry Andric advancePosition(); 791bdd1243dSDimitry Andric } 792bdd1243dSDimitry Andric BestPipeline = CurrPipeline; 793bdd1243dSDimitry Andric removeEdges(AddedEdges); 794bdd1243dSDimitry Andric return false; 795bdd1243dSDimitry Andric } 796bdd1243dSDimitry Andric 797bdd1243dSDimitry Andric unsigned PipelineSolver::computeProblemSize() { 798bdd1243dSDimitry Andric unsigned ProblemSize = 0; 799bdd1243dSDimitry Andric for (auto &PipeConflicts : PipelineInstrs) { 800bdd1243dSDimitry Andric ProblemSize += PipeConflicts.size(); 801bdd1243dSDimitry Andric } 802bdd1243dSDimitry Andric 803bdd1243dSDimitry Andric return ProblemSize; 804bdd1243dSDimitry Andric } 805bdd1243dSDimitry Andric 806bdd1243dSDimitry Andric void PipelineSolver::solve() { 807bdd1243dSDimitry Andric if (!NeedsSolver) 808bdd1243dSDimitry Andric return; 809bdd1243dSDimitry Andric 810bdd1243dSDimitry Andric unsigned ProblemSize = computeProblemSize(); 811bdd1243dSDimitry Andric assert(ProblemSize > 0); 812bdd1243dSDimitry Andric 813bdd1243dSDimitry Andric bool BelowCutoff = (CutoffForExact > 0) && ProblemSize <= CutoffForExact; 814bdd1243dSDimitry Andric MissPenalty = (ProblemSize / 2) + 1; 815bdd1243dSDimitry Andric 816bdd1243dSDimitry Andric LLVM_DEBUG(DAG->dump()); 817bdd1243dSDimitry Andric if (EnableExactSolver || BelowCutoff) { 818bdd1243dSDimitry Andric LLVM_DEBUG(dbgs() << "Starting Greedy pipeline solver\n"); 819bdd1243dSDimitry Andric solveGreedy(); 820bdd1243dSDimitry Andric reset(); 821bdd1243dSDimitry Andric LLVM_DEBUG(dbgs() << "Greedy produced best cost of " << BestCost << "\n"); 822bdd1243dSDimitry Andric if (BestCost > 0) { 823bdd1243dSDimitry Andric LLVM_DEBUG(dbgs() << "Starting EXACT pipeline solver\n"); 824bdd1243dSDimitry Andric solveExact(); 825bdd1243dSDimitry Andric LLVM_DEBUG(dbgs() << "Exact produced best cost of " << BestCost << "\n"); 826bdd1243dSDimitry Andric } 827bdd1243dSDimitry Andric } else { // Use the Greedy Algorithm by default 828bdd1243dSDimitry Andric LLVM_DEBUG(dbgs() << "Starting GREEDY pipeline solver\n"); 829bdd1243dSDimitry Andric solveGreedy(); 830bdd1243dSDimitry Andric } 831bdd1243dSDimitry Andric 832bdd1243dSDimitry Andric makePipeline(); 83306c3fb27SDimitry Andric LLVM_DEBUG(dbgs() << "After applying mutation\n"); 83406c3fb27SDimitry Andric LLVM_DEBUG(DAG->dump()); 835bdd1243dSDimitry Andric } 836bdd1243dSDimitry Andric 83706c3fb27SDimitry Andric enum IGLPStrategyID : int { 83806c3fb27SDimitry Andric MFMASmallGemmOptID = 0, 83906c3fb27SDimitry Andric MFMASmallGemmSingleWaveOptID = 1, 84006c3fb27SDimitry Andric }; 841bdd1243dSDimitry Andric 842bdd1243dSDimitry Andric // Implement a IGLP scheduling strategy. 843bdd1243dSDimitry Andric class IGLPStrategy { 844bdd1243dSDimitry Andric protected: 845bdd1243dSDimitry Andric ScheduleDAGInstrs *DAG; 846bdd1243dSDimitry Andric 847bdd1243dSDimitry Andric const SIInstrInfo *TII; 848bdd1243dSDimitry Andric 849bdd1243dSDimitry Andric public: 850*5f757f3fSDimitry Andric /// Add SchedGroups to \p SyncedSchedGroups to implement this Strategy. 851bdd1243dSDimitry Andric virtual void applyIGLPStrategy( 852bdd1243dSDimitry Andric DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs, 853*5f757f3fSDimitry Andric DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups, 854*5f757f3fSDimitry Andric bool IsReentry) = 0; 855bdd1243dSDimitry Andric 856bdd1243dSDimitry Andric // Returns true if this strategy should be applied to a ScheduleDAG. 857bdd1243dSDimitry Andric virtual bool shouldApplyStrategy(ScheduleDAGInstrs *DAG) = 0; 858bdd1243dSDimitry Andric 85906c3fb27SDimitry Andric bool IsBottomUp = 1; 86006c3fb27SDimitry Andric 861bdd1243dSDimitry Andric IGLPStrategy(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII) 862bdd1243dSDimitry Andric : DAG(DAG), TII(TII) {} 863bdd1243dSDimitry Andric 864bdd1243dSDimitry Andric virtual ~IGLPStrategy() = default; 865bdd1243dSDimitry Andric }; 866bdd1243dSDimitry Andric 867bdd1243dSDimitry Andric class MFMASmallGemmOpt final : public IGLPStrategy { 86806c3fb27SDimitry Andric private: 869bdd1243dSDimitry Andric public: 870bdd1243dSDimitry Andric void applyIGLPStrategy( 871bdd1243dSDimitry Andric DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs, 872*5f757f3fSDimitry Andric DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups, 873*5f757f3fSDimitry Andric bool IsReentry) override; 874bdd1243dSDimitry Andric 875bdd1243dSDimitry Andric bool shouldApplyStrategy(ScheduleDAGInstrs *DAG) override { return true; } 876bdd1243dSDimitry Andric 877bdd1243dSDimitry Andric MFMASmallGemmOpt(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII) 87806c3fb27SDimitry Andric : IGLPStrategy(DAG, TII) { 87906c3fb27SDimitry Andric IsBottomUp = 1; 88006c3fb27SDimitry Andric } 881bdd1243dSDimitry Andric }; 882bdd1243dSDimitry Andric 883bdd1243dSDimitry Andric void MFMASmallGemmOpt::applyIGLPStrategy( 884bdd1243dSDimitry Andric DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs, 885*5f757f3fSDimitry Andric DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups, 886*5f757f3fSDimitry Andric bool IsReentry) { 887bdd1243dSDimitry Andric // Count the number of MFMA instructions. 888bdd1243dSDimitry Andric unsigned MFMACount = 0; 889bdd1243dSDimitry Andric for (const MachineInstr &I : *DAG) 890bdd1243dSDimitry Andric if (TII->isMFMAorWMMA(I)) 891bdd1243dSDimitry Andric ++MFMACount; 892bdd1243dSDimitry Andric 893bdd1243dSDimitry Andric const unsigned PipelineSyncID = 0; 894bdd1243dSDimitry Andric SchedGroup *SG = nullptr; 895bdd1243dSDimitry Andric for (unsigned I = 0; I < MFMACount * 3; ++I) { 896bdd1243dSDimitry Andric SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 897bdd1243dSDimitry Andric SchedGroupMask::DS, 2, PipelineSyncID, DAG, TII); 898bdd1243dSDimitry Andric SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 899bdd1243dSDimitry Andric 900bdd1243dSDimitry Andric SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 901bdd1243dSDimitry Andric SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII); 902bdd1243dSDimitry Andric SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 903bdd1243dSDimitry Andric } 904bdd1243dSDimitry Andric } 905bdd1243dSDimitry Andric 90606c3fb27SDimitry Andric class MFMASmallGemmSingleWaveOpt final : public IGLPStrategy { 90706c3fb27SDimitry Andric private: 90806c3fb27SDimitry Andric // Whether the DS_READ is a predecessor of first four MFMA in region 90906c3fb27SDimitry Andric class EnablesInitialMFMA final : public InstructionRule { 91006c3fb27SDimitry Andric public: 91106c3fb27SDimitry Andric bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection, 91206c3fb27SDimitry Andric SmallVectorImpl<SchedGroup> &SyncPipe) override { 91306c3fb27SDimitry Andric if (!SyncPipe.size()) 91406c3fb27SDimitry Andric return false; 91506c3fb27SDimitry Andric int MFMAsFound = 0; 91606c3fb27SDimitry Andric if (!Cache->size()) { 91706c3fb27SDimitry Andric for (auto &Elt : SyncPipe[0].DAG->SUnits) { 91806c3fb27SDimitry Andric if (TII->isMFMAorWMMA(*Elt.getInstr())) { 91906c3fb27SDimitry Andric ++MFMAsFound; 92006c3fb27SDimitry Andric if (MFMAsFound > 4) 92106c3fb27SDimitry Andric break; 92206c3fb27SDimitry Andric Cache->push_back(&Elt); 92306c3fb27SDimitry Andric } 92406c3fb27SDimitry Andric } 92506c3fb27SDimitry Andric } 92606c3fb27SDimitry Andric 92706c3fb27SDimitry Andric assert(Cache->size()); 92806c3fb27SDimitry Andric auto DAG = SyncPipe[0].DAG; 92906c3fb27SDimitry Andric for (auto &Elt : *Cache) { 93006c3fb27SDimitry Andric if (DAG->IsReachable(Elt, const_cast<SUnit *>(SU))) 93106c3fb27SDimitry Andric return true; 93206c3fb27SDimitry Andric } 93306c3fb27SDimitry Andric return false; 93406c3fb27SDimitry Andric } 93506c3fb27SDimitry Andric 93606c3fb27SDimitry Andric EnablesInitialMFMA(const SIInstrInfo *TII, unsigned SGID, 93706c3fb27SDimitry Andric bool NeedsCache = false) 93806c3fb27SDimitry Andric : InstructionRule(TII, SGID, NeedsCache) {} 93906c3fb27SDimitry Andric }; 94006c3fb27SDimitry Andric 94106c3fb27SDimitry Andric // Whether the MI is a V_PERM and is a predecessor of a common DS_WRITE 94206c3fb27SDimitry Andric class IsPermForDSW final : public InstructionRule { 94306c3fb27SDimitry Andric public: 94406c3fb27SDimitry Andric bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection, 94506c3fb27SDimitry Andric SmallVectorImpl<SchedGroup> &SyncPipe) override { 94606c3fb27SDimitry Andric auto MI = SU->getInstr(); 94706c3fb27SDimitry Andric if (MI->getOpcode() != AMDGPU::V_PERM_B32_e64) 94806c3fb27SDimitry Andric return false; 94906c3fb27SDimitry Andric 95006c3fb27SDimitry Andric bool FitsInGroup = false; 95106c3fb27SDimitry Andric // Does the VALU have a DS_WRITE successor 95206c3fb27SDimitry Andric if (!Collection.size()) { 95306c3fb27SDimitry Andric for (auto &Succ : SU->Succs) { 95406c3fb27SDimitry Andric SUnit *SuccUnit = Succ.getSUnit(); 95506c3fb27SDimitry Andric if (TII->isDS(*SuccUnit->getInstr()) && 95606c3fb27SDimitry Andric SuccUnit->getInstr()->mayStore()) { 95706c3fb27SDimitry Andric Cache->push_back(SuccUnit); 95806c3fb27SDimitry Andric FitsInGroup = true; 95906c3fb27SDimitry Andric } 96006c3fb27SDimitry Andric } 96106c3fb27SDimitry Andric return FitsInGroup; 96206c3fb27SDimitry Andric } 96306c3fb27SDimitry Andric 96406c3fb27SDimitry Andric assert(Cache->size()); 96506c3fb27SDimitry Andric 96606c3fb27SDimitry Andric // Does the VALU have a DS_WRITE successor that is the same as other 96706c3fb27SDimitry Andric // VALU already in the group. The V_PERMs will all share 1 DS_W succ 968*5f757f3fSDimitry Andric return llvm::any_of(*Cache, [&SU](SUnit *Elt) { 969*5f757f3fSDimitry Andric return llvm::any_of(SU->Succs, [&Elt](const SDep &ThisSucc) { 97006c3fb27SDimitry Andric return ThisSucc.getSUnit() == Elt; 97106c3fb27SDimitry Andric }); 97206c3fb27SDimitry Andric }); 97306c3fb27SDimitry Andric } 97406c3fb27SDimitry Andric 97506c3fb27SDimitry Andric IsPermForDSW(const SIInstrInfo *TII, unsigned SGID, bool NeedsCache = false) 97606c3fb27SDimitry Andric : InstructionRule(TII, SGID, NeedsCache) {} 97706c3fb27SDimitry Andric }; 97806c3fb27SDimitry Andric 97906c3fb27SDimitry Andric // Whether the SU is a successor of any element in previous SchedGroup 98006c3fb27SDimitry Andric class IsSuccOfPrevGroup final : public InstructionRule { 98106c3fb27SDimitry Andric public: 98206c3fb27SDimitry Andric bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection, 98306c3fb27SDimitry Andric SmallVectorImpl<SchedGroup> &SyncPipe) override { 98406c3fb27SDimitry Andric SchedGroup *OtherGroup = nullptr; 98506c3fb27SDimitry Andric for (auto &PipeSG : SyncPipe) { 98606c3fb27SDimitry Andric if ((unsigned)PipeSG.getSGID() == SGID - 1) { 98706c3fb27SDimitry Andric OtherGroup = &PipeSG; 98806c3fb27SDimitry Andric } 98906c3fb27SDimitry Andric } 99006c3fb27SDimitry Andric 99106c3fb27SDimitry Andric if (!OtherGroup) 99206c3fb27SDimitry Andric return false; 99306c3fb27SDimitry Andric if (!OtherGroup->Collection.size()) 99406c3fb27SDimitry Andric return true; 99506c3fb27SDimitry Andric 99606c3fb27SDimitry Andric // Does the previous VALU have this DS_Write as a successor 99706c3fb27SDimitry Andric return (std::any_of(OtherGroup->Collection.begin(), 99806c3fb27SDimitry Andric OtherGroup->Collection.end(), [&SU](SUnit *Elt) { 99906c3fb27SDimitry Andric return std::any_of(Elt->Succs.begin(), 100006c3fb27SDimitry Andric Elt->Succs.end(), 100106c3fb27SDimitry Andric [&SU](SDep &Succ) { 100206c3fb27SDimitry Andric return Succ.getSUnit() == SU; 100306c3fb27SDimitry Andric }); 100406c3fb27SDimitry Andric })); 100506c3fb27SDimitry Andric } 100606c3fb27SDimitry Andric IsSuccOfPrevGroup(const SIInstrInfo *TII, unsigned SGID, 100706c3fb27SDimitry Andric bool NeedsCache = false) 100806c3fb27SDimitry Andric : InstructionRule(TII, SGID, NeedsCache) {} 100906c3fb27SDimitry Andric }; 101006c3fb27SDimitry Andric 101106c3fb27SDimitry Andric // Whether the combined load width of group is 128 bits 101206c3fb27SDimitry Andric class VMEMSize final : public InstructionRule { 101306c3fb27SDimitry Andric public: 101406c3fb27SDimitry Andric bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection, 101506c3fb27SDimitry Andric SmallVectorImpl<SchedGroup> &SyncPipe) override { 101606c3fb27SDimitry Andric auto MI = SU->getInstr(); 101706c3fb27SDimitry Andric if (MI->getOpcode() == TargetOpcode::BUNDLE) 101806c3fb27SDimitry Andric return false; 101906c3fb27SDimitry Andric if (!Collection.size()) 102006c3fb27SDimitry Andric return true; 102106c3fb27SDimitry Andric 102206c3fb27SDimitry Andric int NumBits = 0; 102306c3fb27SDimitry Andric 102406c3fb27SDimitry Andric auto TRI = TII->getRegisterInfo(); 102506c3fb27SDimitry Andric auto &MRI = MI->getParent()->getParent()->getRegInfo(); 102606c3fb27SDimitry Andric for (auto &Elt : Collection) { 102706c3fb27SDimitry Andric auto Op = Elt->getInstr()->getOperand(0); 102806c3fb27SDimitry Andric auto Size = 102906c3fb27SDimitry Andric TRI.getRegSizeInBits(*TRI.getRegClassForOperandReg(MRI, Op)); 103006c3fb27SDimitry Andric NumBits += Size; 103106c3fb27SDimitry Andric } 103206c3fb27SDimitry Andric 103306c3fb27SDimitry Andric if (NumBits < 128) { 103406c3fb27SDimitry Andric assert(TII->isVMEM(*MI) && MI->mayLoad()); 103506c3fb27SDimitry Andric if (NumBits + TRI.getRegSizeInBits(*TRI.getRegClassForOperandReg( 103606c3fb27SDimitry Andric MRI, MI->getOperand(0))) <= 103706c3fb27SDimitry Andric 128) 103806c3fb27SDimitry Andric return true; 103906c3fb27SDimitry Andric } 104006c3fb27SDimitry Andric 104106c3fb27SDimitry Andric return false; 104206c3fb27SDimitry Andric } 104306c3fb27SDimitry Andric 104406c3fb27SDimitry Andric VMEMSize(const SIInstrInfo *TII, unsigned SGID, bool NeedsCache = false) 104506c3fb27SDimitry Andric : InstructionRule(TII, SGID, NeedsCache) {} 104606c3fb27SDimitry Andric }; 104706c3fb27SDimitry Andric 1048*5f757f3fSDimitry Andric /// Whether the SU shares a V_PERM predecessor with any SU in the SchedGroup 1049*5f757f3fSDimitry Andric /// that is \p Distance steps away 105006c3fb27SDimitry Andric class SharesPredWithPrevNthGroup final : public InstructionRule { 105106c3fb27SDimitry Andric private: 105206c3fb27SDimitry Andric unsigned Distance = 1; 105306c3fb27SDimitry Andric 105406c3fb27SDimitry Andric public: 105506c3fb27SDimitry Andric bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection, 105606c3fb27SDimitry Andric SmallVectorImpl<SchedGroup> &SyncPipe) override { 105706c3fb27SDimitry Andric SchedGroup *OtherGroup = nullptr; 105806c3fb27SDimitry Andric if (!SyncPipe.size()) 105906c3fb27SDimitry Andric return false; 106006c3fb27SDimitry Andric 106106c3fb27SDimitry Andric if (!Cache->size()) { 106206c3fb27SDimitry Andric 106306c3fb27SDimitry Andric for (auto &PipeSG : SyncPipe) { 106406c3fb27SDimitry Andric if ((unsigned)PipeSG.getSGID() == SGID - Distance) { 106506c3fb27SDimitry Andric OtherGroup = &PipeSG; 106606c3fb27SDimitry Andric } 106706c3fb27SDimitry Andric } 106806c3fb27SDimitry Andric 106906c3fb27SDimitry Andric if (!OtherGroup) 107006c3fb27SDimitry Andric return false; 107106c3fb27SDimitry Andric if (!OtherGroup->Collection.size()) 107206c3fb27SDimitry Andric return true; 107306c3fb27SDimitry Andric 107406c3fb27SDimitry Andric for (auto &OtherEle : OtherGroup->Collection) { 107506c3fb27SDimitry Andric for (auto &Pred : OtherEle->Preds) { 107606c3fb27SDimitry Andric if (Pred.getSUnit()->getInstr()->getOpcode() == 107706c3fb27SDimitry Andric AMDGPU::V_PERM_B32_e64) 107806c3fb27SDimitry Andric Cache->push_back(Pred.getSUnit()); 107906c3fb27SDimitry Andric } 108006c3fb27SDimitry Andric } 1081*5f757f3fSDimitry Andric 1082*5f757f3fSDimitry Andric // If the other group has no PERM preds, then this group won't share any 1083*5f757f3fSDimitry Andric if (!Cache->size()) 1084*5f757f3fSDimitry Andric return false; 108506c3fb27SDimitry Andric } 108606c3fb27SDimitry Andric 108706c3fb27SDimitry Andric auto DAG = SyncPipe[0].DAG; 108806c3fb27SDimitry Andric // Does the previous DS_WRITE share a V_PERM predecessor with this 108906c3fb27SDimitry Andric // VMEM_READ 1090*5f757f3fSDimitry Andric return llvm::any_of(*Cache, [&SU, &DAG](SUnit *Elt) { 109106c3fb27SDimitry Andric return DAG->IsReachable(const_cast<SUnit *>(SU), Elt); 1092*5f757f3fSDimitry Andric }); 109306c3fb27SDimitry Andric } 109406c3fb27SDimitry Andric SharesPredWithPrevNthGroup(unsigned Distance, const SIInstrInfo *TII, 109506c3fb27SDimitry Andric unsigned SGID, bool NeedsCache = false) 109606c3fb27SDimitry Andric : InstructionRule(TII, SGID, NeedsCache), Distance(Distance) {} 109706c3fb27SDimitry Andric }; 109806c3fb27SDimitry Andric 109906c3fb27SDimitry Andric public: 110006c3fb27SDimitry Andric void applyIGLPStrategy( 110106c3fb27SDimitry Andric DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs, 1102*5f757f3fSDimitry Andric DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups, 1103*5f757f3fSDimitry Andric bool IsReentry) override; 110406c3fb27SDimitry Andric 110506c3fb27SDimitry Andric bool shouldApplyStrategy(ScheduleDAGInstrs *DAG) override { return true; } 110606c3fb27SDimitry Andric 110706c3fb27SDimitry Andric MFMASmallGemmSingleWaveOpt(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII) 110806c3fb27SDimitry Andric : IGLPStrategy(DAG, TII) { 110906c3fb27SDimitry Andric IsBottomUp = 0; 111006c3fb27SDimitry Andric } 111106c3fb27SDimitry Andric }; 111206c3fb27SDimitry Andric 1113*5f757f3fSDimitry Andric static unsigned DSWCount = 0; 1114*5f757f3fSDimitry Andric static unsigned DSWWithPermCount = 0; 1115*5f757f3fSDimitry Andric static unsigned DSWWithSharedVMEMCount = 0; 1116*5f757f3fSDimitry Andric 111706c3fb27SDimitry Andric void MFMASmallGemmSingleWaveOpt::applyIGLPStrategy( 111806c3fb27SDimitry Andric DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs, 1119*5f757f3fSDimitry Andric DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups, 1120*5f757f3fSDimitry Andric bool IsReentry) { 112106c3fb27SDimitry Andric unsigned MFMACount = 0; 112206c3fb27SDimitry Andric unsigned DSRCount = 0; 1123*5f757f3fSDimitry Andric 1124*5f757f3fSDimitry Andric assert((IsReentry || (DSWCount == 0 && DSWWithPermCount == 0 && 1125*5f757f3fSDimitry Andric DSWWithSharedVMEMCount == 0)) && 1126*5f757f3fSDimitry Andric "DSWCounters should be zero in pre-RA scheduling!"); 112706c3fb27SDimitry Andric SmallVector<SUnit *, 6> DSWithPerms; 112806c3fb27SDimitry Andric for (auto &SU : DAG->SUnits) { 112906c3fb27SDimitry Andric auto I = SU.getInstr(); 113006c3fb27SDimitry Andric if (TII->isMFMAorWMMA(*I)) 113106c3fb27SDimitry Andric ++MFMACount; 113206c3fb27SDimitry Andric else if (TII->isDS(*I)) { 113306c3fb27SDimitry Andric if (I->mayLoad()) 113406c3fb27SDimitry Andric ++DSRCount; 1135*5f757f3fSDimitry Andric else if (I->mayStore() && !IsReentry) { 113606c3fb27SDimitry Andric ++DSWCount; 113706c3fb27SDimitry Andric for (auto Pred : SU.Preds) { 113806c3fb27SDimitry Andric if (Pred.getSUnit()->getInstr()->getOpcode() == 113906c3fb27SDimitry Andric AMDGPU::V_PERM_B32_e64) { 114006c3fb27SDimitry Andric DSWithPerms.push_back(&SU); 114106c3fb27SDimitry Andric break; 114206c3fb27SDimitry Andric } 114306c3fb27SDimitry Andric } 114406c3fb27SDimitry Andric } 114506c3fb27SDimitry Andric } 114606c3fb27SDimitry Andric } 1147*5f757f3fSDimitry Andric 1148*5f757f3fSDimitry Andric if (!IsReentry) { 114906c3fb27SDimitry Andric DSWWithPermCount = DSWithPerms.size(); 115006c3fb27SDimitry Andric auto I = DSWithPerms.begin(); 115106c3fb27SDimitry Andric auto E = DSWithPerms.end(); 115206c3fb27SDimitry Andric 115306c3fb27SDimitry Andric // Get the count of DS_WRITES with V_PERM predecessors which 115406c3fb27SDimitry Andric // have loop carried dependencies (WAR) on the same VMEM_READs. 115506c3fb27SDimitry Andric // We consider partial overlap as a miss -- in other words, 115606c3fb27SDimitry Andric // for a given DS_W, we only consider another DS_W as matching 115706c3fb27SDimitry Andric // if there is a corresponding (in terms of the VMEM_R it uses) V_PERM pred 115806c3fb27SDimitry Andric // for every V_PERM pred of this DS_W. 115906c3fb27SDimitry Andric DenseMap<MachineInstr *, SUnit *> VMEMLookup; 116006c3fb27SDimitry Andric SmallVector<SUnit *, 6> Counted; 116106c3fb27SDimitry Andric for (; I != E; I++) { 116206c3fb27SDimitry Andric SUnit *Cand = nullptr; 116306c3fb27SDimitry Andric bool MissedAny = false; 116406c3fb27SDimitry Andric for (auto &Pred : (*I)->Preds) { 116506c3fb27SDimitry Andric if (Pred.getSUnit()->getInstr()->getOpcode() != AMDGPU::V_PERM_B32_e64) 116606c3fb27SDimitry Andric continue; 116706c3fb27SDimitry Andric 1168*5f757f3fSDimitry Andric if (Cand && llvm::is_contained(Counted, Cand)) 116906c3fb27SDimitry Andric break; 117006c3fb27SDimitry Andric 117106c3fb27SDimitry Andric for (auto &Succ : Pred.getSUnit()->Succs) { 117206c3fb27SDimitry Andric auto MI = Succ.getSUnit()->getInstr(); 117306c3fb27SDimitry Andric if (!TII->isVMEM(*MI) || !MI->mayLoad()) 117406c3fb27SDimitry Andric continue; 117506c3fb27SDimitry Andric 117606c3fb27SDimitry Andric if (MissedAny || !VMEMLookup.size()) { 117706c3fb27SDimitry Andric MissedAny = true; 117806c3fb27SDimitry Andric VMEMLookup[MI] = *I; 117906c3fb27SDimitry Andric continue; 118006c3fb27SDimitry Andric } 118106c3fb27SDimitry Andric 118206c3fb27SDimitry Andric if (!VMEMLookup.contains(MI)) { 118306c3fb27SDimitry Andric MissedAny = true; 118406c3fb27SDimitry Andric VMEMLookup[MI] = *I; 118506c3fb27SDimitry Andric continue; 118606c3fb27SDimitry Andric } 118706c3fb27SDimitry Andric 118806c3fb27SDimitry Andric Cand = VMEMLookup[MI]; 1189*5f757f3fSDimitry Andric if (llvm::is_contained(Counted, Cand)) { 119006c3fb27SDimitry Andric MissedAny = true; 119106c3fb27SDimitry Andric break; 119206c3fb27SDimitry Andric } 119306c3fb27SDimitry Andric } 119406c3fb27SDimitry Andric } 119506c3fb27SDimitry Andric if (!MissedAny && Cand) { 119606c3fb27SDimitry Andric DSWWithSharedVMEMCount += 2; 119706c3fb27SDimitry Andric Counted.push_back(Cand); 119806c3fb27SDimitry Andric Counted.push_back(*I); 119906c3fb27SDimitry Andric } 120006c3fb27SDimitry Andric } 1201*5f757f3fSDimitry Andric } 120206c3fb27SDimitry Andric 120306c3fb27SDimitry Andric assert(DSWWithSharedVMEMCount <= DSWWithPermCount); 120406c3fb27SDimitry Andric SchedGroup *SG; 120506c3fb27SDimitry Andric unsigned PipelineSyncID = 0; 120606c3fb27SDimitry Andric // For kernels with V_PERM, there are enough VALU to mix in between MFMAs 120706c3fb27SDimitry Andric if (DSWWithPermCount) { 120806c3fb27SDimitry Andric for (unsigned I = 0; I < MFMACount; I++) { 120906c3fb27SDimitry Andric SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 121006c3fb27SDimitry Andric SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII); 121106c3fb27SDimitry Andric SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 121206c3fb27SDimitry Andric 121306c3fb27SDimitry Andric SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 121406c3fb27SDimitry Andric SchedGroupMask::VALU, 2, PipelineSyncID, DAG, TII); 121506c3fb27SDimitry Andric SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 121606c3fb27SDimitry Andric } 121706c3fb27SDimitry Andric } 121806c3fb27SDimitry Andric 121906c3fb27SDimitry Andric PipelineSyncID = 1; 122006c3fb27SDimitry Andric // Phase 1: Break up DS_READ and MFMA clusters. 122106c3fb27SDimitry Andric // First DS_READ to make ready initial MFMA, then interleave MFMA with DS_READ 122206c3fb27SDimitry Andric // prefetch 122306c3fb27SDimitry Andric 122406c3fb27SDimitry Andric // Make ready initial MFMA 122506c3fb27SDimitry Andric SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 122606c3fb27SDimitry Andric SchedGroupMask::DS_READ, 4, PipelineSyncID, DAG, TII); 122706c3fb27SDimitry Andric SG->addRule(std::make_shared<EnablesInitialMFMA>(TII, SG->getSGID(), true)); 122806c3fb27SDimitry Andric SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 122906c3fb27SDimitry Andric 123006c3fb27SDimitry Andric SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 123106c3fb27SDimitry Andric SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII); 123206c3fb27SDimitry Andric SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 123306c3fb27SDimitry Andric 123406c3fb27SDimitry Andric // Interleave MFMA with DS_READ prefetch 123506c3fb27SDimitry Andric for (unsigned I = 0; I < DSRCount - 4; ++I) { 123606c3fb27SDimitry Andric SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 123706c3fb27SDimitry Andric SchedGroupMask::DS_READ, 1, PipelineSyncID, DAG, TII); 123806c3fb27SDimitry Andric SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 123906c3fb27SDimitry Andric 124006c3fb27SDimitry Andric SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 124106c3fb27SDimitry Andric SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII); 124206c3fb27SDimitry Andric SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 124306c3fb27SDimitry Andric } 124406c3fb27SDimitry Andric 124506c3fb27SDimitry Andric // Phase 2a: Loop carried dependency with V_PERM 124606c3fb27SDimitry Andric // Schedule VPerm & DS_WRITE as closely as possible to the VMEM_READ they 124706c3fb27SDimitry Andric // depend on. Interleave MFMA to keep XDL unit busy throughout. 124806c3fb27SDimitry Andric for (unsigned I = 0; I < DSWWithPermCount - DSWWithSharedVMEMCount; ++I) { 124906c3fb27SDimitry Andric SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 125006c3fb27SDimitry Andric SchedGroupMask::VALU, 4, PipelineSyncID, DAG, TII); 125106c3fb27SDimitry Andric SG->addRule(std::make_shared<IsPermForDSW>(TII, SG->getSGID(), true)); 125206c3fb27SDimitry Andric SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 125306c3fb27SDimitry Andric 125406c3fb27SDimitry Andric SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 125506c3fb27SDimitry Andric SchedGroupMask::DS_WRITE, 1, PipelineSyncID, DAG, TII); 125606c3fb27SDimitry Andric SG->addRule(std::make_shared<IsSuccOfPrevGroup>(TII, SG->getSGID(), false)); 125706c3fb27SDimitry Andric SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 125806c3fb27SDimitry Andric 125906c3fb27SDimitry Andric SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 126006c3fb27SDimitry Andric SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG, TII); 126106c3fb27SDimitry Andric SG->addRule(std::make_shared<SharesPredWithPrevNthGroup>( 126206c3fb27SDimitry Andric 1, TII, SG->getSGID(), true)); 126306c3fb27SDimitry Andric SG->addRule(std::make_shared<VMEMSize>(TII, SG->getSGID(), false)); 126406c3fb27SDimitry Andric SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 126506c3fb27SDimitry Andric 126606c3fb27SDimitry Andric SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 126706c3fb27SDimitry Andric SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII); 126806c3fb27SDimitry Andric SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 126906c3fb27SDimitry Andric 127006c3fb27SDimitry Andric SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 127106c3fb27SDimitry Andric SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG, TII); 127206c3fb27SDimitry Andric SG->addRule(std::make_shared<SharesPredWithPrevNthGroup>( 127306c3fb27SDimitry Andric 3, TII, SG->getSGID(), true)); 127406c3fb27SDimitry Andric SG->addRule(std::make_shared<VMEMSize>(TII, SG->getSGID(), false)); 127506c3fb27SDimitry Andric SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 127606c3fb27SDimitry Andric 127706c3fb27SDimitry Andric SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 127806c3fb27SDimitry Andric SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII); 127906c3fb27SDimitry Andric SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 128006c3fb27SDimitry Andric } 128106c3fb27SDimitry Andric 128206c3fb27SDimitry Andric // Phase 2b: Loop carried dependency without V_PERM 128306c3fb27SDimitry Andric // Schedule DS_WRITE as closely as possible to the VMEM_READ they depend on. 128406c3fb27SDimitry Andric // Interleave MFMA to keep XDL unit busy throughout. 128506c3fb27SDimitry Andric for (unsigned I = 0; I < DSWCount - DSWWithPermCount; I++) { 128606c3fb27SDimitry Andric SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 128706c3fb27SDimitry Andric SchedGroupMask::DS_WRITE, 1, PipelineSyncID, DAG, TII); 128806c3fb27SDimitry Andric SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 128906c3fb27SDimitry Andric 129006c3fb27SDimitry Andric SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 129106c3fb27SDimitry Andric SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG, TII); 129206c3fb27SDimitry Andric SG->addRule(std::make_shared<VMEMSize>(TII, SG->getSGID(), false)); 129306c3fb27SDimitry Andric SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 129406c3fb27SDimitry Andric 129506c3fb27SDimitry Andric SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 129606c3fb27SDimitry Andric SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII); 129706c3fb27SDimitry Andric SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 129806c3fb27SDimitry Andric } 129906c3fb27SDimitry Andric 130006c3fb27SDimitry Andric // Phase 2c: Loop carried dependency with V_PERM, VMEM_READs are 130106c3fb27SDimitry Andric // ultimately used by two DS_WRITE 130206c3fb27SDimitry Andric // Schedule VPerm & DS_WRITE as closely as possible to the VMEM_READ they 130306c3fb27SDimitry Andric // depend on. Interleave MFMA to keep XDL unit busy throughout. 130406c3fb27SDimitry Andric 130506c3fb27SDimitry Andric for (unsigned I = 0; I < DSWWithSharedVMEMCount; ++I) { 130606c3fb27SDimitry Andric SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 130706c3fb27SDimitry Andric SchedGroupMask::VALU, 4, PipelineSyncID, DAG, TII); 130806c3fb27SDimitry Andric SG->addRule(std::make_shared<IsPermForDSW>(TII, SG->getSGID(), true)); 130906c3fb27SDimitry Andric SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 131006c3fb27SDimitry Andric 131106c3fb27SDimitry Andric SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 131206c3fb27SDimitry Andric SchedGroupMask::DS_WRITE, 1, PipelineSyncID, DAG, TII); 131306c3fb27SDimitry Andric SG->addRule(std::make_shared<IsSuccOfPrevGroup>(TII, SG->getSGID(), false)); 131406c3fb27SDimitry Andric SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 131506c3fb27SDimitry Andric 131606c3fb27SDimitry Andric SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 131706c3fb27SDimitry Andric SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII); 131806c3fb27SDimitry Andric SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 131906c3fb27SDimitry Andric 132006c3fb27SDimitry Andric SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 132106c3fb27SDimitry Andric SchedGroupMask::VALU, 4, PipelineSyncID, DAG, TII); 132206c3fb27SDimitry Andric SG->addRule(std::make_shared<IsPermForDSW>(TII, SG->getSGID(), true)); 132306c3fb27SDimitry Andric SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 132406c3fb27SDimitry Andric 132506c3fb27SDimitry Andric SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 132606c3fb27SDimitry Andric SchedGroupMask::DS_WRITE, 1, PipelineSyncID, DAG, TII); 132706c3fb27SDimitry Andric SG->addRule(std::make_shared<IsSuccOfPrevGroup>(TII, SG->getSGID(), false)); 132806c3fb27SDimitry Andric SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 132906c3fb27SDimitry Andric 133006c3fb27SDimitry Andric SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 133106c3fb27SDimitry Andric SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII); 133206c3fb27SDimitry Andric SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 133306c3fb27SDimitry Andric 133406c3fb27SDimitry Andric SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 133506c3fb27SDimitry Andric SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG, TII); 133606c3fb27SDimitry Andric SG->addRule(std::make_shared<SharesPredWithPrevNthGroup>( 133706c3fb27SDimitry Andric 2, TII, SG->getSGID(), true)); 133806c3fb27SDimitry Andric SG->addRule(std::make_shared<VMEMSize>(TII, SG->getSGID(), false)); 133906c3fb27SDimitry Andric SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 134006c3fb27SDimitry Andric 134106c3fb27SDimitry Andric SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 134206c3fb27SDimitry Andric SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII); 134306c3fb27SDimitry Andric SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 134406c3fb27SDimitry Andric 134506c3fb27SDimitry Andric SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 134606c3fb27SDimitry Andric SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG, TII); 134706c3fb27SDimitry Andric SG->addRule(std::make_shared<SharesPredWithPrevNthGroup>( 134806c3fb27SDimitry Andric 4, TII, SG->getSGID(), true)); 134906c3fb27SDimitry Andric SG->addRule(std::make_shared<VMEMSize>(TII, SG->getSGID(), false)); 135006c3fb27SDimitry Andric SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 135106c3fb27SDimitry Andric 135206c3fb27SDimitry Andric SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 135306c3fb27SDimitry Andric SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII); 135406c3fb27SDimitry Andric SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 135506c3fb27SDimitry Andric } 135606c3fb27SDimitry Andric } 135706c3fb27SDimitry Andric 1358bdd1243dSDimitry Andric static std::unique_ptr<IGLPStrategy> 1359bdd1243dSDimitry Andric createIGLPStrategy(IGLPStrategyID ID, ScheduleDAGInstrs *DAG, 1360bdd1243dSDimitry Andric const SIInstrInfo *TII) { 1361bdd1243dSDimitry Andric switch (ID) { 1362bdd1243dSDimitry Andric case MFMASmallGemmOptID: 1363bdd1243dSDimitry Andric return std::make_unique<MFMASmallGemmOpt>(DAG, TII); 136406c3fb27SDimitry Andric case MFMASmallGemmSingleWaveOptID: 136506c3fb27SDimitry Andric return std::make_unique<MFMASmallGemmSingleWaveOpt>(DAG, TII); 1366bdd1243dSDimitry Andric } 1367bdd1243dSDimitry Andric 1368bdd1243dSDimitry Andric llvm_unreachable("Unknown IGLPStrategyID"); 1369bdd1243dSDimitry Andric } 1370bdd1243dSDimitry Andric 1371bdd1243dSDimitry Andric class IGroupLPDAGMutation : public ScheduleDAGMutation { 1372bdd1243dSDimitry Andric private: 1373bdd1243dSDimitry Andric const SIInstrInfo *TII; 1374bdd1243dSDimitry Andric 1375bdd1243dSDimitry Andric ScheduleDAGMI *DAG; 1376bdd1243dSDimitry Andric 1377bdd1243dSDimitry Andric // Organize lists of SchedGroups by their SyncID. SchedGroups / 1378bdd1243dSDimitry Andric // SCHED_GROUP_BARRIERs with different SyncIDs will have no edges added 1379bdd1243dSDimitry Andric // between then. 1380bdd1243dSDimitry Andric DenseMap<int, SmallVector<SchedGroup, 4>> SyncedSchedGroups; 1381bdd1243dSDimitry Andric 1382bdd1243dSDimitry Andric // Used to track instructions that can be mapped to multiple sched groups 1383bdd1243dSDimitry Andric DenseMap<int, SUnitsToCandidateSGsMap> SyncedInstrs; 1384bdd1243dSDimitry Andric 1385bdd1243dSDimitry Andric // Add DAG edges that enforce SCHED_BARRIER ordering. 1386bdd1243dSDimitry Andric void addSchedBarrierEdges(SUnit &SU); 1387bdd1243dSDimitry Andric 1388bdd1243dSDimitry Andric // Use a SCHED_BARRIER's mask to identify instruction SchedGroups that should 1389bdd1243dSDimitry Andric // not be reordered accross the SCHED_BARRIER. This is used for the base 1390bdd1243dSDimitry Andric // SCHED_BARRIER, and not SCHED_GROUP_BARRIER. The difference is that 1391bdd1243dSDimitry Andric // SCHED_BARRIER will always block all instructions that can be classified 1392bdd1243dSDimitry Andric // into a particular SchedClass, whereas SCHED_GROUP_BARRIER has a fixed size 1393bdd1243dSDimitry Andric // and may only synchronize with some SchedGroups. Returns the inverse of 1394bdd1243dSDimitry Andric // Mask. SCHED_BARRIER's mask describes which instruction types should be 1395bdd1243dSDimitry Andric // allowed to be scheduled across it. Invert the mask to get the 1396bdd1243dSDimitry Andric // SchedGroupMask of instructions that should be barred. 1397bdd1243dSDimitry Andric SchedGroupMask invertSchedBarrierMask(SchedGroupMask Mask) const; 1398bdd1243dSDimitry Andric 1399bdd1243dSDimitry Andric // Create SchedGroups for a SCHED_GROUP_BARRIER. 1400bdd1243dSDimitry Andric void initSchedGroupBarrierPipelineStage( 1401bdd1243dSDimitry Andric std::vector<SUnit>::reverse_iterator RIter); 1402bdd1243dSDimitry Andric 1403bdd1243dSDimitry Andric void initIGLPOpt(SUnit &SU); 1404bdd1243dSDimitry Andric 1405bdd1243dSDimitry Andric public: 1406bdd1243dSDimitry Andric void apply(ScheduleDAGInstrs *DAGInstrs) override; 1407bdd1243dSDimitry Andric 140806c3fb27SDimitry Andric // The order in which the PipelineSolver should process the candidate 140906c3fb27SDimitry Andric // SchedGroup for a PipelineInstr. BOTTOM_UP will try to add SUs to the last 141006c3fb27SDimitry Andric // created SchedGroup first, and will consider that as the ultimate 141106c3fb27SDimitry Andric // predecessor group when linking. TOP_DOWN instead links and processes the 141206c3fb27SDimitry Andric // first created SchedGroup first. 141306c3fb27SDimitry Andric bool IsBottomUp = 1; 141406c3fb27SDimitry Andric 1415*5f757f3fSDimitry Andric // Whether or not this is a reentry into the IGroupLPDAGMutation. 1416*5f757f3fSDimitry Andric bool IsReentry = false; 1417*5f757f3fSDimitry Andric 1418bdd1243dSDimitry Andric IGroupLPDAGMutation() = default; 1419*5f757f3fSDimitry Andric IGroupLPDAGMutation(bool IsReentry) : IsReentry(IsReentry) {} 1420bdd1243dSDimitry Andric }; 1421bdd1243dSDimitry Andric 1422bdd1243dSDimitry Andric unsigned SchedGroup::NumSchedGroups = 0; 1423bdd1243dSDimitry Andric 1424bdd1243dSDimitry Andric bool SchedGroup::tryAddEdge(SUnit *A, SUnit *B) { 1425bdd1243dSDimitry Andric if (A != B && DAG->canAddEdge(B, A)) { 1426bdd1243dSDimitry Andric DAG->addEdge(B, SDep(A, SDep::Artificial)); 1427bdd1243dSDimitry Andric return true; 1428bdd1243dSDimitry Andric } 1429bdd1243dSDimitry Andric return false; 1430bdd1243dSDimitry Andric } 1431bdd1243dSDimitry Andric 1432bdd1243dSDimitry Andric bool SchedGroup::canAddMI(const MachineInstr &MI) const { 1433bdd1243dSDimitry Andric bool Result = false; 1434bdd1243dSDimitry Andric if (MI.isMetaInstruction()) 1435bdd1243dSDimitry Andric Result = false; 1436bdd1243dSDimitry Andric 1437bdd1243dSDimitry Andric else if (((SGMask & SchedGroupMask::ALU) != SchedGroupMask::NONE) && 1438bdd1243dSDimitry Andric (TII->isVALU(MI) || TII->isMFMAorWMMA(MI) || TII->isSALU(MI))) 1439bdd1243dSDimitry Andric Result = true; 1440bdd1243dSDimitry Andric 1441bdd1243dSDimitry Andric else if (((SGMask & SchedGroupMask::VALU) != SchedGroupMask::NONE) && 1442bdd1243dSDimitry Andric TII->isVALU(MI) && !TII->isMFMAorWMMA(MI)) 1443bdd1243dSDimitry Andric Result = true; 1444bdd1243dSDimitry Andric 1445bdd1243dSDimitry Andric else if (((SGMask & SchedGroupMask::SALU) != SchedGroupMask::NONE) && 1446bdd1243dSDimitry Andric TII->isSALU(MI)) 1447bdd1243dSDimitry Andric Result = true; 1448bdd1243dSDimitry Andric 1449bdd1243dSDimitry Andric else if (((SGMask & SchedGroupMask::MFMA) != SchedGroupMask::NONE) && 1450bdd1243dSDimitry Andric TII->isMFMAorWMMA(MI)) 1451bdd1243dSDimitry Andric Result = true; 1452bdd1243dSDimitry Andric 1453bdd1243dSDimitry Andric else if (((SGMask & SchedGroupMask::VMEM) != SchedGroupMask::NONE) && 1454bdd1243dSDimitry Andric (TII->isVMEM(MI) || (TII->isFLAT(MI) && !TII->isDS(MI)))) 1455bdd1243dSDimitry Andric Result = true; 1456bdd1243dSDimitry Andric 1457bdd1243dSDimitry Andric else if (((SGMask & SchedGroupMask::VMEM_READ) != SchedGroupMask::NONE) && 1458bdd1243dSDimitry Andric MI.mayLoad() && 1459bdd1243dSDimitry Andric (TII->isVMEM(MI) || (TII->isFLAT(MI) && !TII->isDS(MI)))) 1460bdd1243dSDimitry Andric Result = true; 1461bdd1243dSDimitry Andric 1462bdd1243dSDimitry Andric else if (((SGMask & SchedGroupMask::VMEM_WRITE) != SchedGroupMask::NONE) && 1463bdd1243dSDimitry Andric MI.mayStore() && 1464bdd1243dSDimitry Andric (TII->isVMEM(MI) || (TII->isFLAT(MI) && !TII->isDS(MI)))) 1465bdd1243dSDimitry Andric Result = true; 1466bdd1243dSDimitry Andric 1467bdd1243dSDimitry Andric else if (((SGMask & SchedGroupMask::DS) != SchedGroupMask::NONE) && 1468bdd1243dSDimitry Andric TII->isDS(MI)) 1469bdd1243dSDimitry Andric Result = true; 1470bdd1243dSDimitry Andric 1471bdd1243dSDimitry Andric else if (((SGMask & SchedGroupMask::DS_READ) != SchedGroupMask::NONE) && 1472bdd1243dSDimitry Andric MI.mayLoad() && TII->isDS(MI)) 1473bdd1243dSDimitry Andric Result = true; 1474bdd1243dSDimitry Andric 1475bdd1243dSDimitry Andric else if (((SGMask & SchedGroupMask::DS_WRITE) != SchedGroupMask::NONE) && 1476bdd1243dSDimitry Andric MI.mayStore() && TII->isDS(MI)) 1477bdd1243dSDimitry Andric Result = true; 1478bdd1243dSDimitry Andric 1479bdd1243dSDimitry Andric LLVM_DEBUG( 1480bdd1243dSDimitry Andric dbgs() << "For SchedGroup with mask " << format_hex((int)SGMask, 10, true) 1481bdd1243dSDimitry Andric << (Result ? " could classify " : " unable to classify ") << MI); 1482bdd1243dSDimitry Andric 1483bdd1243dSDimitry Andric return Result; 1484bdd1243dSDimitry Andric } 1485bdd1243dSDimitry Andric 1486bdd1243dSDimitry Andric int SchedGroup::link(SUnit &SU, bool MakePred, 1487bdd1243dSDimitry Andric std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges) { 1488bdd1243dSDimitry Andric int MissedEdges = 0; 1489bdd1243dSDimitry Andric for (auto *A : Collection) { 1490bdd1243dSDimitry Andric SUnit *B = &SU; 1491bdd1243dSDimitry Andric if (A == B || A->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER) 1492bdd1243dSDimitry Andric continue; 1493bdd1243dSDimitry Andric if (MakePred) 1494bdd1243dSDimitry Andric std::swap(A, B); 1495bdd1243dSDimitry Andric 1496bdd1243dSDimitry Andric if (DAG->IsReachable(B, A)) 1497bdd1243dSDimitry Andric continue; 149806c3fb27SDimitry Andric 1499bdd1243dSDimitry Andric // tryAddEdge returns false if there is a dependency that makes adding 1500bdd1243dSDimitry Andric // the A->B edge impossible, otherwise it returns true; 1501bdd1243dSDimitry Andric bool Added = tryAddEdge(A, B); 1502bdd1243dSDimitry Andric if (Added) 1503bdd1243dSDimitry Andric AddedEdges.push_back(std::pair(A, B)); 1504bdd1243dSDimitry Andric else 1505bdd1243dSDimitry Andric ++MissedEdges; 1506bdd1243dSDimitry Andric } 1507bdd1243dSDimitry Andric 1508bdd1243dSDimitry Andric return MissedEdges; 1509bdd1243dSDimitry Andric } 1510bdd1243dSDimitry Andric 1511bdd1243dSDimitry Andric void SchedGroup::link(SUnit &SU, bool MakePred) { 1512bdd1243dSDimitry Andric for (auto *A : Collection) { 1513bdd1243dSDimitry Andric SUnit *B = &SU; 1514bdd1243dSDimitry Andric if (A->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER) 1515bdd1243dSDimitry Andric continue; 1516bdd1243dSDimitry Andric if (MakePred) 1517bdd1243dSDimitry Andric std::swap(A, B); 1518bdd1243dSDimitry Andric 1519bdd1243dSDimitry Andric tryAddEdge(A, B); 1520bdd1243dSDimitry Andric } 1521bdd1243dSDimitry Andric } 1522bdd1243dSDimitry Andric 1523bdd1243dSDimitry Andric void SchedGroup::link(SUnit &SU, 1524bdd1243dSDimitry Andric function_ref<bool(const SUnit *A, const SUnit *B)> P) { 1525bdd1243dSDimitry Andric for (auto *A : Collection) { 1526bdd1243dSDimitry Andric SUnit *B = &SU; 1527bdd1243dSDimitry Andric if (P(A, B)) 1528bdd1243dSDimitry Andric std::swap(A, B); 1529bdd1243dSDimitry Andric 1530bdd1243dSDimitry Andric tryAddEdge(A, B); 1531bdd1243dSDimitry Andric } 1532bdd1243dSDimitry Andric } 1533bdd1243dSDimitry Andric 1534bdd1243dSDimitry Andric void SchedGroup::link(SchedGroup &OtherGroup) { 1535bdd1243dSDimitry Andric for (auto *B : OtherGroup.Collection) 1536bdd1243dSDimitry Andric link(*B); 1537bdd1243dSDimitry Andric } 1538bdd1243dSDimitry Andric 1539bdd1243dSDimitry Andric bool SchedGroup::canAddSU(SUnit &SU) const { 1540bdd1243dSDimitry Andric MachineInstr &MI = *SU.getInstr(); 1541bdd1243dSDimitry Andric if (MI.getOpcode() != TargetOpcode::BUNDLE) 1542bdd1243dSDimitry Andric return canAddMI(MI); 1543bdd1243dSDimitry Andric 1544bdd1243dSDimitry Andric // Special case for bundled MIs. 1545bdd1243dSDimitry Andric const MachineBasicBlock *MBB = MI.getParent(); 1546bdd1243dSDimitry Andric MachineBasicBlock::instr_iterator B = MI.getIterator(), E = ++B; 1547bdd1243dSDimitry Andric while (E != MBB->end() && E->isBundledWithPred()) 1548bdd1243dSDimitry Andric ++E; 1549bdd1243dSDimitry Andric 1550bdd1243dSDimitry Andric // Return true if all of the bundled MIs can be added to this group. 1551bdd1243dSDimitry Andric return std::all_of(B, E, [this](MachineInstr &MI) { return canAddMI(MI); }); 1552bdd1243dSDimitry Andric } 1553bdd1243dSDimitry Andric 1554bdd1243dSDimitry Andric void SchedGroup::initSchedGroup() { 1555bdd1243dSDimitry Andric for (auto &SU : DAG->SUnits) { 1556bdd1243dSDimitry Andric if (isFull()) 1557bdd1243dSDimitry Andric break; 1558bdd1243dSDimitry Andric 1559bdd1243dSDimitry Andric if (canAddSU(SU)) 1560bdd1243dSDimitry Andric add(SU); 1561bdd1243dSDimitry Andric } 1562bdd1243dSDimitry Andric } 1563bdd1243dSDimitry Andric 1564bdd1243dSDimitry Andric void SchedGroup::initSchedGroup(std::vector<SUnit>::reverse_iterator RIter, 1565bdd1243dSDimitry Andric SUnitsToCandidateSGsMap &SyncedInstrs) { 1566bdd1243dSDimitry Andric SUnit &InitSU = *RIter; 1567bdd1243dSDimitry Andric for (auto E = DAG->SUnits.rend(); RIter != E; ++RIter) { 1568bdd1243dSDimitry Andric auto &SU = *RIter; 1569bdd1243dSDimitry Andric if (isFull()) 1570bdd1243dSDimitry Andric break; 1571bdd1243dSDimitry Andric 1572bdd1243dSDimitry Andric if (canAddSU(SU)) 1573bdd1243dSDimitry Andric SyncedInstrs[&SU].push_back(SGID); 1574bdd1243dSDimitry Andric } 1575bdd1243dSDimitry Andric 1576bdd1243dSDimitry Andric add(InitSU); 1577bdd1243dSDimitry Andric assert(MaxSize); 1578bdd1243dSDimitry Andric (*MaxSize)++; 1579bdd1243dSDimitry Andric } 1580bdd1243dSDimitry Andric 1581bdd1243dSDimitry Andric void SchedGroup::initSchedGroup(SUnitsToCandidateSGsMap &SyncedInstrs) { 1582bdd1243dSDimitry Andric auto I = DAG->SUnits.rbegin(); 1583bdd1243dSDimitry Andric auto E = DAG->SUnits.rend(); 1584bdd1243dSDimitry Andric for (; I != E; ++I) { 1585bdd1243dSDimitry Andric auto &SU = *I; 1586bdd1243dSDimitry Andric if (isFull()) 1587bdd1243dSDimitry Andric break; 1588bdd1243dSDimitry Andric 1589bdd1243dSDimitry Andric if (canAddSU(SU)) 1590bdd1243dSDimitry Andric SyncedInstrs[&SU].push_back(SGID); 1591bdd1243dSDimitry Andric } 1592bdd1243dSDimitry Andric } 1593bdd1243dSDimitry Andric 1594bdd1243dSDimitry Andric void IGroupLPDAGMutation::apply(ScheduleDAGInstrs *DAGInstrs) { 159581ad6265SDimitry Andric const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel(); 159681ad6265SDimitry Andric if (!TSchedModel || DAGInstrs->SUnits.empty()) 159781ad6265SDimitry Andric return; 159881ad6265SDimitry Andric 1599bdd1243dSDimitry Andric LLVM_DEBUG(dbgs() << "Applying IGroupLPDAGMutation...\n"); 160081ad6265SDimitry Andric const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>(); 160181ad6265SDimitry Andric TII = ST.getInstrInfo(); 160281ad6265SDimitry Andric DAG = static_cast<ScheduleDAGMI *>(DAGInstrs); 1603bdd1243dSDimitry Andric SyncedSchedGroups.clear(); 1604bdd1243dSDimitry Andric SyncedInstrs.clear(); 1605bdd1243dSDimitry Andric bool foundSB = false; 1606bdd1243dSDimitry Andric bool foundIGLP = false; 1607bdd1243dSDimitry Andric for (auto R = DAG->SUnits.rbegin(), E = DAG->SUnits.rend(); R != E; ++R) { 1608bdd1243dSDimitry Andric unsigned Opc = R->getInstr()->getOpcode(); 1609bdd1243dSDimitry Andric // SCHED_[GROUP_]BARRIER and IGLP are mutually exclusive. 1610bdd1243dSDimitry Andric if (Opc == AMDGPU::SCHED_BARRIER) { 1611bdd1243dSDimitry Andric addSchedBarrierEdges(*R); 1612bdd1243dSDimitry Andric foundSB = true; 1613bdd1243dSDimitry Andric } else if (Opc == AMDGPU::SCHED_GROUP_BARRIER) { 1614bdd1243dSDimitry Andric initSchedGroupBarrierPipelineStage(R); 1615bdd1243dSDimitry Andric foundSB = true; 1616bdd1243dSDimitry Andric } else if (Opc == AMDGPU::IGLP_OPT) { 1617bdd1243dSDimitry Andric resetEdges(*R, DAG); 1618bdd1243dSDimitry Andric if (!foundSB && !foundIGLP) 1619bdd1243dSDimitry Andric initIGLPOpt(*R); 1620bdd1243dSDimitry Andric foundIGLP = true; 1621bdd1243dSDimitry Andric } 162281ad6265SDimitry Andric } 162381ad6265SDimitry Andric 1624bdd1243dSDimitry Andric if (foundSB || foundIGLP) { 162506c3fb27SDimitry Andric PipelineSolver PS(SyncedSchedGroups, SyncedInstrs, DAG, IsBottomUp); 1626bdd1243dSDimitry Andric // PipelineSolver performs the mutation by adding the edges it 1627bdd1243dSDimitry Andric // determined as the best 1628bdd1243dSDimitry Andric PS.solve(); 1629bdd1243dSDimitry Andric } 1630bdd1243dSDimitry Andric } 1631bdd1243dSDimitry Andric 1632bdd1243dSDimitry Andric void IGroupLPDAGMutation::addSchedBarrierEdges(SUnit &SchedBarrier) { 163381ad6265SDimitry Andric MachineInstr &MI = *SchedBarrier.getInstr(); 163481ad6265SDimitry Andric assert(MI.getOpcode() == AMDGPU::SCHED_BARRIER); 163581ad6265SDimitry Andric // Remove all existing edges from the SCHED_BARRIER that were added due to the 163681ad6265SDimitry Andric // instruction having side effects. 1637bdd1243dSDimitry Andric resetEdges(SchedBarrier, DAG); 1638bdd1243dSDimitry Andric auto InvertedMask = 1639bdd1243dSDimitry Andric invertSchedBarrierMask((SchedGroupMask)MI.getOperand(0).getImm()); 1640bdd1243dSDimitry Andric SchedGroup SG(InvertedMask, std::nullopt, DAG, TII); 1641bdd1243dSDimitry Andric SG.initSchedGroup(); 1642bdd1243dSDimitry Andric // Preserve original instruction ordering relative to the SCHED_BARRIER. 1643bdd1243dSDimitry Andric SG.link( 1644bdd1243dSDimitry Andric SchedBarrier, 1645bdd1243dSDimitry Andric (function_ref<bool(const SUnit *A, const SUnit *B)>)[]( 1646bdd1243dSDimitry Andric const SUnit *A, const SUnit *B) { return A->NodeNum > B->NodeNum; }); 164781ad6265SDimitry Andric } 164881ad6265SDimitry Andric 1649bdd1243dSDimitry Andric SchedGroupMask 1650bdd1243dSDimitry Andric IGroupLPDAGMutation::invertSchedBarrierMask(SchedGroupMask Mask) const { 1651bdd1243dSDimitry Andric // Invert mask and erase bits for types of instructions that are implied to be 1652bdd1243dSDimitry Andric // allowed past the SCHED_BARRIER. 1653bdd1243dSDimitry Andric SchedGroupMask InvertedMask = ~Mask; 1654bdd1243dSDimitry Andric 1655bdd1243dSDimitry Andric // ALU implies VALU, SALU, MFMA. 1656bdd1243dSDimitry Andric if ((InvertedMask & SchedGroupMask::ALU) == SchedGroupMask::NONE) 1657bdd1243dSDimitry Andric InvertedMask &= 1658bdd1243dSDimitry Andric ~SchedGroupMask::VALU & ~SchedGroupMask::SALU & ~SchedGroupMask::MFMA; 1659bdd1243dSDimitry Andric // VALU, SALU, MFMA implies ALU. 1660bdd1243dSDimitry Andric else if ((InvertedMask & SchedGroupMask::VALU) == SchedGroupMask::NONE || 1661bdd1243dSDimitry Andric (InvertedMask & SchedGroupMask::SALU) == SchedGroupMask::NONE || 1662bdd1243dSDimitry Andric (InvertedMask & SchedGroupMask::MFMA) == SchedGroupMask::NONE) 1663bdd1243dSDimitry Andric InvertedMask &= ~SchedGroupMask::ALU; 1664bdd1243dSDimitry Andric 1665bdd1243dSDimitry Andric // VMEM implies VMEM_READ, VMEM_WRITE. 1666bdd1243dSDimitry Andric if ((InvertedMask & SchedGroupMask::VMEM) == SchedGroupMask::NONE) 1667bdd1243dSDimitry Andric InvertedMask &= ~SchedGroupMask::VMEM_READ & ~SchedGroupMask::VMEM_WRITE; 1668bdd1243dSDimitry Andric // VMEM_READ, VMEM_WRITE implies VMEM. 1669bdd1243dSDimitry Andric else if ((InvertedMask & SchedGroupMask::VMEM_READ) == SchedGroupMask::NONE || 1670bdd1243dSDimitry Andric (InvertedMask & SchedGroupMask::VMEM_WRITE) == SchedGroupMask::NONE) 1671bdd1243dSDimitry Andric InvertedMask &= ~SchedGroupMask::VMEM; 1672bdd1243dSDimitry Andric 1673bdd1243dSDimitry Andric // DS implies DS_READ, DS_WRITE. 1674bdd1243dSDimitry Andric if ((InvertedMask & SchedGroupMask::DS) == SchedGroupMask::NONE) 1675bdd1243dSDimitry Andric InvertedMask &= ~SchedGroupMask::DS_READ & ~SchedGroupMask::DS_WRITE; 1676bdd1243dSDimitry Andric // DS_READ, DS_WRITE implies DS. 1677bdd1243dSDimitry Andric else if ((InvertedMask & SchedGroupMask::DS_READ) == SchedGroupMask::NONE || 1678bdd1243dSDimitry Andric (InvertedMask & SchedGroupMask::DS_WRITE) == SchedGroupMask::NONE) 1679bdd1243dSDimitry Andric InvertedMask &= ~SchedGroupMask::DS; 1680bdd1243dSDimitry Andric 1681bdd1243dSDimitry Andric return InvertedMask; 168281ad6265SDimitry Andric } 168381ad6265SDimitry Andric 1684bdd1243dSDimitry Andric void IGroupLPDAGMutation::initSchedGroupBarrierPipelineStage( 1685bdd1243dSDimitry Andric std::vector<SUnit>::reverse_iterator RIter) { 1686bdd1243dSDimitry Andric // Remove all existing edges from the SCHED_GROUP_BARRIER that were added due 1687bdd1243dSDimitry Andric // to the instruction having side effects. 1688bdd1243dSDimitry Andric resetEdges(*RIter, DAG); 1689bdd1243dSDimitry Andric MachineInstr &SGB = *RIter->getInstr(); 1690bdd1243dSDimitry Andric assert(SGB.getOpcode() == AMDGPU::SCHED_GROUP_BARRIER); 1691bdd1243dSDimitry Andric int32_t SGMask = SGB.getOperand(0).getImm(); 1692bdd1243dSDimitry Andric int32_t Size = SGB.getOperand(1).getImm(); 1693bdd1243dSDimitry Andric int32_t SyncID = SGB.getOperand(2).getImm(); 1694bdd1243dSDimitry Andric 1695bdd1243dSDimitry Andric auto &SG = SyncedSchedGroups[SyncID].emplace_back((SchedGroupMask)SGMask, 1696bdd1243dSDimitry Andric Size, SyncID, DAG, TII); 1697bdd1243dSDimitry Andric 1698bdd1243dSDimitry Andric SG.initSchedGroup(RIter, SyncedInstrs[SG.getSyncID()]); 169981ad6265SDimitry Andric } 170081ad6265SDimitry Andric 1701bdd1243dSDimitry Andric void IGroupLPDAGMutation::initIGLPOpt(SUnit &SU) { 1702bdd1243dSDimitry Andric IGLPStrategyID StrategyID = 1703bdd1243dSDimitry Andric (IGLPStrategyID)SU.getInstr()->getOperand(0).getImm(); 1704bdd1243dSDimitry Andric auto S = createIGLPStrategy(StrategyID, DAG, TII); 170506c3fb27SDimitry Andric if (S->shouldApplyStrategy(DAG)) { 170606c3fb27SDimitry Andric IsBottomUp = S->IsBottomUp; 1707*5f757f3fSDimitry Andric S->applyIGLPStrategy(SyncedInstrs, SyncedSchedGroups, IsReentry); 170881ad6265SDimitry Andric } 170906c3fb27SDimitry Andric } 171081ad6265SDimitry Andric 171181ad6265SDimitry Andric } // namespace 171281ad6265SDimitry Andric 171381ad6265SDimitry Andric namespace llvm { 171481ad6265SDimitry Andric 1715*5f757f3fSDimitry Andric /// \p IsReentry specifes whether or not this is a reentry into the 1716*5f757f3fSDimitry Andric /// IGroupLPDAGMutation. Since there may be multiple scheduling passes on the 1717*5f757f3fSDimitry Andric /// same scheduling region (e.g. pre and post-RA scheduling / multiple 1718*5f757f3fSDimitry Andric /// scheduling "phases"), we can reenter this mutation framework more than once 1719*5f757f3fSDimitry Andric /// for a given region. 1720*5f757f3fSDimitry Andric std::unique_ptr<ScheduleDAGMutation> createIGroupLPDAGMutation(bool IsReentry) { 1721*5f757f3fSDimitry Andric return std::make_unique<IGroupLPDAGMutation>(IsReentry); 172281ad6265SDimitry Andric } 172381ad6265SDimitry Andric 172481ad6265SDimitry Andric } // end namespace llvm 1725