181ad6265SDimitry Andric //===--- AMDGPUIGroupLP.cpp - AMDGPU IGroupLP ------------===// 281ad6265SDimitry Andric // 381ad6265SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 481ad6265SDimitry Andric // See https://llvm.org/LICENSE.txt for license information. 581ad6265SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 681ad6265SDimitry Andric // 781ad6265SDimitry Andric //===----------------------------------------------------------------------===// 881ad6265SDimitry Andric // 981ad6265SDimitry Andric // \file This file defines a set of schedule DAG mutations that can be used to 1081ad6265SDimitry Andric // override default scheduler behavior to enforce specific scheduling patterns. 1181ad6265SDimitry Andric // They should be used in cases where runtime performance considerations such as 1281ad6265SDimitry Andric // inter-wavefront interactions, mean that compile-time heuristics cannot 1381ad6265SDimitry Andric // predict the optimal instruction ordering, or in kernels where optimum 1481ad6265SDimitry Andric // instruction scheduling is important enough to warrant manual intervention. 1581ad6265SDimitry Andric // 1681ad6265SDimitry Andric //===----------------------------------------------------------------------===// 1781ad6265SDimitry Andric 1881ad6265SDimitry Andric #include "AMDGPUIGroupLP.h" 1981ad6265SDimitry Andric #include "AMDGPUTargetMachine.h" 2081ad6265SDimitry Andric #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 2181ad6265SDimitry Andric #include "SIInstrInfo.h" 2281ad6265SDimitry Andric #include "SIMachineFunctionInfo.h" 2381ad6265SDimitry Andric #include "llvm/ADT/BitmaskEnum.h" 24bdd1243dSDimitry Andric #include "llvm/ADT/DenseMap.h" 2581ad6265SDimitry Andric #include "llvm/CodeGen/MachineScheduler.h" 2681ad6265SDimitry Andric #include "llvm/CodeGen/TargetOpcodes.h" 2781ad6265SDimitry Andric 2881ad6265SDimitry Andric using namespace llvm; 2981ad6265SDimitry Andric 30bdd1243dSDimitry Andric #define DEBUG_TYPE "igrouplp" 3181ad6265SDimitry Andric 3281ad6265SDimitry Andric namespace { 3381ad6265SDimitry Andric 34bdd1243dSDimitry Andric static cl::opt<bool> EnableExactSolver( 35bdd1243dSDimitry Andric "amdgpu-igrouplp-exact-solver", cl::Hidden, 36bdd1243dSDimitry Andric cl::desc("Whether to use the exponential time solver to fit " 37bdd1243dSDimitry Andric "the instructions to the pipeline as closely as " 38bdd1243dSDimitry Andric "possible."), 3981ad6265SDimitry Andric cl::init(false)); 4081ad6265SDimitry Andric 41bdd1243dSDimitry Andric static cl::opt<unsigned> CutoffForExact( 42bdd1243dSDimitry Andric "amdgpu-igrouplp-exact-solver-cutoff", cl::init(0), cl::Hidden, 43bdd1243dSDimitry Andric cl::desc("The maximum number of scheduling group conflicts " 44bdd1243dSDimitry Andric "which we attempt to solve with the exponential time " 45bdd1243dSDimitry Andric "exact solver. Problem sizes greater than this will" 46bdd1243dSDimitry Andric "be solved by the less accurate greedy algorithm. Selecting " 47bdd1243dSDimitry Andric "solver by size is superseded by manually selecting " 48bdd1243dSDimitry Andric "the solver (e.g. by amdgpu-igrouplp-exact-solver")); 4981ad6265SDimitry Andric 50bdd1243dSDimitry Andric static cl::opt<uint64_t> MaxBranchesExplored( 51bdd1243dSDimitry Andric "amdgpu-igrouplp-exact-solver-max-branches", cl::init(0), cl::Hidden, 52bdd1243dSDimitry Andric cl::desc("The amount of branches that we are willing to explore with" 53bdd1243dSDimitry Andric "the exact algorithm before giving up.")); 5481ad6265SDimitry Andric 55bdd1243dSDimitry Andric static cl::opt<bool> UseCostHeur( 56bdd1243dSDimitry Andric "amdgpu-igrouplp-exact-solver-cost-heur", cl::init(true), cl::Hidden, 57bdd1243dSDimitry Andric cl::desc("Whether to use the cost heuristic to make choices as we " 58bdd1243dSDimitry Andric "traverse the search space using the exact solver. Defaulted " 59bdd1243dSDimitry Andric "to on, and if turned off, we will use the node order -- " 60bdd1243dSDimitry Andric "attempting to put the later nodes in the later sched groups. " 61bdd1243dSDimitry Andric "Experimentally, results are mixed, so this should be set on a " 62bdd1243dSDimitry Andric "case-by-case basis.")); 6381ad6265SDimitry Andric 64bdd1243dSDimitry Andric // Components of the mask that determines which instruction types may be may be 65bdd1243dSDimitry Andric // classified into a SchedGroup. 66bdd1243dSDimitry Andric enum class SchedGroupMask { 6781ad6265SDimitry Andric NONE = 0u, 6881ad6265SDimitry Andric ALU = 1u << 0, 6981ad6265SDimitry Andric VALU = 1u << 1, 7081ad6265SDimitry Andric SALU = 1u << 2, 7181ad6265SDimitry Andric MFMA = 1u << 3, 7281ad6265SDimitry Andric VMEM = 1u << 4, 7381ad6265SDimitry Andric VMEM_READ = 1u << 5, 7481ad6265SDimitry Andric VMEM_WRITE = 1u << 6, 7581ad6265SDimitry Andric DS = 1u << 7, 7681ad6265SDimitry Andric DS_READ = 1u << 8, 7781ad6265SDimitry Andric DS_WRITE = 1u << 9, 78cb14a3feSDimitry Andric TRANS = 1u << 10, 79bdd1243dSDimitry Andric ALL = ALU | VALU | SALU | MFMA | VMEM | VMEM_READ | VMEM_WRITE | DS | 80cb14a3feSDimitry Andric DS_READ | DS_WRITE | TRANS, 81bdd1243dSDimitry Andric LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ ALL) 8281ad6265SDimitry Andric }; 8381ad6265SDimitry Andric 8406c3fb27SDimitry Andric class SchedGroup; 8506c3fb27SDimitry Andric 8606c3fb27SDimitry Andric // InstructionRule class is used to enact a filter which determines whether or 8706c3fb27SDimitry Andric // not an SU maps to a given SchedGroup. It contains complementary data 8806c3fb27SDimitry Andric // structures (e.g Cache) to help those filters. 8906c3fb27SDimitry Andric class InstructionRule { 9006c3fb27SDimitry Andric protected: 9106c3fb27SDimitry Andric const SIInstrInfo *TII; 9206c3fb27SDimitry Andric unsigned SGID; 9306c3fb27SDimitry Andric // A cache made available to the Filter to store SUnits for subsequent 9406c3fb27SDimitry Andric // invocations of the Filter 9506c3fb27SDimitry Andric std::optional<SmallVector<SUnit *, 4>> Cache; 9606c3fb27SDimitry Andric 9706c3fb27SDimitry Andric public: 9806c3fb27SDimitry Andric virtual bool 9906c3fb27SDimitry Andric apply(const SUnit *, const ArrayRef<SUnit *>, 10006c3fb27SDimitry Andric SmallVectorImpl<SchedGroup> &) { 10106c3fb27SDimitry Andric return true; 10206c3fb27SDimitry Andric }; 10306c3fb27SDimitry Andric 10406c3fb27SDimitry Andric InstructionRule(const SIInstrInfo *TII, unsigned SGID, 10506c3fb27SDimitry Andric bool NeedsCache = false) 10606c3fb27SDimitry Andric : TII(TII), SGID(SGID) { 10706c3fb27SDimitry Andric if (NeedsCache) { 10806c3fb27SDimitry Andric Cache = SmallVector<SUnit *, 4>(); 10906c3fb27SDimitry Andric } 11006c3fb27SDimitry Andric } 11106c3fb27SDimitry Andric 11206c3fb27SDimitry Andric virtual ~InstructionRule() = default; 11306c3fb27SDimitry Andric }; 11406c3fb27SDimitry Andric 115*0fca6ea1SDimitry Andric using SUnitsToCandidateSGsMap = DenseMap<SUnit *, SmallVector<int, 4>>; 11681ad6265SDimitry Andric 117bdd1243dSDimitry Andric // Classify instructions into groups to enable fine tuned control over the 118bdd1243dSDimitry Andric // scheduler. These groups may be more specific than current SchedModel 119bdd1243dSDimitry Andric // instruction classes. 120bdd1243dSDimitry Andric class SchedGroup { 121bdd1243dSDimitry Andric private: 122bdd1243dSDimitry Andric // Mask that defines which instruction types can be classified into this 123bdd1243dSDimitry Andric // SchedGroup. The instruction types correspond to the mask from SCHED_BARRIER 124bdd1243dSDimitry Andric // and SCHED_GROUP_BARRIER. 125bdd1243dSDimitry Andric SchedGroupMask SGMask; 12681ad6265SDimitry Andric 127bdd1243dSDimitry Andric // Maximum number of SUnits that can be added to this group. 128bdd1243dSDimitry Andric std::optional<unsigned> MaxSize; 12981ad6265SDimitry Andric 130bdd1243dSDimitry Andric // SchedGroups will only synchronize with other SchedGroups that have the same 131bdd1243dSDimitry Andric // SyncID. 132bdd1243dSDimitry Andric int SyncID = 0; 13381ad6265SDimitry Andric 134bdd1243dSDimitry Andric // SGID is used to map instructions to candidate SchedGroups 135bdd1243dSDimitry Andric unsigned SGID; 136bdd1243dSDimitry Andric 13706c3fb27SDimitry Andric // The different rules each instruction in this SchedGroup must conform to 13806c3fb27SDimitry Andric SmallVector<std::shared_ptr<InstructionRule>, 4> Rules; 13906c3fb27SDimitry Andric 140bdd1243dSDimitry Andric // Count of the number of created SchedGroups, used to initialize SGID. 141bdd1243dSDimitry Andric static unsigned NumSchedGroups; 142bdd1243dSDimitry Andric 143bdd1243dSDimitry Andric // Try to add and edge from SU A to SU B. 144bdd1243dSDimitry Andric bool tryAddEdge(SUnit *A, SUnit *B); 145bdd1243dSDimitry Andric 146bdd1243dSDimitry Andric // Use SGMask to determine whether we can classify MI as a member of this 147bdd1243dSDimitry Andric // SchedGroup object. 148bdd1243dSDimitry Andric bool canAddMI(const MachineInstr &MI) const; 14981ad6265SDimitry Andric 15081ad6265SDimitry Andric public: 151bdd1243dSDimitry Andric // Collection of SUnits that are classified as members of this group. 152bdd1243dSDimitry Andric SmallVector<SUnit *, 32> Collection; 15381ad6265SDimitry Andric 15406c3fb27SDimitry Andric ScheduleDAGInstrs *DAG; 155*0fca6ea1SDimitry Andric const SIInstrInfo *TII; 15606c3fb27SDimitry Andric 157bdd1243dSDimitry Andric // Returns true if SU can be added to this SchedGroup. 158bdd1243dSDimitry Andric bool canAddSU(SUnit &SU) const; 15981ad6265SDimitry Andric 160bdd1243dSDimitry Andric // Add DAG dependencies from all SUnits in this SchedGroup and this SU. If 161bdd1243dSDimitry Andric // MakePred is true, SU will be a predecessor of the SUnits in this 162bdd1243dSDimitry Andric // SchedGroup, otherwise SU will be a successor. 163bdd1243dSDimitry Andric void link(SUnit &SU, bool MakePred = false); 16481ad6265SDimitry Andric 165bdd1243dSDimitry Andric // Add DAG dependencies and track which edges are added, and the count of 166bdd1243dSDimitry Andric // missed edges 167bdd1243dSDimitry Andric int link(SUnit &SU, bool MakePred, 168bdd1243dSDimitry Andric std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges); 16981ad6265SDimitry Andric 170bdd1243dSDimitry Andric // Add DAG dependencies from all SUnits in this SchedGroup and this SU. 171bdd1243dSDimitry Andric // Use the predicate to determine whether SU should be a predecessor (P = 172bdd1243dSDimitry Andric // true) or a successor (P = false) of this SchedGroup. 173bdd1243dSDimitry Andric void link(SUnit &SU, function_ref<bool(const SUnit *A, const SUnit *B)> P); 17481ad6265SDimitry Andric 175bdd1243dSDimitry Andric // Add DAG dependencies such that SUnits in this group shall be ordered 176bdd1243dSDimitry Andric // before SUnits in OtherGroup. 177bdd1243dSDimitry Andric void link(SchedGroup &OtherGroup); 178bdd1243dSDimitry Andric 179bdd1243dSDimitry Andric // Returns true if no more instructions may be added to this group. 180bdd1243dSDimitry Andric bool isFull() const { return MaxSize && Collection.size() >= *MaxSize; } 181bdd1243dSDimitry Andric 18206c3fb27SDimitry Andric // Append a constraint that SUs must meet in order to fit into this 18306c3fb27SDimitry Andric // SchedGroup. Since many rules involve the relationship between a SchedGroup 18406c3fb27SDimitry Andric // and the SUnits in other SchedGroups, rules are checked at Pipeline Solve 18506c3fb27SDimitry Andric // time (rather than SchedGroup init time.) 18606c3fb27SDimitry Andric void addRule(std::shared_ptr<InstructionRule> NewRule) { 18706c3fb27SDimitry Andric Rules.push_back(NewRule); 18806c3fb27SDimitry Andric } 18906c3fb27SDimitry Andric 19006c3fb27SDimitry Andric // Returns true if the SU matches all rules 19106c3fb27SDimitry Andric bool allowedByRules(const SUnit *SU, 19206c3fb27SDimitry Andric SmallVectorImpl<SchedGroup> &SyncPipe) const { 193*0fca6ea1SDimitry Andric for (auto &Rule : Rules) { 194*0fca6ea1SDimitry Andric if (!Rule.get()->apply(SU, Collection, SyncPipe)) 19506c3fb27SDimitry Andric return false; 19606c3fb27SDimitry Andric } 19706c3fb27SDimitry Andric return true; 19806c3fb27SDimitry Andric } 19906c3fb27SDimitry Andric 200bdd1243dSDimitry Andric // Add SU to the SchedGroup. 201bdd1243dSDimitry Andric void add(SUnit &SU) { 202bdd1243dSDimitry Andric LLVM_DEBUG(dbgs() << "For SchedGroup with mask " 203bdd1243dSDimitry Andric << format_hex((int)SGMask, 10, true) << " adding " 204bdd1243dSDimitry Andric << *SU.getInstr()); 205bdd1243dSDimitry Andric Collection.push_back(&SU); 20681ad6265SDimitry Andric } 20781ad6265SDimitry Andric 208bdd1243dSDimitry Andric // Remove last element in the SchedGroup 209bdd1243dSDimitry Andric void pop() { Collection.pop_back(); } 210bdd1243dSDimitry Andric 211bdd1243dSDimitry Andric // Identify and add all relevant SUs from the DAG to this SchedGroup. 212bdd1243dSDimitry Andric void initSchedGroup(); 213bdd1243dSDimitry Andric 214bdd1243dSDimitry Andric // Add instructions to the SchedGroup bottom up starting from RIter. 215bdd1243dSDimitry Andric // PipelineInstrs is a set of instructions that should not be added to the 216bdd1243dSDimitry Andric // SchedGroup even when the other conditions for adding it are satisfied. 217bdd1243dSDimitry Andric // RIter will be added to the SchedGroup as well, and dependencies will be 218bdd1243dSDimitry Andric // added so that RIter will always be scheduled at the end of the group. 219bdd1243dSDimitry Andric void initSchedGroup(std::vector<SUnit>::reverse_iterator RIter, 220bdd1243dSDimitry Andric SUnitsToCandidateSGsMap &SyncedInstrs); 221bdd1243dSDimitry Andric 222bdd1243dSDimitry Andric void initSchedGroup(SUnitsToCandidateSGsMap &SyncedInstrs); 223bdd1243dSDimitry Andric 224bdd1243dSDimitry Andric int getSyncID() { return SyncID; } 225bdd1243dSDimitry Andric 226bdd1243dSDimitry Andric int getSGID() { return SGID; } 227bdd1243dSDimitry Andric 228bdd1243dSDimitry Andric SchedGroupMask getMask() { return SGMask; } 229bdd1243dSDimitry Andric 230bdd1243dSDimitry Andric SchedGroup(SchedGroupMask SGMask, std::optional<unsigned> MaxSize, 231bdd1243dSDimitry Andric ScheduleDAGInstrs *DAG, const SIInstrInfo *TII) 232*0fca6ea1SDimitry Andric : SGMask(SGMask), MaxSize(MaxSize), DAG(DAG), TII(TII) { 233bdd1243dSDimitry Andric SGID = NumSchedGroups++; 234bdd1243dSDimitry Andric } 235bdd1243dSDimitry Andric 236bdd1243dSDimitry Andric SchedGroup(SchedGroupMask SGMask, std::optional<unsigned> MaxSize, int SyncID, 237bdd1243dSDimitry Andric ScheduleDAGInstrs *DAG, const SIInstrInfo *TII) 238*0fca6ea1SDimitry Andric : SGMask(SGMask), MaxSize(MaxSize), SyncID(SyncID), DAG(DAG), TII(TII) { 239bdd1243dSDimitry Andric SGID = NumSchedGroups++; 240bdd1243dSDimitry Andric } 241bdd1243dSDimitry Andric }; 242bdd1243dSDimitry Andric 243bdd1243dSDimitry Andric // Remove all existing edges from a SCHED_BARRIER or SCHED_GROUP_BARRIER. 244bdd1243dSDimitry Andric static void resetEdges(SUnit &SU, ScheduleDAGInstrs *DAG) { 245bdd1243dSDimitry Andric assert(SU.getInstr()->getOpcode() == AMDGPU::SCHED_BARRIER || 246bdd1243dSDimitry Andric SU.getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER || 247bdd1243dSDimitry Andric SU.getInstr()->getOpcode() == AMDGPU::IGLP_OPT); 248bdd1243dSDimitry Andric 249bdd1243dSDimitry Andric while (!SU.Preds.empty()) 250bdd1243dSDimitry Andric for (auto &P : SU.Preds) 251bdd1243dSDimitry Andric SU.removePred(P); 252bdd1243dSDimitry Andric 253bdd1243dSDimitry Andric while (!SU.Succs.empty()) 254bdd1243dSDimitry Andric for (auto &S : SU.Succs) 255bdd1243dSDimitry Andric for (auto &SP : S.getSUnit()->Preds) 256bdd1243dSDimitry Andric if (SP.getSUnit() == &SU) 257bdd1243dSDimitry Andric S.getSUnit()->removePred(SP); 258bdd1243dSDimitry Andric } 259bdd1243dSDimitry Andric 260*0fca6ea1SDimitry Andric using SUToCandSGsPair = std::pair<SUnit *, SmallVector<int, 4>>; 261*0fca6ea1SDimitry Andric using SUsToCandSGsVec = SmallVector<SUToCandSGsPair, 4>; 262bdd1243dSDimitry Andric 263bdd1243dSDimitry Andric // The PipelineSolver is used to assign SUnits to SchedGroups in a pipeline 264bdd1243dSDimitry Andric // in non-trivial cases. For example, if the requested pipeline is 265bdd1243dSDimitry Andric // {VMEM_READ, VALU, MFMA, VMEM_READ} and we encounter a VMEM_READ instruction 266bdd1243dSDimitry Andric // in the DAG, then we will have an instruction that can not be trivially 267bdd1243dSDimitry Andric // assigned to a SchedGroup. The PipelineSolver class implements two algorithms 268bdd1243dSDimitry Andric // to find a good solution to the pipeline -- a greedy algorithm and an exact 269bdd1243dSDimitry Andric // algorithm. The exact algorithm has an exponential time complexity and should 270bdd1243dSDimitry Andric // only be used for small sized problems or medium sized problems where an exact 271bdd1243dSDimitry Andric // solution is highly desired. 272bdd1243dSDimitry Andric class PipelineSolver { 273bdd1243dSDimitry Andric ScheduleDAGMI *DAG; 274bdd1243dSDimitry Andric 275bdd1243dSDimitry Andric // Instructions that can be assigned to multiple SchedGroups 276bdd1243dSDimitry Andric DenseMap<int, SUnitsToCandidateSGsMap> SyncedInstrs; 277bdd1243dSDimitry Andric SmallVector<SUsToCandSGsVec, 4> PipelineInstrs; 278bdd1243dSDimitry Andric DenseMap<int, SmallVector<SchedGroup, 4>> SyncedSchedGroups; 279bdd1243dSDimitry Andric // The current working pipeline 280bdd1243dSDimitry Andric SmallVector<SmallVector<SchedGroup, 4>, 4> CurrPipeline; 281bdd1243dSDimitry Andric // The pipeline that has the best solution found so far 282bdd1243dSDimitry Andric SmallVector<SmallVector<SchedGroup, 4>, 4> BestPipeline; 283bdd1243dSDimitry Andric 284bdd1243dSDimitry Andric // Whether or not we actually have any SyncedInstrs to try to solve. 285bdd1243dSDimitry Andric bool NeedsSolver = false; 286bdd1243dSDimitry Andric 287bdd1243dSDimitry Andric // Compute an estimate of the size of search tree -- the true size is 288bdd1243dSDimitry Andric // the product of each conflictedInst.Matches.size() across all SyncPipelines 289bdd1243dSDimitry Andric unsigned computeProblemSize(); 290bdd1243dSDimitry Andric 291bdd1243dSDimitry Andric // The cost penalty of not assigning a SU to a SchedGroup 292bdd1243dSDimitry Andric int MissPenalty = 0; 293bdd1243dSDimitry Andric 294bdd1243dSDimitry Andric // Costs in terms of the number of edges we are unable to add 295bdd1243dSDimitry Andric int BestCost = -1; 296bdd1243dSDimitry Andric int CurrCost = 0; 297bdd1243dSDimitry Andric 298bdd1243dSDimitry Andric // Index pointing to the conflicting instruction that is currently being 299bdd1243dSDimitry Andric // fitted 300bdd1243dSDimitry Andric int CurrConflInstNo = 0; 301bdd1243dSDimitry Andric // Index to the pipeline that is currently being fitted 302bdd1243dSDimitry Andric int CurrSyncGroupIdx = 0; 303bdd1243dSDimitry Andric // The first non trivial pipeline 304bdd1243dSDimitry Andric int BeginSyncGroupIdx = 0; 305bdd1243dSDimitry Andric 306bdd1243dSDimitry Andric // How many branches we have explored 307bdd1243dSDimitry Andric uint64_t BranchesExplored = 0; 308bdd1243dSDimitry Andric 30906c3fb27SDimitry Andric // The direction in which we process the candidate SchedGroups per SU 310*0fca6ea1SDimitry Andric bool IsBottomUp = true; 31106c3fb27SDimitry Andric 312bdd1243dSDimitry Andric // Update indices to fit next conflicting instruction 313bdd1243dSDimitry Andric void advancePosition(); 314bdd1243dSDimitry Andric // Recede indices to attempt to find better fit for previous conflicting 315bdd1243dSDimitry Andric // instruction 316bdd1243dSDimitry Andric void retreatPosition(); 317bdd1243dSDimitry Andric 318bdd1243dSDimitry Andric // The exponential time algorithm which finds the provably best fit 319bdd1243dSDimitry Andric bool solveExact(); 320bdd1243dSDimitry Andric // The polynomial time algorithm which attempts to find a good fit 321bdd1243dSDimitry Andric bool solveGreedy(); 32206c3fb27SDimitry Andric // Find the best SchedGroup for the current SU using the heuristic given all 32306c3fb27SDimitry Andric // current information. One step in the greedy algorithm. Templated against 32406c3fb27SDimitry Andric // the SchedGroup iterator (either reverse or forward). 32506c3fb27SDimitry Andric template <typename T> 32606c3fb27SDimitry Andric void greedyFind(std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges, T I, 32706c3fb27SDimitry Andric T E); 328bdd1243dSDimitry Andric // Whether or not the current solution is optimal 329bdd1243dSDimitry Andric bool checkOptimal(); 330bdd1243dSDimitry Andric // Populate the ready list, prioiritizing fewest missed edges first 33106c3fb27SDimitry Andric // Templated against the SchedGroup iterator (either reverse or forward). 33206c3fb27SDimitry Andric template <typename T> 33306c3fb27SDimitry Andric void populateReadyList(SmallVectorImpl<std::pair<int, int>> &ReadyList, T I, 33406c3fb27SDimitry Andric T E); 335bdd1243dSDimitry Andric // Add edges corresponding to the SchedGroups as assigned by solver 336bdd1243dSDimitry Andric void makePipeline(); 33706c3fb27SDimitry Andric // Link the SchedGroups in the best found pipeline. 33806c3fb27SDimitry Andric // Tmplated against the SchedGroup iterator (either reverse or forward). 33906c3fb27SDimitry Andric template <typename T> void linkSchedGroups(T I, T E); 340bdd1243dSDimitry Andric // Add the edges from the SU to the other SchedGroups in pipeline, and 341bdd1243dSDimitry Andric // return the number of edges missed. 342bdd1243dSDimitry Andric int addEdges(SmallVectorImpl<SchedGroup> &SyncPipeline, SUnit *SU, int SGID, 343bdd1243dSDimitry Andric std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges); 3445f757f3fSDimitry Andric /// Link the pipeline as if \p SU was in the SchedGroup with ID \p SGID. It 3455f757f3fSDimitry Andric /// returns the cost (in terms of missed pipeline edges), and tracks the edges 3465f757f3fSDimitry Andric /// added in \p AddedEdges 34706c3fb27SDimitry Andric template <typename T> 34806c3fb27SDimitry Andric int linkSUnit(SUnit *SU, int SGID, 34906c3fb27SDimitry Andric std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges, T I, T E); 3505f757f3fSDimitry Andric /// Remove the edges passed via \p AddedEdges 351bdd1243dSDimitry Andric void removeEdges(const std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges); 352bdd1243dSDimitry Andric // Convert the passed in maps to arrays for bidirectional iterators 353bdd1243dSDimitry Andric void convertSyncMapsToArrays(); 354bdd1243dSDimitry Andric 355bdd1243dSDimitry Andric void reset(); 356bdd1243dSDimitry Andric 357bdd1243dSDimitry Andric public: 358bdd1243dSDimitry Andric // Invoke the solver to map instructions to instruction groups. Heuristic && 359bdd1243dSDimitry Andric // command-line-option determines to use exact or greedy algorithm. 360bdd1243dSDimitry Andric void solve(); 361bdd1243dSDimitry Andric 362bdd1243dSDimitry Andric PipelineSolver(DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups, 363bdd1243dSDimitry Andric DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs, 364*0fca6ea1SDimitry Andric ScheduleDAGMI *DAG, bool IsBottomUp = true) 365bdd1243dSDimitry Andric : DAG(DAG), SyncedInstrs(SyncedInstrs), 36606c3fb27SDimitry Andric SyncedSchedGroups(SyncedSchedGroups), IsBottomUp(IsBottomUp) { 367bdd1243dSDimitry Andric 368bdd1243dSDimitry Andric for (auto &PipelineInstrs : SyncedInstrs) { 369bdd1243dSDimitry Andric if (PipelineInstrs.second.size() > 0) { 370bdd1243dSDimitry Andric NeedsSolver = true; 371bdd1243dSDimitry Andric break; 372bdd1243dSDimitry Andric } 373bdd1243dSDimitry Andric } 374bdd1243dSDimitry Andric 375bdd1243dSDimitry Andric if (!NeedsSolver) 376bdd1243dSDimitry Andric return; 377bdd1243dSDimitry Andric 378bdd1243dSDimitry Andric convertSyncMapsToArrays(); 379bdd1243dSDimitry Andric 380bdd1243dSDimitry Andric CurrPipeline = BestPipeline; 381bdd1243dSDimitry Andric 382bdd1243dSDimitry Andric while (static_cast<size_t>(BeginSyncGroupIdx) < PipelineInstrs.size() && 383bdd1243dSDimitry Andric PipelineInstrs[BeginSyncGroupIdx].size() == 0) 384bdd1243dSDimitry Andric ++BeginSyncGroupIdx; 385bdd1243dSDimitry Andric 386bdd1243dSDimitry Andric if (static_cast<size_t>(BeginSyncGroupIdx) >= PipelineInstrs.size()) 387bdd1243dSDimitry Andric return; 388bdd1243dSDimitry Andric } 389bdd1243dSDimitry Andric }; 390bdd1243dSDimitry Andric 391bdd1243dSDimitry Andric void PipelineSolver::reset() { 392bdd1243dSDimitry Andric 393bdd1243dSDimitry Andric for (auto &SyncPipeline : CurrPipeline) { 394bdd1243dSDimitry Andric for (auto &SG : SyncPipeline) { 395bdd1243dSDimitry Andric SmallVector<SUnit *, 32> TempCollection = SG.Collection; 396bdd1243dSDimitry Andric SG.Collection.clear(); 397bdd1243dSDimitry Andric auto SchedBarr = llvm::find_if(TempCollection, [](SUnit *SU) { 398bdd1243dSDimitry Andric return SU->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER; 399bdd1243dSDimitry Andric }); 400bdd1243dSDimitry Andric if (SchedBarr != TempCollection.end()) 401bdd1243dSDimitry Andric SG.Collection.push_back(*SchedBarr); 402bdd1243dSDimitry Andric } 403bdd1243dSDimitry Andric } 404bdd1243dSDimitry Andric 405bdd1243dSDimitry Andric CurrSyncGroupIdx = BeginSyncGroupIdx; 406bdd1243dSDimitry Andric CurrConflInstNo = 0; 407bdd1243dSDimitry Andric CurrCost = 0; 408bdd1243dSDimitry Andric } 409bdd1243dSDimitry Andric 410bdd1243dSDimitry Andric void PipelineSolver::convertSyncMapsToArrays() { 411bdd1243dSDimitry Andric for (auto &SyncPipe : SyncedSchedGroups) { 412bdd1243dSDimitry Andric BestPipeline.insert(BestPipeline.begin(), SyncPipe.second); 413bdd1243dSDimitry Andric } 414bdd1243dSDimitry Andric 415bdd1243dSDimitry Andric int PipelineIDx = SyncedInstrs.size() - 1; 416bdd1243dSDimitry Andric PipelineInstrs.resize(SyncedInstrs.size()); 417bdd1243dSDimitry Andric for (auto &SyncInstrMap : SyncedInstrs) { 418bdd1243dSDimitry Andric for (auto &SUsToCandSGs : SyncInstrMap.second) { 419bdd1243dSDimitry Andric if (PipelineInstrs[PipelineIDx].size() == 0) { 420bdd1243dSDimitry Andric PipelineInstrs[PipelineIDx].push_back( 421bdd1243dSDimitry Andric std::pair(SUsToCandSGs.first, SUsToCandSGs.second)); 422bdd1243dSDimitry Andric continue; 423bdd1243dSDimitry Andric } 424bdd1243dSDimitry Andric auto SortPosition = PipelineInstrs[PipelineIDx].begin(); 425bdd1243dSDimitry Andric // Insert them in sorted order -- this allows for good parsing order in 426bdd1243dSDimitry Andric // the greedy algorithm 427bdd1243dSDimitry Andric while (SortPosition != PipelineInstrs[PipelineIDx].end() && 428bdd1243dSDimitry Andric SUsToCandSGs.first->NodeNum > SortPosition->first->NodeNum) 429bdd1243dSDimitry Andric ++SortPosition; 430bdd1243dSDimitry Andric PipelineInstrs[PipelineIDx].insert( 431bdd1243dSDimitry Andric SortPosition, std::pair(SUsToCandSGs.first, SUsToCandSGs.second)); 432bdd1243dSDimitry Andric } 433bdd1243dSDimitry Andric --PipelineIDx; 434bdd1243dSDimitry Andric } 435bdd1243dSDimitry Andric } 436bdd1243dSDimitry Andric 43706c3fb27SDimitry Andric template <typename T> void PipelineSolver::linkSchedGroups(T I, T E) { 43806c3fb27SDimitry Andric for (; I != E; ++I) { 43906c3fb27SDimitry Andric auto &GroupA = *I; 44006c3fb27SDimitry Andric for (auto J = std::next(I); J != E; ++J) { 44106c3fb27SDimitry Andric auto &GroupB = *J; 44206c3fb27SDimitry Andric GroupA.link(GroupB); 44306c3fb27SDimitry Andric } 44406c3fb27SDimitry Andric } 44506c3fb27SDimitry Andric } 44606c3fb27SDimitry Andric 447bdd1243dSDimitry Andric void PipelineSolver::makePipeline() { 448bdd1243dSDimitry Andric // Preserve the order of barrier for subsequent SchedGroupBarrier mutations 449bdd1243dSDimitry Andric for (auto &SyncPipeline : BestPipeline) { 45006c3fb27SDimitry Andric LLVM_DEBUG(dbgs() << "Printing SchedGroups\n"); 451bdd1243dSDimitry Andric for (auto &SG : SyncPipeline) { 45206c3fb27SDimitry Andric LLVM_DEBUG(dbgs() << "SchedGroup with SGID " << SG.getSGID() 45306c3fb27SDimitry Andric << " has: \n"); 454bdd1243dSDimitry Andric SUnit *SGBarr = nullptr; 455bdd1243dSDimitry Andric for (auto &SU : SG.Collection) { 456bdd1243dSDimitry Andric if (SU->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER) 457bdd1243dSDimitry Andric SGBarr = SU; 45806c3fb27SDimitry Andric LLVM_DEBUG(dbgs() << "SU(" << SU->NodeNum << ")\n"); 459bdd1243dSDimitry Andric } 460bdd1243dSDimitry Andric // Command line requested IGroupLP doesn't have SGBarr 461bdd1243dSDimitry Andric if (!SGBarr) 462bdd1243dSDimitry Andric continue; 463bdd1243dSDimitry Andric resetEdges(*SGBarr, DAG); 464bdd1243dSDimitry Andric SG.link(*SGBarr, false); 465bdd1243dSDimitry Andric } 466bdd1243dSDimitry Andric } 467bdd1243dSDimitry Andric 468bdd1243dSDimitry Andric for (auto &SyncPipeline : BestPipeline) { 46906c3fb27SDimitry Andric IsBottomUp ? linkSchedGroups(SyncPipeline.rbegin(), SyncPipeline.rend()) 47006c3fb27SDimitry Andric : linkSchedGroups(SyncPipeline.begin(), SyncPipeline.end()); 47181ad6265SDimitry Andric } 47281ad6265SDimitry Andric } 47306c3fb27SDimitry Andric 47406c3fb27SDimitry Andric template <typename T> 47506c3fb27SDimitry Andric int PipelineSolver::linkSUnit( 47606c3fb27SDimitry Andric SUnit *SU, int SGID, std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges, 47706c3fb27SDimitry Andric T I, T E) { 47806c3fb27SDimitry Andric bool MakePred = false; 47906c3fb27SDimitry Andric int AddedCost = 0; 48006c3fb27SDimitry Andric for (; I < E; ++I) { 48106c3fb27SDimitry Andric if (I->getSGID() == SGID) { 48206c3fb27SDimitry Andric MakePred = true; 48306c3fb27SDimitry Andric continue; 48481ad6265SDimitry Andric } 48506c3fb27SDimitry Andric auto Group = *I; 48606c3fb27SDimitry Andric AddedCost += Group.link(*SU, MakePred, AddedEdges); 48706c3fb27SDimitry Andric assert(AddedCost >= 0); 48806c3fb27SDimitry Andric } 48906c3fb27SDimitry Andric return AddedCost; 490bdd1243dSDimitry Andric } 49181ad6265SDimitry Andric 492bdd1243dSDimitry Andric int PipelineSolver::addEdges( 493bdd1243dSDimitry Andric SmallVectorImpl<SchedGroup> &SyncPipeline, SUnit *SU, int SGID, 494bdd1243dSDimitry Andric std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges) { 495bdd1243dSDimitry Andric 49606c3fb27SDimitry Andric // For IsBottomUp, the first SchedGroup in SyncPipeline contains the 49706c3fb27SDimitry Andric // instructions that are the ultimate successors in the resultant mutation. 49806c3fb27SDimitry Andric // Therefore, in such a configuration, the SchedGroups occurring before the 49906c3fb27SDimitry Andric // candidate SGID are successors of the candidate SchedGroup, thus the current 50006c3fb27SDimitry Andric // SU should be linked as a predecessor to SUs in those SchedGroups. The 50106c3fb27SDimitry Andric // opposite is true if !IsBottomUp. IsBottomUp occurs in the case of multiple 50206c3fb27SDimitry Andric // SCHED_GROUP_BARRIERS, or if a user specifies IGLP_OPT SchedGroups using 50306c3fb27SDimitry Andric // IsBottomUp (in reverse). 50406c3fb27SDimitry Andric return IsBottomUp ? linkSUnit(SU, SGID, AddedEdges, SyncPipeline.rbegin(), 50506c3fb27SDimitry Andric SyncPipeline.rend()) 50606c3fb27SDimitry Andric : linkSUnit(SU, SGID, AddedEdges, SyncPipeline.begin(), 50706c3fb27SDimitry Andric SyncPipeline.end()); 508bdd1243dSDimitry Andric } 509bdd1243dSDimitry Andric 510bdd1243dSDimitry Andric void PipelineSolver::removeEdges( 511bdd1243dSDimitry Andric const std::vector<std::pair<SUnit *, SUnit *>> &EdgesToRemove) { 512bdd1243dSDimitry Andric // Only remove the edges that we have added when testing 513bdd1243dSDimitry Andric // the fit. 514bdd1243dSDimitry Andric for (auto &PredSuccPair : EdgesToRemove) { 515bdd1243dSDimitry Andric SUnit *Pred = PredSuccPair.first; 516bdd1243dSDimitry Andric SUnit *Succ = PredSuccPair.second; 517bdd1243dSDimitry Andric 518bdd1243dSDimitry Andric auto Match = llvm::find_if( 519bdd1243dSDimitry Andric Succ->Preds, [&Pred](SDep &P) { return P.getSUnit() == Pred; }); 520bdd1243dSDimitry Andric if (Match != Succ->Preds.end()) { 521bdd1243dSDimitry Andric assert(Match->isArtificial()); 522bdd1243dSDimitry Andric Succ->removePred(*Match); 523bdd1243dSDimitry Andric } 524bdd1243dSDimitry Andric } 525bdd1243dSDimitry Andric } 526bdd1243dSDimitry Andric 527bdd1243dSDimitry Andric void PipelineSolver::advancePosition() { 528bdd1243dSDimitry Andric ++CurrConflInstNo; 529bdd1243dSDimitry Andric 530bdd1243dSDimitry Andric if (static_cast<size_t>(CurrConflInstNo) >= 531bdd1243dSDimitry Andric PipelineInstrs[CurrSyncGroupIdx].size()) { 532bdd1243dSDimitry Andric CurrConflInstNo = 0; 533bdd1243dSDimitry Andric ++CurrSyncGroupIdx; 534bdd1243dSDimitry Andric // Advance to next non-trivial pipeline 535bdd1243dSDimitry Andric while (static_cast<size_t>(CurrSyncGroupIdx) < PipelineInstrs.size() && 536bdd1243dSDimitry Andric PipelineInstrs[CurrSyncGroupIdx].size() == 0) 537bdd1243dSDimitry Andric ++CurrSyncGroupIdx; 538bdd1243dSDimitry Andric } 539bdd1243dSDimitry Andric } 540bdd1243dSDimitry Andric 541bdd1243dSDimitry Andric void PipelineSolver::retreatPosition() { 542bdd1243dSDimitry Andric assert(CurrConflInstNo >= 0); 543bdd1243dSDimitry Andric assert(CurrSyncGroupIdx >= 0); 544bdd1243dSDimitry Andric 545bdd1243dSDimitry Andric if (CurrConflInstNo > 0) { 546bdd1243dSDimitry Andric --CurrConflInstNo; 547bdd1243dSDimitry Andric return; 548bdd1243dSDimitry Andric } 549bdd1243dSDimitry Andric 550bdd1243dSDimitry Andric if (CurrConflInstNo == 0) { 551bdd1243dSDimitry Andric // If we return to the starting position, we have explored 552bdd1243dSDimitry Andric // the entire tree 553bdd1243dSDimitry Andric if (CurrSyncGroupIdx == BeginSyncGroupIdx) 554bdd1243dSDimitry Andric return; 555bdd1243dSDimitry Andric 556bdd1243dSDimitry Andric --CurrSyncGroupIdx; 557bdd1243dSDimitry Andric // Go to previous non-trivial pipeline 558bdd1243dSDimitry Andric while (PipelineInstrs[CurrSyncGroupIdx].size() == 0) 559bdd1243dSDimitry Andric --CurrSyncGroupIdx; 560bdd1243dSDimitry Andric 561bdd1243dSDimitry Andric CurrConflInstNo = PipelineInstrs[CurrSyncGroupIdx].size() - 1; 562bdd1243dSDimitry Andric } 563bdd1243dSDimitry Andric } 564bdd1243dSDimitry Andric 565bdd1243dSDimitry Andric bool PipelineSolver::checkOptimal() { 566bdd1243dSDimitry Andric if (static_cast<size_t>(CurrSyncGroupIdx) == PipelineInstrs.size()) { 567bdd1243dSDimitry Andric if (BestCost == -1 || CurrCost < BestCost) { 568bdd1243dSDimitry Andric BestPipeline = CurrPipeline; 569bdd1243dSDimitry Andric BestCost = CurrCost; 570bdd1243dSDimitry Andric LLVM_DEBUG(dbgs() << "Found Fit with cost " << BestCost << "\n"); 571bdd1243dSDimitry Andric } 572bdd1243dSDimitry Andric assert(BestCost >= 0); 573bdd1243dSDimitry Andric } 574bdd1243dSDimitry Andric 575bdd1243dSDimitry Andric bool DoneExploring = false; 576bdd1243dSDimitry Andric if (MaxBranchesExplored > 0 && BranchesExplored >= MaxBranchesExplored) 577bdd1243dSDimitry Andric DoneExploring = true; 578bdd1243dSDimitry Andric 579bdd1243dSDimitry Andric return (DoneExploring || BestCost == 0); 580bdd1243dSDimitry Andric } 581bdd1243dSDimitry Andric 58206c3fb27SDimitry Andric template <typename T> 583bdd1243dSDimitry Andric void PipelineSolver::populateReadyList( 58406c3fb27SDimitry Andric SmallVectorImpl<std::pair<int, int>> &ReadyList, T I, T E) { 58506c3fb27SDimitry Andric SUToCandSGsPair CurrSU = PipelineInstrs[CurrSyncGroupIdx][CurrConflInstNo]; 58606c3fb27SDimitry Andric auto SyncPipeline = CurrPipeline[CurrSyncGroupIdx]; 587bdd1243dSDimitry Andric assert(CurrSU.second.size() >= 1); 58806c3fb27SDimitry Andric 589bdd1243dSDimitry Andric for (; I != E; ++I) { 590bdd1243dSDimitry Andric std::vector<std::pair<SUnit *, SUnit *>> AddedEdges; 591bdd1243dSDimitry Andric int CandSGID = *I; 5925f757f3fSDimitry Andric SchedGroup *Match = llvm::find_if(SyncPipeline, [CandSGID](SchedGroup &SG) { 5935f757f3fSDimitry Andric return SG.getSGID() == CandSGID; 5945f757f3fSDimitry Andric }); 5955f757f3fSDimitry Andric assert(Match); 596bdd1243dSDimitry Andric 597bdd1243dSDimitry Andric if (UseCostHeur) { 598bdd1243dSDimitry Andric if (Match->isFull()) { 599bdd1243dSDimitry Andric ReadyList.push_back(std::pair(*I, MissPenalty)); 600bdd1243dSDimitry Andric continue; 601bdd1243dSDimitry Andric } 602bdd1243dSDimitry Andric 603bdd1243dSDimitry Andric int TempCost = addEdges(SyncPipeline, CurrSU.first, CandSGID, AddedEdges); 604bdd1243dSDimitry Andric ReadyList.push_back(std::pair(*I, TempCost)); 605bdd1243dSDimitry Andric removeEdges(AddedEdges); 606bdd1243dSDimitry Andric } else 607bdd1243dSDimitry Andric ReadyList.push_back(std::pair(*I, -1)); 608bdd1243dSDimitry Andric } 609bdd1243dSDimitry Andric 610bdd1243dSDimitry Andric if (UseCostHeur) { 611bdd1243dSDimitry Andric std::sort(ReadyList.begin(), ReadyList.end(), 612bdd1243dSDimitry Andric [](std::pair<int, int> A, std::pair<int, int> B) { 613bdd1243dSDimitry Andric return A.second < B.second; 614bdd1243dSDimitry Andric }); 615bdd1243dSDimitry Andric } 616bdd1243dSDimitry Andric 617bdd1243dSDimitry Andric assert(ReadyList.size() == CurrSU.second.size()); 618bdd1243dSDimitry Andric } 619bdd1243dSDimitry Andric 620bdd1243dSDimitry Andric bool PipelineSolver::solveExact() { 621bdd1243dSDimitry Andric if (checkOptimal()) 622bdd1243dSDimitry Andric return true; 623bdd1243dSDimitry Andric 624bdd1243dSDimitry Andric if (static_cast<size_t>(CurrSyncGroupIdx) == PipelineInstrs.size()) 625bdd1243dSDimitry Andric return false; 626bdd1243dSDimitry Andric 627bdd1243dSDimitry Andric assert(static_cast<size_t>(CurrSyncGroupIdx) < PipelineInstrs.size()); 628bdd1243dSDimitry Andric assert(static_cast<size_t>(CurrConflInstNo) < 629bdd1243dSDimitry Andric PipelineInstrs[CurrSyncGroupIdx].size()); 630bdd1243dSDimitry Andric SUToCandSGsPair CurrSU = PipelineInstrs[CurrSyncGroupIdx][CurrConflInstNo]; 631bdd1243dSDimitry Andric LLVM_DEBUG(dbgs() << "Fitting SU(" << CurrSU.first->NodeNum 632bdd1243dSDimitry Andric << ") in Pipeline # " << CurrSyncGroupIdx << "\n"); 633bdd1243dSDimitry Andric 634bdd1243dSDimitry Andric // SchedGroup -> Cost pairs 635bdd1243dSDimitry Andric SmallVector<std::pair<int, int>, 4> ReadyList; 636bdd1243dSDimitry Andric // Prioritize the candidate sched groups in terms of lowest cost first 63706c3fb27SDimitry Andric IsBottomUp ? populateReadyList(ReadyList, CurrSU.second.rbegin(), 63806c3fb27SDimitry Andric CurrSU.second.rend()) 63906c3fb27SDimitry Andric : populateReadyList(ReadyList, CurrSU.second.begin(), 64006c3fb27SDimitry Andric CurrSU.second.end()); 641bdd1243dSDimitry Andric 642bdd1243dSDimitry Andric auto I = ReadyList.begin(); 643bdd1243dSDimitry Andric auto E = ReadyList.end(); 644bdd1243dSDimitry Andric for (; I != E; ++I) { 645bdd1243dSDimitry Andric // If we are trying SGs in least cost order, and the current SG is cost 646bdd1243dSDimitry Andric // infeasible, then all subsequent SGs will also be cost infeasible, so we 647bdd1243dSDimitry Andric // can prune. 648bdd1243dSDimitry Andric if (BestCost != -1 && (CurrCost + I->second > BestCost)) 649bdd1243dSDimitry Andric return false; 650bdd1243dSDimitry Andric 651bdd1243dSDimitry Andric int CandSGID = I->first; 652bdd1243dSDimitry Andric int AddedCost = 0; 653bdd1243dSDimitry Andric std::vector<std::pair<SUnit *, SUnit *>> AddedEdges; 654bdd1243dSDimitry Andric auto &SyncPipeline = CurrPipeline[CurrSyncGroupIdx]; 655bdd1243dSDimitry Andric SchedGroup *Match; 656bdd1243dSDimitry Andric for (auto &SG : SyncPipeline) { 657bdd1243dSDimitry Andric if (SG.getSGID() == CandSGID) 658bdd1243dSDimitry Andric Match = &SG; 659bdd1243dSDimitry Andric } 660bdd1243dSDimitry Andric 661bdd1243dSDimitry Andric if (Match->isFull()) 662bdd1243dSDimitry Andric continue; 663bdd1243dSDimitry Andric 66406c3fb27SDimitry Andric if (!Match->allowedByRules(CurrSU.first, SyncPipeline)) 66506c3fb27SDimitry Andric continue; 66606c3fb27SDimitry Andric 667bdd1243dSDimitry Andric LLVM_DEBUG(dbgs() << "Assigning to SchedGroup with Mask " 668bdd1243dSDimitry Andric << (int)Match->getMask() << "and ID " << CandSGID 669bdd1243dSDimitry Andric << "\n"); 670bdd1243dSDimitry Andric Match->add(*CurrSU.first); 671bdd1243dSDimitry Andric AddedCost = addEdges(SyncPipeline, CurrSU.first, CandSGID, AddedEdges); 672bdd1243dSDimitry Andric LLVM_DEBUG(dbgs() << "Cost of Assignment: " << AddedCost << "\n"); 673bdd1243dSDimitry Andric CurrCost += AddedCost; 674bdd1243dSDimitry Andric advancePosition(); 675bdd1243dSDimitry Andric ++BranchesExplored; 676bdd1243dSDimitry Andric bool FinishedExploring = false; 677bdd1243dSDimitry Andric // If the Cost after adding edges is greater than a known solution, 678bdd1243dSDimitry Andric // backtrack 679bdd1243dSDimitry Andric if (CurrCost < BestCost || BestCost == -1) { 680bdd1243dSDimitry Andric if (solveExact()) { 681bdd1243dSDimitry Andric FinishedExploring = BestCost != 0; 682bdd1243dSDimitry Andric if (!FinishedExploring) 683bdd1243dSDimitry Andric return true; 684bdd1243dSDimitry Andric } 685bdd1243dSDimitry Andric } 686bdd1243dSDimitry Andric 687bdd1243dSDimitry Andric retreatPosition(); 688bdd1243dSDimitry Andric CurrCost -= AddedCost; 689bdd1243dSDimitry Andric removeEdges(AddedEdges); 690bdd1243dSDimitry Andric Match->pop(); 691bdd1243dSDimitry Andric CurrPipeline[CurrSyncGroupIdx] = SyncPipeline; 692bdd1243dSDimitry Andric if (FinishedExploring) 693bdd1243dSDimitry Andric return true; 694bdd1243dSDimitry Andric } 695bdd1243dSDimitry Andric 696bdd1243dSDimitry Andric // Try the pipeline where the current instruction is omitted 697bdd1243dSDimitry Andric // Potentially if we omit a problematic instruction from the pipeline, 698bdd1243dSDimitry Andric // all the other instructions can nicely fit. 699bdd1243dSDimitry Andric CurrCost += MissPenalty; 700bdd1243dSDimitry Andric advancePosition(); 701bdd1243dSDimitry Andric 702bdd1243dSDimitry Andric LLVM_DEBUG(dbgs() << "NOT Assigned (" << CurrSU.first->NodeNum << ")\n"); 703bdd1243dSDimitry Andric 704bdd1243dSDimitry Andric bool FinishedExploring = false; 705bdd1243dSDimitry Andric if (CurrCost < BestCost || BestCost == -1) { 706bdd1243dSDimitry Andric if (solveExact()) { 707bdd1243dSDimitry Andric bool FinishedExploring = BestCost != 0; 708bdd1243dSDimitry Andric if (!FinishedExploring) 709bdd1243dSDimitry Andric return true; 710bdd1243dSDimitry Andric } 711bdd1243dSDimitry Andric } 712bdd1243dSDimitry Andric 713bdd1243dSDimitry Andric retreatPosition(); 714bdd1243dSDimitry Andric CurrCost -= MissPenalty; 715bdd1243dSDimitry Andric return FinishedExploring; 716bdd1243dSDimitry Andric } 717bdd1243dSDimitry Andric 71806c3fb27SDimitry Andric template <typename T> 71906c3fb27SDimitry Andric void PipelineSolver::greedyFind( 72006c3fb27SDimitry Andric std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges, T I, T E) { 721bdd1243dSDimitry Andric SUToCandSGsPair CurrSU = PipelineInstrs[CurrSyncGroupIdx][CurrConflInstNo]; 722bdd1243dSDimitry Andric int BestNodeCost = -1; 723bdd1243dSDimitry Andric int TempCost; 724bdd1243dSDimitry Andric SchedGroup *BestGroup = nullptr; 725bdd1243dSDimitry Andric int BestGroupID = -1; 726bdd1243dSDimitry Andric auto &SyncPipeline = CurrPipeline[CurrSyncGroupIdx]; 727bdd1243dSDimitry Andric LLVM_DEBUG(dbgs() << "Fitting SU(" << CurrSU.first->NodeNum 728bdd1243dSDimitry Andric << ") in Pipeline # " << CurrSyncGroupIdx << "\n"); 729bdd1243dSDimitry Andric 730bdd1243dSDimitry Andric // Since we have added the potential SchedGroups from bottom up, but 731bdd1243dSDimitry Andric // traversed the DAG from top down, parse over the groups from last to 732bdd1243dSDimitry Andric // first. If we fail to do this for the greedy algorithm, the solution will 733bdd1243dSDimitry Andric // likely not be good in more complex cases. 734bdd1243dSDimitry Andric for (; I != E; ++I) { 735bdd1243dSDimitry Andric std::vector<std::pair<SUnit *, SUnit *>> AddedEdges; 736bdd1243dSDimitry Andric int CandSGID = *I; 7375f757f3fSDimitry Andric SchedGroup *Match = llvm::find_if(SyncPipeline, [CandSGID](SchedGroup &SG) { 7385f757f3fSDimitry Andric return SG.getSGID() == CandSGID; 7395f757f3fSDimitry Andric }); 7405f757f3fSDimitry Andric assert(Match); 741bdd1243dSDimitry Andric 742bdd1243dSDimitry Andric LLVM_DEBUG(dbgs() << "Trying SGID # " << CandSGID << " with Mask " 743bdd1243dSDimitry Andric << (int)Match->getMask() << "\n"); 744bdd1243dSDimitry Andric 745bdd1243dSDimitry Andric if (Match->isFull()) { 746bdd1243dSDimitry Andric LLVM_DEBUG(dbgs() << "SGID # " << CandSGID << " is full\n"); 747bdd1243dSDimitry Andric continue; 748bdd1243dSDimitry Andric } 74906c3fb27SDimitry Andric if (!Match->allowedByRules(CurrSU.first, SyncPipeline)) { 75006c3fb27SDimitry Andric LLVM_DEBUG(dbgs() << "SGID # " << CandSGID << " has conflicting rule\n"); 75106c3fb27SDimitry Andric continue; 75206c3fb27SDimitry Andric } 753bdd1243dSDimitry Andric TempCost = addEdges(SyncPipeline, CurrSU.first, CandSGID, AddedEdges); 754bdd1243dSDimitry Andric LLVM_DEBUG(dbgs() << "Cost of Group " << TempCost << "\n"); 755bdd1243dSDimitry Andric if (TempCost < BestNodeCost || BestNodeCost == -1) { 756bdd1243dSDimitry Andric BestGroup = Match; 757bdd1243dSDimitry Andric BestNodeCost = TempCost; 758bdd1243dSDimitry Andric BestGroupID = CandSGID; 759bdd1243dSDimitry Andric } 760bdd1243dSDimitry Andric removeEdges(AddedEdges); 761bdd1243dSDimitry Andric if (BestNodeCost == 0) 762bdd1243dSDimitry Andric break; 763bdd1243dSDimitry Andric } 764bdd1243dSDimitry Andric 765bdd1243dSDimitry Andric if (BestGroupID != -1) { 766bdd1243dSDimitry Andric BestGroup->add(*CurrSU.first); 767bdd1243dSDimitry Andric addEdges(SyncPipeline, CurrSU.first, BestGroupID, AddedEdges); 768bdd1243dSDimitry Andric LLVM_DEBUG(dbgs() << "Best Group has ID: " << BestGroupID << " and Mask" 769bdd1243dSDimitry Andric << (int)BestGroup->getMask() << "\n"); 770bdd1243dSDimitry Andric BestCost += TempCost; 771bdd1243dSDimitry Andric } else 772bdd1243dSDimitry Andric BestCost += MissPenalty; 773bdd1243dSDimitry Andric 774bdd1243dSDimitry Andric CurrPipeline[CurrSyncGroupIdx] = SyncPipeline; 77506c3fb27SDimitry Andric } 77606c3fb27SDimitry Andric 77706c3fb27SDimitry Andric bool PipelineSolver::solveGreedy() { 77806c3fb27SDimitry Andric BestCost = 0; 77906c3fb27SDimitry Andric std::vector<std::pair<SUnit *, SUnit *>> AddedEdges; 78006c3fb27SDimitry Andric 78106c3fb27SDimitry Andric while (static_cast<size_t>(CurrSyncGroupIdx) < PipelineInstrs.size()) { 78206c3fb27SDimitry Andric SUToCandSGsPair CurrSU = PipelineInstrs[CurrSyncGroupIdx][CurrConflInstNo]; 78306c3fb27SDimitry Andric IsBottomUp 78406c3fb27SDimitry Andric ? greedyFind(AddedEdges, CurrSU.second.rbegin(), CurrSU.second.rend()) 78506c3fb27SDimitry Andric : greedyFind(AddedEdges, CurrSU.second.begin(), CurrSU.second.end()); 786bdd1243dSDimitry Andric advancePosition(); 787bdd1243dSDimitry Andric } 788bdd1243dSDimitry Andric BestPipeline = CurrPipeline; 789bdd1243dSDimitry Andric removeEdges(AddedEdges); 790bdd1243dSDimitry Andric return false; 791bdd1243dSDimitry Andric } 792bdd1243dSDimitry Andric 793bdd1243dSDimitry Andric unsigned PipelineSolver::computeProblemSize() { 794bdd1243dSDimitry Andric unsigned ProblemSize = 0; 795bdd1243dSDimitry Andric for (auto &PipeConflicts : PipelineInstrs) { 796bdd1243dSDimitry Andric ProblemSize += PipeConflicts.size(); 797bdd1243dSDimitry Andric } 798bdd1243dSDimitry Andric 799bdd1243dSDimitry Andric return ProblemSize; 800bdd1243dSDimitry Andric } 801bdd1243dSDimitry Andric 802bdd1243dSDimitry Andric void PipelineSolver::solve() { 803bdd1243dSDimitry Andric if (!NeedsSolver) 804bdd1243dSDimitry Andric return; 805bdd1243dSDimitry Andric 806bdd1243dSDimitry Andric unsigned ProblemSize = computeProblemSize(); 807bdd1243dSDimitry Andric assert(ProblemSize > 0); 808bdd1243dSDimitry Andric 809bdd1243dSDimitry Andric bool BelowCutoff = (CutoffForExact > 0) && ProblemSize <= CutoffForExact; 810bdd1243dSDimitry Andric MissPenalty = (ProblemSize / 2) + 1; 811bdd1243dSDimitry Andric 812bdd1243dSDimitry Andric LLVM_DEBUG(DAG->dump()); 813bdd1243dSDimitry Andric if (EnableExactSolver || BelowCutoff) { 814bdd1243dSDimitry Andric LLVM_DEBUG(dbgs() << "Starting Greedy pipeline solver\n"); 815bdd1243dSDimitry Andric solveGreedy(); 816bdd1243dSDimitry Andric reset(); 817bdd1243dSDimitry Andric LLVM_DEBUG(dbgs() << "Greedy produced best cost of " << BestCost << "\n"); 818bdd1243dSDimitry Andric if (BestCost > 0) { 819bdd1243dSDimitry Andric LLVM_DEBUG(dbgs() << "Starting EXACT pipeline solver\n"); 820bdd1243dSDimitry Andric solveExact(); 821bdd1243dSDimitry Andric LLVM_DEBUG(dbgs() << "Exact produced best cost of " << BestCost << "\n"); 822bdd1243dSDimitry Andric } 823bdd1243dSDimitry Andric } else { // Use the Greedy Algorithm by default 824bdd1243dSDimitry Andric LLVM_DEBUG(dbgs() << "Starting GREEDY pipeline solver\n"); 825bdd1243dSDimitry Andric solveGreedy(); 826bdd1243dSDimitry Andric } 827bdd1243dSDimitry Andric 828bdd1243dSDimitry Andric makePipeline(); 82906c3fb27SDimitry Andric LLVM_DEBUG(dbgs() << "After applying mutation\n"); 83006c3fb27SDimitry Andric LLVM_DEBUG(DAG->dump()); 831bdd1243dSDimitry Andric } 832bdd1243dSDimitry Andric 83306c3fb27SDimitry Andric enum IGLPStrategyID : int { 83406c3fb27SDimitry Andric MFMASmallGemmOptID = 0, 83506c3fb27SDimitry Andric MFMASmallGemmSingleWaveOptID = 1, 836*0fca6ea1SDimitry Andric MFMAExpInterleave = 2 83706c3fb27SDimitry Andric }; 838bdd1243dSDimitry Andric 839bdd1243dSDimitry Andric // Implement a IGLP scheduling strategy. 840bdd1243dSDimitry Andric class IGLPStrategy { 841bdd1243dSDimitry Andric protected: 842bdd1243dSDimitry Andric ScheduleDAGInstrs *DAG; 843bdd1243dSDimitry Andric 844bdd1243dSDimitry Andric const SIInstrInfo *TII; 845bdd1243dSDimitry Andric 846bdd1243dSDimitry Andric public: 8475f757f3fSDimitry Andric /// Add SchedGroups to \p SyncedSchedGroups to implement this Strategy. 848*0fca6ea1SDimitry Andric virtual bool applyIGLPStrategy( 849bdd1243dSDimitry Andric DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs, 8505f757f3fSDimitry Andric DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups, 851*0fca6ea1SDimitry Andric AMDGPU::SchedulingPhase Phase) = 0; 852bdd1243dSDimitry Andric 853bdd1243dSDimitry Andric // Returns true if this strategy should be applied to a ScheduleDAG. 854*0fca6ea1SDimitry Andric virtual bool shouldApplyStrategy(ScheduleDAGInstrs *DAG, 855*0fca6ea1SDimitry Andric AMDGPU::SchedulingPhase Phase) = 0; 856bdd1243dSDimitry Andric 857*0fca6ea1SDimitry Andric bool IsBottomUp = true; 85806c3fb27SDimitry Andric 859bdd1243dSDimitry Andric IGLPStrategy(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII) 860bdd1243dSDimitry Andric : DAG(DAG), TII(TII) {} 861bdd1243dSDimitry Andric 862bdd1243dSDimitry Andric virtual ~IGLPStrategy() = default; 863bdd1243dSDimitry Andric }; 864bdd1243dSDimitry Andric 865bdd1243dSDimitry Andric class MFMASmallGemmOpt final : public IGLPStrategy { 86606c3fb27SDimitry Andric private: 867bdd1243dSDimitry Andric public: 868*0fca6ea1SDimitry Andric bool applyIGLPStrategy( 869bdd1243dSDimitry Andric DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs, 8705f757f3fSDimitry Andric DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups, 871*0fca6ea1SDimitry Andric AMDGPU::SchedulingPhase Phase) override; 872bdd1243dSDimitry Andric 873*0fca6ea1SDimitry Andric bool shouldApplyStrategy(ScheduleDAGInstrs *DAG, 874*0fca6ea1SDimitry Andric AMDGPU::SchedulingPhase Phase) override { 875*0fca6ea1SDimitry Andric return true; 876*0fca6ea1SDimitry Andric } 877bdd1243dSDimitry Andric 878bdd1243dSDimitry Andric MFMASmallGemmOpt(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII) 87906c3fb27SDimitry Andric : IGLPStrategy(DAG, TII) { 880*0fca6ea1SDimitry Andric IsBottomUp = true; 88106c3fb27SDimitry Andric } 882bdd1243dSDimitry Andric }; 883bdd1243dSDimitry Andric 884*0fca6ea1SDimitry Andric bool MFMASmallGemmOpt::applyIGLPStrategy( 885bdd1243dSDimitry Andric DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs, 8865f757f3fSDimitry Andric DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups, 887*0fca6ea1SDimitry Andric AMDGPU::SchedulingPhase Phase) { 888bdd1243dSDimitry Andric // Count the number of MFMA instructions. 889bdd1243dSDimitry Andric unsigned MFMACount = 0; 890bdd1243dSDimitry Andric for (const MachineInstr &I : *DAG) 891bdd1243dSDimitry Andric if (TII->isMFMAorWMMA(I)) 892bdd1243dSDimitry Andric ++MFMACount; 893bdd1243dSDimitry Andric 894bdd1243dSDimitry Andric const unsigned PipelineSyncID = 0; 895bdd1243dSDimitry Andric SchedGroup *SG = nullptr; 896bdd1243dSDimitry Andric for (unsigned I = 0; I < MFMACount * 3; ++I) { 897bdd1243dSDimitry Andric SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 898bdd1243dSDimitry Andric SchedGroupMask::DS, 2, PipelineSyncID, DAG, TII); 899bdd1243dSDimitry Andric SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 900bdd1243dSDimitry Andric 901bdd1243dSDimitry Andric SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 902bdd1243dSDimitry Andric SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII); 903bdd1243dSDimitry Andric SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 904bdd1243dSDimitry Andric } 905*0fca6ea1SDimitry Andric 906*0fca6ea1SDimitry Andric return true; 907*0fca6ea1SDimitry Andric } 908*0fca6ea1SDimitry Andric 909*0fca6ea1SDimitry Andric class MFMAExpInterleaveOpt final : public IGLPStrategy { 910*0fca6ea1SDimitry Andric private: 911*0fca6ea1SDimitry Andric // The count of TRANS SUs involved in the interleaved pipeline 912*0fca6ea1SDimitry Andric static unsigned TransPipeCount; 913*0fca6ea1SDimitry Andric // The count of MFMA SUs involved in the interleaved pipeline 914*0fca6ea1SDimitry Andric static unsigned MFMAPipeCount; 915*0fca6ea1SDimitry Andric // The count of Add SUs involved in the interleaved pipeline 916*0fca6ea1SDimitry Andric static unsigned AddPipeCount; 917*0fca6ea1SDimitry Andric // The number of transitive MFMA successors for each TRANS SU 918*0fca6ea1SDimitry Andric static unsigned MFMAEnablement; 919*0fca6ea1SDimitry Andric // The number of transitive TRANS predecessors for each MFMA SU 920*0fca6ea1SDimitry Andric static unsigned ExpRequirement; 921*0fca6ea1SDimitry Andric // The count of independent "chains" of MFMA instructions in the pipeline 922*0fca6ea1SDimitry Andric static unsigned MFMAChains; 923*0fca6ea1SDimitry Andric // The length of each independent "chain" of MFMA instructions 924*0fca6ea1SDimitry Andric static unsigned MFMAChainLength; 925*0fca6ea1SDimitry Andric // Whether or not the pipeline has V_CVT instructions 926*0fca6ea1SDimitry Andric static bool HasCvt; 927*0fca6ea1SDimitry Andric // Whether or not there are instructions between the TRANS instruction and 928*0fca6ea1SDimitry Andric // V_CVT 929*0fca6ea1SDimitry Andric static bool HasChainBetweenCvt; 930*0fca6ea1SDimitry Andric // The first occuring DS_READ which feeds an MFMA chain 931*0fca6ea1SDimitry Andric static std::optional<unsigned> FirstPipeDSR; 932*0fca6ea1SDimitry Andric // The MFMAPipe SUs with no MFMA predecessors 933*0fca6ea1SDimitry Andric SmallVector<SUnit *, 4> MFMAChainSeeds; 934*0fca6ea1SDimitry Andric // Compute the heuristics for the pipeline, returning whether or not the DAG 935*0fca6ea1SDimitry Andric // is well formatted for the mutation 936*0fca6ea1SDimitry Andric bool analyzeDAG(const SIInstrInfo *TII); 937*0fca6ea1SDimitry Andric 938*0fca6ea1SDimitry Andric /// Whether or not the instruction is a transitive predecessor of an MFMA 939*0fca6ea1SDimitry Andric /// instruction 940*0fca6ea1SDimitry Andric class IsPipeExp final : public InstructionRule { 941*0fca6ea1SDimitry Andric public: 942*0fca6ea1SDimitry Andric bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection, 943*0fca6ea1SDimitry Andric SmallVectorImpl<SchedGroup> &SyncPipe) override { 944*0fca6ea1SDimitry Andric 945*0fca6ea1SDimitry Andric auto DAG = SyncPipe[0].DAG; 946*0fca6ea1SDimitry Andric 947*0fca6ea1SDimitry Andric if (Cache->empty()) { 948*0fca6ea1SDimitry Andric auto I = DAG->SUnits.rbegin(); 949*0fca6ea1SDimitry Andric auto E = DAG->SUnits.rend(); 950*0fca6ea1SDimitry Andric for (; I != E; I++) { 951*0fca6ea1SDimitry Andric if (TII->isMFMAorWMMA(*I->getInstr())) 952*0fca6ea1SDimitry Andric Cache->push_back(&*I); 953*0fca6ea1SDimitry Andric } 954*0fca6ea1SDimitry Andric if (Cache->empty()) 955*0fca6ea1SDimitry Andric return false; 956*0fca6ea1SDimitry Andric } 957*0fca6ea1SDimitry Andric 958*0fca6ea1SDimitry Andric auto Reaches = (std::any_of( 959*0fca6ea1SDimitry Andric Cache->begin(), Cache->end(), [&SU, &DAG](SUnit *TargetSU) { 960*0fca6ea1SDimitry Andric return DAG->IsReachable(TargetSU, const_cast<SUnit *>(SU)); 961*0fca6ea1SDimitry Andric })); 962*0fca6ea1SDimitry Andric 963*0fca6ea1SDimitry Andric return Reaches; 964*0fca6ea1SDimitry Andric } 965*0fca6ea1SDimitry Andric IsPipeExp(const SIInstrInfo *TII, unsigned SGID, bool NeedsCache = false) 966*0fca6ea1SDimitry Andric : InstructionRule(TII, SGID, NeedsCache) {} 967*0fca6ea1SDimitry Andric }; 968*0fca6ea1SDimitry Andric 969*0fca6ea1SDimitry Andric /// Whether or not the instruction is a transitive predecessor of the 970*0fca6ea1SDimitry Andric /// \p Number th MFMA of the MFMAs occuring after a TRANS instruction 971*0fca6ea1SDimitry Andric class EnablesNthMFMA final : public InstructionRule { 972*0fca6ea1SDimitry Andric private: 973*0fca6ea1SDimitry Andric unsigned Number = 1; 974*0fca6ea1SDimitry Andric 975*0fca6ea1SDimitry Andric public: 976*0fca6ea1SDimitry Andric bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection, 977*0fca6ea1SDimitry Andric SmallVectorImpl<SchedGroup> &SyncPipe) override { 978*0fca6ea1SDimitry Andric bool FoundTrans = false; 979*0fca6ea1SDimitry Andric unsigned Counter = 1; 980*0fca6ea1SDimitry Andric auto DAG = SyncPipe[0].DAG; 981*0fca6ea1SDimitry Andric 982*0fca6ea1SDimitry Andric if (Cache->empty()) { 983*0fca6ea1SDimitry Andric SmallVector<SUnit *, 8> Worklist; 984*0fca6ea1SDimitry Andric 985*0fca6ea1SDimitry Andric auto I = DAG->SUnits.begin(); 986*0fca6ea1SDimitry Andric auto E = DAG->SUnits.end(); 987*0fca6ea1SDimitry Andric for (; I != E; I++) { 988*0fca6ea1SDimitry Andric if (FoundTrans && TII->isMFMAorWMMA(*I->getInstr())) { 989*0fca6ea1SDimitry Andric if (Counter == Number) { 990*0fca6ea1SDimitry Andric Cache->push_back(&*I); 991*0fca6ea1SDimitry Andric break; 992*0fca6ea1SDimitry Andric } 993*0fca6ea1SDimitry Andric ++Counter; 994*0fca6ea1SDimitry Andric } 995*0fca6ea1SDimitry Andric if (!FoundTrans && TII->isTRANS(I->getInstr()->getOpcode())) 996*0fca6ea1SDimitry Andric FoundTrans = true; 997*0fca6ea1SDimitry Andric } 998*0fca6ea1SDimitry Andric if (Cache->empty()) 999*0fca6ea1SDimitry Andric return false; 1000*0fca6ea1SDimitry Andric } 1001*0fca6ea1SDimitry Andric 1002*0fca6ea1SDimitry Andric return DAG->IsReachable((*Cache)[0], const_cast<SUnit *>(SU)); 1003*0fca6ea1SDimitry Andric } 1004*0fca6ea1SDimitry Andric 1005*0fca6ea1SDimitry Andric EnablesNthMFMA(unsigned Number, const SIInstrInfo *TII, unsigned SGID, 1006*0fca6ea1SDimitry Andric bool NeedsCache = false) 1007*0fca6ea1SDimitry Andric : InstructionRule(TII, SGID, NeedsCache), Number(Number) {} 1008*0fca6ea1SDimitry Andric }; 1009*0fca6ea1SDimitry Andric 1010*0fca6ea1SDimitry Andric /// Whether or not the instruction enables the exact MFMA that is the \p 1011*0fca6ea1SDimitry Andric /// Number th MFMA in the chain starting with \p ChainSeed 1012*0fca6ea1SDimitry Andric class EnablesNthMFMAInChain final : public InstructionRule { 1013*0fca6ea1SDimitry Andric private: 1014*0fca6ea1SDimitry Andric unsigned Number = 1; 1015*0fca6ea1SDimitry Andric SUnit *ChainSeed; 1016*0fca6ea1SDimitry Andric 1017*0fca6ea1SDimitry Andric public: 1018*0fca6ea1SDimitry Andric bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection, 1019*0fca6ea1SDimitry Andric SmallVectorImpl<SchedGroup> &SyncPipe) override { 1020*0fca6ea1SDimitry Andric auto DAG = SyncPipe[0].DAG; 1021*0fca6ea1SDimitry Andric 1022*0fca6ea1SDimitry Andric if (!SU || !TII->isMFMAorWMMA(*ChainSeed->getInstr())) 1023*0fca6ea1SDimitry Andric return false; 1024*0fca6ea1SDimitry Andric 1025*0fca6ea1SDimitry Andric if (Cache->empty()) { 1026*0fca6ea1SDimitry Andric auto TempSU = ChainSeed; 1027*0fca6ea1SDimitry Andric auto Depth = Number; 1028*0fca6ea1SDimitry Andric while (Depth > 0) { 1029*0fca6ea1SDimitry Andric --Depth; 1030*0fca6ea1SDimitry Andric bool Found = false; 1031*0fca6ea1SDimitry Andric for (auto &Succ : TempSU->Succs) { 1032*0fca6ea1SDimitry Andric if (TII->isMFMAorWMMA(*Succ.getSUnit()->getInstr())) { 1033*0fca6ea1SDimitry Andric TempSU = Succ.getSUnit(); 1034*0fca6ea1SDimitry Andric Found = true; 1035*0fca6ea1SDimitry Andric break; 1036*0fca6ea1SDimitry Andric } 1037*0fca6ea1SDimitry Andric } 1038*0fca6ea1SDimitry Andric if (!Found) 1039*0fca6ea1SDimitry Andric return false; 1040*0fca6ea1SDimitry Andric } 1041*0fca6ea1SDimitry Andric 1042*0fca6ea1SDimitry Andric Cache->push_back(TempSU); 1043*0fca6ea1SDimitry Andric } 1044*0fca6ea1SDimitry Andric // If we failed to find the instruction to be placed into the cache, we 1045*0fca6ea1SDimitry Andric // would have already exited. 1046*0fca6ea1SDimitry Andric assert(!Cache->empty()); 1047*0fca6ea1SDimitry Andric 1048*0fca6ea1SDimitry Andric return DAG->IsReachable((*Cache)[0], const_cast<SUnit *>(SU)); 1049*0fca6ea1SDimitry Andric } 1050*0fca6ea1SDimitry Andric 1051*0fca6ea1SDimitry Andric EnablesNthMFMAInChain(unsigned Number, SUnit *ChainSeed, 1052*0fca6ea1SDimitry Andric const SIInstrInfo *TII, unsigned SGID, 1053*0fca6ea1SDimitry Andric bool NeedsCache = false) 1054*0fca6ea1SDimitry Andric : InstructionRule(TII, SGID, NeedsCache), Number(Number), 1055*0fca6ea1SDimitry Andric ChainSeed(ChainSeed) {} 1056*0fca6ea1SDimitry Andric }; 1057*0fca6ea1SDimitry Andric 1058*0fca6ea1SDimitry Andric /// Whether or not the instruction has less than \p Size immediate successors. 1059*0fca6ea1SDimitry Andric /// If \p HasIntermediary is true, this tests also whether all successors of 1060*0fca6ea1SDimitry Andric /// the SUnit have less than \p Size successors. 1061*0fca6ea1SDimitry Andric class LessThanNSuccs final : public InstructionRule { 1062*0fca6ea1SDimitry Andric private: 1063*0fca6ea1SDimitry Andric unsigned Size = 1; 1064*0fca6ea1SDimitry Andric bool HasIntermediary = false; 1065*0fca6ea1SDimitry Andric 1066*0fca6ea1SDimitry Andric public: 1067*0fca6ea1SDimitry Andric bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection, 1068*0fca6ea1SDimitry Andric SmallVectorImpl<SchedGroup> &SyncPipe) override { 1069*0fca6ea1SDimitry Andric if (!SyncPipe.size()) 1070*0fca6ea1SDimitry Andric return false; 1071*0fca6ea1SDimitry Andric 1072*0fca6ea1SDimitry Andric auto SuccSize = std::count_if( 1073*0fca6ea1SDimitry Andric SU->Succs.begin(), SU->Succs.end(), 1074*0fca6ea1SDimitry Andric [](const SDep &Succ) { return Succ.getKind() == SDep::Data; }); 1075*0fca6ea1SDimitry Andric if (SuccSize >= Size) 1076*0fca6ea1SDimitry Andric return false; 1077*0fca6ea1SDimitry Andric 1078*0fca6ea1SDimitry Andric if (HasIntermediary) { 1079*0fca6ea1SDimitry Andric for (auto Succ : SU->Succs) { 1080*0fca6ea1SDimitry Andric auto SuccSize = std::count_if( 1081*0fca6ea1SDimitry Andric Succ.getSUnit()->Succs.begin(), Succ.getSUnit()->Succs.end(), 1082*0fca6ea1SDimitry Andric [](const SDep &SuccSucc) { 1083*0fca6ea1SDimitry Andric return SuccSucc.getKind() == SDep::Data; 1084*0fca6ea1SDimitry Andric }); 1085*0fca6ea1SDimitry Andric if (SuccSize >= Size) 1086*0fca6ea1SDimitry Andric return false; 1087*0fca6ea1SDimitry Andric } 1088*0fca6ea1SDimitry Andric } 1089*0fca6ea1SDimitry Andric 1090*0fca6ea1SDimitry Andric return true; 1091*0fca6ea1SDimitry Andric } 1092*0fca6ea1SDimitry Andric LessThanNSuccs(unsigned Size, const SIInstrInfo *TII, unsigned SGID, 1093*0fca6ea1SDimitry Andric bool HasIntermediary = false, bool NeedsCache = false) 1094*0fca6ea1SDimitry Andric : InstructionRule(TII, SGID, NeedsCache), Size(Size), 1095*0fca6ea1SDimitry Andric HasIntermediary(HasIntermediary) {} 1096*0fca6ea1SDimitry Andric }; 1097*0fca6ea1SDimitry Andric 1098*0fca6ea1SDimitry Andric /// Whether or not the instruction has greater than or equal to \p Size 1099*0fca6ea1SDimitry Andric /// immediate successors. If \p HasIntermediary is true, this tests also 1100*0fca6ea1SDimitry Andric /// whether all successors of the SUnit have greater than or equal to \p Size 1101*0fca6ea1SDimitry Andric /// successors. 1102*0fca6ea1SDimitry Andric class GreaterThanOrEqualToNSuccs final : public InstructionRule { 1103*0fca6ea1SDimitry Andric private: 1104*0fca6ea1SDimitry Andric unsigned Size = 1; 1105*0fca6ea1SDimitry Andric bool HasIntermediary = false; 1106*0fca6ea1SDimitry Andric 1107*0fca6ea1SDimitry Andric public: 1108*0fca6ea1SDimitry Andric bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection, 1109*0fca6ea1SDimitry Andric SmallVectorImpl<SchedGroup> &SyncPipe) override { 1110*0fca6ea1SDimitry Andric if (!SyncPipe.size()) 1111*0fca6ea1SDimitry Andric return false; 1112*0fca6ea1SDimitry Andric 1113*0fca6ea1SDimitry Andric auto SuccSize = std::count_if( 1114*0fca6ea1SDimitry Andric SU->Succs.begin(), SU->Succs.end(), 1115*0fca6ea1SDimitry Andric [](const SDep &Succ) { return Succ.getKind() == SDep::Data; }); 1116*0fca6ea1SDimitry Andric if (SuccSize >= Size) 1117*0fca6ea1SDimitry Andric return true; 1118*0fca6ea1SDimitry Andric 1119*0fca6ea1SDimitry Andric if (HasIntermediary) { 1120*0fca6ea1SDimitry Andric for (auto Succ : SU->Succs) { 1121*0fca6ea1SDimitry Andric auto SuccSize = std::count_if( 1122*0fca6ea1SDimitry Andric Succ.getSUnit()->Succs.begin(), Succ.getSUnit()->Succs.end(), 1123*0fca6ea1SDimitry Andric [](const SDep &SuccSucc) { 1124*0fca6ea1SDimitry Andric return SuccSucc.getKind() == SDep::Data; 1125*0fca6ea1SDimitry Andric }); 1126*0fca6ea1SDimitry Andric if (SuccSize >= Size) 1127*0fca6ea1SDimitry Andric return true; 1128*0fca6ea1SDimitry Andric } 1129*0fca6ea1SDimitry Andric } 1130*0fca6ea1SDimitry Andric 1131*0fca6ea1SDimitry Andric return false; 1132*0fca6ea1SDimitry Andric } 1133*0fca6ea1SDimitry Andric GreaterThanOrEqualToNSuccs(unsigned Size, const SIInstrInfo *TII, 1134*0fca6ea1SDimitry Andric unsigned SGID, bool HasIntermediary = false, 1135*0fca6ea1SDimitry Andric bool NeedsCache = false) 1136*0fca6ea1SDimitry Andric : InstructionRule(TII, SGID, NeedsCache), Size(Size), 1137*0fca6ea1SDimitry Andric HasIntermediary(HasIntermediary) {} 1138*0fca6ea1SDimitry Andric }; 1139*0fca6ea1SDimitry Andric 1140*0fca6ea1SDimitry Andric // Whether or not the instruction is a relevant V_CVT instruction. 1141*0fca6ea1SDimitry Andric class IsCvt final : public InstructionRule { 1142*0fca6ea1SDimitry Andric public: 1143*0fca6ea1SDimitry Andric bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection, 1144*0fca6ea1SDimitry Andric SmallVectorImpl<SchedGroup> &SyncPipe) override { 1145*0fca6ea1SDimitry Andric auto Opc = SU->getInstr()->getOpcode(); 1146*0fca6ea1SDimitry Andric return Opc == AMDGPU::V_CVT_F16_F32_e32 || 1147*0fca6ea1SDimitry Andric Opc == AMDGPU::V_CVT_I32_F32_e32; 1148*0fca6ea1SDimitry Andric } 1149*0fca6ea1SDimitry Andric IsCvt(const SIInstrInfo *TII, unsigned SGID, bool NeedsCache = false) 1150*0fca6ea1SDimitry Andric : InstructionRule(TII, SGID, NeedsCache) {} 1151*0fca6ea1SDimitry Andric }; 1152*0fca6ea1SDimitry Andric 1153*0fca6ea1SDimitry Andric // Whether or not the instruction is FMA_F32. 1154*0fca6ea1SDimitry Andric class IsFMA final : public InstructionRule { 1155*0fca6ea1SDimitry Andric public: 1156*0fca6ea1SDimitry Andric bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection, 1157*0fca6ea1SDimitry Andric SmallVectorImpl<SchedGroup> &SyncPipe) override { 1158*0fca6ea1SDimitry Andric return SU->getInstr()->getOpcode() == AMDGPU::V_FMA_F32_e64 || 1159*0fca6ea1SDimitry Andric SU->getInstr()->getOpcode() == AMDGPU::V_PK_FMA_F32; 1160*0fca6ea1SDimitry Andric } 1161*0fca6ea1SDimitry Andric IsFMA(const SIInstrInfo *TII, unsigned SGID, bool NeedsCache = false) 1162*0fca6ea1SDimitry Andric : InstructionRule(TII, SGID, NeedsCache) {} 1163*0fca6ea1SDimitry Andric }; 1164*0fca6ea1SDimitry Andric 1165*0fca6ea1SDimitry Andric // Whether or not the instruction is a V_ADD_F32 instruction. 1166*0fca6ea1SDimitry Andric class IsPipeAdd final : public InstructionRule { 1167*0fca6ea1SDimitry Andric public: 1168*0fca6ea1SDimitry Andric bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection, 1169*0fca6ea1SDimitry Andric SmallVectorImpl<SchedGroup> &SyncPipe) override { 1170*0fca6ea1SDimitry Andric return SU->getInstr()->getOpcode() == AMDGPU::V_ADD_F32_e32; 1171*0fca6ea1SDimitry Andric } 1172*0fca6ea1SDimitry Andric IsPipeAdd(const SIInstrInfo *TII, unsigned SGID, bool NeedsCache = false) 1173*0fca6ea1SDimitry Andric : InstructionRule(TII, SGID, NeedsCache) {} 1174*0fca6ea1SDimitry Andric }; 1175*0fca6ea1SDimitry Andric 1176*0fca6ea1SDimitry Andric /// Whether or not the instruction is an immediate RAW successor 1177*0fca6ea1SDimitry Andric /// of the SchedGroup \p Distance steps before. 1178*0fca6ea1SDimitry Andric class IsSuccOfPrevNthGroup final : public InstructionRule { 1179*0fca6ea1SDimitry Andric private: 1180*0fca6ea1SDimitry Andric unsigned Distance = 1; 1181*0fca6ea1SDimitry Andric 1182*0fca6ea1SDimitry Andric public: 1183*0fca6ea1SDimitry Andric bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection, 1184*0fca6ea1SDimitry Andric SmallVectorImpl<SchedGroup> &SyncPipe) override { 1185*0fca6ea1SDimitry Andric SchedGroup *OtherGroup = nullptr; 1186*0fca6ea1SDimitry Andric if (!SyncPipe.size()) 1187*0fca6ea1SDimitry Andric return false; 1188*0fca6ea1SDimitry Andric 1189*0fca6ea1SDimitry Andric for (auto &PipeSG : SyncPipe) { 1190*0fca6ea1SDimitry Andric if ((unsigned)PipeSG.getSGID() == SGID - Distance) 1191*0fca6ea1SDimitry Andric OtherGroup = &PipeSG; 1192*0fca6ea1SDimitry Andric } 1193*0fca6ea1SDimitry Andric 1194*0fca6ea1SDimitry Andric if (!OtherGroup) 1195*0fca6ea1SDimitry Andric return false; 1196*0fca6ea1SDimitry Andric if (!OtherGroup->Collection.size()) 1197*0fca6ea1SDimitry Andric return true; 1198*0fca6ea1SDimitry Andric 1199*0fca6ea1SDimitry Andric for (auto &OtherEle : OtherGroup->Collection) { 1200*0fca6ea1SDimitry Andric for (auto &Succ : OtherEle->Succs) { 1201*0fca6ea1SDimitry Andric if (Succ.getSUnit() == SU && Succ.getKind() == SDep::Data) 1202*0fca6ea1SDimitry Andric return true; 1203*0fca6ea1SDimitry Andric } 1204*0fca6ea1SDimitry Andric } 1205*0fca6ea1SDimitry Andric 1206*0fca6ea1SDimitry Andric return false; 1207*0fca6ea1SDimitry Andric } 1208*0fca6ea1SDimitry Andric IsSuccOfPrevNthGroup(unsigned Distance, const SIInstrInfo *TII, 1209*0fca6ea1SDimitry Andric unsigned SGID, bool NeedsCache = false) 1210*0fca6ea1SDimitry Andric : InstructionRule(TII, SGID, NeedsCache), Distance(Distance) {} 1211*0fca6ea1SDimitry Andric }; 1212*0fca6ea1SDimitry Andric 1213*0fca6ea1SDimitry Andric /// Whether or not the instruction is a transitive successor of any 1214*0fca6ea1SDimitry Andric /// instruction the the SchedGroup \p Distance steps before. 1215*0fca6ea1SDimitry Andric class IsReachableFromPrevNthGroup final : public InstructionRule { 1216*0fca6ea1SDimitry Andric private: 1217*0fca6ea1SDimitry Andric unsigned Distance = 1; 1218*0fca6ea1SDimitry Andric 1219*0fca6ea1SDimitry Andric public: 1220*0fca6ea1SDimitry Andric bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection, 1221*0fca6ea1SDimitry Andric SmallVectorImpl<SchedGroup> &SyncPipe) override { 1222*0fca6ea1SDimitry Andric SchedGroup *OtherGroup = nullptr; 1223*0fca6ea1SDimitry Andric if (!SyncPipe.size()) 1224*0fca6ea1SDimitry Andric return false; 1225*0fca6ea1SDimitry Andric 1226*0fca6ea1SDimitry Andric for (auto &PipeSG : SyncPipe) { 1227*0fca6ea1SDimitry Andric if ((unsigned)PipeSG.getSGID() == SGID - Distance) 1228*0fca6ea1SDimitry Andric OtherGroup = &PipeSG; 1229*0fca6ea1SDimitry Andric } 1230*0fca6ea1SDimitry Andric 1231*0fca6ea1SDimitry Andric if (!OtherGroup) 1232*0fca6ea1SDimitry Andric return false; 1233*0fca6ea1SDimitry Andric if (!OtherGroup->Collection.size()) 1234*0fca6ea1SDimitry Andric return true; 1235*0fca6ea1SDimitry Andric 1236*0fca6ea1SDimitry Andric auto DAG = SyncPipe[0].DAG; 1237*0fca6ea1SDimitry Andric 1238*0fca6ea1SDimitry Andric for (auto &OtherEle : OtherGroup->Collection) 1239*0fca6ea1SDimitry Andric if (DAG->IsReachable(const_cast<SUnit *>(SU), OtherEle)) 1240*0fca6ea1SDimitry Andric return true; 1241*0fca6ea1SDimitry Andric 1242*0fca6ea1SDimitry Andric return false; 1243*0fca6ea1SDimitry Andric } 1244*0fca6ea1SDimitry Andric IsReachableFromPrevNthGroup(unsigned Distance, const SIInstrInfo *TII, 1245*0fca6ea1SDimitry Andric unsigned SGID, bool NeedsCache = false) 1246*0fca6ea1SDimitry Andric : InstructionRule(TII, SGID, NeedsCache), Distance(Distance) {} 1247*0fca6ea1SDimitry Andric }; 1248*0fca6ea1SDimitry Andric 1249*0fca6ea1SDimitry Andric /// Whether or not the instruction occurs after the SU with NodeNUm \p Number 1250*0fca6ea1SDimitry Andric class OccursAtOrAfterNode final : public InstructionRule { 1251*0fca6ea1SDimitry Andric private: 1252*0fca6ea1SDimitry Andric unsigned Number = 1; 1253*0fca6ea1SDimitry Andric 1254*0fca6ea1SDimitry Andric public: 1255*0fca6ea1SDimitry Andric bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection, 1256*0fca6ea1SDimitry Andric SmallVectorImpl<SchedGroup> &SyncPipe) override { 1257*0fca6ea1SDimitry Andric 1258*0fca6ea1SDimitry Andric return SU->NodeNum >= Number; 1259*0fca6ea1SDimitry Andric } 1260*0fca6ea1SDimitry Andric OccursAtOrAfterNode(unsigned Number, const SIInstrInfo *TII, unsigned SGID, 1261*0fca6ea1SDimitry Andric bool NeedsCache = false) 1262*0fca6ea1SDimitry Andric : InstructionRule(TII, SGID, NeedsCache), Number(Number) {} 1263*0fca6ea1SDimitry Andric }; 1264*0fca6ea1SDimitry Andric 1265*0fca6ea1SDimitry Andric /// Whether or not the SU is exactly the \p Number th MFMA in the chain 1266*0fca6ea1SDimitry Andric /// starting with \p ChainSeed 1267*0fca6ea1SDimitry Andric class IsExactMFMA final : public InstructionRule { 1268*0fca6ea1SDimitry Andric private: 1269*0fca6ea1SDimitry Andric unsigned Number = 1; 1270*0fca6ea1SDimitry Andric SUnit *ChainSeed; 1271*0fca6ea1SDimitry Andric 1272*0fca6ea1SDimitry Andric public: 1273*0fca6ea1SDimitry Andric bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection, 1274*0fca6ea1SDimitry Andric SmallVectorImpl<SchedGroup> &SyncPipe) override { 1275*0fca6ea1SDimitry Andric if (!SU || !TII->isMFMAorWMMA(*ChainSeed->getInstr())) 1276*0fca6ea1SDimitry Andric return false; 1277*0fca6ea1SDimitry Andric 1278*0fca6ea1SDimitry Andric if (Cache->empty()) { 1279*0fca6ea1SDimitry Andric auto TempSU = ChainSeed; 1280*0fca6ea1SDimitry Andric auto Depth = Number; 1281*0fca6ea1SDimitry Andric while (Depth > 0) { 1282*0fca6ea1SDimitry Andric --Depth; 1283*0fca6ea1SDimitry Andric bool Found = false; 1284*0fca6ea1SDimitry Andric for (auto &Succ : TempSU->Succs) { 1285*0fca6ea1SDimitry Andric if (TII->isMFMAorWMMA(*Succ.getSUnit()->getInstr())) { 1286*0fca6ea1SDimitry Andric TempSU = Succ.getSUnit(); 1287*0fca6ea1SDimitry Andric Found = true; 1288*0fca6ea1SDimitry Andric break; 1289*0fca6ea1SDimitry Andric } 1290*0fca6ea1SDimitry Andric } 1291*0fca6ea1SDimitry Andric if (!Found) { 1292*0fca6ea1SDimitry Andric return false; 1293*0fca6ea1SDimitry Andric } 1294*0fca6ea1SDimitry Andric } 1295*0fca6ea1SDimitry Andric Cache->push_back(TempSU); 1296*0fca6ea1SDimitry Andric } 1297*0fca6ea1SDimitry Andric // If we failed to find the instruction to be placed into the cache, we 1298*0fca6ea1SDimitry Andric // would have already exited. 1299*0fca6ea1SDimitry Andric assert(!Cache->empty()); 1300*0fca6ea1SDimitry Andric 1301*0fca6ea1SDimitry Andric return (*Cache)[0] == SU; 1302*0fca6ea1SDimitry Andric } 1303*0fca6ea1SDimitry Andric 1304*0fca6ea1SDimitry Andric IsExactMFMA(unsigned Number, SUnit *ChainSeed, const SIInstrInfo *TII, 1305*0fca6ea1SDimitry Andric unsigned SGID, bool NeedsCache = false) 1306*0fca6ea1SDimitry Andric : InstructionRule(TII, SGID, NeedsCache), Number(Number), 1307*0fca6ea1SDimitry Andric ChainSeed(ChainSeed) {} 1308*0fca6ea1SDimitry Andric }; 1309*0fca6ea1SDimitry Andric 1310*0fca6ea1SDimitry Andric // Whether the instruction occurs after the first TRANS instruction. This 1311*0fca6ea1SDimitry Andric // implies the instruction can not be a predecessor of the first TRANS 1312*0fca6ea1SDimitry Andric // insruction 1313*0fca6ea1SDimitry Andric class OccursAfterExp final : public InstructionRule { 1314*0fca6ea1SDimitry Andric public: 1315*0fca6ea1SDimitry Andric bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection, 1316*0fca6ea1SDimitry Andric SmallVectorImpl<SchedGroup> &SyncPipe) override { 1317*0fca6ea1SDimitry Andric 1318*0fca6ea1SDimitry Andric SmallVector<SUnit *, 12> Worklist; 1319*0fca6ea1SDimitry Andric auto DAG = SyncPipe[0].DAG; 1320*0fca6ea1SDimitry Andric if (Cache->empty()) { 1321*0fca6ea1SDimitry Andric for (auto &SU : DAG->SUnits) 1322*0fca6ea1SDimitry Andric if (TII->isTRANS(SU.getInstr()->getOpcode())) { 1323*0fca6ea1SDimitry Andric Cache->push_back(&SU); 1324*0fca6ea1SDimitry Andric break; 1325*0fca6ea1SDimitry Andric } 1326*0fca6ea1SDimitry Andric if (Cache->empty()) 1327*0fca6ea1SDimitry Andric return false; 1328*0fca6ea1SDimitry Andric } 1329*0fca6ea1SDimitry Andric 1330*0fca6ea1SDimitry Andric return SU->NodeNum > (*Cache)[0]->NodeNum; 1331*0fca6ea1SDimitry Andric } 1332*0fca6ea1SDimitry Andric 1333*0fca6ea1SDimitry Andric OccursAfterExp(const SIInstrInfo *TII, unsigned SGID, 1334*0fca6ea1SDimitry Andric bool NeedsCache = false) 1335*0fca6ea1SDimitry Andric : InstructionRule(TII, SGID, NeedsCache) {} 1336*0fca6ea1SDimitry Andric }; 1337*0fca6ea1SDimitry Andric 1338*0fca6ea1SDimitry Andric public: 1339*0fca6ea1SDimitry Andric bool applyIGLPStrategy( 1340*0fca6ea1SDimitry Andric DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs, 1341*0fca6ea1SDimitry Andric DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups, 1342*0fca6ea1SDimitry Andric AMDGPU::SchedulingPhase Phase) override; 1343*0fca6ea1SDimitry Andric 1344*0fca6ea1SDimitry Andric bool shouldApplyStrategy(ScheduleDAGInstrs *DAG, 1345*0fca6ea1SDimitry Andric AMDGPU::SchedulingPhase Phase) override; 1346*0fca6ea1SDimitry Andric 1347*0fca6ea1SDimitry Andric MFMAExpInterleaveOpt(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII) 1348*0fca6ea1SDimitry Andric : IGLPStrategy(DAG, TII) { 1349*0fca6ea1SDimitry Andric IsBottomUp = false; 1350*0fca6ea1SDimitry Andric } 1351*0fca6ea1SDimitry Andric }; 1352*0fca6ea1SDimitry Andric 1353*0fca6ea1SDimitry Andric unsigned MFMAExpInterleaveOpt::TransPipeCount = 0; 1354*0fca6ea1SDimitry Andric unsigned MFMAExpInterleaveOpt::MFMAPipeCount = 0; 1355*0fca6ea1SDimitry Andric unsigned MFMAExpInterleaveOpt::AddPipeCount = 0; 1356*0fca6ea1SDimitry Andric unsigned MFMAExpInterleaveOpt::MFMAEnablement = 0; 1357*0fca6ea1SDimitry Andric unsigned MFMAExpInterleaveOpt::ExpRequirement = 0; 1358*0fca6ea1SDimitry Andric unsigned MFMAExpInterleaveOpt::MFMAChains = 0; 1359*0fca6ea1SDimitry Andric unsigned MFMAExpInterleaveOpt::MFMAChainLength = 0; 1360*0fca6ea1SDimitry Andric bool MFMAExpInterleaveOpt::HasCvt = false; 1361*0fca6ea1SDimitry Andric bool MFMAExpInterleaveOpt::HasChainBetweenCvt = false; 1362*0fca6ea1SDimitry Andric std::optional<unsigned> MFMAExpInterleaveOpt::FirstPipeDSR = std::nullopt; 1363*0fca6ea1SDimitry Andric 1364*0fca6ea1SDimitry Andric bool MFMAExpInterleaveOpt::analyzeDAG(const SIInstrInfo *TII) { 1365*0fca6ea1SDimitry Andric SmallVector<SUnit *, 10> ExpPipeCands; 1366*0fca6ea1SDimitry Andric SmallVector<SUnit *, 10> MFMAPipeCands; 1367*0fca6ea1SDimitry Andric SmallVector<SUnit *, 10> MFMAPipeSUs; 1368*0fca6ea1SDimitry Andric SmallVector<SUnit *, 10> PackSUs; 1369*0fca6ea1SDimitry Andric SmallVector<SUnit *, 10> CvtSUs; 1370*0fca6ea1SDimitry Andric 1371*0fca6ea1SDimitry Andric auto isBitPack = [](unsigned Opc) { 1372*0fca6ea1SDimitry Andric return Opc == AMDGPU::V_PACK_B32_F16_e64 || Opc == AMDGPU::V_PERM_B32_e64; 1373*0fca6ea1SDimitry Andric }; 1374*0fca6ea1SDimitry Andric 1375*0fca6ea1SDimitry Andric auto isCvt = [](unsigned Opc) { 1376*0fca6ea1SDimitry Andric return Opc == AMDGPU::V_CVT_F16_F32_e32 || Opc == AMDGPU::V_CVT_I32_F32_e32; 1377*0fca6ea1SDimitry Andric }; 1378*0fca6ea1SDimitry Andric 1379*0fca6ea1SDimitry Andric auto isAdd = [](unsigned Opc) { return Opc == AMDGPU::V_ADD_F32_e32; }; 1380*0fca6ea1SDimitry Andric 1381*0fca6ea1SDimitry Andric AddPipeCount = 0; 1382*0fca6ea1SDimitry Andric for (SUnit &SU : DAG->SUnits) { 1383*0fca6ea1SDimitry Andric auto Opc = SU.getInstr()->getOpcode(); 1384*0fca6ea1SDimitry Andric if (TII->isTRANS(Opc)) { 1385*0fca6ea1SDimitry Andric // Avoid counting a potential bonus V_EXP which all the MFMA depend on 1386*0fca6ea1SDimitry Andric if (SU.Succs.size() >= 7) 1387*0fca6ea1SDimitry Andric continue; 1388*0fca6ea1SDimitry Andric for (auto &Succ : SU.Succs) { 1389*0fca6ea1SDimitry Andric if (Succ.getSUnit()->Succs.size() >= 7) 1390*0fca6ea1SDimitry Andric continue; 1391*0fca6ea1SDimitry Andric } 1392*0fca6ea1SDimitry Andric ExpPipeCands.push_back(&SU); 1393*0fca6ea1SDimitry Andric } 1394*0fca6ea1SDimitry Andric 1395*0fca6ea1SDimitry Andric if (TII->isMFMAorWMMA(*SU.getInstr())) 1396*0fca6ea1SDimitry Andric MFMAPipeCands.push_back(&SU); 1397*0fca6ea1SDimitry Andric 1398*0fca6ea1SDimitry Andric if (isBitPack(Opc)) 1399*0fca6ea1SDimitry Andric PackSUs.push_back(&SU); 1400*0fca6ea1SDimitry Andric 1401*0fca6ea1SDimitry Andric if (isCvt(Opc)) 1402*0fca6ea1SDimitry Andric CvtSUs.push_back(&SU); 1403*0fca6ea1SDimitry Andric 1404*0fca6ea1SDimitry Andric if (isAdd(Opc)) 1405*0fca6ea1SDimitry Andric ++AddPipeCount; 1406*0fca6ea1SDimitry Andric } 1407*0fca6ea1SDimitry Andric 1408*0fca6ea1SDimitry Andric if (!(PackSUs.size() && MFMAPipeCands.size() && ExpPipeCands.size())) 1409*0fca6ea1SDimitry Andric return false; 1410*0fca6ea1SDimitry Andric 1411*0fca6ea1SDimitry Andric TransPipeCount = 0; 1412*0fca6ea1SDimitry Andric 1413*0fca6ea1SDimitry Andric std::optional<SUnit *> TempMFMA; 1414*0fca6ea1SDimitry Andric std::optional<SUnit *> TempExp; 1415*0fca6ea1SDimitry Andric // Count the number of EXPs that reach an MFMA 1416*0fca6ea1SDimitry Andric for (auto &PredSU : ExpPipeCands) { 1417*0fca6ea1SDimitry Andric for (auto &SuccSU : MFMAPipeCands) { 1418*0fca6ea1SDimitry Andric if (DAG->IsReachable(SuccSU, PredSU)) { 1419*0fca6ea1SDimitry Andric if (!TempExp) { 1420*0fca6ea1SDimitry Andric TempExp = PredSU; 1421*0fca6ea1SDimitry Andric TempMFMA = SuccSU; 1422*0fca6ea1SDimitry Andric } 1423*0fca6ea1SDimitry Andric MFMAPipeSUs.push_back(SuccSU); 1424*0fca6ea1SDimitry Andric ++TransPipeCount; 1425*0fca6ea1SDimitry Andric break; 1426*0fca6ea1SDimitry Andric } 1427*0fca6ea1SDimitry Andric } 1428*0fca6ea1SDimitry Andric } 1429*0fca6ea1SDimitry Andric 1430*0fca6ea1SDimitry Andric if (!(TempExp && TempMFMA)) 1431*0fca6ea1SDimitry Andric return false; 1432*0fca6ea1SDimitry Andric 1433*0fca6ea1SDimitry Andric HasChainBetweenCvt = 1434*0fca6ea1SDimitry Andric std::find_if((*TempExp)->Succs.begin(), (*TempExp)->Succs.end(), 1435*0fca6ea1SDimitry Andric [&isCvt](SDep &Succ) { 1436*0fca6ea1SDimitry Andric return isCvt(Succ.getSUnit()->getInstr()->getOpcode()); 1437*0fca6ea1SDimitry Andric }) == (*TempExp)->Succs.end(); 1438*0fca6ea1SDimitry Andric 1439*0fca6ea1SDimitry Andric // Count the number of MFMAs that are reached by an EXP 1440*0fca6ea1SDimitry Andric for (auto &SuccSU : MFMAPipeCands) { 1441*0fca6ea1SDimitry Andric if (MFMAPipeSUs.size() && 1442*0fca6ea1SDimitry Andric std::find_if(MFMAPipeSUs.begin(), MFMAPipeSUs.end(), 1443*0fca6ea1SDimitry Andric [&SuccSU](SUnit *PotentialMatch) { 1444*0fca6ea1SDimitry Andric return PotentialMatch->NodeNum == SuccSU->NodeNum; 1445*0fca6ea1SDimitry Andric }) != MFMAPipeSUs.end()) 1446*0fca6ea1SDimitry Andric continue; 1447*0fca6ea1SDimitry Andric 1448*0fca6ea1SDimitry Andric for (auto &PredSU : ExpPipeCands) { 1449*0fca6ea1SDimitry Andric if (DAG->IsReachable(SuccSU, PredSU)) { 1450*0fca6ea1SDimitry Andric MFMAPipeSUs.push_back(SuccSU); 1451*0fca6ea1SDimitry Andric break; 1452*0fca6ea1SDimitry Andric } 1453*0fca6ea1SDimitry Andric } 1454*0fca6ea1SDimitry Andric } 1455*0fca6ea1SDimitry Andric 1456*0fca6ea1SDimitry Andric MFMAPipeCount = MFMAPipeSUs.size(); 1457*0fca6ea1SDimitry Andric 1458*0fca6ea1SDimitry Andric assert(TempExp && TempMFMA); 1459*0fca6ea1SDimitry Andric assert(MFMAPipeCount > 0); 1460*0fca6ea1SDimitry Andric 1461*0fca6ea1SDimitry Andric std::optional<SUnit *> TempCvt; 1462*0fca6ea1SDimitry Andric for (auto &SuccSU : CvtSUs) { 1463*0fca6ea1SDimitry Andric if (DAG->IsReachable(SuccSU, *TempExp)) { 1464*0fca6ea1SDimitry Andric TempCvt = SuccSU; 1465*0fca6ea1SDimitry Andric break; 1466*0fca6ea1SDimitry Andric } 1467*0fca6ea1SDimitry Andric } 1468*0fca6ea1SDimitry Andric 1469*0fca6ea1SDimitry Andric HasCvt = false; 1470*0fca6ea1SDimitry Andric if (TempCvt.has_value()) { 1471*0fca6ea1SDimitry Andric for (auto &SuccSU : MFMAPipeSUs) { 1472*0fca6ea1SDimitry Andric if (DAG->IsReachable(SuccSU, *TempCvt)) { 1473*0fca6ea1SDimitry Andric HasCvt = true; 1474*0fca6ea1SDimitry Andric break; 1475*0fca6ea1SDimitry Andric } 1476*0fca6ea1SDimitry Andric } 1477*0fca6ea1SDimitry Andric } 1478*0fca6ea1SDimitry Andric 1479*0fca6ea1SDimitry Andric MFMAChains = 0; 1480*0fca6ea1SDimitry Andric for (auto &MFMAPipeSU : MFMAPipeSUs) { 1481*0fca6ea1SDimitry Andric if (is_contained(MFMAChainSeeds, MFMAPipeSU)) 1482*0fca6ea1SDimitry Andric continue; 1483*0fca6ea1SDimitry Andric if (!std::any_of(MFMAPipeSU->Preds.begin(), MFMAPipeSU->Preds.end(), 1484*0fca6ea1SDimitry Andric [&TII](SDep &Succ) { 1485*0fca6ea1SDimitry Andric return TII->isMFMAorWMMA(*Succ.getSUnit()->getInstr()); 1486*0fca6ea1SDimitry Andric })) { 1487*0fca6ea1SDimitry Andric MFMAChainSeeds.push_back(MFMAPipeSU); 1488*0fca6ea1SDimitry Andric ++MFMAChains; 1489*0fca6ea1SDimitry Andric } 1490*0fca6ea1SDimitry Andric } 1491*0fca6ea1SDimitry Andric 1492*0fca6ea1SDimitry Andric if (!MFMAChains) 1493*0fca6ea1SDimitry Andric return false; 1494*0fca6ea1SDimitry Andric 1495*0fca6ea1SDimitry Andric for (auto Pred : MFMAChainSeeds[0]->Preds) { 1496*0fca6ea1SDimitry Andric if (TII->isDS(Pred.getSUnit()->getInstr()->getOpcode()) && 1497*0fca6ea1SDimitry Andric Pred.getSUnit()->getInstr()->mayLoad()) 1498*0fca6ea1SDimitry Andric FirstPipeDSR = Pred.getSUnit()->NodeNum; 1499*0fca6ea1SDimitry Andric } 1500*0fca6ea1SDimitry Andric 1501*0fca6ea1SDimitry Andric MFMAChainLength = MFMAPipeCount / MFMAChains; 1502*0fca6ea1SDimitry Andric 1503*0fca6ea1SDimitry Andric // The number of bit pack operations that depend on a single V_EXP 1504*0fca6ea1SDimitry Andric unsigned PackSuccCount = std::count_if( 1505*0fca6ea1SDimitry Andric PackSUs.begin(), PackSUs.end(), [this, &TempExp](SUnit *VPack) { 1506*0fca6ea1SDimitry Andric return DAG->IsReachable(VPack, *TempExp); 1507*0fca6ea1SDimitry Andric }); 1508*0fca6ea1SDimitry Andric 1509*0fca6ea1SDimitry Andric // The number of bit pack operations an MFMA depends on 1510*0fca6ea1SDimitry Andric unsigned PackPredCount = 1511*0fca6ea1SDimitry Andric std::count_if((*TempMFMA)->Preds.begin(), (*TempMFMA)->Preds.end(), 1512*0fca6ea1SDimitry Andric [&isBitPack](SDep &Pred) { 1513*0fca6ea1SDimitry Andric auto Opc = Pred.getSUnit()->getInstr()->getOpcode(); 1514*0fca6ea1SDimitry Andric return isBitPack(Opc); 1515*0fca6ea1SDimitry Andric }); 1516*0fca6ea1SDimitry Andric 1517*0fca6ea1SDimitry Andric auto PackPred = 1518*0fca6ea1SDimitry Andric std::find_if((*TempMFMA)->Preds.begin(), (*TempMFMA)->Preds.end(), 1519*0fca6ea1SDimitry Andric [&isBitPack](SDep &Pred) { 1520*0fca6ea1SDimitry Andric auto Opc = Pred.getSUnit()->getInstr()->getOpcode(); 1521*0fca6ea1SDimitry Andric return isBitPack(Opc); 1522*0fca6ea1SDimitry Andric }); 1523*0fca6ea1SDimitry Andric 1524*0fca6ea1SDimitry Andric if (PackPred == (*TempMFMA)->Preds.end()) 1525*0fca6ea1SDimitry Andric return false; 1526*0fca6ea1SDimitry Andric 1527*0fca6ea1SDimitry Andric MFMAEnablement = 0; 1528*0fca6ea1SDimitry Andric ExpRequirement = 0; 1529*0fca6ea1SDimitry Andric // How many MFMAs depend on a single bit pack operation 1530*0fca6ea1SDimitry Andric MFMAEnablement = 1531*0fca6ea1SDimitry Andric std::count_if(PackPred->getSUnit()->Succs.begin(), 1532*0fca6ea1SDimitry Andric PackPred->getSUnit()->Succs.end(), [&TII](SDep &Succ) { 1533*0fca6ea1SDimitry Andric return TII->isMFMAorWMMA(*Succ.getSUnit()->getInstr()); 1534*0fca6ea1SDimitry Andric }); 1535*0fca6ea1SDimitry Andric 1536*0fca6ea1SDimitry Andric // The number of MFMAs that depend on a single V_EXP 1537*0fca6ea1SDimitry Andric MFMAEnablement *= PackSuccCount; 1538*0fca6ea1SDimitry Andric 1539*0fca6ea1SDimitry Andric // The number of V_EXPs required to resolve all dependencies for an MFMA 1540*0fca6ea1SDimitry Andric ExpRequirement = 1541*0fca6ea1SDimitry Andric std::count_if(ExpPipeCands.begin(), ExpPipeCands.end(), 1542*0fca6ea1SDimitry Andric [this, &PackPred](SUnit *ExpBase) { 1543*0fca6ea1SDimitry Andric return DAG->IsReachable(PackPred->getSUnit(), ExpBase); 1544*0fca6ea1SDimitry Andric }); 1545*0fca6ea1SDimitry Andric 1546*0fca6ea1SDimitry Andric ExpRequirement *= PackPredCount; 1547*0fca6ea1SDimitry Andric return true; 1548*0fca6ea1SDimitry Andric } 1549*0fca6ea1SDimitry Andric 1550*0fca6ea1SDimitry Andric bool MFMAExpInterleaveOpt::shouldApplyStrategy(ScheduleDAGInstrs *DAG, 1551*0fca6ea1SDimitry Andric AMDGPU::SchedulingPhase Phase) { 1552*0fca6ea1SDimitry Andric const GCNSubtarget &ST = DAG->MF.getSubtarget<GCNSubtarget>(); 1553*0fca6ea1SDimitry Andric const SIInstrInfo *TII = ST.getInstrInfo(); 1554*0fca6ea1SDimitry Andric 1555*0fca6ea1SDimitry Andric if (Phase != AMDGPU::SchedulingPhase::PostRA) 1556*0fca6ea1SDimitry Andric MFMAChainSeeds.clear(); 1557*0fca6ea1SDimitry Andric if (Phase != AMDGPU::SchedulingPhase::PostRA && !analyzeDAG(TII)) 1558*0fca6ea1SDimitry Andric return false; 1559*0fca6ea1SDimitry Andric 1560*0fca6ea1SDimitry Andric return true; 1561*0fca6ea1SDimitry Andric } 1562*0fca6ea1SDimitry Andric 1563*0fca6ea1SDimitry Andric bool MFMAExpInterleaveOpt::applyIGLPStrategy( 1564*0fca6ea1SDimitry Andric DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs, 1565*0fca6ea1SDimitry Andric DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups, 1566*0fca6ea1SDimitry Andric AMDGPU::SchedulingPhase Phase) { 1567*0fca6ea1SDimitry Andric 1568*0fca6ea1SDimitry Andric bool IsSmallKernelType = 1569*0fca6ea1SDimitry Andric MFMAEnablement == 2 && ExpRequirement == 4 && TransPipeCount == 32; 1570*0fca6ea1SDimitry Andric bool IsLargeKernelType = 1571*0fca6ea1SDimitry Andric MFMAEnablement == 4 && ExpRequirement == 4 && TransPipeCount == 64; 1572*0fca6ea1SDimitry Andric 1573*0fca6ea1SDimitry Andric if (!(IsSmallKernelType || IsLargeKernelType)) 1574*0fca6ea1SDimitry Andric return false; 1575*0fca6ea1SDimitry Andric 1576*0fca6ea1SDimitry Andric const GCNSubtarget &ST = DAG->MF.getSubtarget<GCNSubtarget>(); 1577*0fca6ea1SDimitry Andric const SIInstrInfo *TII = ST.getInstrInfo(); 1578*0fca6ea1SDimitry Andric 1579*0fca6ea1SDimitry Andric unsigned PipelineSyncID = 0; 1580*0fca6ea1SDimitry Andric SchedGroup *SG = nullptr; 1581*0fca6ea1SDimitry Andric 1582*0fca6ea1SDimitry Andric unsigned MFMAChain = 0; 1583*0fca6ea1SDimitry Andric unsigned PositionInChain = 0; 1584*0fca6ea1SDimitry Andric unsigned CurrMFMAForTransPosition = 0; 1585*0fca6ea1SDimitry Andric 1586*0fca6ea1SDimitry Andric auto incrementTransPosition = [&MFMAChain, &PositionInChain, 1587*0fca6ea1SDimitry Andric &CurrMFMAForTransPosition]() { 1588*0fca6ea1SDimitry Andric CurrMFMAForTransPosition += MFMAEnablement; 1589*0fca6ea1SDimitry Andric PositionInChain = (CurrMFMAForTransPosition / MFMAChains); 1590*0fca6ea1SDimitry Andric MFMAChain = CurrMFMAForTransPosition % MFMAChains; 1591*0fca6ea1SDimitry Andric }; 1592*0fca6ea1SDimitry Andric 1593*0fca6ea1SDimitry Andric auto getNextTransPositionInChain = [&CurrMFMAForTransPosition]() { 1594*0fca6ea1SDimitry Andric auto TempMFMAForTrans = CurrMFMAForTransPosition + MFMAEnablement; 1595*0fca6ea1SDimitry Andric return (TempMFMAForTrans / MFMAChains); 1596*0fca6ea1SDimitry Andric }; 1597*0fca6ea1SDimitry Andric 1598*0fca6ea1SDimitry Andric auto getNextTransMFMAChain = [&CurrMFMAForTransPosition]() { 1599*0fca6ea1SDimitry Andric auto TempMFMAForTrans = CurrMFMAForTransPosition + MFMAEnablement; 1600*0fca6ea1SDimitry Andric return TempMFMAForTrans % MFMAChains; 1601*0fca6ea1SDimitry Andric }; 1602*0fca6ea1SDimitry Andric 1603*0fca6ea1SDimitry Andric unsigned CurrMFMAPosition = 0; 1604*0fca6ea1SDimitry Andric unsigned MFMAChainForMFMA = 0; 1605*0fca6ea1SDimitry Andric unsigned PositionInChainForMFMA = 0; 1606*0fca6ea1SDimitry Andric 1607*0fca6ea1SDimitry Andric auto incrementMFMAPosition = [&CurrMFMAPosition, &MFMAChainForMFMA, 1608*0fca6ea1SDimitry Andric &PositionInChainForMFMA]() { 1609*0fca6ea1SDimitry Andric ++CurrMFMAPosition; 1610*0fca6ea1SDimitry Andric MFMAChainForMFMA = CurrMFMAPosition % MFMAChains; 1611*0fca6ea1SDimitry Andric PositionInChainForMFMA = CurrMFMAPosition / MFMAChains; 1612*0fca6ea1SDimitry Andric }; 1613*0fca6ea1SDimitry Andric 1614*0fca6ea1SDimitry Andric bool IsPostRA = Phase == AMDGPU::SchedulingPhase::PostRA; 1615*0fca6ea1SDimitry Andric assert(IsPostRA || MFMAChainSeeds.size() == MFMAChains); 1616*0fca6ea1SDimitry Andric 1617*0fca6ea1SDimitry Andric bool UsesFMA = IsSmallKernelType || !IsPostRA; 1618*0fca6ea1SDimitry Andric bool UsesDSRead = IsLargeKernelType && !IsPostRA && FirstPipeDSR; 1619*0fca6ea1SDimitry Andric bool UsesCvt = HasCvt && (IsSmallKernelType || !IsPostRA); 1620*0fca6ea1SDimitry Andric bool UsesVALU = IsSmallKernelType; 1621*0fca6ea1SDimitry Andric 1622*0fca6ea1SDimitry Andric // PHASE 1: "Prefetch" 1623*0fca6ea1SDimitry Andric if (UsesFMA) { 1624*0fca6ea1SDimitry Andric // First Round FMA 1625*0fca6ea1SDimitry Andric SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 1626*0fca6ea1SDimitry Andric SchedGroupMask::VALU, ExpRequirement, PipelineSyncID, DAG, TII); 1627*0fca6ea1SDimitry Andric if (!IsPostRA && MFMAChains) { 1628*0fca6ea1SDimitry Andric SG->addRule(std::make_shared<EnablesNthMFMAInChain>( 1629*0fca6ea1SDimitry Andric PositionInChain, MFMAChainSeeds[MFMAChain], TII, SG->getSGID(), 1630*0fca6ea1SDimitry Andric true)); 1631*0fca6ea1SDimitry Andric } else 1632*0fca6ea1SDimitry Andric SG->addRule( 1633*0fca6ea1SDimitry Andric std::make_shared<EnablesNthMFMA>(1, TII, SG->getSGID(), true)); 1634*0fca6ea1SDimitry Andric SG->addRule(std::make_shared<IsFMA>(TII, SG->getSGID())); 1635*0fca6ea1SDimitry Andric SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 1636*0fca6ea1SDimitry Andric 1637*0fca6ea1SDimitry Andric // Second Round FMA 1638*0fca6ea1SDimitry Andric SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 1639*0fca6ea1SDimitry Andric SchedGroupMask::VALU, ExpRequirement, PipelineSyncID, DAG, TII); 1640*0fca6ea1SDimitry Andric if (!IsPostRA && MFMAChains) { 1641*0fca6ea1SDimitry Andric SG->addRule(std::make_shared<EnablesNthMFMAInChain>( 1642*0fca6ea1SDimitry Andric getNextTransPositionInChain(), 1643*0fca6ea1SDimitry Andric MFMAChainSeeds[getNextTransMFMAChain()], TII, SG->getSGID(), true)); 1644*0fca6ea1SDimitry Andric } else 1645*0fca6ea1SDimitry Andric SG->addRule(std::make_shared<EnablesNthMFMA>(MFMAEnablement + 1, TII, 1646*0fca6ea1SDimitry Andric SG->getSGID(), true)); 1647*0fca6ea1SDimitry Andric SG->addRule(std::make_shared<IsFMA>(TII, SG->getSGID())); 1648*0fca6ea1SDimitry Andric SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 1649*0fca6ea1SDimitry Andric } 1650*0fca6ea1SDimitry Andric 1651*0fca6ea1SDimitry Andric if (UsesDSRead) { 1652*0fca6ea1SDimitry Andric SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 1653*0fca6ea1SDimitry Andric SchedGroupMask::DS_READ, 2, PipelineSyncID, DAG, TII); 1654*0fca6ea1SDimitry Andric SG->addRule(std::make_shared<OccursAtOrAfterNode>(*FirstPipeDSR, TII, 1655*0fca6ea1SDimitry Andric SG->getSGID())); 1656*0fca6ea1SDimitry Andric SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 1657*0fca6ea1SDimitry Andric } 1658*0fca6ea1SDimitry Andric 1659*0fca6ea1SDimitry Andric // First Round EXP 1660*0fca6ea1SDimitry Andric SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 1661*0fca6ea1SDimitry Andric SchedGroupMask::TRANS, ExpRequirement, PipelineSyncID, DAG, TII); 1662*0fca6ea1SDimitry Andric if (!IsPostRA && MFMAChains) 1663*0fca6ea1SDimitry Andric SG->addRule(std::make_shared<EnablesNthMFMAInChain>( 1664*0fca6ea1SDimitry Andric PositionInChain, MFMAChainSeeds[MFMAChain], TII, SG->getSGID(), true)); 1665*0fca6ea1SDimitry Andric else 1666*0fca6ea1SDimitry Andric SG->addRule(std::make_shared<EnablesNthMFMA>(1, TII, SG->getSGID(), true)); 1667*0fca6ea1SDimitry Andric SG->addRule(std::make_shared<IsPipeExp>(TII, SG->getSGID(), true)); 1668*0fca6ea1SDimitry Andric SG->addRule(std::make_shared<LessThanNSuccs>(8, TII, SG->getSGID(), 1669*0fca6ea1SDimitry Andric HasChainBetweenCvt)); 1670*0fca6ea1SDimitry Andric SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 1671*0fca6ea1SDimitry Andric 1672*0fca6ea1SDimitry Andric incrementTransPosition(); 1673*0fca6ea1SDimitry Andric 1674*0fca6ea1SDimitry Andric // First Round CVT, Third Round FMA, Second Round EXP; interleaved 1675*0fca6ea1SDimitry Andric for (unsigned I = 0; I < ExpRequirement; I++) { 1676*0fca6ea1SDimitry Andric // First Round CVT 1677*0fca6ea1SDimitry Andric if (UsesCvt) { 1678*0fca6ea1SDimitry Andric SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 1679*0fca6ea1SDimitry Andric SchedGroupMask::VALU, 1, PipelineSyncID, DAG, TII); 1680*0fca6ea1SDimitry Andric SG->addRule(std::make_shared<IsCvt>(TII, SG->getSGID())); 1681*0fca6ea1SDimitry Andric if (HasChainBetweenCvt) 1682*0fca6ea1SDimitry Andric SG->addRule(std::make_shared<IsReachableFromPrevNthGroup>( 1683*0fca6ea1SDimitry Andric 1 + (2 + UsesFMA) * I, TII, SG->getSGID())); 1684*0fca6ea1SDimitry Andric else 1685*0fca6ea1SDimitry Andric SG->addRule(std::make_shared<IsSuccOfPrevNthGroup>( 1686*0fca6ea1SDimitry Andric 1 + (2 + UsesFMA) * I, TII, SG->getSGID())); 1687*0fca6ea1SDimitry Andric SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 1688*0fca6ea1SDimitry Andric } 1689*0fca6ea1SDimitry Andric 1690*0fca6ea1SDimitry Andric // Third Round FMA 1691*0fca6ea1SDimitry Andric if (UsesFMA) { 1692*0fca6ea1SDimitry Andric SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 1693*0fca6ea1SDimitry Andric SchedGroupMask::VALU, 1, PipelineSyncID, DAG, TII); 1694*0fca6ea1SDimitry Andric if (!IsPostRA && MFMAChains) { 1695*0fca6ea1SDimitry Andric SG->addRule(std::make_shared<EnablesNthMFMAInChain>( 1696*0fca6ea1SDimitry Andric getNextTransPositionInChain(), 1697*0fca6ea1SDimitry Andric MFMAChainSeeds[getNextTransMFMAChain()], TII, SG->getSGID(), true)); 1698*0fca6ea1SDimitry Andric } else 1699*0fca6ea1SDimitry Andric SG->addRule(std::make_shared<EnablesNthMFMA>(2 * MFMAEnablement + 1, 1700*0fca6ea1SDimitry Andric TII, SG->getSGID(), true)); 1701*0fca6ea1SDimitry Andric SG->addRule(std::make_shared<IsFMA>(TII, SG->getSGID())); 1702*0fca6ea1SDimitry Andric SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 1703*0fca6ea1SDimitry Andric } 1704*0fca6ea1SDimitry Andric 1705*0fca6ea1SDimitry Andric // Second Round EXP 1706*0fca6ea1SDimitry Andric SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 1707*0fca6ea1SDimitry Andric SchedGroupMask::TRANS, 1, PipelineSyncID, DAG, TII); 1708*0fca6ea1SDimitry Andric if (!IsPostRA && MFMAChains) 1709*0fca6ea1SDimitry Andric SG->addRule(std::make_shared<EnablesNthMFMAInChain>( 1710*0fca6ea1SDimitry Andric PositionInChain, MFMAChainSeeds[MFMAChain], TII, SG->getSGID(), 1711*0fca6ea1SDimitry Andric true)); 1712*0fca6ea1SDimitry Andric else 1713*0fca6ea1SDimitry Andric SG->addRule(std::make_shared<EnablesNthMFMA>(MFMAEnablement + 1, TII, 1714*0fca6ea1SDimitry Andric SG->getSGID(), true)); 1715*0fca6ea1SDimitry Andric SG->addRule(std::make_shared<IsPipeExp>(TII, SG->getSGID(), true)); 1716*0fca6ea1SDimitry Andric SG->addRule(std::make_shared<LessThanNSuccs>(8, TII, SG->getSGID(), 1717*0fca6ea1SDimitry Andric HasChainBetweenCvt)); 1718*0fca6ea1SDimitry Andric SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 1719*0fca6ea1SDimitry Andric } 1720*0fca6ea1SDimitry Andric 1721*0fca6ea1SDimitry Andric // The "extra" EXP which enables all MFMA 1722*0fca6ea1SDimitry Andric // TODO: UsesExtraExp 1723*0fca6ea1SDimitry Andric SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 1724*0fca6ea1SDimitry Andric SchedGroupMask::TRANS, 1, PipelineSyncID, DAG, TII); 1725*0fca6ea1SDimitry Andric SG->addRule(std::make_shared<IsPipeExp>(TII, SG->getSGID(), true)); 1726*0fca6ea1SDimitry Andric SG->addRule(std::make_shared<GreaterThanOrEqualToNSuccs>( 1727*0fca6ea1SDimitry Andric 8, TII, SG->getSGID(), HasChainBetweenCvt)); 1728*0fca6ea1SDimitry Andric SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 1729*0fca6ea1SDimitry Andric 1730*0fca6ea1SDimitry Andric // PHASE 2: Main Interleave Loop 1731*0fca6ea1SDimitry Andric 1732*0fca6ea1SDimitry Andric // The number of MFMAs per iteration 1733*0fca6ea1SDimitry Andric unsigned MFMARatio = 1734*0fca6ea1SDimitry Andric MFMAEnablement > ExpRequirement ? MFMAEnablement / ExpRequirement : 1; 1735*0fca6ea1SDimitry Andric // The number of Exps per iteration 1736*0fca6ea1SDimitry Andric unsigned ExpRatio = 1737*0fca6ea1SDimitry Andric MFMAEnablement > ExpRequirement ? 1 : ExpRequirement / MFMAEnablement; 1738*0fca6ea1SDimitry Andric // The reamaining Exps 1739*0fca6ea1SDimitry Andric unsigned RemainingExp = TransPipeCount > (2 * ExpRequirement) 1740*0fca6ea1SDimitry Andric ? TransPipeCount - (2 * ExpRequirement) 1741*0fca6ea1SDimitry Andric : 0; 1742*0fca6ea1SDimitry Andric unsigned ExpLoopCount = RemainingExp / ExpRatio; 1743*0fca6ea1SDimitry Andric // In loop MFMAs 1744*0fca6ea1SDimitry Andric unsigned MFMAInLoop = MFMAPipeCount > (MFMAEnablement * 2) 1745*0fca6ea1SDimitry Andric ? MFMAPipeCount - (MFMAEnablement * 2) 1746*0fca6ea1SDimitry Andric : 0; 1747*0fca6ea1SDimitry Andric unsigned MFMALoopCount = MFMAInLoop / MFMARatio; 1748*0fca6ea1SDimitry Andric unsigned VALUOps = 1749*0fca6ea1SDimitry Andric AddPipeCount < MFMAPipeCount ? 1 : AddPipeCount / MFMAPipeCount; 1750*0fca6ea1SDimitry Andric unsigned LoopSize = std::min(ExpLoopCount, MFMALoopCount); 1751*0fca6ea1SDimitry Andric 1752*0fca6ea1SDimitry Andric for (unsigned I = 0; I < LoopSize; I++) { 1753*0fca6ea1SDimitry Andric if (!(I * ExpRatio % ExpRequirement)) 1754*0fca6ea1SDimitry Andric incrementTransPosition(); 1755*0fca6ea1SDimitry Andric 1756*0fca6ea1SDimitry Andric // Round N MFMA 1757*0fca6ea1SDimitry Andric SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 1758*0fca6ea1SDimitry Andric SchedGroupMask::MFMA, MFMARatio, PipelineSyncID, DAG, TII); 1759*0fca6ea1SDimitry Andric if (!IsPostRA && MFMAChains) 1760*0fca6ea1SDimitry Andric SG->addRule(std::make_shared<IsExactMFMA>( 1761*0fca6ea1SDimitry Andric PositionInChainForMFMA, MFMAChainSeeds[MFMAChainForMFMA], TII, 1762*0fca6ea1SDimitry Andric SG->getSGID(), true)); 1763*0fca6ea1SDimitry Andric else 1764*0fca6ea1SDimitry Andric SG->addRule(std::make_shared<OccursAfterExp>(TII, SG->getSGID(), true)); 1765*0fca6ea1SDimitry Andric SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 1766*0fca6ea1SDimitry Andric incrementMFMAPosition(); 1767*0fca6ea1SDimitry Andric 1768*0fca6ea1SDimitry Andric if (UsesVALU) { 1769*0fca6ea1SDimitry Andric SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 1770*0fca6ea1SDimitry Andric SchedGroupMask::VALU, VALUOps, PipelineSyncID, DAG, TII); 1771*0fca6ea1SDimitry Andric SG->addRule(std::make_shared<IsPipeAdd>(TII, SG->getSGID())); 1772*0fca6ea1SDimitry Andric SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 1773*0fca6ea1SDimitry Andric } 1774*0fca6ea1SDimitry Andric 1775*0fca6ea1SDimitry Andric if (UsesDSRead && !(I % 4)) { 1776*0fca6ea1SDimitry Andric SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 1777*0fca6ea1SDimitry Andric SchedGroupMask::DS_READ, 2, PipelineSyncID, DAG, TII); 1778*0fca6ea1SDimitry Andric SG->addRule(std::make_shared<OccursAtOrAfterNode>(*FirstPipeDSR, TII, 1779*0fca6ea1SDimitry Andric SG->getSGID())); 1780*0fca6ea1SDimitry Andric SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 1781*0fca6ea1SDimitry Andric } 1782*0fca6ea1SDimitry Andric 1783*0fca6ea1SDimitry Andric // CVT, EXP, FMA Interleaving 1784*0fca6ea1SDimitry Andric for (unsigned J = 0; J < ExpRatio; J++) { 1785*0fca6ea1SDimitry Andric auto MFMAOffset = (1 + UsesVALU) * MFMARatio * (I + 1); 1786*0fca6ea1SDimitry Andric auto MaxMFMAOffset = 1787*0fca6ea1SDimitry Andric (1 + UsesVALU) * ExpRequirement * MFMARatio / ExpRatio; 1788*0fca6ea1SDimitry Andric 1789*0fca6ea1SDimitry Andric // Round N + 1 CVT 1790*0fca6ea1SDimitry Andric if (UsesCvt) { 1791*0fca6ea1SDimitry Andric SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 1792*0fca6ea1SDimitry Andric SchedGroupMask::VALU, 1, PipelineSyncID, DAG, TII); 1793*0fca6ea1SDimitry Andric SG->addRule(std::make_shared<IsCvt>(TII, SG->getSGID())); 1794*0fca6ea1SDimitry Andric auto BaseDiff = (2 + UsesFMA) * (ExpRequirement - 1) + 1; 1795*0fca6ea1SDimitry Andric auto DSROffset = I / 4 + 1; 1796*0fca6ea1SDimitry Andric auto MaxDSROffset = MaxMFMAOffset / 4; 1797*0fca6ea1SDimitry Andric // TODO: UsesExtraExp 1798*0fca6ea1SDimitry Andric auto ExpOffset = I * ExpRatio + J >= ExpRequirement ? 0 : 1; 1799*0fca6ea1SDimitry Andric auto CurrentOffset = UsesDSRead * std::min(MaxDSROffset, DSROffset) + 1800*0fca6ea1SDimitry Andric std::min(MaxMFMAOffset, MFMAOffset) + BaseDiff + 1801*0fca6ea1SDimitry Andric ExpOffset; 1802*0fca6ea1SDimitry Andric if (HasChainBetweenCvt) 1803*0fca6ea1SDimitry Andric SG->addRule(std::make_shared<IsReachableFromPrevNthGroup>( 1804*0fca6ea1SDimitry Andric CurrentOffset, TII, SG->getSGID())); 1805*0fca6ea1SDimitry Andric else 1806*0fca6ea1SDimitry Andric SG->addRule(std::make_shared<IsSuccOfPrevNthGroup>(CurrentOffset, TII, 1807*0fca6ea1SDimitry Andric SG->getSGID())); 1808*0fca6ea1SDimitry Andric SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 1809*0fca6ea1SDimitry Andric } 1810*0fca6ea1SDimitry Andric 1811*0fca6ea1SDimitry Andric // Round N + 3 FMA 1812*0fca6ea1SDimitry Andric if (UsesFMA) { 1813*0fca6ea1SDimitry Andric SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 1814*0fca6ea1SDimitry Andric SchedGroupMask::VALU, 1, PipelineSyncID, DAG, TII); 1815*0fca6ea1SDimitry Andric if (!IsPostRA && MFMAChains) 1816*0fca6ea1SDimitry Andric SG->addRule(std::make_shared<EnablesNthMFMAInChain>( 1817*0fca6ea1SDimitry Andric getNextTransPositionInChain(), 1818*0fca6ea1SDimitry Andric MFMAChainSeeds[getNextTransMFMAChain()], TII, SG->getSGID(), 1819*0fca6ea1SDimitry Andric true)); 1820*0fca6ea1SDimitry Andric else 1821*0fca6ea1SDimitry Andric SG->addRule(std::make_shared<EnablesNthMFMA>( 1822*0fca6ea1SDimitry Andric (((I * ExpRatio + J) / ExpRequirement) + 3) * MFMAEnablement + 1, 1823*0fca6ea1SDimitry Andric TII, SG->getSGID(), true)); 1824*0fca6ea1SDimitry Andric SG->addRule(std::make_shared<IsFMA>(TII, SG->getSGID())); 1825*0fca6ea1SDimitry Andric SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 1826*0fca6ea1SDimitry Andric } 1827*0fca6ea1SDimitry Andric 1828*0fca6ea1SDimitry Andric // Round N + 2 Exp 1829*0fca6ea1SDimitry Andric SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 1830*0fca6ea1SDimitry Andric SchedGroupMask::TRANS, 1, PipelineSyncID, DAG, TII); 1831*0fca6ea1SDimitry Andric if (!IsPostRA && MFMAChains) 1832*0fca6ea1SDimitry Andric SG->addRule(std::make_shared<EnablesNthMFMAInChain>( 1833*0fca6ea1SDimitry Andric PositionInChain, MFMAChainSeeds[MFMAChain], TII, SG->getSGID(), 1834*0fca6ea1SDimitry Andric true)); 1835*0fca6ea1SDimitry Andric else 1836*0fca6ea1SDimitry Andric SG->addRule(std::make_shared<EnablesNthMFMA>( 1837*0fca6ea1SDimitry Andric (((I * ExpRatio + J) / ExpRequirement) + 2) * MFMAEnablement + 1, 1838*0fca6ea1SDimitry Andric TII, SG->getSGID(), true)); 1839*0fca6ea1SDimitry Andric SG->addRule(std::make_shared<IsPipeExp>(TII, SG->getSGID(), true)); 1840*0fca6ea1SDimitry Andric SG->addRule(std::make_shared<LessThanNSuccs>(8, TII, SG->getSGID(), 1841*0fca6ea1SDimitry Andric HasChainBetweenCvt)); 1842*0fca6ea1SDimitry Andric SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 1843*0fca6ea1SDimitry Andric } 1844*0fca6ea1SDimitry Andric } 1845*0fca6ea1SDimitry Andric 1846*0fca6ea1SDimitry Andric // PHASE 3: Remaining MFMAs 1847*0fca6ea1SDimitry Andric SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 1848*0fca6ea1SDimitry Andric SchedGroupMask::MFMA, MFMAEnablement * 2, PipelineSyncID, DAG, TII); 1849*0fca6ea1SDimitry Andric SG->addRule(std::make_shared<OccursAfterExp>(TII, SG->getSGID(), true)); 1850*0fca6ea1SDimitry Andric SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 1851*0fca6ea1SDimitry Andric return true; 1852bdd1243dSDimitry Andric } 1853bdd1243dSDimitry Andric 185406c3fb27SDimitry Andric class MFMASmallGemmSingleWaveOpt final : public IGLPStrategy { 185506c3fb27SDimitry Andric private: 185606c3fb27SDimitry Andric // Whether the DS_READ is a predecessor of first four MFMA in region 185706c3fb27SDimitry Andric class EnablesInitialMFMA final : public InstructionRule { 185806c3fb27SDimitry Andric public: 185906c3fb27SDimitry Andric bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection, 186006c3fb27SDimitry Andric SmallVectorImpl<SchedGroup> &SyncPipe) override { 186106c3fb27SDimitry Andric if (!SyncPipe.size()) 186206c3fb27SDimitry Andric return false; 186306c3fb27SDimitry Andric int MFMAsFound = 0; 186406c3fb27SDimitry Andric if (!Cache->size()) { 186506c3fb27SDimitry Andric for (auto &Elt : SyncPipe[0].DAG->SUnits) { 186606c3fb27SDimitry Andric if (TII->isMFMAorWMMA(*Elt.getInstr())) { 186706c3fb27SDimitry Andric ++MFMAsFound; 186806c3fb27SDimitry Andric if (MFMAsFound > 4) 186906c3fb27SDimitry Andric break; 187006c3fb27SDimitry Andric Cache->push_back(&Elt); 187106c3fb27SDimitry Andric } 187206c3fb27SDimitry Andric } 187306c3fb27SDimitry Andric } 187406c3fb27SDimitry Andric 187506c3fb27SDimitry Andric assert(Cache->size()); 187606c3fb27SDimitry Andric auto DAG = SyncPipe[0].DAG; 187706c3fb27SDimitry Andric for (auto &Elt : *Cache) { 187806c3fb27SDimitry Andric if (DAG->IsReachable(Elt, const_cast<SUnit *>(SU))) 187906c3fb27SDimitry Andric return true; 188006c3fb27SDimitry Andric } 188106c3fb27SDimitry Andric return false; 188206c3fb27SDimitry Andric } 188306c3fb27SDimitry Andric 188406c3fb27SDimitry Andric EnablesInitialMFMA(const SIInstrInfo *TII, unsigned SGID, 188506c3fb27SDimitry Andric bool NeedsCache = false) 188606c3fb27SDimitry Andric : InstructionRule(TII, SGID, NeedsCache) {} 188706c3fb27SDimitry Andric }; 188806c3fb27SDimitry Andric 188906c3fb27SDimitry Andric // Whether the MI is a V_PERM and is a predecessor of a common DS_WRITE 189006c3fb27SDimitry Andric class IsPermForDSW final : public InstructionRule { 189106c3fb27SDimitry Andric public: 189206c3fb27SDimitry Andric bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection, 189306c3fb27SDimitry Andric SmallVectorImpl<SchedGroup> &SyncPipe) override { 189406c3fb27SDimitry Andric auto MI = SU->getInstr(); 189506c3fb27SDimitry Andric if (MI->getOpcode() != AMDGPU::V_PERM_B32_e64) 189606c3fb27SDimitry Andric return false; 189706c3fb27SDimitry Andric 189806c3fb27SDimitry Andric bool FitsInGroup = false; 189906c3fb27SDimitry Andric // Does the VALU have a DS_WRITE successor 190006c3fb27SDimitry Andric if (!Collection.size()) { 190106c3fb27SDimitry Andric for (auto &Succ : SU->Succs) { 190206c3fb27SDimitry Andric SUnit *SuccUnit = Succ.getSUnit(); 190306c3fb27SDimitry Andric if (TII->isDS(*SuccUnit->getInstr()) && 190406c3fb27SDimitry Andric SuccUnit->getInstr()->mayStore()) { 190506c3fb27SDimitry Andric Cache->push_back(SuccUnit); 190606c3fb27SDimitry Andric FitsInGroup = true; 190706c3fb27SDimitry Andric } 190806c3fb27SDimitry Andric } 190906c3fb27SDimitry Andric return FitsInGroup; 191006c3fb27SDimitry Andric } 191106c3fb27SDimitry Andric 191206c3fb27SDimitry Andric assert(Cache->size()); 191306c3fb27SDimitry Andric 191406c3fb27SDimitry Andric // Does the VALU have a DS_WRITE successor that is the same as other 191506c3fb27SDimitry Andric // VALU already in the group. The V_PERMs will all share 1 DS_W succ 19165f757f3fSDimitry Andric return llvm::any_of(*Cache, [&SU](SUnit *Elt) { 19175f757f3fSDimitry Andric return llvm::any_of(SU->Succs, [&Elt](const SDep &ThisSucc) { 191806c3fb27SDimitry Andric return ThisSucc.getSUnit() == Elt; 191906c3fb27SDimitry Andric }); 192006c3fb27SDimitry Andric }); 192106c3fb27SDimitry Andric } 192206c3fb27SDimitry Andric 192306c3fb27SDimitry Andric IsPermForDSW(const SIInstrInfo *TII, unsigned SGID, bool NeedsCache = false) 192406c3fb27SDimitry Andric : InstructionRule(TII, SGID, NeedsCache) {} 192506c3fb27SDimitry Andric }; 192606c3fb27SDimitry Andric 192706c3fb27SDimitry Andric // Whether the SU is a successor of any element in previous SchedGroup 192806c3fb27SDimitry Andric class IsSuccOfPrevGroup final : public InstructionRule { 192906c3fb27SDimitry Andric public: 193006c3fb27SDimitry Andric bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection, 193106c3fb27SDimitry Andric SmallVectorImpl<SchedGroup> &SyncPipe) override { 193206c3fb27SDimitry Andric SchedGroup *OtherGroup = nullptr; 193306c3fb27SDimitry Andric for (auto &PipeSG : SyncPipe) { 193406c3fb27SDimitry Andric if ((unsigned)PipeSG.getSGID() == SGID - 1) { 193506c3fb27SDimitry Andric OtherGroup = &PipeSG; 193606c3fb27SDimitry Andric } 193706c3fb27SDimitry Andric } 193806c3fb27SDimitry Andric 193906c3fb27SDimitry Andric if (!OtherGroup) 194006c3fb27SDimitry Andric return false; 194106c3fb27SDimitry Andric if (!OtherGroup->Collection.size()) 194206c3fb27SDimitry Andric return true; 194306c3fb27SDimitry Andric 194406c3fb27SDimitry Andric // Does the previous VALU have this DS_Write as a successor 194506c3fb27SDimitry Andric return (std::any_of(OtherGroup->Collection.begin(), 194606c3fb27SDimitry Andric OtherGroup->Collection.end(), [&SU](SUnit *Elt) { 194706c3fb27SDimitry Andric return std::any_of(Elt->Succs.begin(), 194806c3fb27SDimitry Andric Elt->Succs.end(), 194906c3fb27SDimitry Andric [&SU](SDep &Succ) { 195006c3fb27SDimitry Andric return Succ.getSUnit() == SU; 195106c3fb27SDimitry Andric }); 195206c3fb27SDimitry Andric })); 195306c3fb27SDimitry Andric } 195406c3fb27SDimitry Andric IsSuccOfPrevGroup(const SIInstrInfo *TII, unsigned SGID, 195506c3fb27SDimitry Andric bool NeedsCache = false) 195606c3fb27SDimitry Andric : InstructionRule(TII, SGID, NeedsCache) {} 195706c3fb27SDimitry Andric }; 195806c3fb27SDimitry Andric 195906c3fb27SDimitry Andric // Whether the combined load width of group is 128 bits 196006c3fb27SDimitry Andric class VMEMSize final : public InstructionRule { 196106c3fb27SDimitry Andric public: 196206c3fb27SDimitry Andric bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection, 196306c3fb27SDimitry Andric SmallVectorImpl<SchedGroup> &SyncPipe) override { 196406c3fb27SDimitry Andric auto MI = SU->getInstr(); 196506c3fb27SDimitry Andric if (MI->getOpcode() == TargetOpcode::BUNDLE) 196606c3fb27SDimitry Andric return false; 196706c3fb27SDimitry Andric if (!Collection.size()) 196806c3fb27SDimitry Andric return true; 196906c3fb27SDimitry Andric 197006c3fb27SDimitry Andric int NumBits = 0; 197106c3fb27SDimitry Andric 197206c3fb27SDimitry Andric auto TRI = TII->getRegisterInfo(); 197306c3fb27SDimitry Andric auto &MRI = MI->getParent()->getParent()->getRegInfo(); 197406c3fb27SDimitry Andric for (auto &Elt : Collection) { 197506c3fb27SDimitry Andric auto Op = Elt->getInstr()->getOperand(0); 197606c3fb27SDimitry Andric auto Size = 197706c3fb27SDimitry Andric TRI.getRegSizeInBits(*TRI.getRegClassForOperandReg(MRI, Op)); 197806c3fb27SDimitry Andric NumBits += Size; 197906c3fb27SDimitry Andric } 198006c3fb27SDimitry Andric 198106c3fb27SDimitry Andric if (NumBits < 128) { 198206c3fb27SDimitry Andric assert(TII->isVMEM(*MI) && MI->mayLoad()); 198306c3fb27SDimitry Andric if (NumBits + TRI.getRegSizeInBits(*TRI.getRegClassForOperandReg( 198406c3fb27SDimitry Andric MRI, MI->getOperand(0))) <= 198506c3fb27SDimitry Andric 128) 198606c3fb27SDimitry Andric return true; 198706c3fb27SDimitry Andric } 198806c3fb27SDimitry Andric 198906c3fb27SDimitry Andric return false; 199006c3fb27SDimitry Andric } 199106c3fb27SDimitry Andric 199206c3fb27SDimitry Andric VMEMSize(const SIInstrInfo *TII, unsigned SGID, bool NeedsCache = false) 199306c3fb27SDimitry Andric : InstructionRule(TII, SGID, NeedsCache) {} 199406c3fb27SDimitry Andric }; 199506c3fb27SDimitry Andric 19965f757f3fSDimitry Andric /// Whether the SU shares a V_PERM predecessor with any SU in the SchedGroup 19975f757f3fSDimitry Andric /// that is \p Distance steps away 199806c3fb27SDimitry Andric class SharesPredWithPrevNthGroup final : public InstructionRule { 199906c3fb27SDimitry Andric private: 200006c3fb27SDimitry Andric unsigned Distance = 1; 200106c3fb27SDimitry Andric 200206c3fb27SDimitry Andric public: 200306c3fb27SDimitry Andric bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection, 200406c3fb27SDimitry Andric SmallVectorImpl<SchedGroup> &SyncPipe) override { 200506c3fb27SDimitry Andric SchedGroup *OtherGroup = nullptr; 200606c3fb27SDimitry Andric if (!SyncPipe.size()) 200706c3fb27SDimitry Andric return false; 200806c3fb27SDimitry Andric 200906c3fb27SDimitry Andric if (!Cache->size()) { 201006c3fb27SDimitry Andric 201106c3fb27SDimitry Andric for (auto &PipeSG : SyncPipe) { 201206c3fb27SDimitry Andric if ((unsigned)PipeSG.getSGID() == SGID - Distance) { 201306c3fb27SDimitry Andric OtherGroup = &PipeSG; 201406c3fb27SDimitry Andric } 201506c3fb27SDimitry Andric } 201606c3fb27SDimitry Andric 201706c3fb27SDimitry Andric if (!OtherGroup) 201806c3fb27SDimitry Andric return false; 201906c3fb27SDimitry Andric if (!OtherGroup->Collection.size()) 202006c3fb27SDimitry Andric return true; 202106c3fb27SDimitry Andric 202206c3fb27SDimitry Andric for (auto &OtherEle : OtherGroup->Collection) { 202306c3fb27SDimitry Andric for (auto &Pred : OtherEle->Preds) { 202406c3fb27SDimitry Andric if (Pred.getSUnit()->getInstr()->getOpcode() == 202506c3fb27SDimitry Andric AMDGPU::V_PERM_B32_e64) 202606c3fb27SDimitry Andric Cache->push_back(Pred.getSUnit()); 202706c3fb27SDimitry Andric } 202806c3fb27SDimitry Andric } 20295f757f3fSDimitry Andric 20305f757f3fSDimitry Andric // If the other group has no PERM preds, then this group won't share any 20315f757f3fSDimitry Andric if (!Cache->size()) 20325f757f3fSDimitry Andric return false; 203306c3fb27SDimitry Andric } 203406c3fb27SDimitry Andric 203506c3fb27SDimitry Andric auto DAG = SyncPipe[0].DAG; 203606c3fb27SDimitry Andric // Does the previous DS_WRITE share a V_PERM predecessor with this 203706c3fb27SDimitry Andric // VMEM_READ 20385f757f3fSDimitry Andric return llvm::any_of(*Cache, [&SU, &DAG](SUnit *Elt) { 203906c3fb27SDimitry Andric return DAG->IsReachable(const_cast<SUnit *>(SU), Elt); 20405f757f3fSDimitry Andric }); 204106c3fb27SDimitry Andric } 204206c3fb27SDimitry Andric SharesPredWithPrevNthGroup(unsigned Distance, const SIInstrInfo *TII, 204306c3fb27SDimitry Andric unsigned SGID, bool NeedsCache = false) 204406c3fb27SDimitry Andric : InstructionRule(TII, SGID, NeedsCache), Distance(Distance) {} 204506c3fb27SDimitry Andric }; 204606c3fb27SDimitry Andric 204706c3fb27SDimitry Andric public: 2048*0fca6ea1SDimitry Andric bool applyIGLPStrategy( 204906c3fb27SDimitry Andric DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs, 20505f757f3fSDimitry Andric DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups, 2051*0fca6ea1SDimitry Andric AMDGPU::SchedulingPhase Phase) override; 205206c3fb27SDimitry Andric 2053*0fca6ea1SDimitry Andric bool shouldApplyStrategy(ScheduleDAGInstrs *DAG, 2054*0fca6ea1SDimitry Andric AMDGPU::SchedulingPhase Phase) override { 2055*0fca6ea1SDimitry Andric return true; 2056*0fca6ea1SDimitry Andric } 205706c3fb27SDimitry Andric 205806c3fb27SDimitry Andric MFMASmallGemmSingleWaveOpt(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII) 205906c3fb27SDimitry Andric : IGLPStrategy(DAG, TII) { 2060*0fca6ea1SDimitry Andric IsBottomUp = false; 206106c3fb27SDimitry Andric } 206206c3fb27SDimitry Andric }; 206306c3fb27SDimitry Andric 20645f757f3fSDimitry Andric static unsigned DSWCount = 0; 20655f757f3fSDimitry Andric static unsigned DSWWithPermCount = 0; 20665f757f3fSDimitry Andric static unsigned DSWWithSharedVMEMCount = 0; 20675f757f3fSDimitry Andric 2068*0fca6ea1SDimitry Andric bool MFMASmallGemmSingleWaveOpt::applyIGLPStrategy( 206906c3fb27SDimitry Andric DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs, 20705f757f3fSDimitry Andric DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups, 2071*0fca6ea1SDimitry Andric AMDGPU::SchedulingPhase Phase) { 207206c3fb27SDimitry Andric unsigned MFMACount = 0; 207306c3fb27SDimitry Andric unsigned DSRCount = 0; 20745f757f3fSDimitry Andric 2075*0fca6ea1SDimitry Andric bool IsInitial = Phase == AMDGPU::SchedulingPhase::Initial; 2076*0fca6ea1SDimitry Andric 2077*0fca6ea1SDimitry Andric assert((!IsInitial || (DSWCount == 0 && DSWWithPermCount == 0 && 20785f757f3fSDimitry Andric DSWWithSharedVMEMCount == 0)) && 20795f757f3fSDimitry Andric "DSWCounters should be zero in pre-RA scheduling!"); 208006c3fb27SDimitry Andric SmallVector<SUnit *, 6> DSWithPerms; 208106c3fb27SDimitry Andric for (auto &SU : DAG->SUnits) { 208206c3fb27SDimitry Andric auto I = SU.getInstr(); 208306c3fb27SDimitry Andric if (TII->isMFMAorWMMA(*I)) 208406c3fb27SDimitry Andric ++MFMACount; 208506c3fb27SDimitry Andric else if (TII->isDS(*I)) { 208606c3fb27SDimitry Andric if (I->mayLoad()) 208706c3fb27SDimitry Andric ++DSRCount; 2088*0fca6ea1SDimitry Andric else if (I->mayStore() && IsInitial) { 208906c3fb27SDimitry Andric ++DSWCount; 209006c3fb27SDimitry Andric for (auto Pred : SU.Preds) { 209106c3fb27SDimitry Andric if (Pred.getSUnit()->getInstr()->getOpcode() == 209206c3fb27SDimitry Andric AMDGPU::V_PERM_B32_e64) { 209306c3fb27SDimitry Andric DSWithPerms.push_back(&SU); 209406c3fb27SDimitry Andric break; 209506c3fb27SDimitry Andric } 209606c3fb27SDimitry Andric } 209706c3fb27SDimitry Andric } 209806c3fb27SDimitry Andric } 209906c3fb27SDimitry Andric } 21005f757f3fSDimitry Andric 2101*0fca6ea1SDimitry Andric if (IsInitial) { 210206c3fb27SDimitry Andric DSWWithPermCount = DSWithPerms.size(); 210306c3fb27SDimitry Andric auto I = DSWithPerms.begin(); 210406c3fb27SDimitry Andric auto E = DSWithPerms.end(); 210506c3fb27SDimitry Andric 210606c3fb27SDimitry Andric // Get the count of DS_WRITES with V_PERM predecessors which 210706c3fb27SDimitry Andric // have loop carried dependencies (WAR) on the same VMEM_READs. 210806c3fb27SDimitry Andric // We consider partial overlap as a miss -- in other words, 210906c3fb27SDimitry Andric // for a given DS_W, we only consider another DS_W as matching 211006c3fb27SDimitry Andric // if there is a corresponding (in terms of the VMEM_R it uses) V_PERM pred 211106c3fb27SDimitry Andric // for every V_PERM pred of this DS_W. 211206c3fb27SDimitry Andric DenseMap<MachineInstr *, SUnit *> VMEMLookup; 211306c3fb27SDimitry Andric SmallVector<SUnit *, 6> Counted; 211406c3fb27SDimitry Andric for (; I != E; I++) { 211506c3fb27SDimitry Andric SUnit *Cand = nullptr; 211606c3fb27SDimitry Andric bool MissedAny = false; 211706c3fb27SDimitry Andric for (auto &Pred : (*I)->Preds) { 211806c3fb27SDimitry Andric if (Pred.getSUnit()->getInstr()->getOpcode() != AMDGPU::V_PERM_B32_e64) 211906c3fb27SDimitry Andric continue; 212006c3fb27SDimitry Andric 21215f757f3fSDimitry Andric if (Cand && llvm::is_contained(Counted, Cand)) 212206c3fb27SDimitry Andric break; 212306c3fb27SDimitry Andric 212406c3fb27SDimitry Andric for (auto &Succ : Pred.getSUnit()->Succs) { 212506c3fb27SDimitry Andric auto MI = Succ.getSUnit()->getInstr(); 212606c3fb27SDimitry Andric if (!TII->isVMEM(*MI) || !MI->mayLoad()) 212706c3fb27SDimitry Andric continue; 212806c3fb27SDimitry Andric 212906c3fb27SDimitry Andric if (MissedAny || !VMEMLookup.size()) { 213006c3fb27SDimitry Andric MissedAny = true; 213106c3fb27SDimitry Andric VMEMLookup[MI] = *I; 213206c3fb27SDimitry Andric continue; 213306c3fb27SDimitry Andric } 213406c3fb27SDimitry Andric 213506c3fb27SDimitry Andric if (!VMEMLookup.contains(MI)) { 213606c3fb27SDimitry Andric MissedAny = true; 213706c3fb27SDimitry Andric VMEMLookup[MI] = *I; 213806c3fb27SDimitry Andric continue; 213906c3fb27SDimitry Andric } 214006c3fb27SDimitry Andric 214106c3fb27SDimitry Andric Cand = VMEMLookup[MI]; 21425f757f3fSDimitry Andric if (llvm::is_contained(Counted, Cand)) { 214306c3fb27SDimitry Andric MissedAny = true; 214406c3fb27SDimitry Andric break; 214506c3fb27SDimitry Andric } 214606c3fb27SDimitry Andric } 214706c3fb27SDimitry Andric } 214806c3fb27SDimitry Andric if (!MissedAny && Cand) { 214906c3fb27SDimitry Andric DSWWithSharedVMEMCount += 2; 215006c3fb27SDimitry Andric Counted.push_back(Cand); 215106c3fb27SDimitry Andric Counted.push_back(*I); 215206c3fb27SDimitry Andric } 215306c3fb27SDimitry Andric } 21545f757f3fSDimitry Andric } 215506c3fb27SDimitry Andric 215606c3fb27SDimitry Andric assert(DSWWithSharedVMEMCount <= DSWWithPermCount); 215706c3fb27SDimitry Andric SchedGroup *SG; 215806c3fb27SDimitry Andric unsigned PipelineSyncID = 0; 215906c3fb27SDimitry Andric // For kernels with V_PERM, there are enough VALU to mix in between MFMAs 216006c3fb27SDimitry Andric if (DSWWithPermCount) { 216106c3fb27SDimitry Andric for (unsigned I = 0; I < MFMACount; I++) { 216206c3fb27SDimitry Andric SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 216306c3fb27SDimitry Andric SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII); 216406c3fb27SDimitry Andric SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 216506c3fb27SDimitry Andric 216606c3fb27SDimitry Andric SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 216706c3fb27SDimitry Andric SchedGroupMask::VALU, 2, PipelineSyncID, DAG, TII); 216806c3fb27SDimitry Andric SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 216906c3fb27SDimitry Andric } 217006c3fb27SDimitry Andric } 217106c3fb27SDimitry Andric 217206c3fb27SDimitry Andric PipelineSyncID = 1; 217306c3fb27SDimitry Andric // Phase 1: Break up DS_READ and MFMA clusters. 217406c3fb27SDimitry Andric // First DS_READ to make ready initial MFMA, then interleave MFMA with DS_READ 217506c3fb27SDimitry Andric // prefetch 217606c3fb27SDimitry Andric 217706c3fb27SDimitry Andric // Make ready initial MFMA 217806c3fb27SDimitry Andric SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 217906c3fb27SDimitry Andric SchedGroupMask::DS_READ, 4, PipelineSyncID, DAG, TII); 218006c3fb27SDimitry Andric SG->addRule(std::make_shared<EnablesInitialMFMA>(TII, SG->getSGID(), true)); 218106c3fb27SDimitry Andric SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 218206c3fb27SDimitry Andric 218306c3fb27SDimitry Andric SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 218406c3fb27SDimitry Andric SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII); 218506c3fb27SDimitry Andric SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 218606c3fb27SDimitry Andric 218706c3fb27SDimitry Andric // Interleave MFMA with DS_READ prefetch 218806c3fb27SDimitry Andric for (unsigned I = 0; I < DSRCount - 4; ++I) { 218906c3fb27SDimitry Andric SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 219006c3fb27SDimitry Andric SchedGroupMask::DS_READ, 1, PipelineSyncID, DAG, TII); 219106c3fb27SDimitry Andric SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 219206c3fb27SDimitry Andric 219306c3fb27SDimitry Andric SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 219406c3fb27SDimitry Andric SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII); 219506c3fb27SDimitry Andric SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 219606c3fb27SDimitry Andric } 219706c3fb27SDimitry Andric 219806c3fb27SDimitry Andric // Phase 2a: Loop carried dependency with V_PERM 219906c3fb27SDimitry Andric // Schedule VPerm & DS_WRITE as closely as possible to the VMEM_READ they 220006c3fb27SDimitry Andric // depend on. Interleave MFMA to keep XDL unit busy throughout. 220106c3fb27SDimitry Andric for (unsigned I = 0; I < DSWWithPermCount - DSWWithSharedVMEMCount; ++I) { 220206c3fb27SDimitry Andric SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 220306c3fb27SDimitry Andric SchedGroupMask::VALU, 4, PipelineSyncID, DAG, TII); 220406c3fb27SDimitry Andric SG->addRule(std::make_shared<IsPermForDSW>(TII, SG->getSGID(), true)); 220506c3fb27SDimitry Andric SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 220606c3fb27SDimitry Andric 220706c3fb27SDimitry Andric SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 220806c3fb27SDimitry Andric SchedGroupMask::DS_WRITE, 1, PipelineSyncID, DAG, TII); 2209*0fca6ea1SDimitry Andric SG->addRule(std::make_shared<IsSuccOfPrevGroup>(TII, SG->getSGID())); 221006c3fb27SDimitry Andric SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 221106c3fb27SDimitry Andric 221206c3fb27SDimitry Andric SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 221306c3fb27SDimitry Andric SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG, TII); 221406c3fb27SDimitry Andric SG->addRule(std::make_shared<SharesPredWithPrevNthGroup>( 221506c3fb27SDimitry Andric 1, TII, SG->getSGID(), true)); 2216*0fca6ea1SDimitry Andric SG->addRule(std::make_shared<VMEMSize>(TII, SG->getSGID())); 221706c3fb27SDimitry Andric SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 221806c3fb27SDimitry Andric 221906c3fb27SDimitry Andric SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 222006c3fb27SDimitry Andric SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII); 222106c3fb27SDimitry Andric SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 222206c3fb27SDimitry Andric 222306c3fb27SDimitry Andric SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 222406c3fb27SDimitry Andric SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG, TII); 222506c3fb27SDimitry Andric SG->addRule(std::make_shared<SharesPredWithPrevNthGroup>( 222606c3fb27SDimitry Andric 3, TII, SG->getSGID(), true)); 2227*0fca6ea1SDimitry Andric SG->addRule(std::make_shared<VMEMSize>(TII, SG->getSGID())); 222806c3fb27SDimitry Andric SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 222906c3fb27SDimitry Andric 223006c3fb27SDimitry Andric SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 223106c3fb27SDimitry Andric SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII); 223206c3fb27SDimitry Andric SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 223306c3fb27SDimitry Andric } 223406c3fb27SDimitry Andric 223506c3fb27SDimitry Andric // Phase 2b: Loop carried dependency without V_PERM 223606c3fb27SDimitry Andric // Schedule DS_WRITE as closely as possible to the VMEM_READ they depend on. 223706c3fb27SDimitry Andric // Interleave MFMA to keep XDL unit busy throughout. 223806c3fb27SDimitry Andric for (unsigned I = 0; I < DSWCount - DSWWithPermCount; I++) { 223906c3fb27SDimitry Andric SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 224006c3fb27SDimitry Andric SchedGroupMask::DS_WRITE, 1, PipelineSyncID, DAG, TII); 224106c3fb27SDimitry Andric SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 224206c3fb27SDimitry Andric 224306c3fb27SDimitry Andric SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 224406c3fb27SDimitry Andric SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG, TII); 2245*0fca6ea1SDimitry Andric SG->addRule(std::make_shared<VMEMSize>(TII, SG->getSGID())); 224606c3fb27SDimitry Andric SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 224706c3fb27SDimitry Andric 224806c3fb27SDimitry Andric SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 224906c3fb27SDimitry Andric SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII); 225006c3fb27SDimitry Andric SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 225106c3fb27SDimitry Andric } 225206c3fb27SDimitry Andric 225306c3fb27SDimitry Andric // Phase 2c: Loop carried dependency with V_PERM, VMEM_READs are 225406c3fb27SDimitry Andric // ultimately used by two DS_WRITE 225506c3fb27SDimitry Andric // Schedule VPerm & DS_WRITE as closely as possible to the VMEM_READ they 225606c3fb27SDimitry Andric // depend on. Interleave MFMA to keep XDL unit busy throughout. 225706c3fb27SDimitry Andric 225806c3fb27SDimitry Andric for (unsigned I = 0; I < DSWWithSharedVMEMCount; ++I) { 225906c3fb27SDimitry Andric SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 226006c3fb27SDimitry Andric SchedGroupMask::VALU, 4, PipelineSyncID, DAG, TII); 226106c3fb27SDimitry Andric SG->addRule(std::make_shared<IsPermForDSW>(TII, SG->getSGID(), true)); 226206c3fb27SDimitry Andric SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 226306c3fb27SDimitry Andric 226406c3fb27SDimitry Andric SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 226506c3fb27SDimitry Andric SchedGroupMask::DS_WRITE, 1, PipelineSyncID, DAG, TII); 2266*0fca6ea1SDimitry Andric SG->addRule(std::make_shared<IsSuccOfPrevGroup>(TII, SG->getSGID())); 226706c3fb27SDimitry Andric SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 226806c3fb27SDimitry Andric 226906c3fb27SDimitry Andric SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 227006c3fb27SDimitry Andric SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII); 227106c3fb27SDimitry Andric SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 227206c3fb27SDimitry Andric 227306c3fb27SDimitry Andric SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 227406c3fb27SDimitry Andric SchedGroupMask::VALU, 4, PipelineSyncID, DAG, TII); 227506c3fb27SDimitry Andric SG->addRule(std::make_shared<IsPermForDSW>(TII, SG->getSGID(), true)); 227606c3fb27SDimitry Andric SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 227706c3fb27SDimitry Andric 227806c3fb27SDimitry Andric SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 227906c3fb27SDimitry Andric SchedGroupMask::DS_WRITE, 1, PipelineSyncID, DAG, TII); 2280*0fca6ea1SDimitry Andric SG->addRule(std::make_shared<IsSuccOfPrevGroup>(TII, SG->getSGID())); 228106c3fb27SDimitry Andric SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 228206c3fb27SDimitry Andric 228306c3fb27SDimitry Andric SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 228406c3fb27SDimitry Andric SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII); 228506c3fb27SDimitry Andric SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 228606c3fb27SDimitry Andric 228706c3fb27SDimitry Andric SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 228806c3fb27SDimitry Andric SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG, TII); 228906c3fb27SDimitry Andric SG->addRule(std::make_shared<SharesPredWithPrevNthGroup>( 229006c3fb27SDimitry Andric 2, TII, SG->getSGID(), true)); 2291*0fca6ea1SDimitry Andric SG->addRule(std::make_shared<VMEMSize>(TII, SG->getSGID())); 229206c3fb27SDimitry Andric SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 229306c3fb27SDimitry Andric 229406c3fb27SDimitry Andric SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 229506c3fb27SDimitry Andric SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII); 229606c3fb27SDimitry Andric SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 229706c3fb27SDimitry Andric 229806c3fb27SDimitry Andric SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 229906c3fb27SDimitry Andric SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG, TII); 230006c3fb27SDimitry Andric SG->addRule(std::make_shared<SharesPredWithPrevNthGroup>( 230106c3fb27SDimitry Andric 4, TII, SG->getSGID(), true)); 2302*0fca6ea1SDimitry Andric SG->addRule(std::make_shared<VMEMSize>(TII, SG->getSGID())); 230306c3fb27SDimitry Andric SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 230406c3fb27SDimitry Andric 230506c3fb27SDimitry Andric SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 230606c3fb27SDimitry Andric SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII); 230706c3fb27SDimitry Andric SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 230806c3fb27SDimitry Andric } 2309*0fca6ea1SDimitry Andric 2310*0fca6ea1SDimitry Andric return true; 231106c3fb27SDimitry Andric } 231206c3fb27SDimitry Andric 2313bdd1243dSDimitry Andric static std::unique_ptr<IGLPStrategy> 2314bdd1243dSDimitry Andric createIGLPStrategy(IGLPStrategyID ID, ScheduleDAGInstrs *DAG, 2315bdd1243dSDimitry Andric const SIInstrInfo *TII) { 2316bdd1243dSDimitry Andric switch (ID) { 2317bdd1243dSDimitry Andric case MFMASmallGemmOptID: 2318bdd1243dSDimitry Andric return std::make_unique<MFMASmallGemmOpt>(DAG, TII); 231906c3fb27SDimitry Andric case MFMASmallGemmSingleWaveOptID: 232006c3fb27SDimitry Andric return std::make_unique<MFMASmallGemmSingleWaveOpt>(DAG, TII); 2321*0fca6ea1SDimitry Andric case MFMAExpInterleave: 2322*0fca6ea1SDimitry Andric return std::make_unique<MFMAExpInterleaveOpt>(DAG, TII); 2323bdd1243dSDimitry Andric } 2324bdd1243dSDimitry Andric 2325bdd1243dSDimitry Andric llvm_unreachable("Unknown IGLPStrategyID"); 2326bdd1243dSDimitry Andric } 2327bdd1243dSDimitry Andric 2328bdd1243dSDimitry Andric class IGroupLPDAGMutation : public ScheduleDAGMutation { 2329bdd1243dSDimitry Andric private: 2330bdd1243dSDimitry Andric const SIInstrInfo *TII; 2331bdd1243dSDimitry Andric 2332bdd1243dSDimitry Andric ScheduleDAGMI *DAG; 2333bdd1243dSDimitry Andric 2334bdd1243dSDimitry Andric // Organize lists of SchedGroups by their SyncID. SchedGroups / 2335bdd1243dSDimitry Andric // SCHED_GROUP_BARRIERs with different SyncIDs will have no edges added 2336bdd1243dSDimitry Andric // between then. 2337bdd1243dSDimitry Andric DenseMap<int, SmallVector<SchedGroup, 4>> SyncedSchedGroups; 2338bdd1243dSDimitry Andric 2339bdd1243dSDimitry Andric // Used to track instructions that can be mapped to multiple sched groups 2340bdd1243dSDimitry Andric DenseMap<int, SUnitsToCandidateSGsMap> SyncedInstrs; 2341bdd1243dSDimitry Andric 2342bdd1243dSDimitry Andric // Add DAG edges that enforce SCHED_BARRIER ordering. 2343bdd1243dSDimitry Andric void addSchedBarrierEdges(SUnit &SU); 2344bdd1243dSDimitry Andric 2345bdd1243dSDimitry Andric // Use a SCHED_BARRIER's mask to identify instruction SchedGroups that should 2346bdd1243dSDimitry Andric // not be reordered accross the SCHED_BARRIER. This is used for the base 2347bdd1243dSDimitry Andric // SCHED_BARRIER, and not SCHED_GROUP_BARRIER. The difference is that 2348bdd1243dSDimitry Andric // SCHED_BARRIER will always block all instructions that can be classified 2349bdd1243dSDimitry Andric // into a particular SchedClass, whereas SCHED_GROUP_BARRIER has a fixed size 2350bdd1243dSDimitry Andric // and may only synchronize with some SchedGroups. Returns the inverse of 2351bdd1243dSDimitry Andric // Mask. SCHED_BARRIER's mask describes which instruction types should be 2352bdd1243dSDimitry Andric // allowed to be scheduled across it. Invert the mask to get the 2353bdd1243dSDimitry Andric // SchedGroupMask of instructions that should be barred. 2354bdd1243dSDimitry Andric SchedGroupMask invertSchedBarrierMask(SchedGroupMask Mask) const; 2355bdd1243dSDimitry Andric 2356bdd1243dSDimitry Andric // Create SchedGroups for a SCHED_GROUP_BARRIER. 2357bdd1243dSDimitry Andric void initSchedGroupBarrierPipelineStage( 2358bdd1243dSDimitry Andric std::vector<SUnit>::reverse_iterator RIter); 2359bdd1243dSDimitry Andric 2360*0fca6ea1SDimitry Andric bool initIGLPOpt(SUnit &SU); 2361bdd1243dSDimitry Andric 2362bdd1243dSDimitry Andric public: 2363bdd1243dSDimitry Andric void apply(ScheduleDAGInstrs *DAGInstrs) override; 2364bdd1243dSDimitry Andric 236506c3fb27SDimitry Andric // The order in which the PipelineSolver should process the candidate 236606c3fb27SDimitry Andric // SchedGroup for a PipelineInstr. BOTTOM_UP will try to add SUs to the last 236706c3fb27SDimitry Andric // created SchedGroup first, and will consider that as the ultimate 236806c3fb27SDimitry Andric // predecessor group when linking. TOP_DOWN instead links and processes the 236906c3fb27SDimitry Andric // first created SchedGroup first. 2370*0fca6ea1SDimitry Andric bool IsBottomUp = true; 237106c3fb27SDimitry Andric 2372*0fca6ea1SDimitry Andric // The scheduling phase this application of IGLP corresponds with. 2373*0fca6ea1SDimitry Andric AMDGPU::SchedulingPhase Phase = AMDGPU::SchedulingPhase::Initial; 23745f757f3fSDimitry Andric 2375bdd1243dSDimitry Andric IGroupLPDAGMutation() = default; 2376*0fca6ea1SDimitry Andric IGroupLPDAGMutation(AMDGPU::SchedulingPhase Phase) : Phase(Phase) {} 2377bdd1243dSDimitry Andric }; 2378bdd1243dSDimitry Andric 2379bdd1243dSDimitry Andric unsigned SchedGroup::NumSchedGroups = 0; 2380bdd1243dSDimitry Andric 2381bdd1243dSDimitry Andric bool SchedGroup::tryAddEdge(SUnit *A, SUnit *B) { 2382bdd1243dSDimitry Andric if (A != B && DAG->canAddEdge(B, A)) { 2383bdd1243dSDimitry Andric DAG->addEdge(B, SDep(A, SDep::Artificial)); 2384bdd1243dSDimitry Andric return true; 2385bdd1243dSDimitry Andric } 2386bdd1243dSDimitry Andric return false; 2387bdd1243dSDimitry Andric } 2388bdd1243dSDimitry Andric 2389bdd1243dSDimitry Andric bool SchedGroup::canAddMI(const MachineInstr &MI) const { 2390bdd1243dSDimitry Andric bool Result = false; 2391bdd1243dSDimitry Andric if (MI.isMetaInstruction()) 2392bdd1243dSDimitry Andric Result = false; 2393bdd1243dSDimitry Andric 2394bdd1243dSDimitry Andric else if (((SGMask & SchedGroupMask::ALU) != SchedGroupMask::NONE) && 2395cb14a3feSDimitry Andric (TII->isVALU(MI) || TII->isMFMAorWMMA(MI) || TII->isSALU(MI) || 2396cb14a3feSDimitry Andric TII->isTRANS(MI))) 2397bdd1243dSDimitry Andric Result = true; 2398bdd1243dSDimitry Andric 2399bdd1243dSDimitry Andric else if (((SGMask & SchedGroupMask::VALU) != SchedGroupMask::NONE) && 2400cb14a3feSDimitry Andric TII->isVALU(MI) && !TII->isMFMAorWMMA(MI) && !TII->isTRANS(MI)) 2401bdd1243dSDimitry Andric Result = true; 2402bdd1243dSDimitry Andric 2403bdd1243dSDimitry Andric else if (((SGMask & SchedGroupMask::SALU) != SchedGroupMask::NONE) && 2404bdd1243dSDimitry Andric TII->isSALU(MI)) 2405bdd1243dSDimitry Andric Result = true; 2406bdd1243dSDimitry Andric 2407bdd1243dSDimitry Andric else if (((SGMask & SchedGroupMask::MFMA) != SchedGroupMask::NONE) && 2408bdd1243dSDimitry Andric TII->isMFMAorWMMA(MI)) 2409bdd1243dSDimitry Andric Result = true; 2410bdd1243dSDimitry Andric 2411bdd1243dSDimitry Andric else if (((SGMask & SchedGroupMask::VMEM) != SchedGroupMask::NONE) && 2412bdd1243dSDimitry Andric (TII->isVMEM(MI) || (TII->isFLAT(MI) && !TII->isDS(MI)))) 2413bdd1243dSDimitry Andric Result = true; 2414bdd1243dSDimitry Andric 2415bdd1243dSDimitry Andric else if (((SGMask & SchedGroupMask::VMEM_READ) != SchedGroupMask::NONE) && 2416bdd1243dSDimitry Andric MI.mayLoad() && 2417bdd1243dSDimitry Andric (TII->isVMEM(MI) || (TII->isFLAT(MI) && !TII->isDS(MI)))) 2418bdd1243dSDimitry Andric Result = true; 2419bdd1243dSDimitry Andric 2420bdd1243dSDimitry Andric else if (((SGMask & SchedGroupMask::VMEM_WRITE) != SchedGroupMask::NONE) && 2421bdd1243dSDimitry Andric MI.mayStore() && 2422bdd1243dSDimitry Andric (TII->isVMEM(MI) || (TII->isFLAT(MI) && !TII->isDS(MI)))) 2423bdd1243dSDimitry Andric Result = true; 2424bdd1243dSDimitry Andric 2425bdd1243dSDimitry Andric else if (((SGMask & SchedGroupMask::DS) != SchedGroupMask::NONE) && 2426bdd1243dSDimitry Andric TII->isDS(MI)) 2427bdd1243dSDimitry Andric Result = true; 2428bdd1243dSDimitry Andric 2429bdd1243dSDimitry Andric else if (((SGMask & SchedGroupMask::DS_READ) != SchedGroupMask::NONE) && 2430bdd1243dSDimitry Andric MI.mayLoad() && TII->isDS(MI)) 2431bdd1243dSDimitry Andric Result = true; 2432bdd1243dSDimitry Andric 2433bdd1243dSDimitry Andric else if (((SGMask & SchedGroupMask::DS_WRITE) != SchedGroupMask::NONE) && 2434bdd1243dSDimitry Andric MI.mayStore() && TII->isDS(MI)) 2435bdd1243dSDimitry Andric Result = true; 2436bdd1243dSDimitry Andric 2437cb14a3feSDimitry Andric else if (((SGMask & SchedGroupMask::TRANS) != SchedGroupMask::NONE) && 2438cb14a3feSDimitry Andric TII->isTRANS(MI)) 2439cb14a3feSDimitry Andric Result = true; 2440cb14a3feSDimitry Andric 2441bdd1243dSDimitry Andric LLVM_DEBUG( 2442bdd1243dSDimitry Andric dbgs() << "For SchedGroup with mask " << format_hex((int)SGMask, 10, true) 2443bdd1243dSDimitry Andric << (Result ? " could classify " : " unable to classify ") << MI); 2444bdd1243dSDimitry Andric 2445bdd1243dSDimitry Andric return Result; 2446bdd1243dSDimitry Andric } 2447bdd1243dSDimitry Andric 2448bdd1243dSDimitry Andric int SchedGroup::link(SUnit &SU, bool MakePred, 2449bdd1243dSDimitry Andric std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges) { 2450bdd1243dSDimitry Andric int MissedEdges = 0; 2451bdd1243dSDimitry Andric for (auto *A : Collection) { 2452bdd1243dSDimitry Andric SUnit *B = &SU; 2453bdd1243dSDimitry Andric if (A == B || A->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER) 2454bdd1243dSDimitry Andric continue; 2455bdd1243dSDimitry Andric if (MakePred) 2456bdd1243dSDimitry Andric std::swap(A, B); 2457bdd1243dSDimitry Andric 2458bdd1243dSDimitry Andric if (DAG->IsReachable(B, A)) 2459bdd1243dSDimitry Andric continue; 246006c3fb27SDimitry Andric 2461bdd1243dSDimitry Andric // tryAddEdge returns false if there is a dependency that makes adding 2462bdd1243dSDimitry Andric // the A->B edge impossible, otherwise it returns true; 2463bdd1243dSDimitry Andric bool Added = tryAddEdge(A, B); 2464bdd1243dSDimitry Andric if (Added) 2465*0fca6ea1SDimitry Andric AddedEdges.emplace_back(A, B); 2466bdd1243dSDimitry Andric else 2467bdd1243dSDimitry Andric ++MissedEdges; 2468bdd1243dSDimitry Andric } 2469bdd1243dSDimitry Andric 2470bdd1243dSDimitry Andric return MissedEdges; 2471bdd1243dSDimitry Andric } 2472bdd1243dSDimitry Andric 2473bdd1243dSDimitry Andric void SchedGroup::link(SUnit &SU, bool MakePred) { 2474bdd1243dSDimitry Andric for (auto *A : Collection) { 2475bdd1243dSDimitry Andric SUnit *B = &SU; 2476bdd1243dSDimitry Andric if (A->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER) 2477bdd1243dSDimitry Andric continue; 2478bdd1243dSDimitry Andric if (MakePred) 2479bdd1243dSDimitry Andric std::swap(A, B); 2480bdd1243dSDimitry Andric 2481bdd1243dSDimitry Andric tryAddEdge(A, B); 2482bdd1243dSDimitry Andric } 2483bdd1243dSDimitry Andric } 2484bdd1243dSDimitry Andric 2485bdd1243dSDimitry Andric void SchedGroup::link(SUnit &SU, 2486bdd1243dSDimitry Andric function_ref<bool(const SUnit *A, const SUnit *B)> P) { 2487bdd1243dSDimitry Andric for (auto *A : Collection) { 2488bdd1243dSDimitry Andric SUnit *B = &SU; 2489bdd1243dSDimitry Andric if (P(A, B)) 2490bdd1243dSDimitry Andric std::swap(A, B); 2491bdd1243dSDimitry Andric 2492bdd1243dSDimitry Andric tryAddEdge(A, B); 2493bdd1243dSDimitry Andric } 2494bdd1243dSDimitry Andric } 2495bdd1243dSDimitry Andric 2496bdd1243dSDimitry Andric void SchedGroup::link(SchedGroup &OtherGroup) { 2497bdd1243dSDimitry Andric for (auto *B : OtherGroup.Collection) 2498bdd1243dSDimitry Andric link(*B); 2499bdd1243dSDimitry Andric } 2500bdd1243dSDimitry Andric 2501bdd1243dSDimitry Andric bool SchedGroup::canAddSU(SUnit &SU) const { 2502bdd1243dSDimitry Andric MachineInstr &MI = *SU.getInstr(); 2503bdd1243dSDimitry Andric if (MI.getOpcode() != TargetOpcode::BUNDLE) 2504bdd1243dSDimitry Andric return canAddMI(MI); 2505bdd1243dSDimitry Andric 2506bdd1243dSDimitry Andric // Special case for bundled MIs. 2507bdd1243dSDimitry Andric const MachineBasicBlock *MBB = MI.getParent(); 2508bdd1243dSDimitry Andric MachineBasicBlock::instr_iterator B = MI.getIterator(), E = ++B; 2509bdd1243dSDimitry Andric while (E != MBB->end() && E->isBundledWithPred()) 2510bdd1243dSDimitry Andric ++E; 2511bdd1243dSDimitry Andric 2512bdd1243dSDimitry Andric // Return true if all of the bundled MIs can be added to this group. 2513bdd1243dSDimitry Andric return std::all_of(B, E, [this](MachineInstr &MI) { return canAddMI(MI); }); 2514bdd1243dSDimitry Andric } 2515bdd1243dSDimitry Andric 2516bdd1243dSDimitry Andric void SchedGroup::initSchedGroup() { 2517bdd1243dSDimitry Andric for (auto &SU : DAG->SUnits) { 2518bdd1243dSDimitry Andric if (isFull()) 2519bdd1243dSDimitry Andric break; 2520bdd1243dSDimitry Andric 2521bdd1243dSDimitry Andric if (canAddSU(SU)) 2522bdd1243dSDimitry Andric add(SU); 2523bdd1243dSDimitry Andric } 2524bdd1243dSDimitry Andric } 2525bdd1243dSDimitry Andric 2526bdd1243dSDimitry Andric void SchedGroup::initSchedGroup(std::vector<SUnit>::reverse_iterator RIter, 2527bdd1243dSDimitry Andric SUnitsToCandidateSGsMap &SyncedInstrs) { 2528bdd1243dSDimitry Andric SUnit &InitSU = *RIter; 2529bdd1243dSDimitry Andric for (auto E = DAG->SUnits.rend(); RIter != E; ++RIter) { 2530bdd1243dSDimitry Andric auto &SU = *RIter; 2531bdd1243dSDimitry Andric if (isFull()) 2532bdd1243dSDimitry Andric break; 2533bdd1243dSDimitry Andric 2534bdd1243dSDimitry Andric if (canAddSU(SU)) 2535bdd1243dSDimitry Andric SyncedInstrs[&SU].push_back(SGID); 2536bdd1243dSDimitry Andric } 2537bdd1243dSDimitry Andric 2538bdd1243dSDimitry Andric add(InitSU); 2539bdd1243dSDimitry Andric assert(MaxSize); 2540bdd1243dSDimitry Andric (*MaxSize)++; 2541bdd1243dSDimitry Andric } 2542bdd1243dSDimitry Andric 2543bdd1243dSDimitry Andric void SchedGroup::initSchedGroup(SUnitsToCandidateSGsMap &SyncedInstrs) { 2544bdd1243dSDimitry Andric auto I = DAG->SUnits.rbegin(); 2545bdd1243dSDimitry Andric auto E = DAG->SUnits.rend(); 2546bdd1243dSDimitry Andric for (; I != E; ++I) { 2547bdd1243dSDimitry Andric auto &SU = *I; 2548bdd1243dSDimitry Andric if (isFull()) 2549bdd1243dSDimitry Andric break; 2550bdd1243dSDimitry Andric if (canAddSU(SU)) 2551bdd1243dSDimitry Andric SyncedInstrs[&SU].push_back(SGID); 2552bdd1243dSDimitry Andric } 2553bdd1243dSDimitry Andric } 2554bdd1243dSDimitry Andric 2555bdd1243dSDimitry Andric void IGroupLPDAGMutation::apply(ScheduleDAGInstrs *DAGInstrs) { 255681ad6265SDimitry Andric const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel(); 255781ad6265SDimitry Andric if (!TSchedModel || DAGInstrs->SUnits.empty()) 255881ad6265SDimitry Andric return; 255981ad6265SDimitry Andric 2560bdd1243dSDimitry Andric LLVM_DEBUG(dbgs() << "Applying IGroupLPDAGMutation...\n"); 256181ad6265SDimitry Andric const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>(); 256281ad6265SDimitry Andric TII = ST.getInstrInfo(); 256381ad6265SDimitry Andric DAG = static_cast<ScheduleDAGMI *>(DAGInstrs); 2564bdd1243dSDimitry Andric SyncedSchedGroups.clear(); 2565bdd1243dSDimitry Andric SyncedInstrs.clear(); 2566*0fca6ea1SDimitry Andric bool FoundSB = false; 2567*0fca6ea1SDimitry Andric bool FoundIGLP = false; 2568*0fca6ea1SDimitry Andric bool ShouldApplyIGLP = false; 2569bdd1243dSDimitry Andric for (auto R = DAG->SUnits.rbegin(), E = DAG->SUnits.rend(); R != E; ++R) { 2570bdd1243dSDimitry Andric unsigned Opc = R->getInstr()->getOpcode(); 2571bdd1243dSDimitry Andric // SCHED_[GROUP_]BARRIER and IGLP are mutually exclusive. 2572bdd1243dSDimitry Andric if (Opc == AMDGPU::SCHED_BARRIER) { 2573bdd1243dSDimitry Andric addSchedBarrierEdges(*R); 2574*0fca6ea1SDimitry Andric FoundSB = true; 2575bdd1243dSDimitry Andric } else if (Opc == AMDGPU::SCHED_GROUP_BARRIER) { 2576bdd1243dSDimitry Andric initSchedGroupBarrierPipelineStage(R); 2577*0fca6ea1SDimitry Andric FoundSB = true; 2578bdd1243dSDimitry Andric } else if (Opc == AMDGPU::IGLP_OPT) { 2579bdd1243dSDimitry Andric resetEdges(*R, DAG); 2580*0fca6ea1SDimitry Andric if (!FoundSB && !FoundIGLP) { 2581*0fca6ea1SDimitry Andric FoundIGLP = true; 2582*0fca6ea1SDimitry Andric ShouldApplyIGLP = initIGLPOpt(*R); 2583*0fca6ea1SDimitry Andric } 2584bdd1243dSDimitry Andric } 258581ad6265SDimitry Andric } 258681ad6265SDimitry Andric 2587*0fca6ea1SDimitry Andric if (FoundSB || (FoundIGLP && ShouldApplyIGLP)) { 258806c3fb27SDimitry Andric PipelineSolver PS(SyncedSchedGroups, SyncedInstrs, DAG, IsBottomUp); 2589bdd1243dSDimitry Andric // PipelineSolver performs the mutation by adding the edges it 2590bdd1243dSDimitry Andric // determined as the best 2591bdd1243dSDimitry Andric PS.solve(); 2592*0fca6ea1SDimitry Andric return; 2593bdd1243dSDimitry Andric } 2594bdd1243dSDimitry Andric } 2595bdd1243dSDimitry Andric 2596bdd1243dSDimitry Andric void IGroupLPDAGMutation::addSchedBarrierEdges(SUnit &SchedBarrier) { 259781ad6265SDimitry Andric MachineInstr &MI = *SchedBarrier.getInstr(); 259881ad6265SDimitry Andric assert(MI.getOpcode() == AMDGPU::SCHED_BARRIER); 259981ad6265SDimitry Andric // Remove all existing edges from the SCHED_BARRIER that were added due to the 260081ad6265SDimitry Andric // instruction having side effects. 2601bdd1243dSDimitry Andric resetEdges(SchedBarrier, DAG); 2602cb14a3feSDimitry Andric LLVM_DEBUG(dbgs() << "Building SchedGroup for SchedBarrier with Mask: " 2603cb14a3feSDimitry Andric << MI.getOperand(0).getImm() << "\n"); 2604bdd1243dSDimitry Andric auto InvertedMask = 2605bdd1243dSDimitry Andric invertSchedBarrierMask((SchedGroupMask)MI.getOperand(0).getImm()); 2606bdd1243dSDimitry Andric SchedGroup SG(InvertedMask, std::nullopt, DAG, TII); 2607bdd1243dSDimitry Andric SG.initSchedGroup(); 2608cb14a3feSDimitry Andric 2609bdd1243dSDimitry Andric // Preserve original instruction ordering relative to the SCHED_BARRIER. 2610bdd1243dSDimitry Andric SG.link( 2611bdd1243dSDimitry Andric SchedBarrier, 2612bdd1243dSDimitry Andric (function_ref<bool(const SUnit *A, const SUnit *B)>)[]( 2613bdd1243dSDimitry Andric const SUnit *A, const SUnit *B) { return A->NodeNum > B->NodeNum; }); 261481ad6265SDimitry Andric } 261581ad6265SDimitry Andric 2616bdd1243dSDimitry Andric SchedGroupMask 2617bdd1243dSDimitry Andric IGroupLPDAGMutation::invertSchedBarrierMask(SchedGroupMask Mask) const { 2618bdd1243dSDimitry Andric // Invert mask and erase bits for types of instructions that are implied to be 2619bdd1243dSDimitry Andric // allowed past the SCHED_BARRIER. 2620bdd1243dSDimitry Andric SchedGroupMask InvertedMask = ~Mask; 2621bdd1243dSDimitry Andric 2622cb14a3feSDimitry Andric // ALU implies VALU, SALU, MFMA, TRANS. 2623bdd1243dSDimitry Andric if ((InvertedMask & SchedGroupMask::ALU) == SchedGroupMask::NONE) 2624cb14a3feSDimitry Andric InvertedMask &= ~SchedGroupMask::VALU & ~SchedGroupMask::SALU & 2625cb14a3feSDimitry Andric ~SchedGroupMask::MFMA & ~SchedGroupMask::TRANS; 2626cb14a3feSDimitry Andric // VALU, SALU, MFMA, TRANS implies ALU. 2627bdd1243dSDimitry Andric else if ((InvertedMask & SchedGroupMask::VALU) == SchedGroupMask::NONE || 2628bdd1243dSDimitry Andric (InvertedMask & SchedGroupMask::SALU) == SchedGroupMask::NONE || 2629cb14a3feSDimitry Andric (InvertedMask & SchedGroupMask::MFMA) == SchedGroupMask::NONE || 2630cb14a3feSDimitry Andric (InvertedMask & SchedGroupMask::TRANS) == SchedGroupMask::NONE) 2631bdd1243dSDimitry Andric InvertedMask &= ~SchedGroupMask::ALU; 2632bdd1243dSDimitry Andric 2633bdd1243dSDimitry Andric // VMEM implies VMEM_READ, VMEM_WRITE. 2634bdd1243dSDimitry Andric if ((InvertedMask & SchedGroupMask::VMEM) == SchedGroupMask::NONE) 2635bdd1243dSDimitry Andric InvertedMask &= ~SchedGroupMask::VMEM_READ & ~SchedGroupMask::VMEM_WRITE; 2636bdd1243dSDimitry Andric // VMEM_READ, VMEM_WRITE implies VMEM. 2637bdd1243dSDimitry Andric else if ((InvertedMask & SchedGroupMask::VMEM_READ) == SchedGroupMask::NONE || 2638bdd1243dSDimitry Andric (InvertedMask & SchedGroupMask::VMEM_WRITE) == SchedGroupMask::NONE) 2639bdd1243dSDimitry Andric InvertedMask &= ~SchedGroupMask::VMEM; 2640bdd1243dSDimitry Andric 2641bdd1243dSDimitry Andric // DS implies DS_READ, DS_WRITE. 2642bdd1243dSDimitry Andric if ((InvertedMask & SchedGroupMask::DS) == SchedGroupMask::NONE) 2643bdd1243dSDimitry Andric InvertedMask &= ~SchedGroupMask::DS_READ & ~SchedGroupMask::DS_WRITE; 2644bdd1243dSDimitry Andric // DS_READ, DS_WRITE implies DS. 2645bdd1243dSDimitry Andric else if ((InvertedMask & SchedGroupMask::DS_READ) == SchedGroupMask::NONE || 2646bdd1243dSDimitry Andric (InvertedMask & SchedGroupMask::DS_WRITE) == SchedGroupMask::NONE) 2647bdd1243dSDimitry Andric InvertedMask &= ~SchedGroupMask::DS; 2648bdd1243dSDimitry Andric 2649cb14a3feSDimitry Andric LLVM_DEBUG(dbgs() << "After Inverting, SchedGroup Mask: " << (int)InvertedMask 2650cb14a3feSDimitry Andric << "\n"); 2651cb14a3feSDimitry Andric 2652bdd1243dSDimitry Andric return InvertedMask; 265381ad6265SDimitry Andric } 265481ad6265SDimitry Andric 2655bdd1243dSDimitry Andric void IGroupLPDAGMutation::initSchedGroupBarrierPipelineStage( 2656bdd1243dSDimitry Andric std::vector<SUnit>::reverse_iterator RIter) { 2657bdd1243dSDimitry Andric // Remove all existing edges from the SCHED_GROUP_BARRIER that were added due 2658bdd1243dSDimitry Andric // to the instruction having side effects. 2659bdd1243dSDimitry Andric resetEdges(*RIter, DAG); 2660bdd1243dSDimitry Andric MachineInstr &SGB = *RIter->getInstr(); 2661bdd1243dSDimitry Andric assert(SGB.getOpcode() == AMDGPU::SCHED_GROUP_BARRIER); 2662bdd1243dSDimitry Andric int32_t SGMask = SGB.getOperand(0).getImm(); 2663bdd1243dSDimitry Andric int32_t Size = SGB.getOperand(1).getImm(); 2664bdd1243dSDimitry Andric int32_t SyncID = SGB.getOperand(2).getImm(); 2665bdd1243dSDimitry Andric 2666bdd1243dSDimitry Andric auto &SG = SyncedSchedGroups[SyncID].emplace_back((SchedGroupMask)SGMask, 2667bdd1243dSDimitry Andric Size, SyncID, DAG, TII); 2668bdd1243dSDimitry Andric 2669bdd1243dSDimitry Andric SG.initSchedGroup(RIter, SyncedInstrs[SG.getSyncID()]); 267081ad6265SDimitry Andric } 267181ad6265SDimitry Andric 2672*0fca6ea1SDimitry Andric bool IGroupLPDAGMutation::initIGLPOpt(SUnit &SU) { 2673bdd1243dSDimitry Andric IGLPStrategyID StrategyID = 2674bdd1243dSDimitry Andric (IGLPStrategyID)SU.getInstr()->getOperand(0).getImm(); 2675bdd1243dSDimitry Andric auto S = createIGLPStrategy(StrategyID, DAG, TII); 2676*0fca6ea1SDimitry Andric if (!S->shouldApplyStrategy(DAG, Phase)) 2677*0fca6ea1SDimitry Andric return false; 2678*0fca6ea1SDimitry Andric 267906c3fb27SDimitry Andric IsBottomUp = S->IsBottomUp; 2680*0fca6ea1SDimitry Andric return S->applyIGLPStrategy(SyncedInstrs, SyncedSchedGroups, Phase); 268106c3fb27SDimitry Andric } 268281ad6265SDimitry Andric 268381ad6265SDimitry Andric } // namespace 268481ad6265SDimitry Andric 268581ad6265SDimitry Andric namespace llvm { 268681ad6265SDimitry Andric 2687*0fca6ea1SDimitry Andric /// \p Phase specifes whether or not this is a reentry into the 26885f757f3fSDimitry Andric /// IGroupLPDAGMutation. Since there may be multiple scheduling passes on the 26895f757f3fSDimitry Andric /// same scheduling region (e.g. pre and post-RA scheduling / multiple 26905f757f3fSDimitry Andric /// scheduling "phases"), we can reenter this mutation framework more than once 26915f757f3fSDimitry Andric /// for a given region. 2692*0fca6ea1SDimitry Andric std::unique_ptr<ScheduleDAGMutation> 2693*0fca6ea1SDimitry Andric createIGroupLPDAGMutation(AMDGPU::SchedulingPhase Phase) { 2694*0fca6ea1SDimitry Andric return std::make_unique<IGroupLPDAGMutation>(Phase); 269581ad6265SDimitry Andric } 269681ad6265SDimitry Andric 269781ad6265SDimitry Andric } // end namespace llvm 2698