1 //===--- AMDGPUIGroupLP.cpp - AMDGPU IGroupLP ------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // \file This file defines a set of schedule DAG mutations that can be used to 10 // override default scheduler behavior to enforce specific scheduling patterns. 11 // They should be used in cases where runtime performance considerations such as 12 // inter-wavefront interactions, mean that compile-time heuristics cannot 13 // predict the optimal instruction ordering, or in kernels where optimum 14 // instruction scheduling is important enough to warrant manual intervention. 15 // 16 //===----------------------------------------------------------------------===// 17 18 #include "AMDGPUIGroupLP.h" 19 #include "AMDGPUTargetMachine.h" 20 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 21 #include "SIInstrInfo.h" 22 #include "SIMachineFunctionInfo.h" 23 #include "llvm/ADT/BitmaskEnum.h" 24 #include "llvm/ADT/DenseMap.h" 25 #include "llvm/CodeGen/MachineScheduler.h" 26 #include "llvm/CodeGen/TargetOpcodes.h" 27 28 using namespace llvm; 29 30 #define DEBUG_TYPE "igrouplp" 31 32 namespace { 33 34 static cl::opt<bool> EnableExactSolver( 35 "amdgpu-igrouplp-exact-solver", cl::Hidden, 36 cl::desc("Whether to use the exponential time solver to fit " 37 "the instructions to the pipeline as closely as " 38 "possible."), 39 cl::init(false)); 40 41 static cl::opt<unsigned> CutoffForExact( 42 "amdgpu-igrouplp-exact-solver-cutoff", cl::init(0), cl::Hidden, 43 cl::desc("The maximum number of scheduling group conflicts " 44 "which we attempt to solve with the exponential time " 45 "exact solver. Problem sizes greater than this will" 46 "be solved by the less accurate greedy algorithm. Selecting " 47 "solver by size is superseded by manually selecting " 48 "the solver (e.g. by amdgpu-igrouplp-exact-solver")); 49 50 static cl::opt<uint64_t> MaxBranchesExplored( 51 "amdgpu-igrouplp-exact-solver-max-branches", cl::init(0), cl::Hidden, 52 cl::desc("The amount of branches that we are willing to explore with" 53 "the exact algorithm before giving up.")); 54 55 static cl::opt<bool> UseCostHeur( 56 "amdgpu-igrouplp-exact-solver-cost-heur", cl::init(true), cl::Hidden, 57 cl::desc("Whether to use the cost heuristic to make choices as we " 58 "traverse the search space using the exact solver. Defaulted " 59 "to on, and if turned off, we will use the node order -- " 60 "attempting to put the later nodes in the later sched groups. " 61 "Experimentally, results are mixed, so this should be set on a " 62 "case-by-case basis.")); 63 64 // Components of the mask that determines which instruction types may be may be 65 // classified into a SchedGroup. 66 enum class SchedGroupMask { 67 NONE = 0u, 68 ALU = 1u << 0, 69 VALU = 1u << 1, 70 SALU = 1u << 2, 71 MFMA = 1u << 3, 72 VMEM = 1u << 4, 73 VMEM_READ = 1u << 5, 74 VMEM_WRITE = 1u << 6, 75 DS = 1u << 7, 76 DS_READ = 1u << 8, 77 DS_WRITE = 1u << 9, 78 TRANS = 1u << 10, 79 ALL = ALU | VALU | SALU | MFMA | VMEM | VMEM_READ | VMEM_WRITE | DS | 80 DS_READ | DS_WRITE | TRANS, 81 LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ ALL) 82 }; 83 84 class SchedGroup; 85 86 // InstructionRule class is used to enact a filter which determines whether or 87 // not an SU maps to a given SchedGroup. It contains complementary data 88 // structures (e.g Cache) to help those filters. 89 class InstructionRule { 90 protected: 91 const SIInstrInfo *TII; 92 unsigned SGID; 93 // A cache made available to the Filter to store SUnits for subsequent 94 // invocations of the Filter 95 std::optional<SmallVector<SUnit *, 4>> Cache; 96 97 public: 98 virtual bool 99 apply(const SUnit *, const ArrayRef<SUnit *>, 100 SmallVectorImpl<SchedGroup> &) { 101 return true; 102 }; 103 104 InstructionRule(const SIInstrInfo *TII, unsigned SGID, 105 bool NeedsCache = false) 106 : TII(TII), SGID(SGID) { 107 if (NeedsCache) { 108 Cache = SmallVector<SUnit *, 4>(); 109 } 110 } 111 112 virtual ~InstructionRule() = default; 113 }; 114 115 using SUnitsToCandidateSGsMap = DenseMap<SUnit *, SmallVector<int, 4>>; 116 117 // Classify instructions into groups to enable fine tuned control over the 118 // scheduler. These groups may be more specific than current SchedModel 119 // instruction classes. 120 class SchedGroup { 121 private: 122 // Mask that defines which instruction types can be classified into this 123 // SchedGroup. The instruction types correspond to the mask from SCHED_BARRIER 124 // and SCHED_GROUP_BARRIER. 125 SchedGroupMask SGMask; 126 127 // Maximum number of SUnits that can be added to this group. 128 std::optional<unsigned> MaxSize; 129 130 // SchedGroups will only synchronize with other SchedGroups that have the same 131 // SyncID. 132 int SyncID = 0; 133 134 // SGID is used to map instructions to candidate SchedGroups 135 unsigned SGID; 136 137 // The different rules each instruction in this SchedGroup must conform to 138 SmallVector<std::shared_ptr<InstructionRule>, 4> Rules; 139 140 // Count of the number of created SchedGroups, used to initialize SGID. 141 static unsigned NumSchedGroups; 142 143 // Try to add and edge from SU A to SU B. 144 bool tryAddEdge(SUnit *A, SUnit *B); 145 146 // Use SGMask to determine whether we can classify MI as a member of this 147 // SchedGroup object. 148 bool canAddMI(const MachineInstr &MI) const; 149 150 public: 151 // Collection of SUnits that are classified as members of this group. 152 SmallVector<SUnit *, 32> Collection; 153 154 ScheduleDAGInstrs *DAG; 155 const SIInstrInfo *TII; 156 157 // Returns true if SU can be added to this SchedGroup. 158 bool canAddSU(SUnit &SU) const; 159 160 // Add DAG dependencies from all SUnits in this SchedGroup and this SU. If 161 // MakePred is true, SU will be a predecessor of the SUnits in this 162 // SchedGroup, otherwise SU will be a successor. 163 void link(SUnit &SU, bool MakePred = false); 164 165 // Add DAG dependencies and track which edges are added, and the count of 166 // missed edges 167 int link(SUnit &SU, bool MakePred, 168 std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges); 169 170 // Add DAG dependencies from all SUnits in this SchedGroup and this SU. 171 // Use the predicate to determine whether SU should be a predecessor (P = 172 // true) or a successor (P = false) of this SchedGroup. 173 void link(SUnit &SU, function_ref<bool(const SUnit *A, const SUnit *B)> P); 174 175 // Add DAG dependencies such that SUnits in this group shall be ordered 176 // before SUnits in OtherGroup. 177 void link(SchedGroup &OtherGroup); 178 179 // Returns true if no more instructions may be added to this group. 180 bool isFull() const { return MaxSize && Collection.size() >= *MaxSize; } 181 182 // Append a constraint that SUs must meet in order to fit into this 183 // SchedGroup. Since many rules involve the relationship between a SchedGroup 184 // and the SUnits in other SchedGroups, rules are checked at Pipeline Solve 185 // time (rather than SchedGroup init time.) 186 void addRule(std::shared_ptr<InstructionRule> NewRule) { 187 Rules.push_back(NewRule); 188 } 189 190 // Returns true if the SU matches all rules 191 bool allowedByRules(const SUnit *SU, 192 SmallVectorImpl<SchedGroup> &SyncPipe) const { 193 if (Rules.empty()) 194 return true; 195 for (size_t I = 0; I < Rules.size(); I++) { 196 auto TheRule = Rules[I].get(); 197 if (!TheRule->apply(SU, Collection, SyncPipe)) { 198 return false; 199 } 200 } 201 return true; 202 } 203 204 // Add SU to the SchedGroup. 205 void add(SUnit &SU) { 206 LLVM_DEBUG(dbgs() << "For SchedGroup with mask " 207 << format_hex((int)SGMask, 10, true) << " adding " 208 << *SU.getInstr()); 209 Collection.push_back(&SU); 210 } 211 212 // Remove last element in the SchedGroup 213 void pop() { Collection.pop_back(); } 214 215 // Identify and add all relevant SUs from the DAG to this SchedGroup. 216 void initSchedGroup(); 217 218 // Add instructions to the SchedGroup bottom up starting from RIter. 219 // PipelineInstrs is a set of instructions that should not be added to the 220 // SchedGroup even when the other conditions for adding it are satisfied. 221 // RIter will be added to the SchedGroup as well, and dependencies will be 222 // added so that RIter will always be scheduled at the end of the group. 223 void initSchedGroup(std::vector<SUnit>::reverse_iterator RIter, 224 SUnitsToCandidateSGsMap &SyncedInstrs); 225 226 void initSchedGroup(SUnitsToCandidateSGsMap &SyncedInstrs); 227 228 int getSyncID() { return SyncID; } 229 230 int getSGID() { return SGID; } 231 232 SchedGroupMask getMask() { return SGMask; } 233 234 SchedGroup(SchedGroupMask SGMask, std::optional<unsigned> MaxSize, 235 ScheduleDAGInstrs *DAG, const SIInstrInfo *TII) 236 : SGMask(SGMask), MaxSize(MaxSize), DAG(DAG), TII(TII) { 237 SGID = NumSchedGroups++; 238 } 239 240 SchedGroup(SchedGroupMask SGMask, std::optional<unsigned> MaxSize, int SyncID, 241 ScheduleDAGInstrs *DAG, const SIInstrInfo *TII) 242 : SGMask(SGMask), MaxSize(MaxSize), SyncID(SyncID), DAG(DAG), TII(TII) { 243 SGID = NumSchedGroups++; 244 } 245 }; 246 247 // Remove all existing edges from a SCHED_BARRIER or SCHED_GROUP_BARRIER. 248 static void resetEdges(SUnit &SU, ScheduleDAGInstrs *DAG) { 249 assert(SU.getInstr()->getOpcode() == AMDGPU::SCHED_BARRIER || 250 SU.getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER || 251 SU.getInstr()->getOpcode() == AMDGPU::IGLP_OPT); 252 253 while (!SU.Preds.empty()) 254 for (auto &P : SU.Preds) 255 SU.removePred(P); 256 257 while (!SU.Succs.empty()) 258 for (auto &S : SU.Succs) 259 for (auto &SP : S.getSUnit()->Preds) 260 if (SP.getSUnit() == &SU) 261 S.getSUnit()->removePred(SP); 262 } 263 264 using SUToCandSGsPair = std::pair<SUnit *, SmallVector<int, 4>>; 265 using SUsToCandSGsVec = SmallVector<SUToCandSGsPair, 4>; 266 267 // The PipelineSolver is used to assign SUnits to SchedGroups in a pipeline 268 // in non-trivial cases. For example, if the requested pipeline is 269 // {VMEM_READ, VALU, MFMA, VMEM_READ} and we encounter a VMEM_READ instruction 270 // in the DAG, then we will have an instruction that can not be trivially 271 // assigned to a SchedGroup. The PipelineSolver class implements two algorithms 272 // to find a good solution to the pipeline -- a greedy algorithm and an exact 273 // algorithm. The exact algorithm has an exponential time complexity and should 274 // only be used for small sized problems or medium sized problems where an exact 275 // solution is highly desired. 276 class PipelineSolver { 277 ScheduleDAGMI *DAG; 278 279 // Instructions that can be assigned to multiple SchedGroups 280 DenseMap<int, SUnitsToCandidateSGsMap> SyncedInstrs; 281 SmallVector<SUsToCandSGsVec, 4> PipelineInstrs; 282 DenseMap<int, SmallVector<SchedGroup, 4>> SyncedSchedGroups; 283 // The current working pipeline 284 SmallVector<SmallVector<SchedGroup, 4>, 4> CurrPipeline; 285 // The pipeline that has the best solution found so far 286 SmallVector<SmallVector<SchedGroup, 4>, 4> BestPipeline; 287 288 // Whether or not we actually have any SyncedInstrs to try to solve. 289 bool NeedsSolver = false; 290 291 // Compute an estimate of the size of search tree -- the true size is 292 // the product of each conflictedInst.Matches.size() across all SyncPipelines 293 unsigned computeProblemSize(); 294 295 // The cost penalty of not assigning a SU to a SchedGroup 296 int MissPenalty = 0; 297 298 // Costs in terms of the number of edges we are unable to add 299 int BestCost = -1; 300 int CurrCost = 0; 301 302 // Index pointing to the conflicting instruction that is currently being 303 // fitted 304 int CurrConflInstNo = 0; 305 // Index to the pipeline that is currently being fitted 306 int CurrSyncGroupIdx = 0; 307 // The first non trivial pipeline 308 int BeginSyncGroupIdx = 0; 309 310 // How many branches we have explored 311 uint64_t BranchesExplored = 0; 312 313 // The direction in which we process the candidate SchedGroups per SU 314 bool IsBottomUp = true; 315 316 // Update indices to fit next conflicting instruction 317 void advancePosition(); 318 // Recede indices to attempt to find better fit for previous conflicting 319 // instruction 320 void retreatPosition(); 321 322 // The exponential time algorithm which finds the provably best fit 323 bool solveExact(); 324 // The polynomial time algorithm which attempts to find a good fit 325 bool solveGreedy(); 326 // Find the best SchedGroup for the current SU using the heuristic given all 327 // current information. One step in the greedy algorithm. Templated against 328 // the SchedGroup iterator (either reverse or forward). 329 template <typename T> 330 void greedyFind(std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges, T I, 331 T E); 332 // Whether or not the current solution is optimal 333 bool checkOptimal(); 334 // Populate the ready list, prioiritizing fewest missed edges first 335 // Templated against the SchedGroup iterator (either reverse or forward). 336 template <typename T> 337 void populateReadyList(SmallVectorImpl<std::pair<int, int>> &ReadyList, T I, 338 T E); 339 // Add edges corresponding to the SchedGroups as assigned by solver 340 void makePipeline(); 341 // Link the SchedGroups in the best found pipeline. 342 // Tmplated against the SchedGroup iterator (either reverse or forward). 343 template <typename T> void linkSchedGroups(T I, T E); 344 // Add the edges from the SU to the other SchedGroups in pipeline, and 345 // return the number of edges missed. 346 int addEdges(SmallVectorImpl<SchedGroup> &SyncPipeline, SUnit *SU, int SGID, 347 std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges); 348 /// Link the pipeline as if \p SU was in the SchedGroup with ID \p SGID. It 349 /// returns the cost (in terms of missed pipeline edges), and tracks the edges 350 /// added in \p AddedEdges 351 template <typename T> 352 int linkSUnit(SUnit *SU, int SGID, 353 std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges, T I, T E); 354 /// Remove the edges passed via \p AddedEdges 355 void removeEdges(const std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges); 356 // Convert the passed in maps to arrays for bidirectional iterators 357 void convertSyncMapsToArrays(); 358 359 void reset(); 360 361 public: 362 // Invoke the solver to map instructions to instruction groups. Heuristic && 363 // command-line-option determines to use exact or greedy algorithm. 364 void solve(); 365 366 PipelineSolver(DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups, 367 DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs, 368 ScheduleDAGMI *DAG, bool IsBottomUp = true) 369 : DAG(DAG), SyncedInstrs(SyncedInstrs), 370 SyncedSchedGroups(SyncedSchedGroups), IsBottomUp(IsBottomUp) { 371 372 for (auto &PipelineInstrs : SyncedInstrs) { 373 if (PipelineInstrs.second.size() > 0) { 374 NeedsSolver = true; 375 break; 376 } 377 } 378 379 if (!NeedsSolver) 380 return; 381 382 convertSyncMapsToArrays(); 383 384 CurrPipeline = BestPipeline; 385 386 while (static_cast<size_t>(BeginSyncGroupIdx) < PipelineInstrs.size() && 387 PipelineInstrs[BeginSyncGroupIdx].size() == 0) 388 ++BeginSyncGroupIdx; 389 390 if (static_cast<size_t>(BeginSyncGroupIdx) >= PipelineInstrs.size()) 391 return; 392 } 393 }; 394 395 void PipelineSolver::reset() { 396 397 for (auto &SyncPipeline : CurrPipeline) { 398 for (auto &SG : SyncPipeline) { 399 SmallVector<SUnit *, 32> TempCollection = SG.Collection; 400 SG.Collection.clear(); 401 auto SchedBarr = llvm::find_if(TempCollection, [](SUnit *SU) { 402 return SU->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER; 403 }); 404 if (SchedBarr != TempCollection.end()) 405 SG.Collection.push_back(*SchedBarr); 406 } 407 } 408 409 CurrSyncGroupIdx = BeginSyncGroupIdx; 410 CurrConflInstNo = 0; 411 CurrCost = 0; 412 } 413 414 void PipelineSolver::convertSyncMapsToArrays() { 415 for (auto &SyncPipe : SyncedSchedGroups) { 416 BestPipeline.insert(BestPipeline.begin(), SyncPipe.second); 417 } 418 419 int PipelineIDx = SyncedInstrs.size() - 1; 420 PipelineInstrs.resize(SyncedInstrs.size()); 421 for (auto &SyncInstrMap : SyncedInstrs) { 422 for (auto &SUsToCandSGs : SyncInstrMap.second) { 423 if (PipelineInstrs[PipelineIDx].size() == 0) { 424 PipelineInstrs[PipelineIDx].push_back( 425 std::pair(SUsToCandSGs.first, SUsToCandSGs.second)); 426 continue; 427 } 428 auto SortPosition = PipelineInstrs[PipelineIDx].begin(); 429 // Insert them in sorted order -- this allows for good parsing order in 430 // the greedy algorithm 431 while (SortPosition != PipelineInstrs[PipelineIDx].end() && 432 SUsToCandSGs.first->NodeNum > SortPosition->first->NodeNum) 433 ++SortPosition; 434 PipelineInstrs[PipelineIDx].insert( 435 SortPosition, std::pair(SUsToCandSGs.first, SUsToCandSGs.second)); 436 } 437 --PipelineIDx; 438 } 439 } 440 441 template <typename T> void PipelineSolver::linkSchedGroups(T I, T E) { 442 for (; I != E; ++I) { 443 auto &GroupA = *I; 444 for (auto J = std::next(I); J != E; ++J) { 445 auto &GroupB = *J; 446 GroupA.link(GroupB); 447 } 448 } 449 } 450 451 void PipelineSolver::makePipeline() { 452 // Preserve the order of barrier for subsequent SchedGroupBarrier mutations 453 for (auto &SyncPipeline : BestPipeline) { 454 LLVM_DEBUG(dbgs() << "Printing SchedGroups\n"); 455 for (auto &SG : SyncPipeline) { 456 LLVM_DEBUG(dbgs() << "SchedGroup with SGID " << SG.getSGID() 457 << " has: \n"); 458 SUnit *SGBarr = nullptr; 459 for (auto &SU : SG.Collection) { 460 if (SU->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER) 461 SGBarr = SU; 462 LLVM_DEBUG(dbgs() << "SU(" << SU->NodeNum << ")\n"); 463 } 464 // Command line requested IGroupLP doesn't have SGBarr 465 if (!SGBarr) 466 continue; 467 resetEdges(*SGBarr, DAG); 468 SG.link(*SGBarr, false); 469 } 470 } 471 472 for (auto &SyncPipeline : BestPipeline) { 473 IsBottomUp ? linkSchedGroups(SyncPipeline.rbegin(), SyncPipeline.rend()) 474 : linkSchedGroups(SyncPipeline.begin(), SyncPipeline.end()); 475 } 476 } 477 478 template <typename T> 479 int PipelineSolver::linkSUnit( 480 SUnit *SU, int SGID, std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges, 481 T I, T E) { 482 bool MakePred = false; 483 int AddedCost = 0; 484 for (; I < E; ++I) { 485 if (I->getSGID() == SGID) { 486 MakePred = true; 487 continue; 488 } 489 auto Group = *I; 490 AddedCost += Group.link(*SU, MakePred, AddedEdges); 491 assert(AddedCost >= 0); 492 } 493 return AddedCost; 494 } 495 496 int PipelineSolver::addEdges( 497 SmallVectorImpl<SchedGroup> &SyncPipeline, SUnit *SU, int SGID, 498 std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges) { 499 500 // For IsBottomUp, the first SchedGroup in SyncPipeline contains the 501 // instructions that are the ultimate successors in the resultant mutation. 502 // Therefore, in such a configuration, the SchedGroups occurring before the 503 // candidate SGID are successors of the candidate SchedGroup, thus the current 504 // SU should be linked as a predecessor to SUs in those SchedGroups. The 505 // opposite is true if !IsBottomUp. IsBottomUp occurs in the case of multiple 506 // SCHED_GROUP_BARRIERS, or if a user specifies IGLP_OPT SchedGroups using 507 // IsBottomUp (in reverse). 508 return IsBottomUp ? linkSUnit(SU, SGID, AddedEdges, SyncPipeline.rbegin(), 509 SyncPipeline.rend()) 510 : linkSUnit(SU, SGID, AddedEdges, SyncPipeline.begin(), 511 SyncPipeline.end()); 512 } 513 514 void PipelineSolver::removeEdges( 515 const std::vector<std::pair<SUnit *, SUnit *>> &EdgesToRemove) { 516 // Only remove the edges that we have added when testing 517 // the fit. 518 for (auto &PredSuccPair : EdgesToRemove) { 519 SUnit *Pred = PredSuccPair.first; 520 SUnit *Succ = PredSuccPair.second; 521 522 auto Match = llvm::find_if( 523 Succ->Preds, [&Pred](SDep &P) { return P.getSUnit() == Pred; }); 524 if (Match != Succ->Preds.end()) { 525 assert(Match->isArtificial()); 526 Succ->removePred(*Match); 527 } 528 } 529 } 530 531 void PipelineSolver::advancePosition() { 532 ++CurrConflInstNo; 533 534 if (static_cast<size_t>(CurrConflInstNo) >= 535 PipelineInstrs[CurrSyncGroupIdx].size()) { 536 CurrConflInstNo = 0; 537 ++CurrSyncGroupIdx; 538 // Advance to next non-trivial pipeline 539 while (static_cast<size_t>(CurrSyncGroupIdx) < PipelineInstrs.size() && 540 PipelineInstrs[CurrSyncGroupIdx].size() == 0) 541 ++CurrSyncGroupIdx; 542 } 543 } 544 545 void PipelineSolver::retreatPosition() { 546 assert(CurrConflInstNo >= 0); 547 assert(CurrSyncGroupIdx >= 0); 548 549 if (CurrConflInstNo > 0) { 550 --CurrConflInstNo; 551 return; 552 } 553 554 if (CurrConflInstNo == 0) { 555 // If we return to the starting position, we have explored 556 // the entire tree 557 if (CurrSyncGroupIdx == BeginSyncGroupIdx) 558 return; 559 560 --CurrSyncGroupIdx; 561 // Go to previous non-trivial pipeline 562 while (PipelineInstrs[CurrSyncGroupIdx].size() == 0) 563 --CurrSyncGroupIdx; 564 565 CurrConflInstNo = PipelineInstrs[CurrSyncGroupIdx].size() - 1; 566 } 567 } 568 569 bool PipelineSolver::checkOptimal() { 570 if (static_cast<size_t>(CurrSyncGroupIdx) == PipelineInstrs.size()) { 571 if (BestCost == -1 || CurrCost < BestCost) { 572 BestPipeline = CurrPipeline; 573 BestCost = CurrCost; 574 LLVM_DEBUG(dbgs() << "Found Fit with cost " << BestCost << "\n"); 575 } 576 assert(BestCost >= 0); 577 } 578 579 bool DoneExploring = false; 580 if (MaxBranchesExplored > 0 && BranchesExplored >= MaxBranchesExplored) 581 DoneExploring = true; 582 583 return (DoneExploring || BestCost == 0); 584 } 585 586 template <typename T> 587 void PipelineSolver::populateReadyList( 588 SmallVectorImpl<std::pair<int, int>> &ReadyList, T I, T E) { 589 SUToCandSGsPair CurrSU = PipelineInstrs[CurrSyncGroupIdx][CurrConflInstNo]; 590 auto SyncPipeline = CurrPipeline[CurrSyncGroupIdx]; 591 assert(CurrSU.second.size() >= 1); 592 593 for (; I != E; ++I) { 594 std::vector<std::pair<SUnit *, SUnit *>> AddedEdges; 595 int CandSGID = *I; 596 SchedGroup *Match = llvm::find_if(SyncPipeline, [CandSGID](SchedGroup &SG) { 597 return SG.getSGID() == CandSGID; 598 }); 599 assert(Match); 600 601 if (UseCostHeur) { 602 if (Match->isFull()) { 603 ReadyList.push_back(std::pair(*I, MissPenalty)); 604 continue; 605 } 606 607 int TempCost = addEdges(SyncPipeline, CurrSU.first, CandSGID, AddedEdges); 608 ReadyList.push_back(std::pair(*I, TempCost)); 609 removeEdges(AddedEdges); 610 } else 611 ReadyList.push_back(std::pair(*I, -1)); 612 } 613 614 if (UseCostHeur) { 615 std::sort(ReadyList.begin(), ReadyList.end(), 616 [](std::pair<int, int> A, std::pair<int, int> B) { 617 return A.second < B.second; 618 }); 619 } 620 621 assert(ReadyList.size() == CurrSU.second.size()); 622 } 623 624 bool PipelineSolver::solveExact() { 625 if (checkOptimal()) 626 return true; 627 628 if (static_cast<size_t>(CurrSyncGroupIdx) == PipelineInstrs.size()) 629 return false; 630 631 assert(static_cast<size_t>(CurrSyncGroupIdx) < PipelineInstrs.size()); 632 assert(static_cast<size_t>(CurrConflInstNo) < 633 PipelineInstrs[CurrSyncGroupIdx].size()); 634 SUToCandSGsPair CurrSU = PipelineInstrs[CurrSyncGroupIdx][CurrConflInstNo]; 635 LLVM_DEBUG(dbgs() << "Fitting SU(" << CurrSU.first->NodeNum 636 << ") in Pipeline # " << CurrSyncGroupIdx << "\n"); 637 638 // SchedGroup -> Cost pairs 639 SmallVector<std::pair<int, int>, 4> ReadyList; 640 // Prioritize the candidate sched groups in terms of lowest cost first 641 IsBottomUp ? populateReadyList(ReadyList, CurrSU.second.rbegin(), 642 CurrSU.second.rend()) 643 : populateReadyList(ReadyList, CurrSU.second.begin(), 644 CurrSU.second.end()); 645 646 auto I = ReadyList.begin(); 647 auto E = ReadyList.end(); 648 for (; I != E; ++I) { 649 // If we are trying SGs in least cost order, and the current SG is cost 650 // infeasible, then all subsequent SGs will also be cost infeasible, so we 651 // can prune. 652 if (BestCost != -1 && (CurrCost + I->second > BestCost)) 653 return false; 654 655 int CandSGID = I->first; 656 int AddedCost = 0; 657 std::vector<std::pair<SUnit *, SUnit *>> AddedEdges; 658 auto &SyncPipeline = CurrPipeline[CurrSyncGroupIdx]; 659 SchedGroup *Match; 660 for (auto &SG : SyncPipeline) { 661 if (SG.getSGID() == CandSGID) 662 Match = &SG; 663 } 664 665 if (Match->isFull()) 666 continue; 667 668 if (!Match->allowedByRules(CurrSU.first, SyncPipeline)) 669 continue; 670 671 LLVM_DEBUG(dbgs() << "Assigning to SchedGroup with Mask " 672 << (int)Match->getMask() << "and ID " << CandSGID 673 << "\n"); 674 Match->add(*CurrSU.first); 675 AddedCost = addEdges(SyncPipeline, CurrSU.first, CandSGID, AddedEdges); 676 LLVM_DEBUG(dbgs() << "Cost of Assignment: " << AddedCost << "\n"); 677 CurrCost += AddedCost; 678 advancePosition(); 679 ++BranchesExplored; 680 bool FinishedExploring = false; 681 // If the Cost after adding edges is greater than a known solution, 682 // backtrack 683 if (CurrCost < BestCost || BestCost == -1) { 684 if (solveExact()) { 685 FinishedExploring = BestCost != 0; 686 if (!FinishedExploring) 687 return true; 688 } 689 } 690 691 retreatPosition(); 692 CurrCost -= AddedCost; 693 removeEdges(AddedEdges); 694 Match->pop(); 695 CurrPipeline[CurrSyncGroupIdx] = SyncPipeline; 696 if (FinishedExploring) 697 return true; 698 } 699 700 // Try the pipeline where the current instruction is omitted 701 // Potentially if we omit a problematic instruction from the pipeline, 702 // all the other instructions can nicely fit. 703 CurrCost += MissPenalty; 704 advancePosition(); 705 706 LLVM_DEBUG(dbgs() << "NOT Assigned (" << CurrSU.first->NodeNum << ")\n"); 707 708 bool FinishedExploring = false; 709 if (CurrCost < BestCost || BestCost == -1) { 710 if (solveExact()) { 711 bool FinishedExploring = BestCost != 0; 712 if (!FinishedExploring) 713 return true; 714 } 715 } 716 717 retreatPosition(); 718 CurrCost -= MissPenalty; 719 return FinishedExploring; 720 } 721 722 template <typename T> 723 void PipelineSolver::greedyFind( 724 std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges, T I, T E) { 725 SUToCandSGsPair CurrSU = PipelineInstrs[CurrSyncGroupIdx][CurrConflInstNo]; 726 int BestNodeCost = -1; 727 int TempCost; 728 SchedGroup *BestGroup = nullptr; 729 int BestGroupID = -1; 730 auto &SyncPipeline = CurrPipeline[CurrSyncGroupIdx]; 731 LLVM_DEBUG(dbgs() << "Fitting SU(" << CurrSU.first->NodeNum 732 << ") in Pipeline # " << CurrSyncGroupIdx << "\n"); 733 734 // Since we have added the potential SchedGroups from bottom up, but 735 // traversed the DAG from top down, parse over the groups from last to 736 // first. If we fail to do this for the greedy algorithm, the solution will 737 // likely not be good in more complex cases. 738 for (; I != E; ++I) { 739 std::vector<std::pair<SUnit *, SUnit *>> AddedEdges; 740 int CandSGID = *I; 741 SchedGroup *Match = llvm::find_if(SyncPipeline, [CandSGID](SchedGroup &SG) { 742 return SG.getSGID() == CandSGID; 743 }); 744 assert(Match); 745 746 LLVM_DEBUG(dbgs() << "Trying SGID # " << CandSGID << " with Mask " 747 << (int)Match->getMask() << "\n"); 748 749 if (Match->isFull()) { 750 LLVM_DEBUG(dbgs() << "SGID # " << CandSGID << " is full\n"); 751 continue; 752 } 753 if (!Match->allowedByRules(CurrSU.first, SyncPipeline)) { 754 LLVM_DEBUG(dbgs() << "SGID # " << CandSGID << " has conflicting rule\n"); 755 continue; 756 } 757 TempCost = addEdges(SyncPipeline, CurrSU.first, CandSGID, AddedEdges); 758 LLVM_DEBUG(dbgs() << "Cost of Group " << TempCost << "\n"); 759 if (TempCost < BestNodeCost || BestNodeCost == -1) { 760 BestGroup = Match; 761 BestNodeCost = TempCost; 762 BestGroupID = CandSGID; 763 } 764 removeEdges(AddedEdges); 765 if (BestNodeCost == 0) 766 break; 767 } 768 769 if (BestGroupID != -1) { 770 BestGroup->add(*CurrSU.first); 771 addEdges(SyncPipeline, CurrSU.first, BestGroupID, AddedEdges); 772 LLVM_DEBUG(dbgs() << "Best Group has ID: " << BestGroupID << " and Mask" 773 << (int)BestGroup->getMask() << "\n"); 774 BestCost += TempCost; 775 } else 776 BestCost += MissPenalty; 777 778 CurrPipeline[CurrSyncGroupIdx] = SyncPipeline; 779 } 780 781 bool PipelineSolver::solveGreedy() { 782 BestCost = 0; 783 std::vector<std::pair<SUnit *, SUnit *>> AddedEdges; 784 785 while (static_cast<size_t>(CurrSyncGroupIdx) < PipelineInstrs.size()) { 786 SUToCandSGsPair CurrSU = PipelineInstrs[CurrSyncGroupIdx][CurrConflInstNo]; 787 IsBottomUp 788 ? greedyFind(AddedEdges, CurrSU.second.rbegin(), CurrSU.second.rend()) 789 : greedyFind(AddedEdges, CurrSU.second.begin(), CurrSU.second.end()); 790 advancePosition(); 791 } 792 BestPipeline = CurrPipeline; 793 removeEdges(AddedEdges); 794 return false; 795 } 796 797 unsigned PipelineSolver::computeProblemSize() { 798 unsigned ProblemSize = 0; 799 for (auto &PipeConflicts : PipelineInstrs) { 800 ProblemSize += PipeConflicts.size(); 801 } 802 803 return ProblemSize; 804 } 805 806 void PipelineSolver::solve() { 807 if (!NeedsSolver) 808 return; 809 810 unsigned ProblemSize = computeProblemSize(); 811 assert(ProblemSize > 0); 812 813 bool BelowCutoff = (CutoffForExact > 0) && ProblemSize <= CutoffForExact; 814 MissPenalty = (ProblemSize / 2) + 1; 815 816 LLVM_DEBUG(DAG->dump()); 817 if (EnableExactSolver || BelowCutoff) { 818 LLVM_DEBUG(dbgs() << "Starting Greedy pipeline solver\n"); 819 solveGreedy(); 820 reset(); 821 LLVM_DEBUG(dbgs() << "Greedy produced best cost of " << BestCost << "\n"); 822 if (BestCost > 0) { 823 LLVM_DEBUG(dbgs() << "Starting EXACT pipeline solver\n"); 824 solveExact(); 825 LLVM_DEBUG(dbgs() << "Exact produced best cost of " << BestCost << "\n"); 826 } 827 } else { // Use the Greedy Algorithm by default 828 LLVM_DEBUG(dbgs() << "Starting GREEDY pipeline solver\n"); 829 solveGreedy(); 830 } 831 832 makePipeline(); 833 LLVM_DEBUG(dbgs() << "After applying mutation\n"); 834 LLVM_DEBUG(DAG->dump()); 835 } 836 837 enum IGLPStrategyID : int { 838 MFMASmallGemmOptID = 0, 839 MFMASmallGemmSingleWaveOptID = 1, 840 MFMAExpInterleave = 2 841 }; 842 843 // Implement a IGLP scheduling strategy. 844 class IGLPStrategy { 845 protected: 846 ScheduleDAGInstrs *DAG; 847 848 const SIInstrInfo *TII; 849 850 public: 851 /// Add SchedGroups to \p SyncedSchedGroups to implement this Strategy. 852 virtual bool applyIGLPStrategy( 853 DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs, 854 DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups, 855 AMDGPU::SchedulingPhase Phase) = 0; 856 857 // Returns true if this strategy should be applied to a ScheduleDAG. 858 virtual bool shouldApplyStrategy(ScheduleDAGInstrs *DAG, 859 AMDGPU::SchedulingPhase Phase) = 0; 860 861 bool IsBottomUp = true; 862 863 IGLPStrategy(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII) 864 : DAG(DAG), TII(TII) {} 865 866 virtual ~IGLPStrategy() = default; 867 }; 868 869 class MFMASmallGemmOpt final : public IGLPStrategy { 870 private: 871 public: 872 bool applyIGLPStrategy( 873 DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs, 874 DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups, 875 AMDGPU::SchedulingPhase Phase) override; 876 877 bool shouldApplyStrategy(ScheduleDAGInstrs *DAG, 878 AMDGPU::SchedulingPhase Phase) override { 879 return true; 880 } 881 882 MFMASmallGemmOpt(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII) 883 : IGLPStrategy(DAG, TII) { 884 IsBottomUp = true; 885 } 886 }; 887 888 bool MFMASmallGemmOpt::applyIGLPStrategy( 889 DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs, 890 DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups, 891 AMDGPU::SchedulingPhase Phase) { 892 // Count the number of MFMA instructions. 893 unsigned MFMACount = 0; 894 for (const MachineInstr &I : *DAG) 895 if (TII->isMFMAorWMMA(I)) 896 ++MFMACount; 897 898 const unsigned PipelineSyncID = 0; 899 SchedGroup *SG = nullptr; 900 for (unsigned I = 0; I < MFMACount * 3; ++I) { 901 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 902 SchedGroupMask::DS, 2, PipelineSyncID, DAG, TII); 903 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 904 905 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 906 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII); 907 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 908 } 909 910 return true; 911 } 912 913 class MFMAExpInterleaveOpt final : public IGLPStrategy { 914 private: 915 // The count of TRANS SUs involved in the interleaved pipeline 916 static unsigned TransPipeCount; 917 // The count of MFMA SUs involved in the interleaved pipeline 918 static unsigned MFMAPipeCount; 919 // The count of Add SUs involved in the interleaved pipeline 920 static unsigned AddPipeCount; 921 // The number of transitive MFMA successors for each TRANS SU 922 static unsigned MFMAEnablement; 923 // The number of transitive TRANS predecessors for each MFMA SU 924 static unsigned ExpRequirement; 925 // The count of independent "chains" of MFMA instructions in the pipeline 926 static unsigned MFMAChains; 927 // The length of each independent "chain" of MFMA instructions 928 static unsigned MFMAChainLength; 929 // Whether or not the pipeline has V_CVT instructions 930 static bool HasCvt; 931 // Whether or not there are instructions between the TRANS instruction and 932 // V_CVT 933 static bool HasChainBetweenCvt; 934 // The first occuring DS_READ which feeds an MFMA chain 935 static std::optional<unsigned> FirstPipeDSR; 936 // The MFMAPipe SUs with no MFMA predecessors 937 SmallVector<SUnit *, 4> MFMAChainSeeds; 938 // Compute the heuristics for the pipeline, returning whether or not the DAG 939 // is well formatted for the mutation 940 bool analyzeDAG(const SIInstrInfo *TII); 941 942 /// Whether or not the instruction is a transitive predecessor of an MFMA 943 /// instruction 944 class IsPipeExp final : public InstructionRule { 945 public: 946 bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection, 947 SmallVectorImpl<SchedGroup> &SyncPipe) override { 948 949 auto DAG = SyncPipe[0].DAG; 950 951 if (Cache->empty()) { 952 auto I = DAG->SUnits.rbegin(); 953 auto E = DAG->SUnits.rend(); 954 for (; I != E; I++) { 955 if (TII->isMFMAorWMMA(*I->getInstr())) 956 Cache->push_back(&*I); 957 } 958 if (Cache->empty()) 959 return false; 960 } 961 962 auto Reaches = (std::any_of( 963 Cache->begin(), Cache->end(), [&SU, &DAG](SUnit *TargetSU) { 964 return DAG->IsReachable(TargetSU, const_cast<SUnit *>(SU)); 965 })); 966 967 return Reaches; 968 } 969 IsPipeExp(const SIInstrInfo *TII, unsigned SGID, bool NeedsCache = false) 970 : InstructionRule(TII, SGID, NeedsCache) {} 971 }; 972 973 /// Whether or not the instruction is a transitive predecessor of the 974 /// \p Number th MFMA of the MFMAs occuring after a TRANS instruction 975 class EnablesNthMFMA final : public InstructionRule { 976 private: 977 unsigned Number = 1; 978 979 public: 980 bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection, 981 SmallVectorImpl<SchedGroup> &SyncPipe) override { 982 bool FoundTrans = false; 983 unsigned Counter = 1; 984 auto DAG = SyncPipe[0].DAG; 985 986 if (Cache->empty()) { 987 SmallVector<SUnit *, 8> Worklist; 988 989 auto I = DAG->SUnits.begin(); 990 auto E = DAG->SUnits.end(); 991 for (; I != E; I++) { 992 if (FoundTrans && TII->isMFMAorWMMA(*I->getInstr())) { 993 if (Counter == Number) { 994 Cache->push_back(&*I); 995 break; 996 } 997 ++Counter; 998 } 999 if (!FoundTrans && TII->isTRANS(I->getInstr()->getOpcode())) 1000 FoundTrans = true; 1001 } 1002 if (Cache->empty()) 1003 return false; 1004 } 1005 1006 return DAG->IsReachable((*Cache)[0], const_cast<SUnit *>(SU)); 1007 } 1008 1009 EnablesNthMFMA(unsigned Number, const SIInstrInfo *TII, unsigned SGID, 1010 bool NeedsCache = false) 1011 : InstructionRule(TII, SGID, NeedsCache), Number(Number) {} 1012 }; 1013 1014 /// Whether or not the instruction enables the exact MFMA that is the \p 1015 /// Number th MFMA in the chain starting with \p ChainSeed 1016 class EnablesNthMFMAInChain final : public InstructionRule { 1017 private: 1018 unsigned Number = 1; 1019 SUnit *ChainSeed; 1020 1021 public: 1022 bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection, 1023 SmallVectorImpl<SchedGroup> &SyncPipe) override { 1024 auto DAG = SyncPipe[0].DAG; 1025 1026 if (!SU || !TII->isMFMAorWMMA(*ChainSeed->getInstr())) 1027 return false; 1028 1029 if (Cache->empty()) { 1030 auto TempSU = ChainSeed; 1031 auto Depth = Number; 1032 while (Depth > 0) { 1033 --Depth; 1034 bool Found = false; 1035 for (auto &Succ : TempSU->Succs) { 1036 if (TII->isMFMAorWMMA(*Succ.getSUnit()->getInstr())) { 1037 TempSU = Succ.getSUnit(); 1038 Found = true; 1039 break; 1040 } 1041 } 1042 if (!Found) 1043 return false; 1044 } 1045 1046 Cache->push_back(TempSU); 1047 } 1048 // If we failed to find the instruction to be placed into the cache, we 1049 // would have already exited. 1050 assert(!Cache->empty()); 1051 1052 return DAG->IsReachable((*Cache)[0], const_cast<SUnit *>(SU)); 1053 } 1054 1055 EnablesNthMFMAInChain(unsigned Number, SUnit *ChainSeed, 1056 const SIInstrInfo *TII, unsigned SGID, 1057 bool NeedsCache = false) 1058 : InstructionRule(TII, SGID, NeedsCache), Number(Number), 1059 ChainSeed(ChainSeed) {} 1060 }; 1061 1062 /// Whether or not the instruction has less than \p Size immediate successors. 1063 /// If \p HasIntermediary is true, this tests also whether all successors of 1064 /// the SUnit have less than \p Size successors. 1065 class LessThanNSuccs final : public InstructionRule { 1066 private: 1067 unsigned Size = 1; 1068 bool HasIntermediary = false; 1069 1070 public: 1071 bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection, 1072 SmallVectorImpl<SchedGroup> &SyncPipe) override { 1073 if (!SyncPipe.size()) 1074 return false; 1075 1076 auto SuccSize = std::count_if( 1077 SU->Succs.begin(), SU->Succs.end(), 1078 [](const SDep &Succ) { return Succ.getKind() == SDep::Data; }); 1079 if (SuccSize >= Size) 1080 return false; 1081 1082 if (HasIntermediary) { 1083 for (auto Succ : SU->Succs) { 1084 auto SuccSize = std::count_if( 1085 Succ.getSUnit()->Succs.begin(), Succ.getSUnit()->Succs.end(), 1086 [](const SDep &SuccSucc) { 1087 return SuccSucc.getKind() == SDep::Data; 1088 }); 1089 if (SuccSize >= Size) 1090 return false; 1091 } 1092 } 1093 1094 return true; 1095 } 1096 LessThanNSuccs(unsigned Size, const SIInstrInfo *TII, unsigned SGID, 1097 bool HasIntermediary = false, bool NeedsCache = false) 1098 : InstructionRule(TII, SGID, NeedsCache), Size(Size), 1099 HasIntermediary(HasIntermediary) {} 1100 }; 1101 1102 /// Whether or not the instruction has greater than or equal to \p Size 1103 /// immediate successors. If \p HasIntermediary is true, this tests also 1104 /// whether all successors of the SUnit have greater than or equal to \p Size 1105 /// successors. 1106 class GreaterThanOrEqualToNSuccs final : public InstructionRule { 1107 private: 1108 unsigned Size = 1; 1109 bool HasIntermediary = false; 1110 1111 public: 1112 bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection, 1113 SmallVectorImpl<SchedGroup> &SyncPipe) override { 1114 if (!SyncPipe.size()) 1115 return false; 1116 1117 auto SuccSize = std::count_if( 1118 SU->Succs.begin(), SU->Succs.end(), 1119 [](const SDep &Succ) { return Succ.getKind() == SDep::Data; }); 1120 if (SuccSize >= Size) 1121 return true; 1122 1123 if (HasIntermediary) { 1124 for (auto Succ : SU->Succs) { 1125 auto SuccSize = std::count_if( 1126 Succ.getSUnit()->Succs.begin(), Succ.getSUnit()->Succs.end(), 1127 [](const SDep &SuccSucc) { 1128 return SuccSucc.getKind() == SDep::Data; 1129 }); 1130 if (SuccSize >= Size) 1131 return true; 1132 } 1133 } 1134 1135 return false; 1136 } 1137 GreaterThanOrEqualToNSuccs(unsigned Size, const SIInstrInfo *TII, 1138 unsigned SGID, bool HasIntermediary = false, 1139 bool NeedsCache = false) 1140 : InstructionRule(TII, SGID, NeedsCache), Size(Size), 1141 HasIntermediary(HasIntermediary) {} 1142 }; 1143 1144 // Whether or not the instruction is a relevant V_CVT instruction. 1145 class IsCvt final : public InstructionRule { 1146 public: 1147 bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection, 1148 SmallVectorImpl<SchedGroup> &SyncPipe) override { 1149 auto Opc = SU->getInstr()->getOpcode(); 1150 return Opc == AMDGPU::V_CVT_F16_F32_e32 || 1151 Opc == AMDGPU::V_CVT_I32_F32_e32; 1152 } 1153 IsCvt(const SIInstrInfo *TII, unsigned SGID, bool NeedsCache = false) 1154 : InstructionRule(TII, SGID, NeedsCache) {} 1155 }; 1156 1157 // Whether or not the instruction is FMA_F32. 1158 class IsFMA final : public InstructionRule { 1159 public: 1160 bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection, 1161 SmallVectorImpl<SchedGroup> &SyncPipe) override { 1162 return SU->getInstr()->getOpcode() == AMDGPU::V_FMA_F32_e64 || 1163 SU->getInstr()->getOpcode() == AMDGPU::V_PK_FMA_F32; 1164 } 1165 IsFMA(const SIInstrInfo *TII, unsigned SGID, bool NeedsCache = false) 1166 : InstructionRule(TII, SGID, NeedsCache) {} 1167 }; 1168 1169 // Whether or not the instruction is a V_ADD_F32 instruction. 1170 class IsPipeAdd final : public InstructionRule { 1171 public: 1172 bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection, 1173 SmallVectorImpl<SchedGroup> &SyncPipe) override { 1174 return SU->getInstr()->getOpcode() == AMDGPU::V_ADD_F32_e32; 1175 } 1176 IsPipeAdd(const SIInstrInfo *TII, unsigned SGID, bool NeedsCache = false) 1177 : InstructionRule(TII, SGID, NeedsCache) {} 1178 }; 1179 1180 /// Whether or not the instruction is an immediate RAW successor 1181 /// of the SchedGroup \p Distance steps before. 1182 class IsSuccOfPrevNthGroup final : public InstructionRule { 1183 private: 1184 unsigned Distance = 1; 1185 1186 public: 1187 bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection, 1188 SmallVectorImpl<SchedGroup> &SyncPipe) override { 1189 SchedGroup *OtherGroup = nullptr; 1190 if (!SyncPipe.size()) 1191 return false; 1192 1193 for (auto &PipeSG : SyncPipe) { 1194 if ((unsigned)PipeSG.getSGID() == SGID - Distance) 1195 OtherGroup = &PipeSG; 1196 } 1197 1198 if (!OtherGroup) 1199 return false; 1200 if (!OtherGroup->Collection.size()) 1201 return true; 1202 1203 for (auto &OtherEle : OtherGroup->Collection) { 1204 for (auto &Succ : OtherEle->Succs) { 1205 if (Succ.getSUnit() == SU && Succ.getKind() == SDep::Data) 1206 return true; 1207 } 1208 } 1209 1210 return false; 1211 } 1212 IsSuccOfPrevNthGroup(unsigned Distance, const SIInstrInfo *TII, 1213 unsigned SGID, bool NeedsCache = false) 1214 : InstructionRule(TII, SGID, NeedsCache), Distance(Distance) {} 1215 }; 1216 1217 /// Whether or not the instruction is a transitive successor of any 1218 /// instruction the the SchedGroup \p Distance steps before. 1219 class IsReachableFromPrevNthGroup final : public InstructionRule { 1220 private: 1221 unsigned Distance = 1; 1222 1223 public: 1224 bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection, 1225 SmallVectorImpl<SchedGroup> &SyncPipe) override { 1226 SchedGroup *OtherGroup = nullptr; 1227 if (!SyncPipe.size()) 1228 return false; 1229 1230 for (auto &PipeSG : SyncPipe) { 1231 if ((unsigned)PipeSG.getSGID() == SGID - Distance) 1232 OtherGroup = &PipeSG; 1233 } 1234 1235 if (!OtherGroup) 1236 return false; 1237 if (!OtherGroup->Collection.size()) 1238 return true; 1239 1240 auto DAG = SyncPipe[0].DAG; 1241 1242 for (auto &OtherEle : OtherGroup->Collection) 1243 if (DAG->IsReachable(const_cast<SUnit *>(SU), OtherEle)) 1244 return true; 1245 1246 return false; 1247 } 1248 IsReachableFromPrevNthGroup(unsigned Distance, const SIInstrInfo *TII, 1249 unsigned SGID, bool NeedsCache = false) 1250 : InstructionRule(TII, SGID, NeedsCache), Distance(Distance) {} 1251 }; 1252 1253 /// Whether or not the instruction occurs after the SU with NodeNUm \p Number 1254 class OccursAtOrAfterNode final : public InstructionRule { 1255 private: 1256 unsigned Number = 1; 1257 1258 public: 1259 bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection, 1260 SmallVectorImpl<SchedGroup> &SyncPipe) override { 1261 1262 return SU->NodeNum >= Number; 1263 } 1264 OccursAtOrAfterNode(unsigned Number, const SIInstrInfo *TII, unsigned SGID, 1265 bool NeedsCache = false) 1266 : InstructionRule(TII, SGID, NeedsCache), Number(Number) {} 1267 }; 1268 1269 /// Whether or not the SU is exactly the \p Number th MFMA in the chain 1270 /// starting with \p ChainSeed 1271 class IsExactMFMA final : public InstructionRule { 1272 private: 1273 unsigned Number = 1; 1274 SUnit *ChainSeed; 1275 1276 public: 1277 bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection, 1278 SmallVectorImpl<SchedGroup> &SyncPipe) override { 1279 if (!SU || !TII->isMFMAorWMMA(*ChainSeed->getInstr())) 1280 return false; 1281 1282 if (Cache->empty()) { 1283 auto TempSU = ChainSeed; 1284 auto Depth = Number; 1285 while (Depth > 0) { 1286 --Depth; 1287 bool Found = false; 1288 for (auto &Succ : TempSU->Succs) { 1289 if (TII->isMFMAorWMMA(*Succ.getSUnit()->getInstr())) { 1290 TempSU = Succ.getSUnit(); 1291 Found = true; 1292 break; 1293 } 1294 } 1295 if (!Found) { 1296 return false; 1297 } 1298 } 1299 Cache->push_back(TempSU); 1300 } 1301 // If we failed to find the instruction to be placed into the cache, we 1302 // would have already exited. 1303 assert(!Cache->empty()); 1304 1305 return (*Cache)[0] == SU; 1306 } 1307 1308 IsExactMFMA(unsigned Number, SUnit *ChainSeed, const SIInstrInfo *TII, 1309 unsigned SGID, bool NeedsCache = false) 1310 : InstructionRule(TII, SGID, NeedsCache), Number(Number), 1311 ChainSeed(ChainSeed) {} 1312 }; 1313 1314 // Whether the instruction occurs after the first TRANS instruction. This 1315 // implies the instruction can not be a predecessor of the first TRANS 1316 // insruction 1317 class OccursAfterExp final : public InstructionRule { 1318 public: 1319 bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection, 1320 SmallVectorImpl<SchedGroup> &SyncPipe) override { 1321 1322 SmallVector<SUnit *, 12> Worklist; 1323 auto DAG = SyncPipe[0].DAG; 1324 if (Cache->empty()) { 1325 for (auto &SU : DAG->SUnits) 1326 if (TII->isTRANS(SU.getInstr()->getOpcode())) { 1327 Cache->push_back(&SU); 1328 break; 1329 } 1330 if (Cache->empty()) 1331 return false; 1332 } 1333 1334 return SU->NodeNum > (*Cache)[0]->NodeNum; 1335 } 1336 1337 OccursAfterExp(const SIInstrInfo *TII, unsigned SGID, 1338 bool NeedsCache = false) 1339 : InstructionRule(TII, SGID, NeedsCache) {} 1340 }; 1341 1342 public: 1343 bool applyIGLPStrategy( 1344 DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs, 1345 DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups, 1346 AMDGPU::SchedulingPhase Phase) override; 1347 1348 bool shouldApplyStrategy(ScheduleDAGInstrs *DAG, 1349 AMDGPU::SchedulingPhase Phase) override; 1350 1351 MFMAExpInterleaveOpt(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII) 1352 : IGLPStrategy(DAG, TII) { 1353 IsBottomUp = false; 1354 } 1355 }; 1356 1357 unsigned MFMAExpInterleaveOpt::TransPipeCount = 0; 1358 unsigned MFMAExpInterleaveOpt::MFMAPipeCount = 0; 1359 unsigned MFMAExpInterleaveOpt::AddPipeCount = 0; 1360 unsigned MFMAExpInterleaveOpt::MFMAEnablement = 0; 1361 unsigned MFMAExpInterleaveOpt::ExpRequirement = 0; 1362 unsigned MFMAExpInterleaveOpt::MFMAChains = 0; 1363 unsigned MFMAExpInterleaveOpt::MFMAChainLength = 0; 1364 bool MFMAExpInterleaveOpt::HasCvt = false; 1365 bool MFMAExpInterleaveOpt::HasChainBetweenCvt = false; 1366 std::optional<unsigned> MFMAExpInterleaveOpt::FirstPipeDSR = std::nullopt; 1367 1368 bool MFMAExpInterleaveOpt::analyzeDAG(const SIInstrInfo *TII) { 1369 SmallVector<SUnit *, 10> ExpPipeCands; 1370 SmallVector<SUnit *, 10> MFMAPipeCands; 1371 SmallVector<SUnit *, 10> MFMAPipeSUs; 1372 SmallVector<SUnit *, 10> PackSUs; 1373 SmallVector<SUnit *, 10> CvtSUs; 1374 1375 auto isBitPack = [](unsigned Opc) { 1376 return Opc == AMDGPU::V_PACK_B32_F16_e64 || Opc == AMDGPU::V_PERM_B32_e64; 1377 }; 1378 1379 auto isCvt = [](unsigned Opc) { 1380 return Opc == AMDGPU::V_CVT_F16_F32_e32 || Opc == AMDGPU::V_CVT_I32_F32_e32; 1381 }; 1382 1383 auto isAdd = [](unsigned Opc) { return Opc == AMDGPU::V_ADD_F32_e32; }; 1384 1385 AddPipeCount = 0; 1386 for (SUnit &SU : DAG->SUnits) { 1387 auto Opc = SU.getInstr()->getOpcode(); 1388 if (TII->isTRANS(Opc)) { 1389 // Avoid counting a potential bonus V_EXP which all the MFMA depend on 1390 if (SU.Succs.size() >= 7) 1391 continue; 1392 for (auto &Succ : SU.Succs) { 1393 if (Succ.getSUnit()->Succs.size() >= 7) 1394 continue; 1395 } 1396 ExpPipeCands.push_back(&SU); 1397 } 1398 1399 if (TII->isMFMAorWMMA(*SU.getInstr())) 1400 MFMAPipeCands.push_back(&SU); 1401 1402 if (isBitPack(Opc)) 1403 PackSUs.push_back(&SU); 1404 1405 if (isCvt(Opc)) 1406 CvtSUs.push_back(&SU); 1407 1408 if (isAdd(Opc)) 1409 ++AddPipeCount; 1410 } 1411 1412 if (!(PackSUs.size() && MFMAPipeCands.size() && ExpPipeCands.size())) 1413 return false; 1414 1415 TransPipeCount = 0; 1416 1417 std::optional<SUnit *> TempMFMA; 1418 std::optional<SUnit *> TempExp; 1419 // Count the number of EXPs that reach an MFMA 1420 for (auto &PredSU : ExpPipeCands) { 1421 for (auto &SuccSU : MFMAPipeCands) { 1422 if (DAG->IsReachable(SuccSU, PredSU)) { 1423 if (!TempExp) { 1424 TempExp = PredSU; 1425 TempMFMA = SuccSU; 1426 } 1427 MFMAPipeSUs.push_back(SuccSU); 1428 ++TransPipeCount; 1429 break; 1430 } 1431 } 1432 } 1433 1434 if (!(TempExp && TempMFMA)) 1435 return false; 1436 1437 HasChainBetweenCvt = 1438 std::find_if((*TempExp)->Succs.begin(), (*TempExp)->Succs.end(), 1439 [&isCvt](SDep &Succ) { 1440 return isCvt(Succ.getSUnit()->getInstr()->getOpcode()); 1441 }) == (*TempExp)->Succs.end(); 1442 1443 // Count the number of MFMAs that are reached by an EXP 1444 for (auto &SuccSU : MFMAPipeCands) { 1445 if (MFMAPipeSUs.size() && 1446 std::find_if(MFMAPipeSUs.begin(), MFMAPipeSUs.end(), 1447 [&SuccSU](SUnit *PotentialMatch) { 1448 return PotentialMatch->NodeNum == SuccSU->NodeNum; 1449 }) != MFMAPipeSUs.end()) 1450 continue; 1451 1452 for (auto &PredSU : ExpPipeCands) { 1453 if (DAG->IsReachable(SuccSU, PredSU)) { 1454 MFMAPipeSUs.push_back(SuccSU); 1455 break; 1456 } 1457 } 1458 } 1459 1460 MFMAPipeCount = MFMAPipeSUs.size(); 1461 1462 assert(TempExp && TempMFMA); 1463 assert(MFMAPipeCount > 0); 1464 1465 std::optional<SUnit *> TempCvt; 1466 for (auto &SuccSU : CvtSUs) { 1467 if (DAG->IsReachable(SuccSU, *TempExp)) { 1468 TempCvt = SuccSU; 1469 break; 1470 } 1471 } 1472 1473 HasCvt = false; 1474 if (TempCvt.has_value()) { 1475 for (auto &SuccSU : MFMAPipeSUs) { 1476 if (DAG->IsReachable(SuccSU, *TempCvt)) { 1477 HasCvt = true; 1478 break; 1479 } 1480 } 1481 } 1482 1483 MFMAChains = 0; 1484 for (auto &MFMAPipeSU : MFMAPipeSUs) { 1485 if (is_contained(MFMAChainSeeds, MFMAPipeSU)) 1486 continue; 1487 if (!std::any_of(MFMAPipeSU->Preds.begin(), MFMAPipeSU->Preds.end(), 1488 [&TII](SDep &Succ) { 1489 return TII->isMFMAorWMMA(*Succ.getSUnit()->getInstr()); 1490 })) { 1491 MFMAChainSeeds.push_back(MFMAPipeSU); 1492 ++MFMAChains; 1493 } 1494 } 1495 1496 if (!MFMAChains) 1497 return false; 1498 1499 for (auto Pred : MFMAChainSeeds[0]->Preds) { 1500 if (TII->isDS(Pred.getSUnit()->getInstr()->getOpcode()) && 1501 Pred.getSUnit()->getInstr()->mayLoad()) 1502 FirstPipeDSR = Pred.getSUnit()->NodeNum; 1503 } 1504 1505 MFMAChainLength = MFMAPipeCount / MFMAChains; 1506 1507 // The number of bit pack operations that depend on a single V_EXP 1508 unsigned PackSuccCount = std::count_if( 1509 PackSUs.begin(), PackSUs.end(), [this, &TempExp](SUnit *VPack) { 1510 return DAG->IsReachable(VPack, *TempExp); 1511 }); 1512 1513 // The number of bit pack operations an MFMA depends on 1514 unsigned PackPredCount = 1515 std::count_if((*TempMFMA)->Preds.begin(), (*TempMFMA)->Preds.end(), 1516 [&isBitPack](SDep &Pred) { 1517 auto Opc = Pred.getSUnit()->getInstr()->getOpcode(); 1518 return isBitPack(Opc); 1519 }); 1520 1521 auto PackPred = 1522 std::find_if((*TempMFMA)->Preds.begin(), (*TempMFMA)->Preds.end(), 1523 [&isBitPack](SDep &Pred) { 1524 auto Opc = Pred.getSUnit()->getInstr()->getOpcode(); 1525 return isBitPack(Opc); 1526 }); 1527 1528 if (PackPred == (*TempMFMA)->Preds.end()) 1529 return false; 1530 1531 MFMAEnablement = 0; 1532 ExpRequirement = 0; 1533 // How many MFMAs depend on a single bit pack operation 1534 MFMAEnablement = 1535 std::count_if(PackPred->getSUnit()->Succs.begin(), 1536 PackPred->getSUnit()->Succs.end(), [&TII](SDep &Succ) { 1537 return TII->isMFMAorWMMA(*Succ.getSUnit()->getInstr()); 1538 }); 1539 1540 // The number of MFMAs that depend on a single V_EXP 1541 MFMAEnablement *= PackSuccCount; 1542 1543 // The number of V_EXPs required to resolve all dependencies for an MFMA 1544 ExpRequirement = 1545 std::count_if(ExpPipeCands.begin(), ExpPipeCands.end(), 1546 [this, &PackPred](SUnit *ExpBase) { 1547 return DAG->IsReachable(PackPred->getSUnit(), ExpBase); 1548 }); 1549 1550 ExpRequirement *= PackPredCount; 1551 return true; 1552 } 1553 1554 bool MFMAExpInterleaveOpt::shouldApplyStrategy(ScheduleDAGInstrs *DAG, 1555 AMDGPU::SchedulingPhase Phase) { 1556 const GCNSubtarget &ST = DAG->MF.getSubtarget<GCNSubtarget>(); 1557 const SIInstrInfo *TII = ST.getInstrInfo(); 1558 1559 if (Phase != AMDGPU::SchedulingPhase::PostRA) 1560 MFMAChainSeeds.clear(); 1561 if (Phase != AMDGPU::SchedulingPhase::PostRA && !analyzeDAG(TII)) 1562 return false; 1563 1564 return true; 1565 } 1566 1567 bool MFMAExpInterleaveOpt::applyIGLPStrategy( 1568 DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs, 1569 DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups, 1570 AMDGPU::SchedulingPhase Phase) { 1571 1572 bool IsSmallKernelType = 1573 MFMAEnablement == 2 && ExpRequirement == 4 && TransPipeCount == 32; 1574 bool IsLargeKernelType = 1575 MFMAEnablement == 4 && ExpRequirement == 4 && TransPipeCount == 64; 1576 1577 if (!(IsSmallKernelType || IsLargeKernelType)) 1578 return false; 1579 1580 const GCNSubtarget &ST = DAG->MF.getSubtarget<GCNSubtarget>(); 1581 const SIInstrInfo *TII = ST.getInstrInfo(); 1582 1583 unsigned PipelineSyncID = 0; 1584 SchedGroup *SG = nullptr; 1585 1586 unsigned MFMAChain = 0; 1587 unsigned PositionInChain = 0; 1588 unsigned CurrMFMAForTransPosition = 0; 1589 1590 auto incrementTransPosition = [&MFMAChain, &PositionInChain, 1591 &CurrMFMAForTransPosition]() { 1592 CurrMFMAForTransPosition += MFMAEnablement; 1593 PositionInChain = (CurrMFMAForTransPosition / MFMAChains); 1594 MFMAChain = CurrMFMAForTransPosition % MFMAChains; 1595 }; 1596 1597 auto getNextTransPositionInChain = [&CurrMFMAForTransPosition]() { 1598 auto TempMFMAForTrans = CurrMFMAForTransPosition + MFMAEnablement; 1599 return (TempMFMAForTrans / MFMAChains); 1600 }; 1601 1602 auto getNextTransMFMAChain = [&CurrMFMAForTransPosition]() { 1603 auto TempMFMAForTrans = CurrMFMAForTransPosition + MFMAEnablement; 1604 return TempMFMAForTrans % MFMAChains; 1605 }; 1606 1607 unsigned CurrMFMAPosition = 0; 1608 unsigned MFMAChainForMFMA = 0; 1609 unsigned PositionInChainForMFMA = 0; 1610 1611 auto incrementMFMAPosition = [&CurrMFMAPosition, &MFMAChainForMFMA, 1612 &PositionInChainForMFMA]() { 1613 ++CurrMFMAPosition; 1614 MFMAChainForMFMA = CurrMFMAPosition % MFMAChains; 1615 PositionInChainForMFMA = CurrMFMAPosition / MFMAChains; 1616 }; 1617 1618 bool IsPostRA = Phase == AMDGPU::SchedulingPhase::PostRA; 1619 assert(IsPostRA || MFMAChainSeeds.size() == MFMAChains); 1620 1621 bool UsesFMA = IsSmallKernelType || !IsPostRA; 1622 bool UsesDSRead = IsLargeKernelType && !IsPostRA && FirstPipeDSR; 1623 bool UsesCvt = HasCvt && (IsSmallKernelType || !IsPostRA); 1624 bool UsesVALU = IsSmallKernelType; 1625 1626 // PHASE 1: "Prefetch" 1627 if (UsesFMA) { 1628 // First Round FMA 1629 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 1630 SchedGroupMask::VALU, ExpRequirement, PipelineSyncID, DAG, TII); 1631 if (!IsPostRA && MFMAChains) { 1632 SG->addRule(std::make_shared<EnablesNthMFMAInChain>( 1633 PositionInChain, MFMAChainSeeds[MFMAChain], TII, SG->getSGID(), 1634 true)); 1635 } else 1636 SG->addRule( 1637 std::make_shared<EnablesNthMFMA>(1, TII, SG->getSGID(), true)); 1638 SG->addRule(std::make_shared<IsFMA>(TII, SG->getSGID())); 1639 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 1640 1641 // Second Round FMA 1642 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 1643 SchedGroupMask::VALU, ExpRequirement, PipelineSyncID, DAG, TII); 1644 if (!IsPostRA && MFMAChains) { 1645 SG->addRule(std::make_shared<EnablesNthMFMAInChain>( 1646 getNextTransPositionInChain(), 1647 MFMAChainSeeds[getNextTransMFMAChain()], TII, SG->getSGID(), true)); 1648 } else 1649 SG->addRule(std::make_shared<EnablesNthMFMA>(MFMAEnablement + 1, TII, 1650 SG->getSGID(), true)); 1651 SG->addRule(std::make_shared<IsFMA>(TII, SG->getSGID())); 1652 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 1653 } 1654 1655 if (UsesDSRead) { 1656 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 1657 SchedGroupMask::DS_READ, 2, PipelineSyncID, DAG, TII); 1658 SG->addRule(std::make_shared<OccursAtOrAfterNode>(*FirstPipeDSR, TII, 1659 SG->getSGID())); 1660 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 1661 } 1662 1663 // First Round EXP 1664 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 1665 SchedGroupMask::TRANS, ExpRequirement, PipelineSyncID, DAG, TII); 1666 if (!IsPostRA && MFMAChains) 1667 SG->addRule(std::make_shared<EnablesNthMFMAInChain>( 1668 PositionInChain, MFMAChainSeeds[MFMAChain], TII, SG->getSGID(), true)); 1669 else 1670 SG->addRule(std::make_shared<EnablesNthMFMA>(1, TII, SG->getSGID(), true)); 1671 SG->addRule(std::make_shared<IsPipeExp>(TII, SG->getSGID(), true)); 1672 SG->addRule(std::make_shared<LessThanNSuccs>(8, TII, SG->getSGID(), 1673 HasChainBetweenCvt)); 1674 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 1675 1676 incrementTransPosition(); 1677 1678 // First Round CVT, Third Round FMA, Second Round EXP; interleaved 1679 for (unsigned I = 0; I < ExpRequirement; I++) { 1680 // First Round CVT 1681 if (UsesCvt) { 1682 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 1683 SchedGroupMask::VALU, 1, PipelineSyncID, DAG, TII); 1684 SG->addRule(std::make_shared<IsCvt>(TII, SG->getSGID())); 1685 if (HasChainBetweenCvt) 1686 SG->addRule(std::make_shared<IsReachableFromPrevNthGroup>( 1687 1 + (2 + UsesFMA) * I, TII, SG->getSGID())); 1688 else 1689 SG->addRule(std::make_shared<IsSuccOfPrevNthGroup>( 1690 1 + (2 + UsesFMA) * I, TII, SG->getSGID())); 1691 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 1692 } 1693 1694 // Third Round FMA 1695 if (UsesFMA) { 1696 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 1697 SchedGroupMask::VALU, 1, PipelineSyncID, DAG, TII); 1698 if (!IsPostRA && MFMAChains) { 1699 SG->addRule(std::make_shared<EnablesNthMFMAInChain>( 1700 getNextTransPositionInChain(), 1701 MFMAChainSeeds[getNextTransMFMAChain()], TII, SG->getSGID(), true)); 1702 } else 1703 SG->addRule(std::make_shared<EnablesNthMFMA>(2 * MFMAEnablement + 1, 1704 TII, SG->getSGID(), true)); 1705 SG->addRule(std::make_shared<IsFMA>(TII, SG->getSGID())); 1706 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 1707 } 1708 1709 // Second Round EXP 1710 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 1711 SchedGroupMask::TRANS, 1, PipelineSyncID, DAG, TII); 1712 if (!IsPostRA && MFMAChains) 1713 SG->addRule(std::make_shared<EnablesNthMFMAInChain>( 1714 PositionInChain, MFMAChainSeeds[MFMAChain], TII, SG->getSGID(), 1715 true)); 1716 else 1717 SG->addRule(std::make_shared<EnablesNthMFMA>(MFMAEnablement + 1, TII, 1718 SG->getSGID(), true)); 1719 SG->addRule(std::make_shared<IsPipeExp>(TII, SG->getSGID(), true)); 1720 SG->addRule(std::make_shared<LessThanNSuccs>(8, TII, SG->getSGID(), 1721 HasChainBetweenCvt)); 1722 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 1723 } 1724 1725 // The "extra" EXP which enables all MFMA 1726 // TODO: UsesExtraExp 1727 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 1728 SchedGroupMask::TRANS, 1, PipelineSyncID, DAG, TII); 1729 SG->addRule(std::make_shared<IsPipeExp>(TII, SG->getSGID(), true)); 1730 SG->addRule(std::make_shared<GreaterThanOrEqualToNSuccs>( 1731 8, TII, SG->getSGID(), HasChainBetweenCvt)); 1732 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 1733 1734 // PHASE 2: Main Interleave Loop 1735 1736 // The number of MFMAs per iteration 1737 unsigned MFMARatio = 1738 MFMAEnablement > ExpRequirement ? MFMAEnablement / ExpRequirement : 1; 1739 // The number of Exps per iteration 1740 unsigned ExpRatio = 1741 MFMAEnablement > ExpRequirement ? 1 : ExpRequirement / MFMAEnablement; 1742 // The reamaining Exps 1743 unsigned RemainingExp = TransPipeCount > (2 * ExpRequirement) 1744 ? TransPipeCount - (2 * ExpRequirement) 1745 : 0; 1746 unsigned ExpLoopCount = RemainingExp / ExpRatio; 1747 // In loop MFMAs 1748 unsigned MFMAInLoop = MFMAPipeCount > (MFMAEnablement * 2) 1749 ? MFMAPipeCount - (MFMAEnablement * 2) 1750 : 0; 1751 unsigned MFMALoopCount = MFMAInLoop / MFMARatio; 1752 unsigned VALUOps = 1753 AddPipeCount < MFMAPipeCount ? 1 : AddPipeCount / MFMAPipeCount; 1754 unsigned LoopSize = std::min(ExpLoopCount, MFMALoopCount); 1755 1756 for (unsigned I = 0; I < LoopSize; I++) { 1757 if (!(I * ExpRatio % ExpRequirement)) 1758 incrementTransPosition(); 1759 1760 // Round N MFMA 1761 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 1762 SchedGroupMask::MFMA, MFMARatio, PipelineSyncID, DAG, TII); 1763 if (!IsPostRA && MFMAChains) 1764 SG->addRule(std::make_shared<IsExactMFMA>( 1765 PositionInChainForMFMA, MFMAChainSeeds[MFMAChainForMFMA], TII, 1766 SG->getSGID(), true)); 1767 else 1768 SG->addRule(std::make_shared<OccursAfterExp>(TII, SG->getSGID(), true)); 1769 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 1770 incrementMFMAPosition(); 1771 1772 if (UsesVALU) { 1773 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 1774 SchedGroupMask::VALU, VALUOps, PipelineSyncID, DAG, TII); 1775 SG->addRule(std::make_shared<IsPipeAdd>(TII, SG->getSGID())); 1776 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 1777 } 1778 1779 if (UsesDSRead && !(I % 4)) { 1780 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 1781 SchedGroupMask::DS_READ, 2, PipelineSyncID, DAG, TII); 1782 SG->addRule(std::make_shared<OccursAtOrAfterNode>(*FirstPipeDSR, TII, 1783 SG->getSGID())); 1784 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 1785 } 1786 1787 // CVT, EXP, FMA Interleaving 1788 for (unsigned J = 0; J < ExpRatio; J++) { 1789 auto MFMAOffset = (1 + UsesVALU) * MFMARatio * (I + 1); 1790 auto MaxMFMAOffset = 1791 (1 + UsesVALU) * ExpRequirement * MFMARatio / ExpRatio; 1792 1793 // Round N + 1 CVT 1794 if (UsesCvt) { 1795 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 1796 SchedGroupMask::VALU, 1, PipelineSyncID, DAG, TII); 1797 SG->addRule(std::make_shared<IsCvt>(TII, SG->getSGID())); 1798 auto BaseDiff = (2 + UsesFMA) * (ExpRequirement - 1) + 1; 1799 auto DSROffset = I / 4 + 1; 1800 auto MaxDSROffset = MaxMFMAOffset / 4; 1801 // TODO: UsesExtraExp 1802 auto ExpOffset = I * ExpRatio + J >= ExpRequirement ? 0 : 1; 1803 auto CurrentOffset = UsesDSRead * std::min(MaxDSROffset, DSROffset) + 1804 std::min(MaxMFMAOffset, MFMAOffset) + BaseDiff + 1805 ExpOffset; 1806 if (HasChainBetweenCvt) 1807 SG->addRule(std::make_shared<IsReachableFromPrevNthGroup>( 1808 CurrentOffset, TII, SG->getSGID())); 1809 else 1810 SG->addRule(std::make_shared<IsSuccOfPrevNthGroup>(CurrentOffset, TII, 1811 SG->getSGID())); 1812 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 1813 } 1814 1815 // Round N + 3 FMA 1816 if (UsesFMA) { 1817 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 1818 SchedGroupMask::VALU, 1, PipelineSyncID, DAG, TII); 1819 if (!IsPostRA && MFMAChains) 1820 SG->addRule(std::make_shared<EnablesNthMFMAInChain>( 1821 getNextTransPositionInChain(), 1822 MFMAChainSeeds[getNextTransMFMAChain()], TII, SG->getSGID(), 1823 true)); 1824 else 1825 SG->addRule(std::make_shared<EnablesNthMFMA>( 1826 (((I * ExpRatio + J) / ExpRequirement) + 3) * MFMAEnablement + 1, 1827 TII, SG->getSGID(), true)); 1828 SG->addRule(std::make_shared<IsFMA>(TII, SG->getSGID())); 1829 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 1830 } 1831 1832 // Round N + 2 Exp 1833 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 1834 SchedGroupMask::TRANS, 1, PipelineSyncID, DAG, TII); 1835 if (!IsPostRA && MFMAChains) 1836 SG->addRule(std::make_shared<EnablesNthMFMAInChain>( 1837 PositionInChain, MFMAChainSeeds[MFMAChain], TII, SG->getSGID(), 1838 true)); 1839 else 1840 SG->addRule(std::make_shared<EnablesNthMFMA>( 1841 (((I * ExpRatio + J) / ExpRequirement) + 2) * MFMAEnablement + 1, 1842 TII, SG->getSGID(), true)); 1843 SG->addRule(std::make_shared<IsPipeExp>(TII, SG->getSGID(), true)); 1844 SG->addRule(std::make_shared<LessThanNSuccs>(8, TII, SG->getSGID(), 1845 HasChainBetweenCvt)); 1846 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 1847 } 1848 } 1849 1850 // PHASE 3: Remaining MFMAs 1851 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 1852 SchedGroupMask::MFMA, MFMAEnablement * 2, PipelineSyncID, DAG, TII); 1853 SG->addRule(std::make_shared<OccursAfterExp>(TII, SG->getSGID(), true)); 1854 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 1855 return true; 1856 } 1857 1858 class MFMASmallGemmSingleWaveOpt final : public IGLPStrategy { 1859 private: 1860 // Whether the DS_READ is a predecessor of first four MFMA in region 1861 class EnablesInitialMFMA final : public InstructionRule { 1862 public: 1863 bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection, 1864 SmallVectorImpl<SchedGroup> &SyncPipe) override { 1865 if (!SyncPipe.size()) 1866 return false; 1867 int MFMAsFound = 0; 1868 if (!Cache->size()) { 1869 for (auto &Elt : SyncPipe[0].DAG->SUnits) { 1870 if (TII->isMFMAorWMMA(*Elt.getInstr())) { 1871 ++MFMAsFound; 1872 if (MFMAsFound > 4) 1873 break; 1874 Cache->push_back(&Elt); 1875 } 1876 } 1877 } 1878 1879 assert(Cache->size()); 1880 auto DAG = SyncPipe[0].DAG; 1881 for (auto &Elt : *Cache) { 1882 if (DAG->IsReachable(Elt, const_cast<SUnit *>(SU))) 1883 return true; 1884 } 1885 return false; 1886 } 1887 1888 EnablesInitialMFMA(const SIInstrInfo *TII, unsigned SGID, 1889 bool NeedsCache = false) 1890 : InstructionRule(TII, SGID, NeedsCache) {} 1891 }; 1892 1893 // Whether the MI is a V_PERM and is a predecessor of a common DS_WRITE 1894 class IsPermForDSW final : public InstructionRule { 1895 public: 1896 bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection, 1897 SmallVectorImpl<SchedGroup> &SyncPipe) override { 1898 auto MI = SU->getInstr(); 1899 if (MI->getOpcode() != AMDGPU::V_PERM_B32_e64) 1900 return false; 1901 1902 bool FitsInGroup = false; 1903 // Does the VALU have a DS_WRITE successor 1904 if (!Collection.size()) { 1905 for (auto &Succ : SU->Succs) { 1906 SUnit *SuccUnit = Succ.getSUnit(); 1907 if (TII->isDS(*SuccUnit->getInstr()) && 1908 SuccUnit->getInstr()->mayStore()) { 1909 Cache->push_back(SuccUnit); 1910 FitsInGroup = true; 1911 } 1912 } 1913 return FitsInGroup; 1914 } 1915 1916 assert(Cache->size()); 1917 1918 // Does the VALU have a DS_WRITE successor that is the same as other 1919 // VALU already in the group. The V_PERMs will all share 1 DS_W succ 1920 return llvm::any_of(*Cache, [&SU](SUnit *Elt) { 1921 return llvm::any_of(SU->Succs, [&Elt](const SDep &ThisSucc) { 1922 return ThisSucc.getSUnit() == Elt; 1923 }); 1924 }); 1925 } 1926 1927 IsPermForDSW(const SIInstrInfo *TII, unsigned SGID, bool NeedsCache = false) 1928 : InstructionRule(TII, SGID, NeedsCache) {} 1929 }; 1930 1931 // Whether the SU is a successor of any element in previous SchedGroup 1932 class IsSuccOfPrevGroup final : public InstructionRule { 1933 public: 1934 bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection, 1935 SmallVectorImpl<SchedGroup> &SyncPipe) override { 1936 SchedGroup *OtherGroup = nullptr; 1937 for (auto &PipeSG : SyncPipe) { 1938 if ((unsigned)PipeSG.getSGID() == SGID - 1) { 1939 OtherGroup = &PipeSG; 1940 } 1941 } 1942 1943 if (!OtherGroup) 1944 return false; 1945 if (!OtherGroup->Collection.size()) 1946 return true; 1947 1948 // Does the previous VALU have this DS_Write as a successor 1949 return (std::any_of(OtherGroup->Collection.begin(), 1950 OtherGroup->Collection.end(), [&SU](SUnit *Elt) { 1951 return std::any_of(Elt->Succs.begin(), 1952 Elt->Succs.end(), 1953 [&SU](SDep &Succ) { 1954 return Succ.getSUnit() == SU; 1955 }); 1956 })); 1957 } 1958 IsSuccOfPrevGroup(const SIInstrInfo *TII, unsigned SGID, 1959 bool NeedsCache = false) 1960 : InstructionRule(TII, SGID, NeedsCache) {} 1961 }; 1962 1963 // Whether the combined load width of group is 128 bits 1964 class VMEMSize final : public InstructionRule { 1965 public: 1966 bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection, 1967 SmallVectorImpl<SchedGroup> &SyncPipe) override { 1968 auto MI = SU->getInstr(); 1969 if (MI->getOpcode() == TargetOpcode::BUNDLE) 1970 return false; 1971 if (!Collection.size()) 1972 return true; 1973 1974 int NumBits = 0; 1975 1976 auto TRI = TII->getRegisterInfo(); 1977 auto &MRI = MI->getParent()->getParent()->getRegInfo(); 1978 for (auto &Elt : Collection) { 1979 auto Op = Elt->getInstr()->getOperand(0); 1980 auto Size = 1981 TRI.getRegSizeInBits(*TRI.getRegClassForOperandReg(MRI, Op)); 1982 NumBits += Size; 1983 } 1984 1985 if (NumBits < 128) { 1986 assert(TII->isVMEM(*MI) && MI->mayLoad()); 1987 if (NumBits + TRI.getRegSizeInBits(*TRI.getRegClassForOperandReg( 1988 MRI, MI->getOperand(0))) <= 1989 128) 1990 return true; 1991 } 1992 1993 return false; 1994 } 1995 1996 VMEMSize(const SIInstrInfo *TII, unsigned SGID, bool NeedsCache = false) 1997 : InstructionRule(TII, SGID, NeedsCache) {} 1998 }; 1999 2000 /// Whether the SU shares a V_PERM predecessor with any SU in the SchedGroup 2001 /// that is \p Distance steps away 2002 class SharesPredWithPrevNthGroup final : public InstructionRule { 2003 private: 2004 unsigned Distance = 1; 2005 2006 public: 2007 bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection, 2008 SmallVectorImpl<SchedGroup> &SyncPipe) override { 2009 SchedGroup *OtherGroup = nullptr; 2010 if (!SyncPipe.size()) 2011 return false; 2012 2013 if (!Cache->size()) { 2014 2015 for (auto &PipeSG : SyncPipe) { 2016 if ((unsigned)PipeSG.getSGID() == SGID - Distance) { 2017 OtherGroup = &PipeSG; 2018 } 2019 } 2020 2021 if (!OtherGroup) 2022 return false; 2023 if (!OtherGroup->Collection.size()) 2024 return true; 2025 2026 for (auto &OtherEle : OtherGroup->Collection) { 2027 for (auto &Pred : OtherEle->Preds) { 2028 if (Pred.getSUnit()->getInstr()->getOpcode() == 2029 AMDGPU::V_PERM_B32_e64) 2030 Cache->push_back(Pred.getSUnit()); 2031 } 2032 } 2033 2034 // If the other group has no PERM preds, then this group won't share any 2035 if (!Cache->size()) 2036 return false; 2037 } 2038 2039 auto DAG = SyncPipe[0].DAG; 2040 // Does the previous DS_WRITE share a V_PERM predecessor with this 2041 // VMEM_READ 2042 return llvm::any_of(*Cache, [&SU, &DAG](SUnit *Elt) { 2043 return DAG->IsReachable(const_cast<SUnit *>(SU), Elt); 2044 }); 2045 } 2046 SharesPredWithPrevNthGroup(unsigned Distance, const SIInstrInfo *TII, 2047 unsigned SGID, bool NeedsCache = false) 2048 : InstructionRule(TII, SGID, NeedsCache), Distance(Distance) {} 2049 }; 2050 2051 public: 2052 bool applyIGLPStrategy( 2053 DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs, 2054 DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups, 2055 AMDGPU::SchedulingPhase Phase) override; 2056 2057 bool shouldApplyStrategy(ScheduleDAGInstrs *DAG, 2058 AMDGPU::SchedulingPhase Phase) override { 2059 return true; 2060 } 2061 2062 MFMASmallGemmSingleWaveOpt(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII) 2063 : IGLPStrategy(DAG, TII) { 2064 IsBottomUp = false; 2065 } 2066 }; 2067 2068 static unsigned DSWCount = 0; 2069 static unsigned DSWWithPermCount = 0; 2070 static unsigned DSWWithSharedVMEMCount = 0; 2071 2072 bool MFMASmallGemmSingleWaveOpt::applyIGLPStrategy( 2073 DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs, 2074 DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups, 2075 AMDGPU::SchedulingPhase Phase) { 2076 unsigned MFMACount = 0; 2077 unsigned DSRCount = 0; 2078 2079 bool IsInitial = Phase == AMDGPU::SchedulingPhase::Initial; 2080 2081 assert((!IsInitial || (DSWCount == 0 && DSWWithPermCount == 0 && 2082 DSWWithSharedVMEMCount == 0)) && 2083 "DSWCounters should be zero in pre-RA scheduling!"); 2084 SmallVector<SUnit *, 6> DSWithPerms; 2085 for (auto &SU : DAG->SUnits) { 2086 auto I = SU.getInstr(); 2087 if (TII->isMFMAorWMMA(*I)) 2088 ++MFMACount; 2089 else if (TII->isDS(*I)) { 2090 if (I->mayLoad()) 2091 ++DSRCount; 2092 else if (I->mayStore() && IsInitial) { 2093 ++DSWCount; 2094 for (auto Pred : SU.Preds) { 2095 if (Pred.getSUnit()->getInstr()->getOpcode() == 2096 AMDGPU::V_PERM_B32_e64) { 2097 DSWithPerms.push_back(&SU); 2098 break; 2099 } 2100 } 2101 } 2102 } 2103 } 2104 2105 if (IsInitial) { 2106 DSWWithPermCount = DSWithPerms.size(); 2107 auto I = DSWithPerms.begin(); 2108 auto E = DSWithPerms.end(); 2109 2110 // Get the count of DS_WRITES with V_PERM predecessors which 2111 // have loop carried dependencies (WAR) on the same VMEM_READs. 2112 // We consider partial overlap as a miss -- in other words, 2113 // for a given DS_W, we only consider another DS_W as matching 2114 // if there is a corresponding (in terms of the VMEM_R it uses) V_PERM pred 2115 // for every V_PERM pred of this DS_W. 2116 DenseMap<MachineInstr *, SUnit *> VMEMLookup; 2117 SmallVector<SUnit *, 6> Counted; 2118 for (; I != E; I++) { 2119 SUnit *Cand = nullptr; 2120 bool MissedAny = false; 2121 for (auto &Pred : (*I)->Preds) { 2122 if (Pred.getSUnit()->getInstr()->getOpcode() != AMDGPU::V_PERM_B32_e64) 2123 continue; 2124 2125 if (Cand && llvm::is_contained(Counted, Cand)) 2126 break; 2127 2128 for (auto &Succ : Pred.getSUnit()->Succs) { 2129 auto MI = Succ.getSUnit()->getInstr(); 2130 if (!TII->isVMEM(*MI) || !MI->mayLoad()) 2131 continue; 2132 2133 if (MissedAny || !VMEMLookup.size()) { 2134 MissedAny = true; 2135 VMEMLookup[MI] = *I; 2136 continue; 2137 } 2138 2139 if (!VMEMLookup.contains(MI)) { 2140 MissedAny = true; 2141 VMEMLookup[MI] = *I; 2142 continue; 2143 } 2144 2145 Cand = VMEMLookup[MI]; 2146 if (llvm::is_contained(Counted, Cand)) { 2147 MissedAny = true; 2148 break; 2149 } 2150 } 2151 } 2152 if (!MissedAny && Cand) { 2153 DSWWithSharedVMEMCount += 2; 2154 Counted.push_back(Cand); 2155 Counted.push_back(*I); 2156 } 2157 } 2158 } 2159 2160 assert(DSWWithSharedVMEMCount <= DSWWithPermCount); 2161 SchedGroup *SG; 2162 unsigned PipelineSyncID = 0; 2163 // For kernels with V_PERM, there are enough VALU to mix in between MFMAs 2164 if (DSWWithPermCount) { 2165 for (unsigned I = 0; I < MFMACount; I++) { 2166 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 2167 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII); 2168 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 2169 2170 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 2171 SchedGroupMask::VALU, 2, PipelineSyncID, DAG, TII); 2172 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 2173 } 2174 } 2175 2176 PipelineSyncID = 1; 2177 // Phase 1: Break up DS_READ and MFMA clusters. 2178 // First DS_READ to make ready initial MFMA, then interleave MFMA with DS_READ 2179 // prefetch 2180 2181 // Make ready initial MFMA 2182 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 2183 SchedGroupMask::DS_READ, 4, PipelineSyncID, DAG, TII); 2184 SG->addRule(std::make_shared<EnablesInitialMFMA>(TII, SG->getSGID(), true)); 2185 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 2186 2187 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 2188 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII); 2189 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 2190 2191 // Interleave MFMA with DS_READ prefetch 2192 for (unsigned I = 0; I < DSRCount - 4; ++I) { 2193 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 2194 SchedGroupMask::DS_READ, 1, PipelineSyncID, DAG, TII); 2195 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 2196 2197 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 2198 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII); 2199 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 2200 } 2201 2202 // Phase 2a: Loop carried dependency with V_PERM 2203 // Schedule VPerm & DS_WRITE as closely as possible to the VMEM_READ they 2204 // depend on. Interleave MFMA to keep XDL unit busy throughout. 2205 for (unsigned I = 0; I < DSWWithPermCount - DSWWithSharedVMEMCount; ++I) { 2206 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 2207 SchedGroupMask::VALU, 4, PipelineSyncID, DAG, TII); 2208 SG->addRule(std::make_shared<IsPermForDSW>(TII, SG->getSGID(), true)); 2209 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 2210 2211 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 2212 SchedGroupMask::DS_WRITE, 1, PipelineSyncID, DAG, TII); 2213 SG->addRule(std::make_shared<IsSuccOfPrevGroup>(TII, SG->getSGID())); 2214 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 2215 2216 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 2217 SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG, TII); 2218 SG->addRule(std::make_shared<SharesPredWithPrevNthGroup>( 2219 1, TII, SG->getSGID(), true)); 2220 SG->addRule(std::make_shared<VMEMSize>(TII, SG->getSGID())); 2221 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 2222 2223 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 2224 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII); 2225 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 2226 2227 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 2228 SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG, TII); 2229 SG->addRule(std::make_shared<SharesPredWithPrevNthGroup>( 2230 3, TII, SG->getSGID(), true)); 2231 SG->addRule(std::make_shared<VMEMSize>(TII, SG->getSGID())); 2232 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 2233 2234 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 2235 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII); 2236 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 2237 } 2238 2239 // Phase 2b: Loop carried dependency without V_PERM 2240 // Schedule DS_WRITE as closely as possible to the VMEM_READ they depend on. 2241 // Interleave MFMA to keep XDL unit busy throughout. 2242 for (unsigned I = 0; I < DSWCount - DSWWithPermCount; I++) { 2243 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 2244 SchedGroupMask::DS_WRITE, 1, PipelineSyncID, DAG, TII); 2245 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 2246 2247 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 2248 SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG, TII); 2249 SG->addRule(std::make_shared<VMEMSize>(TII, SG->getSGID())); 2250 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 2251 2252 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 2253 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII); 2254 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 2255 } 2256 2257 // Phase 2c: Loop carried dependency with V_PERM, VMEM_READs are 2258 // ultimately used by two DS_WRITE 2259 // Schedule VPerm & DS_WRITE as closely as possible to the VMEM_READ they 2260 // depend on. Interleave MFMA to keep XDL unit busy throughout. 2261 2262 for (unsigned I = 0; I < DSWWithSharedVMEMCount; ++I) { 2263 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 2264 SchedGroupMask::VALU, 4, PipelineSyncID, DAG, TII); 2265 SG->addRule(std::make_shared<IsPermForDSW>(TII, SG->getSGID(), true)); 2266 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 2267 2268 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 2269 SchedGroupMask::DS_WRITE, 1, PipelineSyncID, DAG, TII); 2270 SG->addRule(std::make_shared<IsSuccOfPrevGroup>(TII, SG->getSGID())); 2271 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 2272 2273 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 2274 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII); 2275 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 2276 2277 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 2278 SchedGroupMask::VALU, 4, PipelineSyncID, DAG, TII); 2279 SG->addRule(std::make_shared<IsPermForDSW>(TII, SG->getSGID(), true)); 2280 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 2281 2282 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 2283 SchedGroupMask::DS_WRITE, 1, PipelineSyncID, DAG, TII); 2284 SG->addRule(std::make_shared<IsSuccOfPrevGroup>(TII, SG->getSGID())); 2285 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 2286 2287 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 2288 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII); 2289 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 2290 2291 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 2292 SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG, TII); 2293 SG->addRule(std::make_shared<SharesPredWithPrevNthGroup>( 2294 2, TII, SG->getSGID(), true)); 2295 SG->addRule(std::make_shared<VMEMSize>(TII, SG->getSGID())); 2296 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 2297 2298 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 2299 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII); 2300 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 2301 2302 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 2303 SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG, TII); 2304 SG->addRule(std::make_shared<SharesPredWithPrevNthGroup>( 2305 4, TII, SG->getSGID(), true)); 2306 SG->addRule(std::make_shared<VMEMSize>(TII, SG->getSGID())); 2307 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 2308 2309 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 2310 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII); 2311 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 2312 } 2313 2314 return true; 2315 } 2316 2317 static std::unique_ptr<IGLPStrategy> 2318 createIGLPStrategy(IGLPStrategyID ID, ScheduleDAGInstrs *DAG, 2319 const SIInstrInfo *TII) { 2320 switch (ID) { 2321 case MFMASmallGemmOptID: 2322 return std::make_unique<MFMASmallGemmOpt>(DAG, TII); 2323 case MFMASmallGemmSingleWaveOptID: 2324 return std::make_unique<MFMASmallGemmSingleWaveOpt>(DAG, TII); 2325 case MFMAExpInterleave: 2326 return std::make_unique<MFMAExpInterleaveOpt>(DAG, TII); 2327 } 2328 2329 llvm_unreachable("Unknown IGLPStrategyID"); 2330 } 2331 2332 class IGroupLPDAGMutation : public ScheduleDAGMutation { 2333 private: 2334 const SIInstrInfo *TII; 2335 2336 ScheduleDAGMI *DAG; 2337 2338 // Organize lists of SchedGroups by their SyncID. SchedGroups / 2339 // SCHED_GROUP_BARRIERs with different SyncIDs will have no edges added 2340 // between then. 2341 DenseMap<int, SmallVector<SchedGroup, 4>> SyncedSchedGroups; 2342 2343 // Used to track instructions that can be mapped to multiple sched groups 2344 DenseMap<int, SUnitsToCandidateSGsMap> SyncedInstrs; 2345 2346 // Add DAG edges that enforce SCHED_BARRIER ordering. 2347 void addSchedBarrierEdges(SUnit &SU); 2348 2349 // Use a SCHED_BARRIER's mask to identify instruction SchedGroups that should 2350 // not be reordered accross the SCHED_BARRIER. This is used for the base 2351 // SCHED_BARRIER, and not SCHED_GROUP_BARRIER. The difference is that 2352 // SCHED_BARRIER will always block all instructions that can be classified 2353 // into a particular SchedClass, whereas SCHED_GROUP_BARRIER has a fixed size 2354 // and may only synchronize with some SchedGroups. Returns the inverse of 2355 // Mask. SCHED_BARRIER's mask describes which instruction types should be 2356 // allowed to be scheduled across it. Invert the mask to get the 2357 // SchedGroupMask of instructions that should be barred. 2358 SchedGroupMask invertSchedBarrierMask(SchedGroupMask Mask) const; 2359 2360 // Create SchedGroups for a SCHED_GROUP_BARRIER. 2361 void initSchedGroupBarrierPipelineStage( 2362 std::vector<SUnit>::reverse_iterator RIter); 2363 2364 bool initIGLPOpt(SUnit &SU); 2365 2366 public: 2367 void apply(ScheduleDAGInstrs *DAGInstrs) override; 2368 2369 // The order in which the PipelineSolver should process the candidate 2370 // SchedGroup for a PipelineInstr. BOTTOM_UP will try to add SUs to the last 2371 // created SchedGroup first, and will consider that as the ultimate 2372 // predecessor group when linking. TOP_DOWN instead links and processes the 2373 // first created SchedGroup first. 2374 bool IsBottomUp = true; 2375 2376 // The scheduling phase this application of IGLP corresponds with. 2377 AMDGPU::SchedulingPhase Phase = AMDGPU::SchedulingPhase::Initial; 2378 2379 IGroupLPDAGMutation() = default; 2380 IGroupLPDAGMutation(AMDGPU::SchedulingPhase Phase) : Phase(Phase) {} 2381 }; 2382 2383 unsigned SchedGroup::NumSchedGroups = 0; 2384 2385 bool SchedGroup::tryAddEdge(SUnit *A, SUnit *B) { 2386 if (A != B && DAG->canAddEdge(B, A)) { 2387 DAG->addEdge(B, SDep(A, SDep::Artificial)); 2388 return true; 2389 } 2390 return false; 2391 } 2392 2393 bool SchedGroup::canAddMI(const MachineInstr &MI) const { 2394 bool Result = false; 2395 if (MI.isMetaInstruction()) 2396 Result = false; 2397 2398 else if (((SGMask & SchedGroupMask::ALU) != SchedGroupMask::NONE) && 2399 (TII->isVALU(MI) || TII->isMFMAorWMMA(MI) || TII->isSALU(MI) || 2400 TII->isTRANS(MI))) 2401 Result = true; 2402 2403 else if (((SGMask & SchedGroupMask::VALU) != SchedGroupMask::NONE) && 2404 TII->isVALU(MI) && !TII->isMFMAorWMMA(MI) && !TII->isTRANS(MI)) 2405 Result = true; 2406 2407 else if (((SGMask & SchedGroupMask::SALU) != SchedGroupMask::NONE) && 2408 TII->isSALU(MI)) 2409 Result = true; 2410 2411 else if (((SGMask & SchedGroupMask::MFMA) != SchedGroupMask::NONE) && 2412 TII->isMFMAorWMMA(MI)) 2413 Result = true; 2414 2415 else if (((SGMask & SchedGroupMask::VMEM) != SchedGroupMask::NONE) && 2416 (TII->isVMEM(MI) || (TII->isFLAT(MI) && !TII->isDS(MI)))) 2417 Result = true; 2418 2419 else if (((SGMask & SchedGroupMask::VMEM_READ) != SchedGroupMask::NONE) && 2420 MI.mayLoad() && 2421 (TII->isVMEM(MI) || (TII->isFLAT(MI) && !TII->isDS(MI)))) 2422 Result = true; 2423 2424 else if (((SGMask & SchedGroupMask::VMEM_WRITE) != SchedGroupMask::NONE) && 2425 MI.mayStore() && 2426 (TII->isVMEM(MI) || (TII->isFLAT(MI) && !TII->isDS(MI)))) 2427 Result = true; 2428 2429 else if (((SGMask & SchedGroupMask::DS) != SchedGroupMask::NONE) && 2430 TII->isDS(MI)) 2431 Result = true; 2432 2433 else if (((SGMask & SchedGroupMask::DS_READ) != SchedGroupMask::NONE) && 2434 MI.mayLoad() && TII->isDS(MI)) 2435 Result = true; 2436 2437 else if (((SGMask & SchedGroupMask::DS_WRITE) != SchedGroupMask::NONE) && 2438 MI.mayStore() && TII->isDS(MI)) 2439 Result = true; 2440 2441 else if (((SGMask & SchedGroupMask::TRANS) != SchedGroupMask::NONE) && 2442 TII->isTRANS(MI)) 2443 Result = true; 2444 2445 LLVM_DEBUG( 2446 dbgs() << "For SchedGroup with mask " << format_hex((int)SGMask, 10, true) 2447 << (Result ? " could classify " : " unable to classify ") << MI); 2448 2449 return Result; 2450 } 2451 2452 int SchedGroup::link(SUnit &SU, bool MakePred, 2453 std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges) { 2454 int MissedEdges = 0; 2455 for (auto *A : Collection) { 2456 SUnit *B = &SU; 2457 if (A == B || A->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER) 2458 continue; 2459 if (MakePred) 2460 std::swap(A, B); 2461 2462 if (DAG->IsReachable(B, A)) 2463 continue; 2464 2465 // tryAddEdge returns false if there is a dependency that makes adding 2466 // the A->B edge impossible, otherwise it returns true; 2467 bool Added = tryAddEdge(A, B); 2468 if (Added) 2469 AddedEdges.push_back(std::pair(A, B)); 2470 else 2471 ++MissedEdges; 2472 } 2473 2474 return MissedEdges; 2475 } 2476 2477 void SchedGroup::link(SUnit &SU, bool MakePred) { 2478 for (auto *A : Collection) { 2479 SUnit *B = &SU; 2480 if (A->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER) 2481 continue; 2482 if (MakePred) 2483 std::swap(A, B); 2484 2485 tryAddEdge(A, B); 2486 } 2487 } 2488 2489 void SchedGroup::link(SUnit &SU, 2490 function_ref<bool(const SUnit *A, const SUnit *B)> P) { 2491 for (auto *A : Collection) { 2492 SUnit *B = &SU; 2493 if (P(A, B)) 2494 std::swap(A, B); 2495 2496 tryAddEdge(A, B); 2497 } 2498 } 2499 2500 void SchedGroup::link(SchedGroup &OtherGroup) { 2501 for (auto *B : OtherGroup.Collection) 2502 link(*B); 2503 } 2504 2505 bool SchedGroup::canAddSU(SUnit &SU) const { 2506 MachineInstr &MI = *SU.getInstr(); 2507 if (MI.getOpcode() != TargetOpcode::BUNDLE) 2508 return canAddMI(MI); 2509 2510 // Special case for bundled MIs. 2511 const MachineBasicBlock *MBB = MI.getParent(); 2512 MachineBasicBlock::instr_iterator B = MI.getIterator(), E = ++B; 2513 while (E != MBB->end() && E->isBundledWithPred()) 2514 ++E; 2515 2516 // Return true if all of the bundled MIs can be added to this group. 2517 return std::all_of(B, E, [this](MachineInstr &MI) { return canAddMI(MI); }); 2518 } 2519 2520 void SchedGroup::initSchedGroup() { 2521 for (auto &SU : DAG->SUnits) { 2522 if (isFull()) 2523 break; 2524 2525 if (canAddSU(SU)) 2526 add(SU); 2527 } 2528 } 2529 2530 void SchedGroup::initSchedGroup(std::vector<SUnit>::reverse_iterator RIter, 2531 SUnitsToCandidateSGsMap &SyncedInstrs) { 2532 SUnit &InitSU = *RIter; 2533 for (auto E = DAG->SUnits.rend(); RIter != E; ++RIter) { 2534 auto &SU = *RIter; 2535 if (isFull()) 2536 break; 2537 2538 if (canAddSU(SU)) 2539 SyncedInstrs[&SU].push_back(SGID); 2540 } 2541 2542 add(InitSU); 2543 assert(MaxSize); 2544 (*MaxSize)++; 2545 } 2546 2547 void SchedGroup::initSchedGroup(SUnitsToCandidateSGsMap &SyncedInstrs) { 2548 auto I = DAG->SUnits.rbegin(); 2549 auto E = DAG->SUnits.rend(); 2550 for (; I != E; ++I) { 2551 auto &SU = *I; 2552 if (isFull()) 2553 break; 2554 if (canAddSU(SU)) 2555 SyncedInstrs[&SU].push_back(SGID); 2556 } 2557 } 2558 2559 void IGroupLPDAGMutation::apply(ScheduleDAGInstrs *DAGInstrs) { 2560 const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel(); 2561 if (!TSchedModel || DAGInstrs->SUnits.empty()) 2562 return; 2563 2564 LLVM_DEBUG(dbgs() << "Applying IGroupLPDAGMutation...\n"); 2565 const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>(); 2566 TII = ST.getInstrInfo(); 2567 DAG = static_cast<ScheduleDAGMI *>(DAGInstrs); 2568 SyncedSchedGroups.clear(); 2569 SyncedInstrs.clear(); 2570 bool FoundSB = false; 2571 bool FoundIGLP = false; 2572 bool ShouldApplyIGLP = false; 2573 for (auto R = DAG->SUnits.rbegin(), E = DAG->SUnits.rend(); R != E; ++R) { 2574 unsigned Opc = R->getInstr()->getOpcode(); 2575 // SCHED_[GROUP_]BARRIER and IGLP are mutually exclusive. 2576 if (Opc == AMDGPU::SCHED_BARRIER) { 2577 addSchedBarrierEdges(*R); 2578 FoundSB = true; 2579 } else if (Opc == AMDGPU::SCHED_GROUP_BARRIER) { 2580 initSchedGroupBarrierPipelineStage(R); 2581 FoundSB = true; 2582 } else if (Opc == AMDGPU::IGLP_OPT) { 2583 resetEdges(*R, DAG); 2584 if (!FoundSB && !FoundIGLP) { 2585 FoundIGLP = true; 2586 ShouldApplyIGLP = initIGLPOpt(*R); 2587 } 2588 } 2589 } 2590 2591 if (FoundSB || (FoundIGLP && ShouldApplyIGLP)) { 2592 PipelineSolver PS(SyncedSchedGroups, SyncedInstrs, DAG, IsBottomUp); 2593 // PipelineSolver performs the mutation by adding the edges it 2594 // determined as the best 2595 PS.solve(); 2596 return; 2597 } 2598 } 2599 2600 void IGroupLPDAGMutation::addSchedBarrierEdges(SUnit &SchedBarrier) { 2601 MachineInstr &MI = *SchedBarrier.getInstr(); 2602 assert(MI.getOpcode() == AMDGPU::SCHED_BARRIER); 2603 // Remove all existing edges from the SCHED_BARRIER that were added due to the 2604 // instruction having side effects. 2605 resetEdges(SchedBarrier, DAG); 2606 LLVM_DEBUG(dbgs() << "Building SchedGroup for SchedBarrier with Mask: " 2607 << MI.getOperand(0).getImm() << "\n"); 2608 auto InvertedMask = 2609 invertSchedBarrierMask((SchedGroupMask)MI.getOperand(0).getImm()); 2610 SchedGroup SG(InvertedMask, std::nullopt, DAG, TII); 2611 SG.initSchedGroup(); 2612 2613 // Preserve original instruction ordering relative to the SCHED_BARRIER. 2614 SG.link( 2615 SchedBarrier, 2616 (function_ref<bool(const SUnit *A, const SUnit *B)>)[]( 2617 const SUnit *A, const SUnit *B) { return A->NodeNum > B->NodeNum; }); 2618 } 2619 2620 SchedGroupMask 2621 IGroupLPDAGMutation::invertSchedBarrierMask(SchedGroupMask Mask) const { 2622 // Invert mask and erase bits for types of instructions that are implied to be 2623 // allowed past the SCHED_BARRIER. 2624 SchedGroupMask InvertedMask = ~Mask; 2625 2626 // ALU implies VALU, SALU, MFMA, TRANS. 2627 if ((InvertedMask & SchedGroupMask::ALU) == SchedGroupMask::NONE) 2628 InvertedMask &= ~SchedGroupMask::VALU & ~SchedGroupMask::SALU & 2629 ~SchedGroupMask::MFMA & ~SchedGroupMask::TRANS; 2630 // VALU, SALU, MFMA, TRANS implies ALU. 2631 else if ((InvertedMask & SchedGroupMask::VALU) == SchedGroupMask::NONE || 2632 (InvertedMask & SchedGroupMask::SALU) == SchedGroupMask::NONE || 2633 (InvertedMask & SchedGroupMask::MFMA) == SchedGroupMask::NONE || 2634 (InvertedMask & SchedGroupMask::TRANS) == SchedGroupMask::NONE) 2635 InvertedMask &= ~SchedGroupMask::ALU; 2636 2637 // VMEM implies VMEM_READ, VMEM_WRITE. 2638 if ((InvertedMask & SchedGroupMask::VMEM) == SchedGroupMask::NONE) 2639 InvertedMask &= ~SchedGroupMask::VMEM_READ & ~SchedGroupMask::VMEM_WRITE; 2640 // VMEM_READ, VMEM_WRITE implies VMEM. 2641 else if ((InvertedMask & SchedGroupMask::VMEM_READ) == SchedGroupMask::NONE || 2642 (InvertedMask & SchedGroupMask::VMEM_WRITE) == SchedGroupMask::NONE) 2643 InvertedMask &= ~SchedGroupMask::VMEM; 2644 2645 // DS implies DS_READ, DS_WRITE. 2646 if ((InvertedMask & SchedGroupMask::DS) == SchedGroupMask::NONE) 2647 InvertedMask &= ~SchedGroupMask::DS_READ & ~SchedGroupMask::DS_WRITE; 2648 // DS_READ, DS_WRITE implies DS. 2649 else if ((InvertedMask & SchedGroupMask::DS_READ) == SchedGroupMask::NONE || 2650 (InvertedMask & SchedGroupMask::DS_WRITE) == SchedGroupMask::NONE) 2651 InvertedMask &= ~SchedGroupMask::DS; 2652 2653 LLVM_DEBUG(dbgs() << "After Inverting, SchedGroup Mask: " << (int)InvertedMask 2654 << "\n"); 2655 2656 return InvertedMask; 2657 } 2658 2659 void IGroupLPDAGMutation::initSchedGroupBarrierPipelineStage( 2660 std::vector<SUnit>::reverse_iterator RIter) { 2661 // Remove all existing edges from the SCHED_GROUP_BARRIER that were added due 2662 // to the instruction having side effects. 2663 resetEdges(*RIter, DAG); 2664 MachineInstr &SGB = *RIter->getInstr(); 2665 assert(SGB.getOpcode() == AMDGPU::SCHED_GROUP_BARRIER); 2666 int32_t SGMask = SGB.getOperand(0).getImm(); 2667 int32_t Size = SGB.getOperand(1).getImm(); 2668 int32_t SyncID = SGB.getOperand(2).getImm(); 2669 2670 auto &SG = SyncedSchedGroups[SyncID].emplace_back((SchedGroupMask)SGMask, 2671 Size, SyncID, DAG, TII); 2672 2673 SG.initSchedGroup(RIter, SyncedInstrs[SG.getSyncID()]); 2674 } 2675 2676 bool IGroupLPDAGMutation::initIGLPOpt(SUnit &SU) { 2677 IGLPStrategyID StrategyID = 2678 (IGLPStrategyID)SU.getInstr()->getOperand(0).getImm(); 2679 auto S = createIGLPStrategy(StrategyID, DAG, TII); 2680 if (!S->shouldApplyStrategy(DAG, Phase)) 2681 return false; 2682 2683 IsBottomUp = S->IsBottomUp; 2684 return S->applyIGLPStrategy(SyncedInstrs, SyncedSchedGroups, Phase); 2685 } 2686 2687 } // namespace 2688 2689 namespace llvm { 2690 2691 /// \p Phase specifes whether or not this is a reentry into the 2692 /// IGroupLPDAGMutation. Since there may be multiple scheduling passes on the 2693 /// same scheduling region (e.g. pre and post-RA scheduling / multiple 2694 /// scheduling "phases"), we can reenter this mutation framework more than once 2695 /// for a given region. 2696 std::unique_ptr<ScheduleDAGMutation> 2697 createIGroupLPDAGMutation(AMDGPU::SchedulingPhase Phase) { 2698 return std::make_unique<IGroupLPDAGMutation>(Phase); 2699 } 2700 2701 } // end namespace llvm 2702