1 //===--- AMDGPUIGroupLP.cpp - AMDGPU IGroupLP ------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // \file This file defines a set of schedule DAG mutations that can be used to 10 // override default scheduler behavior to enforce specific scheduling patterns. 11 // They should be used in cases where runtime performance considerations such as 12 // inter-wavefront interactions, mean that compile-time heuristics cannot 13 // predict the optimal instruction ordering, or in kernels where optimum 14 // instruction scheduling is important enough to warrant manual intervention. 15 // 16 //===----------------------------------------------------------------------===// 17 18 #include "AMDGPUIGroupLP.h" 19 #include "AMDGPUTargetMachine.h" 20 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 21 #include "SIInstrInfo.h" 22 #include "SIMachineFunctionInfo.h" 23 #include "llvm/ADT/BitmaskEnum.h" 24 #include "llvm/ADT/DenseMap.h" 25 #include "llvm/CodeGen/MachineScheduler.h" 26 #include "llvm/CodeGen/TargetOpcodes.h" 27 28 using namespace llvm; 29 30 #define DEBUG_TYPE "igrouplp" 31 32 namespace { 33 34 static cl::opt<bool> EnableExactSolver( 35 "amdgpu-igrouplp-exact-solver", cl::Hidden, 36 cl::desc("Whether to use the exponential time solver to fit " 37 "the instructions to the pipeline as closely as " 38 "possible."), 39 cl::init(false)); 40 41 static cl::opt<unsigned> CutoffForExact( 42 "amdgpu-igrouplp-exact-solver-cutoff", cl::init(0), cl::Hidden, 43 cl::desc("The maximum number of scheduling group conflicts " 44 "which we attempt to solve with the exponential time " 45 "exact solver. Problem sizes greater than this will" 46 "be solved by the less accurate greedy algorithm. Selecting " 47 "solver by size is superseded by manually selecting " 48 "the solver (e.g. by amdgpu-igrouplp-exact-solver")); 49 50 static cl::opt<uint64_t> MaxBranchesExplored( 51 "amdgpu-igrouplp-exact-solver-max-branches", cl::init(0), cl::Hidden, 52 cl::desc("The amount of branches that we are willing to explore with" 53 "the exact algorithm before giving up.")); 54 55 static cl::opt<bool> UseCostHeur( 56 "amdgpu-igrouplp-exact-solver-cost-heur", cl::init(true), cl::Hidden, 57 cl::desc("Whether to use the cost heuristic to make choices as we " 58 "traverse the search space using the exact solver. Defaulted " 59 "to on, and if turned off, we will use the node order -- " 60 "attempting to put the later nodes in the later sched groups. " 61 "Experimentally, results are mixed, so this should be set on a " 62 "case-by-case basis.")); 63 64 // Components of the mask that determines which instruction types may be may be 65 // classified into a SchedGroup. 66 enum class SchedGroupMask { 67 NONE = 0u, 68 ALU = 1u << 0, 69 VALU = 1u << 1, 70 SALU = 1u << 2, 71 MFMA = 1u << 3, 72 VMEM = 1u << 4, 73 VMEM_READ = 1u << 5, 74 VMEM_WRITE = 1u << 6, 75 DS = 1u << 7, 76 DS_READ = 1u << 8, 77 DS_WRITE = 1u << 9, 78 ALL = ALU | VALU | SALU | MFMA | VMEM | VMEM_READ | VMEM_WRITE | DS | 79 DS_READ | DS_WRITE, 80 LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ ALL) 81 }; 82 83 class SchedGroup; 84 85 // InstructionRule class is used to enact a filter which determines whether or 86 // not an SU maps to a given SchedGroup. It contains complementary data 87 // structures (e.g Cache) to help those filters. 88 class InstructionRule { 89 protected: 90 const SIInstrInfo *TII; 91 unsigned SGID; 92 // A cache made available to the Filter to store SUnits for subsequent 93 // invocations of the Filter 94 std::optional<SmallVector<SUnit *, 4>> Cache; 95 96 public: 97 virtual bool 98 apply(const SUnit *, const ArrayRef<SUnit *>, 99 SmallVectorImpl<SchedGroup> &) { 100 return true; 101 }; 102 103 InstructionRule(const SIInstrInfo *TII, unsigned SGID, 104 bool NeedsCache = false) 105 : TII(TII), SGID(SGID) { 106 if (NeedsCache) { 107 Cache = SmallVector<SUnit *, 4>(); 108 } 109 } 110 111 virtual ~InstructionRule() = default; 112 }; 113 114 typedef DenseMap<SUnit *, SmallVector<int, 4>> SUnitsToCandidateSGsMap; 115 116 // Classify instructions into groups to enable fine tuned control over the 117 // scheduler. These groups may be more specific than current SchedModel 118 // instruction classes. 119 class SchedGroup { 120 private: 121 // Mask that defines which instruction types can be classified into this 122 // SchedGroup. The instruction types correspond to the mask from SCHED_BARRIER 123 // and SCHED_GROUP_BARRIER. 124 SchedGroupMask SGMask; 125 126 // Maximum number of SUnits that can be added to this group. 127 std::optional<unsigned> MaxSize; 128 129 // SchedGroups will only synchronize with other SchedGroups that have the same 130 // SyncID. 131 int SyncID = 0; 132 133 // SGID is used to map instructions to candidate SchedGroups 134 unsigned SGID; 135 136 // The different rules each instruction in this SchedGroup must conform to 137 SmallVector<std::shared_ptr<InstructionRule>, 4> Rules; 138 139 // Count of the number of created SchedGroups, used to initialize SGID. 140 static unsigned NumSchedGroups; 141 142 const SIInstrInfo *TII; 143 144 // Try to add and edge from SU A to SU B. 145 bool tryAddEdge(SUnit *A, SUnit *B); 146 147 // Use SGMask to determine whether we can classify MI as a member of this 148 // SchedGroup object. 149 bool canAddMI(const MachineInstr &MI) const; 150 151 public: 152 // Collection of SUnits that are classified as members of this group. 153 SmallVector<SUnit *, 32> Collection; 154 155 ScheduleDAGInstrs *DAG; 156 157 // Returns true if SU can be added to this SchedGroup. 158 bool canAddSU(SUnit &SU) const; 159 160 // Add DAG dependencies from all SUnits in this SchedGroup and this SU. If 161 // MakePred is true, SU will be a predecessor of the SUnits in this 162 // SchedGroup, otherwise SU will be a successor. 163 void link(SUnit &SU, bool MakePred = false); 164 165 // Add DAG dependencies and track which edges are added, and the count of 166 // missed edges 167 int link(SUnit &SU, bool MakePred, 168 std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges); 169 170 // Add DAG dependencies from all SUnits in this SchedGroup and this SU. 171 // Use the predicate to determine whether SU should be a predecessor (P = 172 // true) or a successor (P = false) of this SchedGroup. 173 void link(SUnit &SU, function_ref<bool(const SUnit *A, const SUnit *B)> P); 174 175 // Add DAG dependencies such that SUnits in this group shall be ordered 176 // before SUnits in OtherGroup. 177 void link(SchedGroup &OtherGroup); 178 179 // Returns true if no more instructions may be added to this group. 180 bool isFull() const { return MaxSize && Collection.size() >= *MaxSize; } 181 182 // Append a constraint that SUs must meet in order to fit into this 183 // SchedGroup. Since many rules involve the relationship between a SchedGroup 184 // and the SUnits in other SchedGroups, rules are checked at Pipeline Solve 185 // time (rather than SchedGroup init time.) 186 void addRule(std::shared_ptr<InstructionRule> NewRule) { 187 Rules.push_back(NewRule); 188 } 189 190 // Returns true if the SU matches all rules 191 bool allowedByRules(const SUnit *SU, 192 SmallVectorImpl<SchedGroup> &SyncPipe) const { 193 if (Rules.empty()) 194 return true; 195 for (size_t I = 0; I < Rules.size(); I++) { 196 auto TheRule = Rules[I].get(); 197 if (!TheRule->apply(SU, Collection, SyncPipe)) { 198 return false; 199 } 200 } 201 return true; 202 } 203 204 // Add SU to the SchedGroup. 205 void add(SUnit &SU) { 206 LLVM_DEBUG(dbgs() << "For SchedGroup with mask " 207 << format_hex((int)SGMask, 10, true) << " adding " 208 << *SU.getInstr()); 209 Collection.push_back(&SU); 210 } 211 212 // Remove last element in the SchedGroup 213 void pop() { Collection.pop_back(); } 214 215 // Identify and add all relevant SUs from the DAG to this SchedGroup. 216 void initSchedGroup(); 217 218 // Add instructions to the SchedGroup bottom up starting from RIter. 219 // PipelineInstrs is a set of instructions that should not be added to the 220 // SchedGroup even when the other conditions for adding it are satisfied. 221 // RIter will be added to the SchedGroup as well, and dependencies will be 222 // added so that RIter will always be scheduled at the end of the group. 223 void initSchedGroup(std::vector<SUnit>::reverse_iterator RIter, 224 SUnitsToCandidateSGsMap &SyncedInstrs); 225 226 void initSchedGroup(SUnitsToCandidateSGsMap &SyncedInstrs); 227 228 int getSyncID() { return SyncID; } 229 230 int getSGID() { return SGID; } 231 232 SchedGroupMask getMask() { return SGMask; } 233 234 SchedGroup(SchedGroupMask SGMask, std::optional<unsigned> MaxSize, 235 ScheduleDAGInstrs *DAG, const SIInstrInfo *TII) 236 : SGMask(SGMask), MaxSize(MaxSize), TII(TII), DAG(DAG) { 237 SGID = NumSchedGroups++; 238 } 239 240 SchedGroup(SchedGroupMask SGMask, std::optional<unsigned> MaxSize, int SyncID, 241 ScheduleDAGInstrs *DAG, const SIInstrInfo *TII) 242 : SGMask(SGMask), MaxSize(MaxSize), SyncID(SyncID), TII(TII), DAG(DAG) { 243 SGID = NumSchedGroups++; 244 } 245 }; 246 247 // Remove all existing edges from a SCHED_BARRIER or SCHED_GROUP_BARRIER. 248 static void resetEdges(SUnit &SU, ScheduleDAGInstrs *DAG) { 249 assert(SU.getInstr()->getOpcode() == AMDGPU::SCHED_BARRIER || 250 SU.getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER || 251 SU.getInstr()->getOpcode() == AMDGPU::IGLP_OPT); 252 253 while (!SU.Preds.empty()) 254 for (auto &P : SU.Preds) 255 SU.removePred(P); 256 257 while (!SU.Succs.empty()) 258 for (auto &S : SU.Succs) 259 for (auto &SP : S.getSUnit()->Preds) 260 if (SP.getSUnit() == &SU) 261 S.getSUnit()->removePred(SP); 262 } 263 264 typedef std::pair<SUnit *, SmallVector<int, 4>> SUToCandSGsPair; 265 typedef SmallVector<SUToCandSGsPair, 4> SUsToCandSGsVec; 266 267 // The PipelineSolver is used to assign SUnits to SchedGroups in a pipeline 268 // in non-trivial cases. For example, if the requested pipeline is 269 // {VMEM_READ, VALU, MFMA, VMEM_READ} and we encounter a VMEM_READ instruction 270 // in the DAG, then we will have an instruction that can not be trivially 271 // assigned to a SchedGroup. The PipelineSolver class implements two algorithms 272 // to find a good solution to the pipeline -- a greedy algorithm and an exact 273 // algorithm. The exact algorithm has an exponential time complexity and should 274 // only be used for small sized problems or medium sized problems where an exact 275 // solution is highly desired. 276 class PipelineSolver { 277 ScheduleDAGMI *DAG; 278 279 // Instructions that can be assigned to multiple SchedGroups 280 DenseMap<int, SUnitsToCandidateSGsMap> SyncedInstrs; 281 SmallVector<SUsToCandSGsVec, 4> PipelineInstrs; 282 DenseMap<int, SmallVector<SchedGroup, 4>> SyncedSchedGroups; 283 // The current working pipeline 284 SmallVector<SmallVector<SchedGroup, 4>, 4> CurrPipeline; 285 // The pipeline that has the best solution found so far 286 SmallVector<SmallVector<SchedGroup, 4>, 4> BestPipeline; 287 288 // Whether or not we actually have any SyncedInstrs to try to solve. 289 bool NeedsSolver = false; 290 291 // Compute an estimate of the size of search tree -- the true size is 292 // the product of each conflictedInst.Matches.size() across all SyncPipelines 293 unsigned computeProblemSize(); 294 295 // The cost penalty of not assigning a SU to a SchedGroup 296 int MissPenalty = 0; 297 298 // Costs in terms of the number of edges we are unable to add 299 int BestCost = -1; 300 int CurrCost = 0; 301 302 // Index pointing to the conflicting instruction that is currently being 303 // fitted 304 int CurrConflInstNo = 0; 305 // Index to the pipeline that is currently being fitted 306 int CurrSyncGroupIdx = 0; 307 // The first non trivial pipeline 308 int BeginSyncGroupIdx = 0; 309 310 // How many branches we have explored 311 uint64_t BranchesExplored = 0; 312 313 // The direction in which we process the candidate SchedGroups per SU 314 bool IsBottomUp = 1; 315 316 // Update indices to fit next conflicting instruction 317 void advancePosition(); 318 // Recede indices to attempt to find better fit for previous conflicting 319 // instruction 320 void retreatPosition(); 321 322 // The exponential time algorithm which finds the provably best fit 323 bool solveExact(); 324 // The polynomial time algorithm which attempts to find a good fit 325 bool solveGreedy(); 326 // Find the best SchedGroup for the current SU using the heuristic given all 327 // current information. One step in the greedy algorithm. Templated against 328 // the SchedGroup iterator (either reverse or forward). 329 template <typename T> 330 void greedyFind(std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges, T I, 331 T E); 332 // Whether or not the current solution is optimal 333 bool checkOptimal(); 334 // Populate the ready list, prioiritizing fewest missed edges first 335 // Templated against the SchedGroup iterator (either reverse or forward). 336 template <typename T> 337 void populateReadyList(SmallVectorImpl<std::pair<int, int>> &ReadyList, T I, 338 T E); 339 // Add edges corresponding to the SchedGroups as assigned by solver 340 void makePipeline(); 341 // Link the SchedGroups in the best found pipeline. 342 // Tmplated against the SchedGroup iterator (either reverse or forward). 343 template <typename T> void linkSchedGroups(T I, T E); 344 // Add the edges from the SU to the other SchedGroups in pipeline, and 345 // return the number of edges missed. 346 int addEdges(SmallVectorImpl<SchedGroup> &SyncPipeline, SUnit *SU, int SGID, 347 std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges); 348 // Link the pipeline as if \p SU was in the SchedGroup with ID \p SGID. It 349 // returns the cost (in terms of missed pipeline edges), and tracks the edges 350 // added in \p AddedEdges 351 template <typename T> 352 int linkSUnit(SUnit *SU, int SGID, 353 std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges, T I, T E); 354 // Remove the edges passed via \p AddedEdges 355 void removeEdges(const std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges); 356 // Convert the passed in maps to arrays for bidirectional iterators 357 void convertSyncMapsToArrays(); 358 359 void reset(); 360 361 public: 362 // Invoke the solver to map instructions to instruction groups. Heuristic && 363 // command-line-option determines to use exact or greedy algorithm. 364 void solve(); 365 366 PipelineSolver(DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups, 367 DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs, 368 ScheduleDAGMI *DAG, bool IsBottomUp = 1) 369 : DAG(DAG), SyncedInstrs(SyncedInstrs), 370 SyncedSchedGroups(SyncedSchedGroups), IsBottomUp(IsBottomUp) { 371 372 for (auto &PipelineInstrs : SyncedInstrs) { 373 if (PipelineInstrs.second.size() > 0) { 374 NeedsSolver = true; 375 break; 376 } 377 } 378 379 if (!NeedsSolver) 380 return; 381 382 convertSyncMapsToArrays(); 383 384 CurrPipeline = BestPipeline; 385 386 while (static_cast<size_t>(BeginSyncGroupIdx) < PipelineInstrs.size() && 387 PipelineInstrs[BeginSyncGroupIdx].size() == 0) 388 ++BeginSyncGroupIdx; 389 390 if (static_cast<size_t>(BeginSyncGroupIdx) >= PipelineInstrs.size()) 391 return; 392 } 393 }; 394 395 void PipelineSolver::reset() { 396 397 for (auto &SyncPipeline : CurrPipeline) { 398 for (auto &SG : SyncPipeline) { 399 SmallVector<SUnit *, 32> TempCollection = SG.Collection; 400 SG.Collection.clear(); 401 auto SchedBarr = llvm::find_if(TempCollection, [](SUnit *SU) { 402 return SU->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER; 403 }); 404 if (SchedBarr != TempCollection.end()) 405 SG.Collection.push_back(*SchedBarr); 406 } 407 } 408 409 CurrSyncGroupIdx = BeginSyncGroupIdx; 410 CurrConflInstNo = 0; 411 CurrCost = 0; 412 } 413 414 void PipelineSolver::convertSyncMapsToArrays() { 415 for (auto &SyncPipe : SyncedSchedGroups) { 416 BestPipeline.insert(BestPipeline.begin(), SyncPipe.second); 417 } 418 419 int PipelineIDx = SyncedInstrs.size() - 1; 420 PipelineInstrs.resize(SyncedInstrs.size()); 421 for (auto &SyncInstrMap : SyncedInstrs) { 422 for (auto &SUsToCandSGs : SyncInstrMap.second) { 423 if (PipelineInstrs[PipelineIDx].size() == 0) { 424 PipelineInstrs[PipelineIDx].push_back( 425 std::pair(SUsToCandSGs.first, SUsToCandSGs.second)); 426 continue; 427 } 428 auto SortPosition = PipelineInstrs[PipelineIDx].begin(); 429 // Insert them in sorted order -- this allows for good parsing order in 430 // the greedy algorithm 431 while (SortPosition != PipelineInstrs[PipelineIDx].end() && 432 SUsToCandSGs.first->NodeNum > SortPosition->first->NodeNum) 433 ++SortPosition; 434 PipelineInstrs[PipelineIDx].insert( 435 SortPosition, std::pair(SUsToCandSGs.first, SUsToCandSGs.second)); 436 } 437 --PipelineIDx; 438 } 439 } 440 441 template <typename T> void PipelineSolver::linkSchedGroups(T I, T E) { 442 for (; I != E; ++I) { 443 auto &GroupA = *I; 444 for (auto J = std::next(I); J != E; ++J) { 445 auto &GroupB = *J; 446 GroupA.link(GroupB); 447 } 448 } 449 } 450 451 void PipelineSolver::makePipeline() { 452 // Preserve the order of barrier for subsequent SchedGroupBarrier mutations 453 for (auto &SyncPipeline : BestPipeline) { 454 LLVM_DEBUG(dbgs() << "Printing SchedGroups\n"); 455 for (auto &SG : SyncPipeline) { 456 LLVM_DEBUG(dbgs() << "SchedGroup with SGID " << SG.getSGID() 457 << " has: \n"); 458 SUnit *SGBarr = nullptr; 459 for (auto &SU : SG.Collection) { 460 if (SU->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER) 461 SGBarr = SU; 462 LLVM_DEBUG(dbgs() << "SU(" << SU->NodeNum << ")\n"); 463 } 464 // Command line requested IGroupLP doesn't have SGBarr 465 if (!SGBarr) 466 continue; 467 resetEdges(*SGBarr, DAG); 468 SG.link(*SGBarr, false); 469 } 470 } 471 472 for (auto &SyncPipeline : BestPipeline) { 473 IsBottomUp ? linkSchedGroups(SyncPipeline.rbegin(), SyncPipeline.rend()) 474 : linkSchedGroups(SyncPipeline.begin(), SyncPipeline.end()); 475 } 476 } 477 478 template <typename T> 479 int PipelineSolver::linkSUnit( 480 SUnit *SU, int SGID, std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges, 481 T I, T E) { 482 bool MakePred = false; 483 int AddedCost = 0; 484 for (; I < E; ++I) { 485 if (I->getSGID() == SGID) { 486 MakePred = true; 487 continue; 488 } 489 auto Group = *I; 490 AddedCost += Group.link(*SU, MakePred, AddedEdges); 491 assert(AddedCost >= 0); 492 } 493 return AddedCost; 494 } 495 496 int PipelineSolver::addEdges( 497 SmallVectorImpl<SchedGroup> &SyncPipeline, SUnit *SU, int SGID, 498 std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges) { 499 500 // For IsBottomUp, the first SchedGroup in SyncPipeline contains the 501 // instructions that are the ultimate successors in the resultant mutation. 502 // Therefore, in such a configuration, the SchedGroups occurring before the 503 // candidate SGID are successors of the candidate SchedGroup, thus the current 504 // SU should be linked as a predecessor to SUs in those SchedGroups. The 505 // opposite is true if !IsBottomUp. IsBottomUp occurs in the case of multiple 506 // SCHED_GROUP_BARRIERS, or if a user specifies IGLP_OPT SchedGroups using 507 // IsBottomUp (in reverse). 508 return IsBottomUp ? linkSUnit(SU, SGID, AddedEdges, SyncPipeline.rbegin(), 509 SyncPipeline.rend()) 510 : linkSUnit(SU, SGID, AddedEdges, SyncPipeline.begin(), 511 SyncPipeline.end()); 512 } 513 514 void PipelineSolver::removeEdges( 515 const std::vector<std::pair<SUnit *, SUnit *>> &EdgesToRemove) { 516 // Only remove the edges that we have added when testing 517 // the fit. 518 for (auto &PredSuccPair : EdgesToRemove) { 519 SUnit *Pred = PredSuccPair.first; 520 SUnit *Succ = PredSuccPair.second; 521 522 auto Match = llvm::find_if( 523 Succ->Preds, [&Pred](SDep &P) { return P.getSUnit() == Pred; }); 524 if (Match != Succ->Preds.end()) { 525 assert(Match->isArtificial()); 526 Succ->removePred(*Match); 527 } 528 } 529 } 530 531 void PipelineSolver::advancePosition() { 532 ++CurrConflInstNo; 533 534 if (static_cast<size_t>(CurrConflInstNo) >= 535 PipelineInstrs[CurrSyncGroupIdx].size()) { 536 CurrConflInstNo = 0; 537 ++CurrSyncGroupIdx; 538 // Advance to next non-trivial pipeline 539 while (static_cast<size_t>(CurrSyncGroupIdx) < PipelineInstrs.size() && 540 PipelineInstrs[CurrSyncGroupIdx].size() == 0) 541 ++CurrSyncGroupIdx; 542 } 543 } 544 545 void PipelineSolver::retreatPosition() { 546 assert(CurrConflInstNo >= 0); 547 assert(CurrSyncGroupIdx >= 0); 548 549 if (CurrConflInstNo > 0) { 550 --CurrConflInstNo; 551 return; 552 } 553 554 if (CurrConflInstNo == 0) { 555 // If we return to the starting position, we have explored 556 // the entire tree 557 if (CurrSyncGroupIdx == BeginSyncGroupIdx) 558 return; 559 560 --CurrSyncGroupIdx; 561 // Go to previous non-trivial pipeline 562 while (PipelineInstrs[CurrSyncGroupIdx].size() == 0) 563 --CurrSyncGroupIdx; 564 565 CurrConflInstNo = PipelineInstrs[CurrSyncGroupIdx].size() - 1; 566 } 567 } 568 569 bool PipelineSolver::checkOptimal() { 570 if (static_cast<size_t>(CurrSyncGroupIdx) == PipelineInstrs.size()) { 571 if (BestCost == -1 || CurrCost < BestCost) { 572 BestPipeline = CurrPipeline; 573 BestCost = CurrCost; 574 LLVM_DEBUG(dbgs() << "Found Fit with cost " << BestCost << "\n"); 575 } 576 assert(BestCost >= 0); 577 } 578 579 bool DoneExploring = false; 580 if (MaxBranchesExplored > 0 && BranchesExplored >= MaxBranchesExplored) 581 DoneExploring = true; 582 583 return (DoneExploring || BestCost == 0); 584 } 585 586 template <typename T> 587 void PipelineSolver::populateReadyList( 588 SmallVectorImpl<std::pair<int, int>> &ReadyList, T I, T E) { 589 SUToCandSGsPair CurrSU = PipelineInstrs[CurrSyncGroupIdx][CurrConflInstNo]; 590 auto SyncPipeline = CurrPipeline[CurrSyncGroupIdx]; 591 assert(CurrSU.second.size() >= 1); 592 593 for (; I != E; ++I) { 594 std::vector<std::pair<SUnit *, SUnit *>> AddedEdges; 595 int CandSGID = *I; 596 SchedGroup *Match = llvm::find_if(SyncPipeline, [CandSGID](SchedGroup &SG) { 597 return SG.getSGID() == CandSGID; 598 }); 599 assert(Match); 600 601 if (UseCostHeur) { 602 if (Match->isFull()) { 603 ReadyList.push_back(std::pair(*I, MissPenalty)); 604 continue; 605 } 606 607 int TempCost = addEdges(SyncPipeline, CurrSU.first, CandSGID, AddedEdges); 608 ReadyList.push_back(std::pair(*I, TempCost)); 609 removeEdges(AddedEdges); 610 } else 611 ReadyList.push_back(std::pair(*I, -1)); 612 } 613 614 if (UseCostHeur) { 615 std::sort(ReadyList.begin(), ReadyList.end(), 616 [](std::pair<int, int> A, std::pair<int, int> B) { 617 return A.second < B.second; 618 }); 619 } 620 621 assert(ReadyList.size() == CurrSU.second.size()); 622 } 623 624 bool PipelineSolver::solveExact() { 625 if (checkOptimal()) 626 return true; 627 628 if (static_cast<size_t>(CurrSyncGroupIdx) == PipelineInstrs.size()) 629 return false; 630 631 assert(static_cast<size_t>(CurrSyncGroupIdx) < PipelineInstrs.size()); 632 assert(static_cast<size_t>(CurrConflInstNo) < 633 PipelineInstrs[CurrSyncGroupIdx].size()); 634 SUToCandSGsPair CurrSU = PipelineInstrs[CurrSyncGroupIdx][CurrConflInstNo]; 635 LLVM_DEBUG(dbgs() << "Fitting SU(" << CurrSU.first->NodeNum 636 << ") in Pipeline # " << CurrSyncGroupIdx << "\n"); 637 638 // SchedGroup -> Cost pairs 639 SmallVector<std::pair<int, int>, 4> ReadyList; 640 // Prioritize the candidate sched groups in terms of lowest cost first 641 IsBottomUp ? populateReadyList(ReadyList, CurrSU.second.rbegin(), 642 CurrSU.second.rend()) 643 : populateReadyList(ReadyList, CurrSU.second.begin(), 644 CurrSU.second.end()); 645 646 auto I = ReadyList.begin(); 647 auto E = ReadyList.end(); 648 for (; I != E; ++I) { 649 // If we are trying SGs in least cost order, and the current SG is cost 650 // infeasible, then all subsequent SGs will also be cost infeasible, so we 651 // can prune. 652 if (BestCost != -1 && (CurrCost + I->second > BestCost)) 653 return false; 654 655 int CandSGID = I->first; 656 int AddedCost = 0; 657 std::vector<std::pair<SUnit *, SUnit *>> AddedEdges; 658 auto &SyncPipeline = CurrPipeline[CurrSyncGroupIdx]; 659 SchedGroup *Match; 660 for (auto &SG : SyncPipeline) { 661 if (SG.getSGID() == CandSGID) 662 Match = &SG; 663 } 664 665 if (Match->isFull()) 666 continue; 667 668 if (!Match->allowedByRules(CurrSU.first, SyncPipeline)) 669 continue; 670 671 LLVM_DEBUG(dbgs() << "Assigning to SchedGroup with Mask " 672 << (int)Match->getMask() << "and ID " << CandSGID 673 << "\n"); 674 Match->add(*CurrSU.first); 675 AddedCost = addEdges(SyncPipeline, CurrSU.first, CandSGID, AddedEdges); 676 LLVM_DEBUG(dbgs() << "Cost of Assignment: " << AddedCost << "\n"); 677 CurrCost += AddedCost; 678 advancePosition(); 679 ++BranchesExplored; 680 bool FinishedExploring = false; 681 // If the Cost after adding edges is greater than a known solution, 682 // backtrack 683 if (CurrCost < BestCost || BestCost == -1) { 684 if (solveExact()) { 685 FinishedExploring = BestCost != 0; 686 if (!FinishedExploring) 687 return true; 688 } 689 } 690 691 retreatPosition(); 692 CurrCost -= AddedCost; 693 removeEdges(AddedEdges); 694 Match->pop(); 695 CurrPipeline[CurrSyncGroupIdx] = SyncPipeline; 696 if (FinishedExploring) 697 return true; 698 } 699 700 // Try the pipeline where the current instruction is omitted 701 // Potentially if we omit a problematic instruction from the pipeline, 702 // all the other instructions can nicely fit. 703 CurrCost += MissPenalty; 704 advancePosition(); 705 706 LLVM_DEBUG(dbgs() << "NOT Assigned (" << CurrSU.first->NodeNum << ")\n"); 707 708 bool FinishedExploring = false; 709 if (CurrCost < BestCost || BestCost == -1) { 710 if (solveExact()) { 711 bool FinishedExploring = BestCost != 0; 712 if (!FinishedExploring) 713 return true; 714 } 715 } 716 717 retreatPosition(); 718 CurrCost -= MissPenalty; 719 return FinishedExploring; 720 } 721 722 template <typename T> 723 void PipelineSolver::greedyFind( 724 std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges, T I, T E) { 725 SUToCandSGsPair CurrSU = PipelineInstrs[CurrSyncGroupIdx][CurrConflInstNo]; 726 int BestNodeCost = -1; 727 int TempCost; 728 SchedGroup *BestGroup = nullptr; 729 int BestGroupID = -1; 730 auto &SyncPipeline = CurrPipeline[CurrSyncGroupIdx]; 731 LLVM_DEBUG(dbgs() << "Fitting SU(" << CurrSU.first->NodeNum 732 << ") in Pipeline # " << CurrSyncGroupIdx << "\n"); 733 734 // Since we have added the potential SchedGroups from bottom up, but 735 // traversed the DAG from top down, parse over the groups from last to 736 // first. If we fail to do this for the greedy algorithm, the solution will 737 // likely not be good in more complex cases. 738 for (; I != E; ++I) { 739 std::vector<std::pair<SUnit *, SUnit *>> AddedEdges; 740 int CandSGID = *I; 741 SchedGroup *Match = llvm::find_if(SyncPipeline, [CandSGID](SchedGroup &SG) { 742 return SG.getSGID() == CandSGID; 743 }); 744 assert(Match); 745 746 LLVM_DEBUG(dbgs() << "Trying SGID # " << CandSGID << " with Mask " 747 << (int)Match->getMask() << "\n"); 748 749 if (Match->isFull()) { 750 LLVM_DEBUG(dbgs() << "SGID # " << CandSGID << " is full\n"); 751 continue; 752 } 753 if (!Match->allowedByRules(CurrSU.first, SyncPipeline)) { 754 LLVM_DEBUG(dbgs() << "SGID # " << CandSGID << " has conflicting rule\n"); 755 continue; 756 } 757 TempCost = addEdges(SyncPipeline, CurrSU.first, CandSGID, AddedEdges); 758 LLVM_DEBUG(dbgs() << "Cost of Group " << TempCost << "\n"); 759 if (TempCost < BestNodeCost || BestNodeCost == -1) { 760 BestGroup = Match; 761 BestNodeCost = TempCost; 762 BestGroupID = CandSGID; 763 } 764 removeEdges(AddedEdges); 765 if (BestNodeCost == 0) 766 break; 767 } 768 769 if (BestGroupID != -1) { 770 BestGroup->add(*CurrSU.first); 771 addEdges(SyncPipeline, CurrSU.first, BestGroupID, AddedEdges); 772 LLVM_DEBUG(dbgs() << "Best Group has ID: " << BestGroupID << " and Mask" 773 << (int)BestGroup->getMask() << "\n"); 774 BestCost += TempCost; 775 } else 776 BestCost += MissPenalty; 777 778 CurrPipeline[CurrSyncGroupIdx] = SyncPipeline; 779 } 780 781 bool PipelineSolver::solveGreedy() { 782 BestCost = 0; 783 std::vector<std::pair<SUnit *, SUnit *>> AddedEdges; 784 785 while (static_cast<size_t>(CurrSyncGroupIdx) < PipelineInstrs.size()) { 786 SUToCandSGsPair CurrSU = PipelineInstrs[CurrSyncGroupIdx][CurrConflInstNo]; 787 IsBottomUp 788 ? greedyFind(AddedEdges, CurrSU.second.rbegin(), CurrSU.second.rend()) 789 : greedyFind(AddedEdges, CurrSU.second.begin(), CurrSU.second.end()); 790 advancePosition(); 791 } 792 BestPipeline = CurrPipeline; 793 removeEdges(AddedEdges); 794 return false; 795 } 796 797 unsigned PipelineSolver::computeProblemSize() { 798 unsigned ProblemSize = 0; 799 for (auto &PipeConflicts : PipelineInstrs) { 800 ProblemSize += PipeConflicts.size(); 801 } 802 803 return ProblemSize; 804 } 805 806 void PipelineSolver::solve() { 807 if (!NeedsSolver) 808 return; 809 810 unsigned ProblemSize = computeProblemSize(); 811 assert(ProblemSize > 0); 812 813 bool BelowCutoff = (CutoffForExact > 0) && ProblemSize <= CutoffForExact; 814 MissPenalty = (ProblemSize / 2) + 1; 815 816 LLVM_DEBUG(DAG->dump()); 817 if (EnableExactSolver || BelowCutoff) { 818 LLVM_DEBUG(dbgs() << "Starting Greedy pipeline solver\n"); 819 solveGreedy(); 820 reset(); 821 LLVM_DEBUG(dbgs() << "Greedy produced best cost of " << BestCost << "\n"); 822 if (BestCost > 0) { 823 LLVM_DEBUG(dbgs() << "Starting EXACT pipeline solver\n"); 824 solveExact(); 825 LLVM_DEBUG(dbgs() << "Exact produced best cost of " << BestCost << "\n"); 826 } 827 } else { // Use the Greedy Algorithm by default 828 LLVM_DEBUG(dbgs() << "Starting GREEDY pipeline solver\n"); 829 solveGreedy(); 830 } 831 832 makePipeline(); 833 LLVM_DEBUG(dbgs() << "After applying mutation\n"); 834 LLVM_DEBUG(DAG->dump()); 835 } 836 837 enum IGLPStrategyID : int { 838 MFMASmallGemmOptID = 0, 839 MFMASmallGemmSingleWaveOptID = 1, 840 }; 841 842 // Implement a IGLP scheduling strategy. 843 class IGLPStrategy { 844 protected: 845 ScheduleDAGInstrs *DAG; 846 847 const SIInstrInfo *TII; 848 849 public: 850 // Add SchedGroups to \p Pipeline to implement this Strategy. 851 virtual void applyIGLPStrategy( 852 DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs, 853 DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups) = 0; 854 855 // Returns true if this strategy should be applied to a ScheduleDAG. 856 virtual bool shouldApplyStrategy(ScheduleDAGInstrs *DAG) = 0; 857 858 bool IsBottomUp = 1; 859 860 IGLPStrategy(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII) 861 : DAG(DAG), TII(TII) {} 862 863 virtual ~IGLPStrategy() = default; 864 }; 865 866 class MFMASmallGemmOpt final : public IGLPStrategy { 867 private: 868 public: 869 void applyIGLPStrategy( 870 DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs, 871 DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups) override; 872 873 bool shouldApplyStrategy(ScheduleDAGInstrs *DAG) override { return true; } 874 875 MFMASmallGemmOpt(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII) 876 : IGLPStrategy(DAG, TII) { 877 IsBottomUp = 1; 878 } 879 }; 880 881 void MFMASmallGemmOpt::applyIGLPStrategy( 882 DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs, 883 DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups) { 884 // Count the number of MFMA instructions. 885 unsigned MFMACount = 0; 886 for (const MachineInstr &I : *DAG) 887 if (TII->isMFMAorWMMA(I)) 888 ++MFMACount; 889 890 const unsigned PipelineSyncID = 0; 891 SchedGroup *SG = nullptr; 892 for (unsigned I = 0; I < MFMACount * 3; ++I) { 893 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 894 SchedGroupMask::DS, 2, PipelineSyncID, DAG, TII); 895 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 896 897 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 898 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII); 899 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 900 } 901 } 902 903 class MFMASmallGemmSingleWaveOpt final : public IGLPStrategy { 904 private: 905 // Whether the DS_READ is a predecessor of first four MFMA in region 906 class EnablesInitialMFMA final : public InstructionRule { 907 public: 908 bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection, 909 SmallVectorImpl<SchedGroup> &SyncPipe) override { 910 if (!SyncPipe.size()) 911 return false; 912 int MFMAsFound = 0; 913 if (!Cache->size()) { 914 for (auto &Elt : SyncPipe[0].DAG->SUnits) { 915 if (TII->isMFMAorWMMA(*Elt.getInstr())) { 916 ++MFMAsFound; 917 if (MFMAsFound > 4) 918 break; 919 Cache->push_back(&Elt); 920 } 921 } 922 } 923 924 assert(Cache->size()); 925 auto DAG = SyncPipe[0].DAG; 926 for (auto &Elt : *Cache) { 927 if (DAG->IsReachable(Elt, const_cast<SUnit *>(SU))) 928 return true; 929 } 930 return false; 931 } 932 933 EnablesInitialMFMA(const SIInstrInfo *TII, unsigned SGID, 934 bool NeedsCache = false) 935 : InstructionRule(TII, SGID, NeedsCache) {} 936 }; 937 938 // Whether the MI is a V_PERM and is a predecessor of a common DS_WRITE 939 class IsPermForDSW final : public InstructionRule { 940 public: 941 bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection, 942 SmallVectorImpl<SchedGroup> &SyncPipe) override { 943 auto MI = SU->getInstr(); 944 if (MI->getOpcode() != AMDGPU::V_PERM_B32_e64) 945 return false; 946 947 bool FitsInGroup = false; 948 // Does the VALU have a DS_WRITE successor 949 if (!Collection.size()) { 950 for (auto &Succ : SU->Succs) { 951 SUnit *SuccUnit = Succ.getSUnit(); 952 if (TII->isDS(*SuccUnit->getInstr()) && 953 SuccUnit->getInstr()->mayStore()) { 954 Cache->push_back(SuccUnit); 955 FitsInGroup = true; 956 } 957 } 958 return FitsInGroup; 959 } 960 961 assert(Cache->size()); 962 963 // Does the VALU have a DS_WRITE successor that is the same as other 964 // VALU already in the group. The V_PERMs will all share 1 DS_W succ 965 return std::any_of(Cache->begin(), Cache->end(), [&SU](SUnit *Elt) { 966 return std::any_of(SU->Succs.begin(), SU->Succs.end(), 967 [&Elt](const SDep &ThisSucc) { 968 return ThisSucc.getSUnit() == Elt; 969 }); 970 }); 971 } 972 973 IsPermForDSW(const SIInstrInfo *TII, unsigned SGID, bool NeedsCache = false) 974 : InstructionRule(TII, SGID, NeedsCache) {} 975 }; 976 977 // Whether the SU is a successor of any element in previous SchedGroup 978 class IsSuccOfPrevGroup final : public InstructionRule { 979 public: 980 bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection, 981 SmallVectorImpl<SchedGroup> &SyncPipe) override { 982 SchedGroup *OtherGroup = nullptr; 983 for (auto &PipeSG : SyncPipe) { 984 if ((unsigned)PipeSG.getSGID() == SGID - 1) { 985 OtherGroup = &PipeSG; 986 } 987 } 988 989 if (!OtherGroup) 990 return false; 991 if (!OtherGroup->Collection.size()) 992 return true; 993 994 // Does the previous VALU have this DS_Write as a successor 995 return (std::any_of(OtherGroup->Collection.begin(), 996 OtherGroup->Collection.end(), [&SU](SUnit *Elt) { 997 return std::any_of(Elt->Succs.begin(), 998 Elt->Succs.end(), 999 [&SU](SDep &Succ) { 1000 return Succ.getSUnit() == SU; 1001 }); 1002 })); 1003 } 1004 IsSuccOfPrevGroup(const SIInstrInfo *TII, unsigned SGID, 1005 bool NeedsCache = false) 1006 : InstructionRule(TII, SGID, NeedsCache) {} 1007 }; 1008 1009 // Whether the combined load width of group is 128 bits 1010 class VMEMSize final : public InstructionRule { 1011 public: 1012 bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection, 1013 SmallVectorImpl<SchedGroup> &SyncPipe) override { 1014 auto MI = SU->getInstr(); 1015 if (MI->getOpcode() == TargetOpcode::BUNDLE) 1016 return false; 1017 if (!Collection.size()) 1018 return true; 1019 1020 int NumBits = 0; 1021 1022 auto TRI = TII->getRegisterInfo(); 1023 auto &MRI = MI->getParent()->getParent()->getRegInfo(); 1024 for (auto &Elt : Collection) { 1025 auto Op = Elt->getInstr()->getOperand(0); 1026 auto Size = 1027 TRI.getRegSizeInBits(*TRI.getRegClassForOperandReg(MRI, Op)); 1028 NumBits += Size; 1029 } 1030 1031 if (NumBits < 128) { 1032 assert(TII->isVMEM(*MI) && MI->mayLoad()); 1033 if (NumBits + TRI.getRegSizeInBits(*TRI.getRegClassForOperandReg( 1034 MRI, MI->getOperand(0))) <= 1035 128) 1036 return true; 1037 } 1038 1039 return false; 1040 } 1041 1042 VMEMSize(const SIInstrInfo *TII, unsigned SGID, bool NeedsCache = false) 1043 : InstructionRule(TII, SGID, NeedsCache) {} 1044 }; 1045 1046 // Whether the SU shares a V_PERM predecessor with any SU in the SchedGroup 1047 // that is /p Distance steps away 1048 class SharesPredWithPrevNthGroup final : public InstructionRule { 1049 private: 1050 unsigned Distance = 1; 1051 1052 public: 1053 bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection, 1054 SmallVectorImpl<SchedGroup> &SyncPipe) override { 1055 SchedGroup *OtherGroup = nullptr; 1056 if (!SyncPipe.size()) 1057 return false; 1058 1059 if (!Cache->size()) { 1060 1061 for (auto &PipeSG : SyncPipe) { 1062 if ((unsigned)PipeSG.getSGID() == SGID - Distance) { 1063 OtherGroup = &PipeSG; 1064 } 1065 } 1066 1067 if (!OtherGroup) 1068 return false; 1069 if (!OtherGroup->Collection.size()) 1070 return true; 1071 1072 for (auto &OtherEle : OtherGroup->Collection) { 1073 for (auto &Pred : OtherEle->Preds) { 1074 if (Pred.getSUnit()->getInstr()->getOpcode() == 1075 AMDGPU::V_PERM_B32_e64) 1076 Cache->push_back(Pred.getSUnit()); 1077 } 1078 } 1079 } 1080 1081 assert(Cache->size()); 1082 auto DAG = SyncPipe[0].DAG; 1083 // Does the previous DS_WRITE share a V_PERM predecessor with this 1084 // VMEM_READ 1085 return ( 1086 std::any_of(Cache->begin(), Cache->end(), [&SU, &DAG](SUnit *Elt) { 1087 return DAG->IsReachable(const_cast<SUnit *>(SU), Elt); 1088 })); 1089 } 1090 SharesPredWithPrevNthGroup(unsigned Distance, const SIInstrInfo *TII, 1091 unsigned SGID, bool NeedsCache = false) 1092 : InstructionRule(TII, SGID, NeedsCache), Distance(Distance) {} 1093 }; 1094 1095 public: 1096 void applyIGLPStrategy( 1097 DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs, 1098 DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups) override; 1099 1100 bool shouldApplyStrategy(ScheduleDAGInstrs *DAG) override { return true; } 1101 1102 MFMASmallGemmSingleWaveOpt(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII) 1103 : IGLPStrategy(DAG, TII) { 1104 IsBottomUp = 0; 1105 } 1106 }; 1107 1108 void MFMASmallGemmSingleWaveOpt::applyIGLPStrategy( 1109 DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs, 1110 DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups) { 1111 unsigned MFMACount = 0; 1112 unsigned DSWCount = 0; 1113 unsigned DSWWithPermCount = 0; 1114 unsigned DSWWithSharedVMEMCount = 0; 1115 unsigned DSRCount = 0; 1116 SmallVector<SUnit *, 6> DSWithPerms; 1117 for (auto &SU : DAG->SUnits) { 1118 auto I = SU.getInstr(); 1119 if (TII->isMFMAorWMMA(*I)) 1120 ++MFMACount; 1121 else if (TII->isDS(*I)) { 1122 if (I->mayLoad()) 1123 ++DSRCount; 1124 else if (I->mayStore()) { 1125 ++DSWCount; 1126 for (auto Pred : SU.Preds) { 1127 if (Pred.getSUnit()->getInstr()->getOpcode() == 1128 AMDGPU::V_PERM_B32_e64) { 1129 DSWithPerms.push_back(&SU); 1130 break; 1131 } 1132 } 1133 } 1134 } 1135 } 1136 DSWWithPermCount = DSWithPerms.size(); 1137 auto I = DSWithPerms.begin(); 1138 auto E = DSWithPerms.end(); 1139 1140 // Get the count of DS_WRITES with V_PERM predecessors which 1141 // have loop carried dependencies (WAR) on the same VMEM_READs. 1142 // We consider partial overlap as a miss -- in other words, 1143 // for a given DS_W, we only consider another DS_W as matching 1144 // if there is a corresponding (in terms of the VMEM_R it uses) V_PERM pred 1145 // for every V_PERM pred of this DS_W. 1146 DenseMap<MachineInstr *, SUnit *> VMEMLookup; 1147 SmallVector<SUnit *, 6> Counted; 1148 for (; I != E; I++) { 1149 SUnit *Cand = nullptr; 1150 bool MissedAny = false; 1151 for (auto &Pred : (*I)->Preds) { 1152 if (Pred.getSUnit()->getInstr()->getOpcode() != AMDGPU::V_PERM_B32_e64) 1153 continue; 1154 1155 if (Cand && 1156 std::find(Counted.begin(), Counted.end(), Cand) != Counted.end()) 1157 break; 1158 1159 for (auto &Succ : Pred.getSUnit()->Succs) { 1160 auto MI = Succ.getSUnit()->getInstr(); 1161 if (!TII->isVMEM(*MI) || !MI->mayLoad()) 1162 continue; 1163 1164 if (MissedAny || !VMEMLookup.size()) { 1165 MissedAny = true; 1166 VMEMLookup[MI] = *I; 1167 continue; 1168 } 1169 1170 if (!VMEMLookup.contains(MI)) { 1171 MissedAny = true; 1172 VMEMLookup[MI] = *I; 1173 continue; 1174 } 1175 1176 Cand = VMEMLookup[MI]; 1177 if (std::find(Counted.begin(), Counted.end(), Cand) != Counted.end()) { 1178 MissedAny = true; 1179 break; 1180 } 1181 } 1182 } 1183 if (!MissedAny && Cand) { 1184 DSWWithSharedVMEMCount += 2; 1185 Counted.push_back(Cand); 1186 Counted.push_back(*I); 1187 } 1188 } 1189 1190 assert(DSWWithSharedVMEMCount <= DSWWithPermCount); 1191 SchedGroup *SG; 1192 unsigned PipelineSyncID = 0; 1193 // For kernels with V_PERM, there are enough VALU to mix in between MFMAs 1194 if (DSWWithPermCount) { 1195 for (unsigned I = 0; I < MFMACount; I++) { 1196 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 1197 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII); 1198 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 1199 1200 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 1201 SchedGroupMask::VALU, 2, PipelineSyncID, DAG, TII); 1202 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 1203 } 1204 } 1205 1206 PipelineSyncID = 1; 1207 // Phase 1: Break up DS_READ and MFMA clusters. 1208 // First DS_READ to make ready initial MFMA, then interleave MFMA with DS_READ 1209 // prefetch 1210 1211 // Make ready initial MFMA 1212 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 1213 SchedGroupMask::DS_READ, 4, PipelineSyncID, DAG, TII); 1214 SG->addRule(std::make_shared<EnablesInitialMFMA>(TII, SG->getSGID(), true)); 1215 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 1216 1217 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 1218 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII); 1219 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 1220 1221 // Interleave MFMA with DS_READ prefetch 1222 for (unsigned I = 0; I < DSRCount - 4; ++I) { 1223 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 1224 SchedGroupMask::DS_READ, 1, PipelineSyncID, DAG, TII); 1225 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 1226 1227 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 1228 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII); 1229 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 1230 } 1231 1232 // Phase 2a: Loop carried dependency with V_PERM 1233 // Schedule VPerm & DS_WRITE as closely as possible to the VMEM_READ they 1234 // depend on. Interleave MFMA to keep XDL unit busy throughout. 1235 for (unsigned I = 0; I < DSWWithPermCount - DSWWithSharedVMEMCount; ++I) { 1236 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 1237 SchedGroupMask::VALU, 4, PipelineSyncID, DAG, TII); 1238 SG->addRule(std::make_shared<IsPermForDSW>(TII, SG->getSGID(), true)); 1239 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 1240 1241 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 1242 SchedGroupMask::DS_WRITE, 1, PipelineSyncID, DAG, TII); 1243 SG->addRule(std::make_shared<IsSuccOfPrevGroup>(TII, SG->getSGID(), false)); 1244 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 1245 1246 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 1247 SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG, TII); 1248 SG->addRule(std::make_shared<SharesPredWithPrevNthGroup>( 1249 1, TII, SG->getSGID(), true)); 1250 SG->addRule(std::make_shared<VMEMSize>(TII, SG->getSGID(), false)); 1251 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 1252 1253 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 1254 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII); 1255 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 1256 1257 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 1258 SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG, TII); 1259 SG->addRule(std::make_shared<SharesPredWithPrevNthGroup>( 1260 3, TII, SG->getSGID(), true)); 1261 SG->addRule(std::make_shared<VMEMSize>(TII, SG->getSGID(), false)); 1262 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 1263 1264 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 1265 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII); 1266 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 1267 } 1268 1269 // Phase 2b: Loop carried dependency without V_PERM 1270 // Schedule DS_WRITE as closely as possible to the VMEM_READ they depend on. 1271 // Interleave MFMA to keep XDL unit busy throughout. 1272 for (unsigned I = 0; I < DSWCount - DSWWithPermCount; I++) { 1273 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 1274 SchedGroupMask::DS_WRITE, 1, PipelineSyncID, DAG, TII); 1275 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 1276 1277 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 1278 SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG, TII); 1279 SG->addRule(std::make_shared<VMEMSize>(TII, SG->getSGID(), false)); 1280 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 1281 1282 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 1283 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII); 1284 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 1285 } 1286 1287 // Phase 2c: Loop carried dependency with V_PERM, VMEM_READs are 1288 // ultimately used by two DS_WRITE 1289 // Schedule VPerm & DS_WRITE as closely as possible to the VMEM_READ they 1290 // depend on. Interleave MFMA to keep XDL unit busy throughout. 1291 1292 for (unsigned I = 0; I < DSWWithSharedVMEMCount; ++I) { 1293 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 1294 SchedGroupMask::VALU, 4, PipelineSyncID, DAG, TII); 1295 SG->addRule(std::make_shared<IsPermForDSW>(TII, SG->getSGID(), true)); 1296 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 1297 1298 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 1299 SchedGroupMask::DS_WRITE, 1, PipelineSyncID, DAG, TII); 1300 SG->addRule(std::make_shared<IsSuccOfPrevGroup>(TII, SG->getSGID(), false)); 1301 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 1302 1303 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 1304 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII); 1305 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 1306 1307 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 1308 SchedGroupMask::VALU, 4, PipelineSyncID, DAG, TII); 1309 SG->addRule(std::make_shared<IsPermForDSW>(TII, SG->getSGID(), true)); 1310 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 1311 1312 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 1313 SchedGroupMask::DS_WRITE, 1, PipelineSyncID, DAG, TII); 1314 SG->addRule(std::make_shared<IsSuccOfPrevGroup>(TII, SG->getSGID(), false)); 1315 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 1316 1317 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 1318 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII); 1319 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 1320 1321 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 1322 SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG, TII); 1323 SG->addRule(std::make_shared<SharesPredWithPrevNthGroup>( 1324 2, TII, SG->getSGID(), true)); 1325 SG->addRule(std::make_shared<VMEMSize>(TII, SG->getSGID(), false)); 1326 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 1327 1328 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 1329 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII); 1330 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 1331 1332 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 1333 SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG, TII); 1334 SG->addRule(std::make_shared<SharesPredWithPrevNthGroup>( 1335 4, TII, SG->getSGID(), true)); 1336 SG->addRule(std::make_shared<VMEMSize>(TII, SG->getSGID(), false)); 1337 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 1338 1339 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 1340 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII); 1341 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 1342 } 1343 } 1344 1345 static std::unique_ptr<IGLPStrategy> 1346 createIGLPStrategy(IGLPStrategyID ID, ScheduleDAGInstrs *DAG, 1347 const SIInstrInfo *TII) { 1348 switch (ID) { 1349 case MFMASmallGemmOptID: 1350 return std::make_unique<MFMASmallGemmOpt>(DAG, TII); 1351 case MFMASmallGemmSingleWaveOptID: 1352 return std::make_unique<MFMASmallGemmSingleWaveOpt>(DAG, TII); 1353 } 1354 1355 llvm_unreachable("Unknown IGLPStrategyID"); 1356 } 1357 1358 class IGroupLPDAGMutation : public ScheduleDAGMutation { 1359 private: 1360 const SIInstrInfo *TII; 1361 1362 ScheduleDAGMI *DAG; 1363 1364 // Organize lists of SchedGroups by their SyncID. SchedGroups / 1365 // SCHED_GROUP_BARRIERs with different SyncIDs will have no edges added 1366 // between then. 1367 DenseMap<int, SmallVector<SchedGroup, 4>> SyncedSchedGroups; 1368 1369 // Used to track instructions that can be mapped to multiple sched groups 1370 DenseMap<int, SUnitsToCandidateSGsMap> SyncedInstrs; 1371 1372 // Add DAG edges that enforce SCHED_BARRIER ordering. 1373 void addSchedBarrierEdges(SUnit &SU); 1374 1375 // Use a SCHED_BARRIER's mask to identify instruction SchedGroups that should 1376 // not be reordered accross the SCHED_BARRIER. This is used for the base 1377 // SCHED_BARRIER, and not SCHED_GROUP_BARRIER. The difference is that 1378 // SCHED_BARRIER will always block all instructions that can be classified 1379 // into a particular SchedClass, whereas SCHED_GROUP_BARRIER has a fixed size 1380 // and may only synchronize with some SchedGroups. Returns the inverse of 1381 // Mask. SCHED_BARRIER's mask describes which instruction types should be 1382 // allowed to be scheduled across it. Invert the mask to get the 1383 // SchedGroupMask of instructions that should be barred. 1384 SchedGroupMask invertSchedBarrierMask(SchedGroupMask Mask) const; 1385 1386 // Create SchedGroups for a SCHED_GROUP_BARRIER. 1387 void initSchedGroupBarrierPipelineStage( 1388 std::vector<SUnit>::reverse_iterator RIter); 1389 1390 void initIGLPOpt(SUnit &SU); 1391 1392 public: 1393 void apply(ScheduleDAGInstrs *DAGInstrs) override; 1394 1395 // The order in which the PipelineSolver should process the candidate 1396 // SchedGroup for a PipelineInstr. BOTTOM_UP will try to add SUs to the last 1397 // created SchedGroup first, and will consider that as the ultimate 1398 // predecessor group when linking. TOP_DOWN instead links and processes the 1399 // first created SchedGroup first. 1400 bool IsBottomUp = 1; 1401 1402 IGroupLPDAGMutation() = default; 1403 }; 1404 1405 unsigned SchedGroup::NumSchedGroups = 0; 1406 1407 bool SchedGroup::tryAddEdge(SUnit *A, SUnit *B) { 1408 if (A != B && DAG->canAddEdge(B, A)) { 1409 DAG->addEdge(B, SDep(A, SDep::Artificial)); 1410 return true; 1411 } 1412 return false; 1413 } 1414 1415 bool SchedGroup::canAddMI(const MachineInstr &MI) const { 1416 bool Result = false; 1417 if (MI.isMetaInstruction()) 1418 Result = false; 1419 1420 else if (((SGMask & SchedGroupMask::ALU) != SchedGroupMask::NONE) && 1421 (TII->isVALU(MI) || TII->isMFMAorWMMA(MI) || TII->isSALU(MI))) 1422 Result = true; 1423 1424 else if (((SGMask & SchedGroupMask::VALU) != SchedGroupMask::NONE) && 1425 TII->isVALU(MI) && !TII->isMFMAorWMMA(MI)) 1426 Result = true; 1427 1428 else if (((SGMask & SchedGroupMask::SALU) != SchedGroupMask::NONE) && 1429 TII->isSALU(MI)) 1430 Result = true; 1431 1432 else if (((SGMask & SchedGroupMask::MFMA) != SchedGroupMask::NONE) && 1433 TII->isMFMAorWMMA(MI)) 1434 Result = true; 1435 1436 else if (((SGMask & SchedGroupMask::VMEM) != SchedGroupMask::NONE) && 1437 (TII->isVMEM(MI) || (TII->isFLAT(MI) && !TII->isDS(MI)))) 1438 Result = true; 1439 1440 else if (((SGMask & SchedGroupMask::VMEM_READ) != SchedGroupMask::NONE) && 1441 MI.mayLoad() && 1442 (TII->isVMEM(MI) || (TII->isFLAT(MI) && !TII->isDS(MI)))) 1443 Result = true; 1444 1445 else if (((SGMask & SchedGroupMask::VMEM_WRITE) != SchedGroupMask::NONE) && 1446 MI.mayStore() && 1447 (TII->isVMEM(MI) || (TII->isFLAT(MI) && !TII->isDS(MI)))) 1448 Result = true; 1449 1450 else if (((SGMask & SchedGroupMask::DS) != SchedGroupMask::NONE) && 1451 TII->isDS(MI)) 1452 Result = true; 1453 1454 else if (((SGMask & SchedGroupMask::DS_READ) != SchedGroupMask::NONE) && 1455 MI.mayLoad() && TII->isDS(MI)) 1456 Result = true; 1457 1458 else if (((SGMask & SchedGroupMask::DS_WRITE) != SchedGroupMask::NONE) && 1459 MI.mayStore() && TII->isDS(MI)) 1460 Result = true; 1461 1462 LLVM_DEBUG( 1463 dbgs() << "For SchedGroup with mask " << format_hex((int)SGMask, 10, true) 1464 << (Result ? " could classify " : " unable to classify ") << MI); 1465 1466 return Result; 1467 } 1468 1469 int SchedGroup::link(SUnit &SU, bool MakePred, 1470 std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges) { 1471 int MissedEdges = 0; 1472 for (auto *A : Collection) { 1473 SUnit *B = &SU; 1474 if (A == B || A->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER) 1475 continue; 1476 if (MakePred) 1477 std::swap(A, B); 1478 1479 if (DAG->IsReachable(B, A)) 1480 continue; 1481 1482 // tryAddEdge returns false if there is a dependency that makes adding 1483 // the A->B edge impossible, otherwise it returns true; 1484 bool Added = tryAddEdge(A, B); 1485 if (Added) 1486 AddedEdges.push_back(std::pair(A, B)); 1487 else 1488 ++MissedEdges; 1489 } 1490 1491 return MissedEdges; 1492 } 1493 1494 void SchedGroup::link(SUnit &SU, bool MakePred) { 1495 for (auto *A : Collection) { 1496 SUnit *B = &SU; 1497 if (A->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER) 1498 continue; 1499 if (MakePred) 1500 std::swap(A, B); 1501 1502 tryAddEdge(A, B); 1503 } 1504 } 1505 1506 void SchedGroup::link(SUnit &SU, 1507 function_ref<bool(const SUnit *A, const SUnit *B)> P) { 1508 for (auto *A : Collection) { 1509 SUnit *B = &SU; 1510 if (P(A, B)) 1511 std::swap(A, B); 1512 1513 tryAddEdge(A, B); 1514 } 1515 } 1516 1517 void SchedGroup::link(SchedGroup &OtherGroup) { 1518 for (auto *B : OtherGroup.Collection) 1519 link(*B); 1520 } 1521 1522 bool SchedGroup::canAddSU(SUnit &SU) const { 1523 MachineInstr &MI = *SU.getInstr(); 1524 if (MI.getOpcode() != TargetOpcode::BUNDLE) 1525 return canAddMI(MI); 1526 1527 // Special case for bundled MIs. 1528 const MachineBasicBlock *MBB = MI.getParent(); 1529 MachineBasicBlock::instr_iterator B = MI.getIterator(), E = ++B; 1530 while (E != MBB->end() && E->isBundledWithPred()) 1531 ++E; 1532 1533 // Return true if all of the bundled MIs can be added to this group. 1534 return std::all_of(B, E, [this](MachineInstr &MI) { return canAddMI(MI); }); 1535 } 1536 1537 void SchedGroup::initSchedGroup() { 1538 for (auto &SU : DAG->SUnits) { 1539 if (isFull()) 1540 break; 1541 1542 if (canAddSU(SU)) 1543 add(SU); 1544 } 1545 } 1546 1547 void SchedGroup::initSchedGroup(std::vector<SUnit>::reverse_iterator RIter, 1548 SUnitsToCandidateSGsMap &SyncedInstrs) { 1549 SUnit &InitSU = *RIter; 1550 for (auto E = DAG->SUnits.rend(); RIter != E; ++RIter) { 1551 auto &SU = *RIter; 1552 if (isFull()) 1553 break; 1554 1555 if (canAddSU(SU)) 1556 SyncedInstrs[&SU].push_back(SGID); 1557 } 1558 1559 add(InitSU); 1560 assert(MaxSize); 1561 (*MaxSize)++; 1562 } 1563 1564 void SchedGroup::initSchedGroup(SUnitsToCandidateSGsMap &SyncedInstrs) { 1565 auto I = DAG->SUnits.rbegin(); 1566 auto E = DAG->SUnits.rend(); 1567 for (; I != E; ++I) { 1568 auto &SU = *I; 1569 if (isFull()) 1570 break; 1571 1572 if (canAddSU(SU)) 1573 SyncedInstrs[&SU].push_back(SGID); 1574 } 1575 } 1576 1577 void IGroupLPDAGMutation::apply(ScheduleDAGInstrs *DAGInstrs) { 1578 const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel(); 1579 if (!TSchedModel || DAGInstrs->SUnits.empty()) 1580 return; 1581 1582 LLVM_DEBUG(dbgs() << "Applying IGroupLPDAGMutation...\n"); 1583 const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>(); 1584 TII = ST.getInstrInfo(); 1585 DAG = static_cast<ScheduleDAGMI *>(DAGInstrs); 1586 SyncedSchedGroups.clear(); 1587 SyncedInstrs.clear(); 1588 bool foundSB = false; 1589 bool foundIGLP = false; 1590 for (auto R = DAG->SUnits.rbegin(), E = DAG->SUnits.rend(); R != E; ++R) { 1591 unsigned Opc = R->getInstr()->getOpcode(); 1592 // SCHED_[GROUP_]BARRIER and IGLP are mutually exclusive. 1593 if (Opc == AMDGPU::SCHED_BARRIER) { 1594 addSchedBarrierEdges(*R); 1595 foundSB = true; 1596 } else if (Opc == AMDGPU::SCHED_GROUP_BARRIER) { 1597 initSchedGroupBarrierPipelineStage(R); 1598 foundSB = true; 1599 } else if (Opc == AMDGPU::IGLP_OPT) { 1600 resetEdges(*R, DAG); 1601 if (!foundSB && !foundIGLP) 1602 initIGLPOpt(*R); 1603 foundIGLP = true; 1604 } 1605 } 1606 1607 if (foundSB || foundIGLP) { 1608 PipelineSolver PS(SyncedSchedGroups, SyncedInstrs, DAG, IsBottomUp); 1609 // PipelineSolver performs the mutation by adding the edges it 1610 // determined as the best 1611 PS.solve(); 1612 } 1613 } 1614 1615 void IGroupLPDAGMutation::addSchedBarrierEdges(SUnit &SchedBarrier) { 1616 MachineInstr &MI = *SchedBarrier.getInstr(); 1617 assert(MI.getOpcode() == AMDGPU::SCHED_BARRIER); 1618 // Remove all existing edges from the SCHED_BARRIER that were added due to the 1619 // instruction having side effects. 1620 resetEdges(SchedBarrier, DAG); 1621 auto InvertedMask = 1622 invertSchedBarrierMask((SchedGroupMask)MI.getOperand(0).getImm()); 1623 SchedGroup SG(InvertedMask, std::nullopt, DAG, TII); 1624 SG.initSchedGroup(); 1625 // Preserve original instruction ordering relative to the SCHED_BARRIER. 1626 SG.link( 1627 SchedBarrier, 1628 (function_ref<bool(const SUnit *A, const SUnit *B)>)[]( 1629 const SUnit *A, const SUnit *B) { return A->NodeNum > B->NodeNum; }); 1630 } 1631 1632 SchedGroupMask 1633 IGroupLPDAGMutation::invertSchedBarrierMask(SchedGroupMask Mask) const { 1634 // Invert mask and erase bits for types of instructions that are implied to be 1635 // allowed past the SCHED_BARRIER. 1636 SchedGroupMask InvertedMask = ~Mask; 1637 1638 // ALU implies VALU, SALU, MFMA. 1639 if ((InvertedMask & SchedGroupMask::ALU) == SchedGroupMask::NONE) 1640 InvertedMask &= 1641 ~SchedGroupMask::VALU & ~SchedGroupMask::SALU & ~SchedGroupMask::MFMA; 1642 // VALU, SALU, MFMA implies ALU. 1643 else if ((InvertedMask & SchedGroupMask::VALU) == SchedGroupMask::NONE || 1644 (InvertedMask & SchedGroupMask::SALU) == SchedGroupMask::NONE || 1645 (InvertedMask & SchedGroupMask::MFMA) == SchedGroupMask::NONE) 1646 InvertedMask &= ~SchedGroupMask::ALU; 1647 1648 // VMEM implies VMEM_READ, VMEM_WRITE. 1649 if ((InvertedMask & SchedGroupMask::VMEM) == SchedGroupMask::NONE) 1650 InvertedMask &= ~SchedGroupMask::VMEM_READ & ~SchedGroupMask::VMEM_WRITE; 1651 // VMEM_READ, VMEM_WRITE implies VMEM. 1652 else if ((InvertedMask & SchedGroupMask::VMEM_READ) == SchedGroupMask::NONE || 1653 (InvertedMask & SchedGroupMask::VMEM_WRITE) == SchedGroupMask::NONE) 1654 InvertedMask &= ~SchedGroupMask::VMEM; 1655 1656 // DS implies DS_READ, DS_WRITE. 1657 if ((InvertedMask & SchedGroupMask::DS) == SchedGroupMask::NONE) 1658 InvertedMask &= ~SchedGroupMask::DS_READ & ~SchedGroupMask::DS_WRITE; 1659 // DS_READ, DS_WRITE implies DS. 1660 else if ((InvertedMask & SchedGroupMask::DS_READ) == SchedGroupMask::NONE || 1661 (InvertedMask & SchedGroupMask::DS_WRITE) == SchedGroupMask::NONE) 1662 InvertedMask &= ~SchedGroupMask::DS; 1663 1664 return InvertedMask; 1665 } 1666 1667 void IGroupLPDAGMutation::initSchedGroupBarrierPipelineStage( 1668 std::vector<SUnit>::reverse_iterator RIter) { 1669 // Remove all existing edges from the SCHED_GROUP_BARRIER that were added due 1670 // to the instruction having side effects. 1671 resetEdges(*RIter, DAG); 1672 MachineInstr &SGB = *RIter->getInstr(); 1673 assert(SGB.getOpcode() == AMDGPU::SCHED_GROUP_BARRIER); 1674 int32_t SGMask = SGB.getOperand(0).getImm(); 1675 int32_t Size = SGB.getOperand(1).getImm(); 1676 int32_t SyncID = SGB.getOperand(2).getImm(); 1677 1678 auto &SG = SyncedSchedGroups[SyncID].emplace_back((SchedGroupMask)SGMask, 1679 Size, SyncID, DAG, TII); 1680 1681 SG.initSchedGroup(RIter, SyncedInstrs[SG.getSyncID()]); 1682 } 1683 1684 void IGroupLPDAGMutation::initIGLPOpt(SUnit &SU) { 1685 IGLPStrategyID StrategyID = 1686 (IGLPStrategyID)SU.getInstr()->getOperand(0).getImm(); 1687 auto S = createIGLPStrategy(StrategyID, DAG, TII); 1688 if (S->shouldApplyStrategy(DAG)) { 1689 IsBottomUp = S->IsBottomUp; 1690 S->applyIGLPStrategy(SyncedInstrs, SyncedSchedGroups); 1691 } 1692 } 1693 1694 } // namespace 1695 1696 namespace llvm { 1697 1698 std::unique_ptr<ScheduleDAGMutation> createIGroupLPDAGMutation() { 1699 return std::make_unique<IGroupLPDAGMutation>(); 1700 } 1701 1702 } // end namespace llvm 1703