1 //===--- AMDGPUIGroupLP.cpp - AMDGPU IGroupLP ------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // \file This file defines a set of schedule DAG mutations that can be used to 10 // override default scheduler behavior to enforce specific scheduling patterns. 11 // They should be used in cases where runtime performance considerations such as 12 // inter-wavefront interactions, mean that compile-time heuristics cannot 13 // predict the optimal instruction ordering, or in kernels where optimum 14 // instruction scheduling is important enough to warrant manual intervention. 15 // 16 //===----------------------------------------------------------------------===// 17 18 #include "AMDGPUIGroupLP.h" 19 #include "AMDGPUTargetMachine.h" 20 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 21 #include "SIInstrInfo.h" 22 #include "SIMachineFunctionInfo.h" 23 #include "llvm/ADT/BitmaskEnum.h" 24 #include "llvm/ADT/DenseMap.h" 25 #include "llvm/CodeGen/MachineScheduler.h" 26 #include "llvm/CodeGen/TargetOpcodes.h" 27 28 using namespace llvm; 29 30 #define DEBUG_TYPE "igrouplp" 31 32 namespace { 33 34 static cl::opt<bool> EnableExactSolver( 35 "amdgpu-igrouplp-exact-solver", cl::Hidden, 36 cl::desc("Whether to use the exponential time solver to fit " 37 "the instructions to the pipeline as closely as " 38 "possible."), 39 cl::init(false)); 40 41 static cl::opt<unsigned> CutoffForExact( 42 "amdgpu-igrouplp-exact-solver-cutoff", cl::init(0), cl::Hidden, 43 cl::desc("The maximum number of scheduling group conflicts " 44 "which we attempt to solve with the exponential time " 45 "exact solver. Problem sizes greater than this will" 46 "be solved by the less accurate greedy algorithm. Selecting " 47 "solver by size is superseded by manually selecting " 48 "the solver (e.g. by amdgpu-igrouplp-exact-solver")); 49 50 static cl::opt<uint64_t> MaxBranchesExplored( 51 "amdgpu-igrouplp-exact-solver-max-branches", cl::init(0), cl::Hidden, 52 cl::desc("The amount of branches that we are willing to explore with" 53 "the exact algorithm before giving up.")); 54 55 static cl::opt<bool> UseCostHeur( 56 "amdgpu-igrouplp-exact-solver-cost-heur", cl::init(true), cl::Hidden, 57 cl::desc("Whether to use the cost heuristic to make choices as we " 58 "traverse the search space using the exact solver. Defaulted " 59 "to on, and if turned off, we will use the node order -- " 60 "attempting to put the later nodes in the later sched groups. " 61 "Experimentally, results are mixed, so this should be set on a " 62 "case-by-case basis.")); 63 64 // Components of the mask that determines which instruction types may be may be 65 // classified into a SchedGroup. 66 enum class SchedGroupMask { 67 NONE = 0u, 68 ALU = 1u << 0, 69 VALU = 1u << 1, 70 SALU = 1u << 2, 71 MFMA = 1u << 3, 72 VMEM = 1u << 4, 73 VMEM_READ = 1u << 5, 74 VMEM_WRITE = 1u << 6, 75 DS = 1u << 7, 76 DS_READ = 1u << 8, 77 DS_WRITE = 1u << 9, 78 ALL = ALU | VALU | SALU | MFMA | VMEM | VMEM_READ | VMEM_WRITE | DS | 79 DS_READ | DS_WRITE, 80 LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ ALL) 81 }; 82 83 typedef DenseMap<SUnit *, SmallVector<int, 4>> SUnitsToCandidateSGsMap; 84 85 // Classify instructions into groups to enable fine tuned control over the 86 // scheduler. These groups may be more specific than current SchedModel 87 // instruction classes. 88 class SchedGroup { 89 private: 90 // Mask that defines which instruction types can be classified into this 91 // SchedGroup. The instruction types correspond to the mask from SCHED_BARRIER 92 // and SCHED_GROUP_BARRIER. 93 SchedGroupMask SGMask; 94 95 // Maximum number of SUnits that can be added to this group. 96 Optional<unsigned> MaxSize; 97 98 // SchedGroups will only synchronize with other SchedGroups that have the same 99 // SyncID. 100 int SyncID = 0; 101 102 // SGID is used to map instructions to candidate SchedGroups 103 unsigned SGID; 104 105 // Count of the number of created SchedGroups, used to initialize SGID. 106 static unsigned NumSchedGroups; 107 108 ScheduleDAGInstrs *DAG; 109 110 const SIInstrInfo *TII; 111 112 // Try to add and edge from SU A to SU B. 113 bool tryAddEdge(SUnit *A, SUnit *B); 114 115 // Use SGMask to determine whether we can classify MI as a member of this 116 // SchedGroup object. 117 bool canAddMI(const MachineInstr &MI) const; 118 119 public: 120 // Collection of SUnits that are classified as members of this group. 121 SmallVector<SUnit *, 32> Collection; 122 123 // Returns true if SU can be added to this SchedGroup. 124 bool canAddSU(SUnit &SU) const; 125 126 // Add DAG dependencies from all SUnits in this SchedGroup and this SU. If 127 // MakePred is true, SU will be a predecessor of the SUnits in this 128 // SchedGroup, otherwise SU will be a successor. 129 void link(SUnit &SU, bool MakePred = false); 130 131 // Add DAG dependencies and track which edges are added, and the count of 132 // missed edges 133 int link(SUnit &SU, bool MakePred, 134 std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges); 135 136 // Add DAG dependencies from all SUnits in this SchedGroup and this SU. 137 // Use the predicate to determine whether SU should be a predecessor (P = 138 // true) or a successor (P = false) of this SchedGroup. 139 void link(SUnit &SU, function_ref<bool(const SUnit *A, const SUnit *B)> P); 140 141 // Add DAG dependencies such that SUnits in this group shall be ordered 142 // before SUnits in OtherGroup. 143 void link(SchedGroup &OtherGroup); 144 145 // Returns true if no more instructions may be added to this group. 146 bool isFull() const { return MaxSize && Collection.size() >= *MaxSize; } 147 148 // Add SU to the SchedGroup. 149 void add(SUnit &SU) { 150 LLVM_DEBUG(dbgs() << "For SchedGroup with mask " 151 << format_hex((int)SGMask, 10, true) << " adding " 152 << *SU.getInstr()); 153 Collection.push_back(&SU); 154 } 155 156 // Remove last element in the SchedGroup 157 void pop() { Collection.pop_back(); } 158 159 // Identify and add all relevant SUs from the DAG to this SchedGroup. 160 void initSchedGroup(); 161 162 // Add instructions to the SchedGroup bottom up starting from RIter. 163 // PipelineInstrs is a set of instructions that should not be added to the 164 // SchedGroup even when the other conditions for adding it are satisfied. 165 // RIter will be added to the SchedGroup as well, and dependencies will be 166 // added so that RIter will always be scheduled at the end of the group. 167 void initSchedGroup(std::vector<SUnit>::reverse_iterator RIter, 168 SUnitsToCandidateSGsMap &SyncedInstrs); 169 170 void initSchedGroup(SUnitsToCandidateSGsMap &SyncedInstrs); 171 172 int getSyncID() { return SyncID; } 173 174 int getSGID() { return SGID; } 175 176 SchedGroupMask getMask() { return SGMask; } 177 178 SchedGroup(SchedGroupMask SGMask, Optional<unsigned> MaxSize, 179 ScheduleDAGInstrs *DAG, const SIInstrInfo *TII) 180 : SGMask(SGMask), MaxSize(MaxSize), DAG(DAG), TII(TII) { 181 SGID = NumSchedGroups++; 182 } 183 184 SchedGroup(SchedGroupMask SGMask, Optional<unsigned> MaxSize, int SyncID, 185 ScheduleDAGInstrs *DAG, const SIInstrInfo *TII) 186 : SGMask(SGMask), MaxSize(MaxSize), SyncID(SyncID), DAG(DAG), TII(TII) { 187 SGID = NumSchedGroups++; 188 } 189 }; 190 191 // Remove all existing edges from a SCHED_BARRIER or SCHED_GROUP_BARRIER. 192 static void resetEdges(SUnit &SU, ScheduleDAGInstrs *DAG) { 193 assert(SU.getInstr()->getOpcode() == AMDGPU::SCHED_BARRIER || 194 SU.getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER || 195 SU.getInstr()->getOpcode() == AMDGPU::IGLP_OPT); 196 197 while (!SU.Preds.empty()) 198 for (auto &P : SU.Preds) 199 SU.removePred(P); 200 201 while (!SU.Succs.empty()) 202 for (auto &S : SU.Succs) 203 for (auto &SP : S.getSUnit()->Preds) 204 if (SP.getSUnit() == &SU) 205 S.getSUnit()->removePred(SP); 206 } 207 208 typedef std::pair<SUnit *, SmallVector<int, 4>> SUToCandSGsPair; 209 typedef SmallVector<SUToCandSGsPair, 4> SUsToCandSGsVec; 210 211 // The PipelineSolver is used to assign SUnits to SchedGroups in a pipeline 212 // in non-trivial cases. For example, if the requested pipeline is 213 // {VMEM_READ, VALU, MFMA, VMEM_READ} and we encounter a VMEM_READ instruction 214 // in the DAG, then we will have an instruction that can not be trivially 215 // assigned to a SchedGroup. The PipelineSolver class implements two algorithms 216 // to find a good solution to the pipeline -- a greedy algorithm and an exact 217 // algorithm. The exact algorithm has an exponential time complexity and should 218 // only be used for small sized problems or medium sized problems where an exact 219 // solution is highly desired. 220 class PipelineSolver { 221 ScheduleDAGMI *DAG; 222 223 // Instructions that can be assigned to multiple SchedGroups 224 DenseMap<int, SUnitsToCandidateSGsMap> SyncedInstrs; 225 SmallVector<SUsToCandSGsVec, 4> PipelineInstrs; 226 DenseMap<int, SmallVector<SchedGroup, 4>> SyncedSchedGroups; 227 // The current working pipeline 228 SmallVector<SmallVector<SchedGroup, 4>, 4> CurrPipeline; 229 // The pipeline that has the best solution found so far 230 SmallVector<SmallVector<SchedGroup, 4>, 4> BestPipeline; 231 232 // Whether or not we actually have any SyncedInstrs to try to solve. 233 bool NeedsSolver = false; 234 235 // Compute an estimate of the size of search tree -- the true size is 236 // the product of each conflictedInst.Matches.size() across all SyncPipelines 237 unsigned computeProblemSize(); 238 239 // The cost penalty of not assigning a SU to a SchedGroup 240 int MissPenalty = 0; 241 242 // Costs in terms of the number of edges we are unable to add 243 int BestCost = -1; 244 int CurrCost = 0; 245 246 // Index pointing to the conflicting instruction that is currently being 247 // fitted 248 int CurrConflInstNo = 0; 249 // Index to the pipeline that is currently being fitted 250 int CurrSyncGroupIdx = 0; 251 // The first non trivial pipeline 252 int BeginSyncGroupIdx = 0; 253 254 // How many branches we have explored 255 uint64_t BranchesExplored = 0; 256 257 // Update indices to fit next conflicting instruction 258 void advancePosition(); 259 // Recede indices to attempt to find better fit for previous conflicting 260 // instruction 261 void retreatPosition(); 262 263 // The exponential time algorithm which finds the provably best fit 264 bool solveExact(); 265 // The polynomial time algorithm which attempts to find a good fit 266 bool solveGreedy(); 267 // Whether or not the current solution is optimal 268 bool checkOptimal(); 269 // Populate the ready list, prioiritizing fewest missed edges first 270 void populateReadyList(SUToCandSGsPair &CurrSU, 271 SmallVectorImpl<std::pair<int, int>> &ReadyList, 272 SmallVectorImpl<SchedGroup> &SyncPipeline); 273 // Add edges corresponding to the SchedGroups as assigned by solver 274 void makePipeline(); 275 // Add the edges from the SU to the other SchedGroups in pipeline, and 276 // return the number of edges missed. 277 int addEdges(SmallVectorImpl<SchedGroup> &SyncPipeline, SUnit *SU, int SGID, 278 std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges); 279 // Remove the edges passed via AddedEdges 280 void removeEdges(const std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges); 281 // Convert the passed in maps to arrays for bidirectional iterators 282 void convertSyncMapsToArrays(); 283 284 void reset(); 285 286 public: 287 // Invoke the solver to map instructions to instruction groups. Heuristic && 288 // command-line-option determines to use exact or greedy algorithm. 289 void solve(); 290 291 PipelineSolver(DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups, 292 DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs, 293 ScheduleDAGMI *DAG) 294 : DAG(DAG), SyncedInstrs(SyncedInstrs), 295 SyncedSchedGroups(SyncedSchedGroups) { 296 297 for (auto &PipelineInstrs : SyncedInstrs) { 298 if (PipelineInstrs.second.size() > 0) { 299 NeedsSolver = true; 300 break; 301 } 302 } 303 304 if (!NeedsSolver) 305 return; 306 307 convertSyncMapsToArrays(); 308 309 CurrPipeline = BestPipeline; 310 311 while (static_cast<size_t>(BeginSyncGroupIdx) < PipelineInstrs.size() && 312 PipelineInstrs[BeginSyncGroupIdx].size() == 0) 313 ++BeginSyncGroupIdx; 314 315 if (static_cast<size_t>(BeginSyncGroupIdx) >= PipelineInstrs.size()) 316 return; 317 } 318 }; 319 320 void PipelineSolver::reset() { 321 322 for (auto &SyncPipeline : CurrPipeline) { 323 for (auto &SG : SyncPipeline) { 324 SmallVector<SUnit *, 32> TempCollection = SG.Collection; 325 SG.Collection.clear(); 326 auto SchedBarr = llvm::find_if(TempCollection, [](SUnit *SU) { 327 return SU->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER; 328 }); 329 if (SchedBarr != TempCollection.end()) 330 SG.Collection.push_back(*SchedBarr); 331 } 332 } 333 334 CurrSyncGroupIdx = BeginSyncGroupIdx; 335 CurrConflInstNo = 0; 336 CurrCost = 0; 337 } 338 339 void PipelineSolver::convertSyncMapsToArrays() { 340 for (auto &SyncPipe : SyncedSchedGroups) { 341 BestPipeline.insert(BestPipeline.begin(), SyncPipe.second); 342 } 343 344 int PipelineIDx = SyncedInstrs.size() - 1; 345 PipelineInstrs.resize(SyncedInstrs.size()); 346 for (auto &SyncInstrMap : SyncedInstrs) { 347 for (auto &SUsToCandSGs : SyncInstrMap.second) { 348 if (PipelineInstrs[PipelineIDx].size() == 0) { 349 PipelineInstrs[PipelineIDx].push_back( 350 std::make_pair(SUsToCandSGs.first, SUsToCandSGs.second)); 351 continue; 352 } 353 auto SortPosition = PipelineInstrs[PipelineIDx].begin(); 354 // Insert them in sorted order -- this allows for good parsing order in 355 // the greedy algorithm 356 while (SortPosition != PipelineInstrs[PipelineIDx].end() && 357 SUsToCandSGs.first->NodeNum > SortPosition->first->NodeNum) 358 ++SortPosition; 359 PipelineInstrs[PipelineIDx].insert( 360 SortPosition, 361 std::make_pair(SUsToCandSGs.first, SUsToCandSGs.second)); 362 } 363 --PipelineIDx; 364 } 365 } 366 367 void PipelineSolver::makePipeline() { 368 // Preserve the order of barrier for subsequent SchedGroupBarrier mutations 369 for (auto &SyncPipeline : BestPipeline) { 370 for (auto &SG : SyncPipeline) { 371 SUnit *SGBarr = nullptr; 372 for (auto &SU : SG.Collection) { 373 if (SU->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER) 374 SGBarr = SU; 375 } 376 // Command line requested IGroupLP doesn't have SGBarr 377 if (!SGBarr) 378 continue; 379 resetEdges(*SGBarr, DAG); 380 SG.link(*SGBarr, false); 381 } 382 } 383 384 for (auto &SyncPipeline : BestPipeline) { 385 auto I = SyncPipeline.rbegin(); 386 auto E = SyncPipeline.rend(); 387 for (; I != E; ++I) { 388 auto &GroupA = *I; 389 for (auto J = std::next(I); J != E; ++J) { 390 auto &GroupB = *J; 391 GroupA.link(GroupB); 392 } 393 } 394 } 395 } 396 397 int PipelineSolver::addEdges( 398 SmallVectorImpl<SchedGroup> &SyncPipeline, SUnit *SU, int SGID, 399 std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges) { 400 int AddedCost = 0; 401 bool MakePred = false; 402 403 // The groups in the pipeline are in reverse order. Thus, 404 // by traversing them from last to first, we are traversing 405 // them in the order as they were introduced in the code. After we 406 // pass the group the SU is being assigned to, it should be 407 // linked as a predecessor of the subsequent SchedGroups 408 auto GroupNo = (int)SyncPipeline.size() - 1; 409 for (; GroupNo >= 0; GroupNo--) { 410 if (SyncPipeline[GroupNo].getSGID() == SGID) { 411 MakePred = true; 412 continue; 413 } 414 auto Group = &SyncPipeline[GroupNo]; 415 AddedCost += Group->link(*SU, MakePred, AddedEdges); 416 assert(AddedCost >= 0); 417 } 418 419 return AddedCost; 420 } 421 422 void PipelineSolver::removeEdges( 423 const std::vector<std::pair<SUnit *, SUnit *>> &EdgesToRemove) { 424 // Only remove the edges that we have added when testing 425 // the fit. 426 for (auto &PredSuccPair : EdgesToRemove) { 427 SUnit *Pred = PredSuccPair.first; 428 SUnit *Succ = PredSuccPair.second; 429 430 auto Match = llvm::find_if( 431 Succ->Preds, [&Pred](SDep &P) { return P.getSUnit() == Pred; }); 432 if (Match != Succ->Preds.end()) { 433 assert(Match->isArtificial()); 434 Succ->removePred(*Match); 435 } 436 } 437 } 438 439 void PipelineSolver::advancePosition() { 440 ++CurrConflInstNo; 441 442 if (static_cast<size_t>(CurrConflInstNo) >= 443 PipelineInstrs[CurrSyncGroupIdx].size()) { 444 CurrConflInstNo = 0; 445 ++CurrSyncGroupIdx; 446 // Advance to next non-trivial pipeline 447 while (static_cast<size_t>(CurrSyncGroupIdx) < PipelineInstrs.size() && 448 PipelineInstrs[CurrSyncGroupIdx].size() == 0) 449 ++CurrSyncGroupIdx; 450 } 451 } 452 453 void PipelineSolver::retreatPosition() { 454 assert(CurrConflInstNo >= 0); 455 assert(CurrSyncGroupIdx >= 0); 456 457 if (CurrConflInstNo > 0) { 458 --CurrConflInstNo; 459 return; 460 } 461 462 if (CurrConflInstNo == 0) { 463 // If we return to the starting position, we have explored 464 // the entire tree 465 if (CurrSyncGroupIdx == BeginSyncGroupIdx) 466 return; 467 468 --CurrSyncGroupIdx; 469 // Go to previous non-trivial pipeline 470 while (PipelineInstrs[CurrSyncGroupIdx].size() == 0) 471 --CurrSyncGroupIdx; 472 473 CurrConflInstNo = PipelineInstrs[CurrSyncGroupIdx].size() - 1; 474 } 475 } 476 477 bool PipelineSolver::checkOptimal() { 478 if (static_cast<size_t>(CurrSyncGroupIdx) == PipelineInstrs.size()) { 479 if (BestCost == -1 || CurrCost < BestCost) { 480 BestPipeline = CurrPipeline; 481 BestCost = CurrCost; 482 LLVM_DEBUG(dbgs() << "Found Fit with cost " << BestCost << "\n"); 483 } 484 assert(BestCost >= 0); 485 } 486 487 bool DoneExploring = false; 488 if (MaxBranchesExplored > 0 && BranchesExplored >= MaxBranchesExplored) 489 DoneExploring = true; 490 491 return (DoneExploring || BestCost == 0); 492 } 493 494 void PipelineSolver::populateReadyList( 495 SUToCandSGsPair &CurrSU, SmallVectorImpl<std::pair<int, int>> &ReadyList, 496 SmallVectorImpl<SchedGroup> &SyncPipeline) { 497 assert(CurrSU.second.size() >= 1); 498 auto I = CurrSU.second.rbegin(); 499 auto E = CurrSU.second.rend(); 500 for (; I != E; ++I) { 501 std::vector<std::pair<SUnit *, SUnit *>> AddedEdges; 502 int CandSGID = *I; 503 SchedGroup *Match; 504 for (auto &SG : SyncPipeline) { 505 if (SG.getSGID() == CandSGID) 506 Match = &SG; 507 } 508 509 if (UseCostHeur) { 510 if (Match->isFull()) { 511 ReadyList.push_back(std::make_pair(*I, MissPenalty)); 512 continue; 513 } 514 515 int TempCost = addEdges(SyncPipeline, CurrSU.first, CandSGID, AddedEdges); 516 ReadyList.push_back(std::make_pair(*I, TempCost)); 517 removeEdges(AddedEdges); 518 } else 519 ReadyList.push_back(std::make_pair(*I, -1)); 520 } 521 522 if (UseCostHeur) { 523 std::sort(ReadyList.begin(), ReadyList.end(), 524 [](std::pair<int, int> A, std::pair<int, int> B) { 525 return A.second < B.second; 526 }); 527 } 528 529 assert(ReadyList.size() == CurrSU.second.size()); 530 } 531 532 bool PipelineSolver::solveExact() { 533 if (checkOptimal()) 534 return true; 535 536 if (static_cast<size_t>(CurrSyncGroupIdx) == PipelineInstrs.size()) 537 return false; 538 539 assert(static_cast<size_t>(CurrSyncGroupIdx) < PipelineInstrs.size()); 540 assert(static_cast<size_t>(CurrConflInstNo) < 541 PipelineInstrs[CurrSyncGroupIdx].size()); 542 SUToCandSGsPair CurrSU = PipelineInstrs[CurrSyncGroupIdx][CurrConflInstNo]; 543 LLVM_DEBUG(dbgs() << "Fitting SU(" << CurrSU.first->NodeNum 544 << ") in Pipeline # " << CurrSyncGroupIdx << "\n"); 545 546 // SchedGroup -> Cost pairs 547 SmallVector<std::pair<int, int>, 4> ReadyList; 548 // Prioritize the candidate sched groups in terms of lowest cost first 549 populateReadyList(CurrSU, ReadyList, CurrPipeline[CurrSyncGroupIdx]); 550 551 auto I = ReadyList.begin(); 552 auto E = ReadyList.end(); 553 for (; I != E; ++I) { 554 // If we are trying SGs in least cost order, and the current SG is cost 555 // infeasible, then all subsequent SGs will also be cost infeasible, so we 556 // can prune. 557 if (BestCost != -1 && (CurrCost + I->second > BestCost)) 558 return false; 559 560 int CandSGID = I->first; 561 int AddedCost = 0; 562 std::vector<std::pair<SUnit *, SUnit *>> AddedEdges; 563 auto &SyncPipeline = CurrPipeline[CurrSyncGroupIdx]; 564 SchedGroup *Match; 565 for (auto &SG : SyncPipeline) { 566 if (SG.getSGID() == CandSGID) 567 Match = &SG; 568 } 569 570 if (Match->isFull()) 571 continue; 572 573 LLVM_DEBUG(dbgs() << "Assigning to SchedGroup with Mask " 574 << (int)Match->getMask() << "and ID " << CandSGID 575 << "\n"); 576 Match->add(*CurrSU.first); 577 AddedCost = addEdges(SyncPipeline, CurrSU.first, CandSGID, AddedEdges); 578 LLVM_DEBUG(dbgs() << "Cost of Assignment: " << AddedCost << "\n"); 579 CurrCost += AddedCost; 580 advancePosition(); 581 ++BranchesExplored; 582 bool FinishedExploring = false; 583 // If the Cost after adding edges is greater than a known solution, 584 // backtrack 585 if (CurrCost < BestCost || BestCost == -1) { 586 if (solveExact()) { 587 FinishedExploring = BestCost != 0; 588 if (!FinishedExploring) 589 return true; 590 } 591 } 592 593 retreatPosition(); 594 CurrCost -= AddedCost; 595 removeEdges(AddedEdges); 596 Match->pop(); 597 CurrPipeline[CurrSyncGroupIdx] = SyncPipeline; 598 if (FinishedExploring) 599 return true; 600 } 601 602 // Try the pipeline where the current instruction is omitted 603 // Potentially if we omit a problematic instruction from the pipeline, 604 // all the other instructions can nicely fit. 605 CurrCost += MissPenalty; 606 advancePosition(); 607 608 LLVM_DEBUG(dbgs() << "NOT Assigned (" << CurrSU.first->NodeNum << ")\n"); 609 610 bool FinishedExploring = false; 611 if (CurrCost < BestCost || BestCost == -1) { 612 if (solveExact()) { 613 bool FinishedExploring = BestCost != 0; 614 if (!FinishedExploring) 615 return true; 616 } 617 } 618 619 retreatPosition(); 620 CurrCost -= MissPenalty; 621 return FinishedExploring; 622 } 623 624 bool PipelineSolver::solveGreedy() { 625 BestCost = 0; 626 std::vector<std::pair<SUnit *, SUnit *>> AddedEdges; 627 628 while (static_cast<size_t>(CurrSyncGroupIdx) < PipelineInstrs.size()) { 629 SUToCandSGsPair CurrSU = PipelineInstrs[CurrSyncGroupIdx][CurrConflInstNo]; 630 int BestNodeCost = -1; 631 int TempCost; 632 SchedGroup *BestGroup = nullptr; 633 int BestGroupID = -1; 634 auto &SyncPipeline = CurrPipeline[CurrSyncGroupIdx]; 635 LLVM_DEBUG(dbgs() << "Fitting SU(" << CurrSU.first->NodeNum 636 << ") in Pipeline # " << CurrSyncGroupIdx << "\n"); 637 638 // Since we have added the potential SchedGroups from bottom up, but 639 // traversed the DAG from top down, parse over the groups from last to 640 // first. If we fail to do this for the greedy algorithm, the solution will 641 // likely not be good in more complex cases. 642 auto I = CurrSU.second.rbegin(); 643 auto E = CurrSU.second.rend(); 644 for (; I != E; ++I) { 645 std::vector<std::pair<SUnit *, SUnit *>> AddedEdges; 646 int CandSGID = *I; 647 SchedGroup *Match; 648 for (auto &SG : SyncPipeline) { 649 if (SG.getSGID() == CandSGID) 650 Match = &SG; 651 } 652 653 LLVM_DEBUG(dbgs() << "Trying SGID # " << CandSGID << " with Mask " 654 << (int)Match->getMask() << "\n"); 655 656 if (Match->isFull()) { 657 LLVM_DEBUG(dbgs() << "SGID # " << CandSGID << " is full\n"); 658 continue; 659 } 660 TempCost = addEdges(SyncPipeline, CurrSU.first, CandSGID, AddedEdges); 661 LLVM_DEBUG(dbgs() << "Cost of Group " << TempCost << "\n"); 662 if (TempCost < BestNodeCost || BestNodeCost == -1) { 663 BestGroup = Match; 664 BestNodeCost = TempCost; 665 BestGroupID = CandSGID; 666 } 667 removeEdges(AddedEdges); 668 if (BestNodeCost == 0) 669 break; 670 } 671 672 if (BestGroupID != -1) { 673 BestGroup->add(*CurrSU.first); 674 addEdges(SyncPipeline, CurrSU.first, BestGroupID, AddedEdges); 675 LLVM_DEBUG(dbgs() << "Best Group has ID: " << BestGroupID << " and Mask" 676 << (int)BestGroup->getMask() << "\n"); 677 BestCost += TempCost; 678 } else 679 BestCost += MissPenalty; 680 681 CurrPipeline[CurrSyncGroupIdx] = SyncPipeline; 682 advancePosition(); 683 } 684 BestPipeline = CurrPipeline; 685 removeEdges(AddedEdges); 686 return false; 687 } 688 689 unsigned PipelineSolver::computeProblemSize() { 690 unsigned ProblemSize = 0; 691 for (auto &PipeConflicts : PipelineInstrs) { 692 ProblemSize += PipeConflicts.size(); 693 } 694 695 return ProblemSize; 696 } 697 698 void PipelineSolver::solve() { 699 if (!NeedsSolver) 700 return; 701 702 unsigned ProblemSize = computeProblemSize(); 703 assert(ProblemSize > 0); 704 705 bool BelowCutoff = (CutoffForExact > 0) && ProblemSize <= CutoffForExact; 706 MissPenalty = (ProblemSize / 2) + 1; 707 708 LLVM_DEBUG(DAG->dump()); 709 if (EnableExactSolver || BelowCutoff) { 710 LLVM_DEBUG(dbgs() << "Starting Greedy pipeline solver\n"); 711 solveGreedy(); 712 reset(); 713 LLVM_DEBUG(dbgs() << "Greedy produced best cost of " << BestCost << "\n"); 714 if (BestCost > 0) { 715 LLVM_DEBUG(dbgs() << "Starting EXACT pipeline solver\n"); 716 solveExact(); 717 LLVM_DEBUG(dbgs() << "Exact produced best cost of " << BestCost << "\n"); 718 } 719 } else { // Use the Greedy Algorithm by default 720 LLVM_DEBUG(dbgs() << "Starting GREEDY pipeline solver\n"); 721 solveGreedy(); 722 } 723 724 makePipeline(); 725 } 726 727 enum IGLPStrategyID : int { MFMASmallGemmOptID = 0 }; 728 729 // Implement a IGLP scheduling strategy. 730 class IGLPStrategy { 731 protected: 732 ScheduleDAGInstrs *DAG; 733 734 const SIInstrInfo *TII; 735 736 public: 737 // Add SchedGroups to \p Pipeline to implement this Strategy. 738 virtual void applyIGLPStrategy( 739 DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs, 740 DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups) = 0; 741 742 // Returns true if this strategy should be applied to a ScheduleDAG. 743 virtual bool shouldApplyStrategy(ScheduleDAGInstrs *DAG) = 0; 744 745 IGLPStrategy(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII) 746 : DAG(DAG), TII(TII) {} 747 748 virtual ~IGLPStrategy() = default; 749 }; 750 751 class MFMASmallGemmOpt final : public IGLPStrategy { 752 public: 753 void applyIGLPStrategy( 754 DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs, 755 DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups) override; 756 757 bool shouldApplyStrategy(ScheduleDAGInstrs *DAG) override { return true; } 758 759 MFMASmallGemmOpt(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII) 760 : IGLPStrategy(DAG, TII) {} 761 }; 762 763 void MFMASmallGemmOpt::applyIGLPStrategy( 764 DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs, 765 DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups) { 766 // Count the number of MFMA instructions. 767 unsigned MFMACount = 0; 768 for (auto I = DAG->begin(), E = DAG->end(); I != E; ++I) { 769 if (TII->isMFMA(*I)) 770 ++MFMACount; 771 } 772 773 const unsigned PipelineSyncID = 0; 774 SchedGroup *SG = nullptr; 775 for (unsigned I = 0; I < MFMACount; ++I) { 776 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 777 SchedGroupMask::DS_READ, 1, PipelineSyncID, DAG, TII); 778 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 779 780 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 781 SchedGroupMask::VMEM_READ, 1, PipelineSyncID, DAG, TII); 782 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 783 784 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 785 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII); 786 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 787 788 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 789 SchedGroupMask::VMEM_WRITE, 1, PipelineSyncID, DAG, TII); 790 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 791 792 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 793 SchedGroupMask::DS_WRITE, 1, PipelineSyncID, DAG, TII); 794 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 795 } 796 797 for (unsigned I = 0; I < MFMACount; ++I) { 798 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 799 SchedGroupMask::DS_READ, 1, PipelineSyncID, DAG, TII); 800 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 801 802 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 803 SchedGroupMask::VMEM_READ, 1, PipelineSyncID, DAG, TII); 804 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 805 806 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 807 SchedGroupMask::VMEM_WRITE, 1, PipelineSyncID, DAG, TII); 808 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 809 810 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 811 SchedGroupMask::DS_WRITE, 1, PipelineSyncID, DAG, TII); 812 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 813 } 814 } 815 816 static std::unique_ptr<IGLPStrategy> 817 createIGLPStrategy(IGLPStrategyID ID, ScheduleDAGInstrs *DAG, 818 const SIInstrInfo *TII) { 819 switch (ID) { 820 case MFMASmallGemmOptID: 821 return std::make_unique<MFMASmallGemmOpt>(DAG, TII); 822 } 823 824 llvm_unreachable("Unknown IGLPStrategyID"); 825 } 826 827 class IGroupLPDAGMutation : public ScheduleDAGMutation { 828 private: 829 const SIInstrInfo *TII; 830 831 ScheduleDAGMI *DAG; 832 833 // Organize lists of SchedGroups by their SyncID. SchedGroups / 834 // SCHED_GROUP_BARRIERs with different SyncIDs will have no edges added 835 // between then. 836 DenseMap<int, SmallVector<SchedGroup, 4>> SyncedSchedGroups; 837 838 // Used to track instructions that can be mapped to multiple sched groups 839 DenseMap<int, SUnitsToCandidateSGsMap> SyncedInstrs; 840 841 // Add DAG edges that enforce SCHED_BARRIER ordering. 842 void addSchedBarrierEdges(SUnit &SU); 843 844 // Use a SCHED_BARRIER's mask to identify instruction SchedGroups that should 845 // not be reordered accross the SCHED_BARRIER. This is used for the base 846 // SCHED_BARRIER, and not SCHED_GROUP_BARRIER. The difference is that 847 // SCHED_BARRIER will always block all instructions that can be classified 848 // into a particular SchedClass, whereas SCHED_GROUP_BARRIER has a fixed size 849 // and may only synchronize with some SchedGroups. Returns the inverse of 850 // Mask. SCHED_BARRIER's mask describes which instruction types should be 851 // allowed to be scheduled across it. Invert the mask to get the 852 // SchedGroupMask of instructions that should be barred. 853 SchedGroupMask invertSchedBarrierMask(SchedGroupMask Mask) const; 854 855 // Create SchedGroups for a SCHED_GROUP_BARRIER. 856 void initSchedGroupBarrierPipelineStage( 857 std::vector<SUnit>::reverse_iterator RIter); 858 859 void initIGLPOpt(SUnit &SU); 860 861 public: 862 void apply(ScheduleDAGInstrs *DAGInstrs) override; 863 864 IGroupLPDAGMutation() = default; 865 }; 866 867 unsigned SchedGroup::NumSchedGroups = 0; 868 869 bool SchedGroup::tryAddEdge(SUnit *A, SUnit *B) { 870 if (A != B && DAG->canAddEdge(B, A)) { 871 DAG->addEdge(B, SDep(A, SDep::Artificial)); 872 return true; 873 } 874 return false; 875 } 876 877 bool SchedGroup::canAddMI(const MachineInstr &MI) const { 878 bool Result = false; 879 if (MI.isMetaInstruction()) 880 Result = false; 881 882 else if (((SGMask & SchedGroupMask::ALU) != SchedGroupMask::NONE) && 883 (TII->isVALU(MI) || TII->isMFMA(MI) || TII->isSALU(MI))) 884 Result = true; 885 886 else if (((SGMask & SchedGroupMask::VALU) != SchedGroupMask::NONE) && 887 TII->isVALU(MI) && !TII->isMFMA(MI)) 888 Result = true; 889 890 else if (((SGMask & SchedGroupMask::SALU) != SchedGroupMask::NONE) && 891 TII->isSALU(MI)) 892 Result = true; 893 894 else if (((SGMask & SchedGroupMask::MFMA) != SchedGroupMask::NONE) && 895 TII->isMFMA(MI)) 896 Result = true; 897 898 else if (((SGMask & SchedGroupMask::VMEM) != SchedGroupMask::NONE) && 899 (TII->isVMEM(MI) || (TII->isFLAT(MI) && !TII->isDS(MI)))) 900 Result = true; 901 902 else if (((SGMask & SchedGroupMask::VMEM_READ) != SchedGroupMask::NONE) && 903 MI.mayLoad() && 904 (TII->isVMEM(MI) || (TII->isFLAT(MI) && !TII->isDS(MI)))) 905 Result = true; 906 907 else if (((SGMask & SchedGroupMask::VMEM_WRITE) != SchedGroupMask::NONE) && 908 MI.mayStore() && 909 (TII->isVMEM(MI) || (TII->isFLAT(MI) && !TII->isDS(MI)))) 910 Result = true; 911 912 else if (((SGMask & SchedGroupMask::DS) != SchedGroupMask::NONE) && 913 TII->isDS(MI)) 914 Result = true; 915 916 else if (((SGMask & SchedGroupMask::DS_READ) != SchedGroupMask::NONE) && 917 MI.mayLoad() && TII->isDS(MI)) 918 Result = true; 919 920 else if (((SGMask & SchedGroupMask::DS_WRITE) != SchedGroupMask::NONE) && 921 MI.mayStore() && TII->isDS(MI)) 922 Result = true; 923 924 LLVM_DEBUG( 925 dbgs() << "For SchedGroup with mask " << format_hex((int)SGMask, 10, true) 926 << (Result ? " could classify " : " unable to classify ") << MI); 927 928 return Result; 929 } 930 931 int SchedGroup::link(SUnit &SU, bool MakePred, 932 std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges) { 933 int MissedEdges = 0; 934 for (auto *A : Collection) { 935 SUnit *B = &SU; 936 if (A == B || A->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER) 937 continue; 938 if (MakePred) 939 std::swap(A, B); 940 941 if (DAG->IsReachable(B, A)) 942 continue; 943 // tryAddEdge returns false if there is a dependency that makes adding 944 // the A->B edge impossible, otherwise it returns true; 945 bool Added = tryAddEdge(A, B); 946 if (Added) 947 AddedEdges.push_back(std::make_pair(A, B)); 948 else 949 ++MissedEdges; 950 } 951 952 return MissedEdges; 953 } 954 955 void SchedGroup::link(SUnit &SU, bool MakePred) { 956 for (auto *A : Collection) { 957 SUnit *B = &SU; 958 if (A->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER) 959 continue; 960 if (MakePred) 961 std::swap(A, B); 962 963 tryAddEdge(A, B); 964 } 965 } 966 967 void SchedGroup::link(SUnit &SU, 968 function_ref<bool(const SUnit *A, const SUnit *B)> P) { 969 for (auto *A : Collection) { 970 SUnit *B = &SU; 971 if (P(A, B)) 972 std::swap(A, B); 973 974 tryAddEdge(A, B); 975 } 976 } 977 978 void SchedGroup::link(SchedGroup &OtherGroup) { 979 for (auto *B : OtherGroup.Collection) 980 link(*B); 981 } 982 983 bool SchedGroup::canAddSU(SUnit &SU) const { 984 MachineInstr &MI = *SU.getInstr(); 985 if (MI.getOpcode() != TargetOpcode::BUNDLE) 986 return canAddMI(MI); 987 988 // Special case for bundled MIs. 989 const MachineBasicBlock *MBB = MI.getParent(); 990 MachineBasicBlock::instr_iterator B = MI.getIterator(), E = ++B; 991 while (E != MBB->end() && E->isBundledWithPred()) 992 ++E; 993 994 // Return true if all of the bundled MIs can be added to this group. 995 return std::all_of(B, E, [this](MachineInstr &MI) { return canAddMI(MI); }); 996 } 997 998 void SchedGroup::initSchedGroup() { 999 for (auto &SU : DAG->SUnits) { 1000 if (isFull()) 1001 break; 1002 1003 if (canAddSU(SU)) 1004 add(SU); 1005 } 1006 } 1007 1008 void SchedGroup::initSchedGroup(std::vector<SUnit>::reverse_iterator RIter, 1009 SUnitsToCandidateSGsMap &SyncedInstrs) { 1010 SUnit &InitSU = *RIter; 1011 for (auto E = DAG->SUnits.rend(); RIter != E; ++RIter) { 1012 auto &SU = *RIter; 1013 if (isFull()) 1014 break; 1015 1016 if (canAddSU(SU)) 1017 SyncedInstrs[&SU].push_back(SGID); 1018 } 1019 1020 add(InitSU); 1021 assert(MaxSize); 1022 (*MaxSize)++; 1023 } 1024 1025 void SchedGroup::initSchedGroup(SUnitsToCandidateSGsMap &SyncedInstrs) { 1026 auto I = DAG->SUnits.rbegin(); 1027 auto E = DAG->SUnits.rend(); 1028 for (; I != E; ++I) { 1029 auto &SU = *I; 1030 if (isFull()) 1031 break; 1032 1033 if (canAddSU(SU)) 1034 SyncedInstrs[&SU].push_back(SGID); 1035 } 1036 } 1037 1038 void IGroupLPDAGMutation::apply(ScheduleDAGInstrs *DAGInstrs) { 1039 const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel(); 1040 if (!TSchedModel || DAGInstrs->SUnits.empty()) 1041 return; 1042 1043 LLVM_DEBUG(dbgs() << "Applying IGroupLPDAGMutation...\n"); 1044 const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>(); 1045 TII = ST.getInstrInfo(); 1046 DAG = static_cast<ScheduleDAGMI *>(DAGInstrs); 1047 SyncedSchedGroups.clear(); 1048 SyncedInstrs.clear(); 1049 bool foundSB = false; 1050 bool foundIGLP = false; 1051 for (auto R = DAG->SUnits.rbegin(), E = DAG->SUnits.rend(); R != E; ++R) { 1052 unsigned Opc = R->getInstr()->getOpcode(); 1053 // SCHED_[GROUP_]BARRIER and IGLP are mutually exclusive. 1054 if (Opc == AMDGPU::SCHED_BARRIER) { 1055 addSchedBarrierEdges(*R); 1056 foundSB = true; 1057 } else if (Opc == AMDGPU::SCHED_GROUP_BARRIER) { 1058 initSchedGroupBarrierPipelineStage(R); 1059 foundSB = true; 1060 } else if (Opc == AMDGPU::IGLP_OPT) { 1061 resetEdges(*R, DAG); 1062 if (!foundSB && !foundIGLP) 1063 initIGLPOpt(*R); 1064 foundIGLP = true; 1065 } 1066 } 1067 1068 if (foundSB || foundIGLP) { 1069 PipelineSolver PS(SyncedSchedGroups, SyncedInstrs, DAG); 1070 // PipelineSolver performs the mutation by adding the edges it 1071 // determined as the best 1072 PS.solve(); 1073 } 1074 } 1075 1076 void IGroupLPDAGMutation::addSchedBarrierEdges(SUnit &SchedBarrier) { 1077 MachineInstr &MI = *SchedBarrier.getInstr(); 1078 assert(MI.getOpcode() == AMDGPU::SCHED_BARRIER); 1079 // Remove all existing edges from the SCHED_BARRIER that were added due to the 1080 // instruction having side effects. 1081 resetEdges(SchedBarrier, DAG); 1082 auto InvertedMask = 1083 invertSchedBarrierMask((SchedGroupMask)MI.getOperand(0).getImm()); 1084 SchedGroup SG(InvertedMask, None, DAG, TII); 1085 SG.initSchedGroup(); 1086 // Preserve original instruction ordering relative to the SCHED_BARRIER. 1087 SG.link( 1088 SchedBarrier, 1089 (function_ref<bool(const SUnit *A, const SUnit *B)>)[]( 1090 const SUnit *A, const SUnit *B) { return A->NodeNum > B->NodeNum; }); 1091 } 1092 1093 SchedGroupMask 1094 IGroupLPDAGMutation::invertSchedBarrierMask(SchedGroupMask Mask) const { 1095 // Invert mask and erase bits for types of instructions that are implied to be 1096 // allowed past the SCHED_BARRIER. 1097 SchedGroupMask InvertedMask = ~Mask; 1098 1099 // ALU implies VALU, SALU, MFMA. 1100 if ((InvertedMask & SchedGroupMask::ALU) == SchedGroupMask::NONE) 1101 InvertedMask &= 1102 ~SchedGroupMask::VALU & ~SchedGroupMask::SALU & ~SchedGroupMask::MFMA; 1103 // VALU, SALU, MFMA implies ALU. 1104 else if ((InvertedMask & SchedGroupMask::VALU) == SchedGroupMask::NONE || 1105 (InvertedMask & SchedGroupMask::SALU) == SchedGroupMask::NONE || 1106 (InvertedMask & SchedGroupMask::MFMA) == SchedGroupMask::NONE) 1107 InvertedMask &= ~SchedGroupMask::ALU; 1108 1109 // VMEM implies VMEM_READ, VMEM_WRITE. 1110 if ((InvertedMask & SchedGroupMask::VMEM) == SchedGroupMask::NONE) 1111 InvertedMask &= ~SchedGroupMask::VMEM_READ & ~SchedGroupMask::VMEM_WRITE; 1112 // VMEM_READ, VMEM_WRITE implies VMEM. 1113 else if ((InvertedMask & SchedGroupMask::VMEM_READ) == SchedGroupMask::NONE || 1114 (InvertedMask & SchedGroupMask::VMEM_WRITE) == SchedGroupMask::NONE) 1115 InvertedMask &= ~SchedGroupMask::VMEM; 1116 1117 // DS implies DS_READ, DS_WRITE. 1118 if ((InvertedMask & SchedGroupMask::DS) == SchedGroupMask::NONE) 1119 InvertedMask &= ~SchedGroupMask::DS_READ & ~SchedGroupMask::DS_WRITE; 1120 // DS_READ, DS_WRITE implies DS. 1121 else if ((InvertedMask & SchedGroupMask::DS_READ) == SchedGroupMask::NONE || 1122 (InvertedMask & SchedGroupMask::DS_WRITE) == SchedGroupMask::NONE) 1123 InvertedMask &= ~SchedGroupMask::DS; 1124 1125 return InvertedMask; 1126 } 1127 1128 void IGroupLPDAGMutation::initSchedGroupBarrierPipelineStage( 1129 std::vector<SUnit>::reverse_iterator RIter) { 1130 // Remove all existing edges from the SCHED_GROUP_BARRIER that were added due 1131 // to the instruction having side effects. 1132 resetEdges(*RIter, DAG); 1133 MachineInstr &SGB = *RIter->getInstr(); 1134 assert(SGB.getOpcode() == AMDGPU::SCHED_GROUP_BARRIER); 1135 int32_t SGMask = SGB.getOperand(0).getImm(); 1136 int32_t Size = SGB.getOperand(1).getImm(); 1137 int32_t SyncID = SGB.getOperand(2).getImm(); 1138 1139 auto &SG = SyncedSchedGroups[SyncID].emplace_back((SchedGroupMask)SGMask, 1140 Size, SyncID, DAG, TII); 1141 1142 SG.initSchedGroup(RIter, SyncedInstrs[SG.getSyncID()]); 1143 } 1144 1145 void IGroupLPDAGMutation::initIGLPOpt(SUnit &SU) { 1146 IGLPStrategyID StrategyID = 1147 (IGLPStrategyID)SU.getInstr()->getOperand(0).getImm(); 1148 auto S = createIGLPStrategy(StrategyID, DAG, TII); 1149 if (S->shouldApplyStrategy(DAG)) 1150 S->applyIGLPStrategy(SyncedInstrs, SyncedSchedGroups); 1151 } 1152 1153 } // namespace 1154 1155 namespace llvm { 1156 1157 std::unique_ptr<ScheduleDAGMutation> createIGroupLPDAGMutation() { 1158 return std::make_unique<IGroupLPDAGMutation>(); 1159 } 1160 1161 } // end namespace llvm 1162