1 //======----------- WindowScheduler.cpp - window scheduler -------------======// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // An implementation of the Window Scheduling software pipelining algorithm. 10 // 11 // The fundamental concept of the window scheduling algorithm involves folding 12 // the original MBB at a specific position, followed by list scheduling on the 13 // folded MIs. The optimal scheduling result is then chosen from various folding 14 // positions as the final scheduling outcome. 15 // 16 // The primary challenge in this algorithm lies in generating the folded MIs and 17 // establishing their dependencies. We have innovatively employed a new MBB, 18 // created by copying the original MBB three times, known as TripleMBB. This 19 // TripleMBB enables the convenient implementation of MI folding and dependency 20 // establishment. To facilitate the algorithm's implementation, we have also 21 // devised data structures such as OriMIs, TriMIs, TriToOri, and OriToCycle. 22 // 23 // Another challenge in the algorithm is the scheduling of phis. Semantically, 24 // it is difficult to place the phis in the window and perform list scheduling. 25 // Therefore, we schedule these phis separately after each list scheduling. 26 // 27 // The provided implementation is designed for use before the Register Allocator 28 // (RA). If the target requires implementation after RA, it is recommended to 29 // reimplement analyseII(), schedulePhi(), and expand(). Additionally, 30 // target-specific logic can be added in initialize(), preProcess(), and 31 // postProcess(). 32 // 33 // Lastly, it is worth mentioning that getSearchIndexes() is an important 34 // function. We have experimented with more complex heuristics on downstream 35 // target and achieved favorable results. 36 // 37 //===----------------------------------------------------------------------===// 38 #include "llvm/CodeGen/WindowScheduler.h" 39 #include "llvm/ADT/Statistic.h" 40 #include "llvm/CodeGen/LiveIntervals.h" 41 #include "llvm/CodeGen/MachineLoopInfo.h" 42 #include "llvm/CodeGen/MachinePipeliner.h" 43 #include "llvm/CodeGen/ModuloSchedule.h" 44 #include "llvm/CodeGen/TargetPassConfig.h" 45 #include "llvm/Support/CommandLine.h" 46 #include "llvm/Support/Debug.h" 47 #include "llvm/Support/TimeProfiler.h" 48 49 using namespace llvm; 50 51 #define DEBUG_TYPE "pipeliner" 52 53 namespace { 54 STATISTIC(NumTryWindowSchedule, 55 "Number of loops that we attempt to use window scheduling"); 56 STATISTIC(NumTryWindowSearch, 57 "Number of times that we run list schedule in the window scheduling"); 58 STATISTIC(NumWindowSchedule, 59 "Number of loops that we successfully use window scheduling"); 60 STATISTIC(NumFailAnalyseII, 61 "Window scheduling abort due to the failure of the II analysis"); 62 63 cl::opt<unsigned> 64 WindowSearchNum("window-search-num", 65 cl::desc("The number of searches per loop in the window " 66 "algorithm. 0 means no search number limit."), 67 cl::Hidden, cl::init(6)); 68 69 cl::opt<unsigned> WindowSearchRatio( 70 "window-search-ratio", 71 cl::desc("The ratio of searches per loop in the window algorithm. 100 " 72 "means search all positions in the loop, while 0 means not " 73 "performing any search."), 74 cl::Hidden, cl::init(40)); 75 76 cl::opt<unsigned> WindowIICoeff( 77 "window-ii-coeff", 78 cl::desc( 79 "The coefficient used when initializing II in the window algorithm."), 80 cl::Hidden, cl::init(5)); 81 82 cl::opt<unsigned> WindowRegionLimit( 83 "window-region-limit", 84 cl::desc( 85 "The lower limit of the scheduling region in the window algorithm."), 86 cl::Hidden, cl::init(3)); 87 88 cl::opt<unsigned> WindowDiffLimit( 89 "window-diff-limit", 90 cl::desc("The lower limit of the difference between best II and base II in " 91 "the window algorithm. If the difference is smaller than " 92 "this lower limit, window scheduling will not be performed."), 93 cl::Hidden, cl::init(2)); 94 } // namespace 95 96 // WindowIILimit serves as an indicator of abnormal scheduling results and could 97 // potentially be referenced by the derived target window scheduler. 98 cl::opt<unsigned> 99 WindowIILimit("window-ii-limit", 100 cl::desc("The upper limit of II in the window algorithm."), 101 cl::Hidden, cl::init(1000)); 102 103 WindowScheduler::WindowScheduler(MachineSchedContext *C, MachineLoop &ML) 104 : Context(C), MF(C->MF), MBB(ML.getHeader()), Loop(ML), 105 Subtarget(&MF->getSubtarget()), TII(Subtarget->getInstrInfo()), 106 TRI(Subtarget->getRegisterInfo()), MRI(&MF->getRegInfo()) { 107 TripleDAG = std::unique_ptr<ScheduleDAGInstrs>( 108 createMachineScheduler(/*OnlyBuildGraph=*/true)); 109 } 110 111 bool WindowScheduler::run() { 112 if (!initialize()) { 113 LLVM_DEBUG(dbgs() << "The WindowScheduler failed to initialize!\n"); 114 return false; 115 } 116 // The window algorithm is time-consuming, and its compilation time should be 117 // taken into consideration. 118 TimeTraceScope Scope("WindowSearch"); 119 ++NumTryWindowSchedule; 120 // Performing the relevant processing before window scheduling. 121 preProcess(); 122 // The main window scheduling begins. 123 std::unique_ptr<ScheduleDAGInstrs> SchedDAG(createMachineScheduler()); 124 auto SearchIndexes = getSearchIndexes(WindowSearchNum, WindowSearchRatio); 125 for (unsigned Idx : SearchIndexes) { 126 OriToCycle.clear(); 127 ++NumTryWindowSearch; 128 // The scheduling starts with non-phi instruction, so SchedPhiNum needs to 129 // be added to Idx. 130 unsigned Offset = Idx + SchedPhiNum; 131 auto Range = getScheduleRange(Offset, SchedInstrNum); 132 SchedDAG->startBlock(MBB); 133 SchedDAG->enterRegion(MBB, Range.begin(), Range.end(), SchedInstrNum); 134 SchedDAG->schedule(); 135 LLVM_DEBUG(SchedDAG->dump()); 136 unsigned II = analyseII(*SchedDAG, Offset); 137 if (II == WindowIILimit) { 138 restoreTripleMBB(); 139 LLVM_DEBUG(dbgs() << "Can't find a valid II. Keep searching...\n"); 140 ++NumFailAnalyseII; 141 continue; 142 } 143 schedulePhi(Offset, II); 144 updateScheduleResult(Offset, II); 145 restoreTripleMBB(); 146 LLVM_DEBUG(dbgs() << "Current window Offset is " << Offset << " and II is " 147 << II << ".\n"); 148 } 149 // Performing the relevant processing after window scheduling. 150 postProcess(); 151 // Check whether the scheduling result is valid. 152 if (!isScheduleValid()) { 153 LLVM_DEBUG(dbgs() << "Window scheduling is not needed!\n"); 154 return false; 155 } 156 LLVM_DEBUG(dbgs() << "\nBest window offset is " << BestOffset 157 << " and Best II is " << BestII << ".\n"); 158 // Expand the scheduling result to prologue, kernel, and epilogue. 159 expand(); 160 ++NumWindowSchedule; 161 return true; 162 } 163 164 ScheduleDAGInstrs * 165 WindowScheduler::createMachineScheduler(bool OnlyBuildGraph) { 166 return OnlyBuildGraph 167 ? new ScheduleDAGMI( 168 Context, std::make_unique<PostGenericScheduler>(Context), 169 true) 170 : Context->PassConfig->createMachineScheduler(Context); 171 } 172 173 bool WindowScheduler::initialize() { 174 if (!Subtarget->enableWindowScheduler()) { 175 LLVM_DEBUG(dbgs() << "Target disables the window scheduling!\n"); 176 return false; 177 } 178 // Initialized the member variables used by window algorithm. 179 OriMIs.clear(); 180 TriMIs.clear(); 181 TriToOri.clear(); 182 OriToCycle.clear(); 183 SchedResult.clear(); 184 SchedPhiNum = 0; 185 SchedInstrNum = 0; 186 BestII = UINT_MAX; 187 BestOffset = 0; 188 BaseII = 0; 189 // List scheduling used in the window algorithm depends on LiveIntervals. 190 if (!Context->LIS) { 191 LLVM_DEBUG(dbgs() << "There is no LiveIntervals information!\n"); 192 return false; 193 } 194 // Check each MI in MBB. 195 SmallSet<Register, 8> PrevDefs; 196 SmallSet<Register, 8> PrevUses; 197 auto IsLoopCarried = [&](MachineInstr &Phi) { 198 // Two cases are checked here: (1)The virtual register defined by the 199 // preceding phi is used by the succeeding phi;(2)The preceding phi uses the 200 // virtual register defined by the succeeding phi. 201 if (PrevUses.count(Phi.getOperand(0).getReg())) 202 return true; 203 PrevDefs.insert(Phi.getOperand(0).getReg()); 204 for (unsigned I = 1, E = Phi.getNumOperands(); I != E; I += 2) { 205 if (PrevDefs.count(Phi.getOperand(I).getReg())) 206 return true; 207 PrevUses.insert(Phi.getOperand(I).getReg()); 208 } 209 return false; 210 }; 211 auto PLI = TII->analyzeLoopForPipelining(MBB); 212 for (auto &MI : *MBB) { 213 if (MI.isMetaInstruction() || MI.isTerminator()) 214 continue; 215 if (MI.isPHI()) { 216 if (IsLoopCarried(MI)) { 217 LLVM_DEBUG(dbgs() << "Loop carried phis are not supported yet!\n"); 218 return false; 219 } 220 ++SchedPhiNum; 221 ++BestOffset; 222 } else 223 ++SchedInstrNum; 224 if (TII->isSchedulingBoundary(MI, MBB, *MF)) { 225 LLVM_DEBUG( 226 dbgs() << "Boundary MI is not allowed in window scheduling!\n"); 227 return false; 228 } 229 if (PLI->shouldIgnoreForPipelining(&MI)) { 230 LLVM_DEBUG(dbgs() << "Special MI defined by target is not allowed in " 231 "window scheduling!\n"); 232 return false; 233 } 234 for (auto &Def : MI.all_defs()) 235 if (Def.isReg() && Def.getReg().isPhysical()) 236 return false; 237 } 238 if (SchedInstrNum <= WindowRegionLimit) { 239 LLVM_DEBUG(dbgs() << "There are too few MIs in the window region!\n"); 240 return false; 241 } 242 return true; 243 } 244 245 void WindowScheduler::preProcess() { 246 // Prior to window scheduling, it's necessary to backup the original MBB, 247 // generate a new TripleMBB, and build a TripleDAG based on the TripleMBB. 248 backupMBB(); 249 generateTripleMBB(); 250 TripleDAG->startBlock(MBB); 251 TripleDAG->enterRegion( 252 MBB, MBB->begin(), MBB->getFirstTerminator(), 253 std::distance(MBB->begin(), MBB->getFirstTerminator())); 254 TripleDAG->buildSchedGraph(Context->AA); 255 } 256 257 void WindowScheduler::postProcess() { 258 // After window scheduling, it's necessary to clear the TripleDAG and restore 259 // to the original MBB. 260 TripleDAG->exitRegion(); 261 TripleDAG->finishBlock(); 262 restoreMBB(); 263 } 264 265 void WindowScheduler::backupMBB() { 266 for (auto &MI : MBB->instrs()) 267 OriMIs.push_back(&MI); 268 // Remove MIs and the corresponding live intervals. 269 for (auto &MI : make_early_inc_range(*MBB)) { 270 Context->LIS->getSlotIndexes()->removeMachineInstrFromMaps(MI, true); 271 MBB->remove(&MI); 272 } 273 } 274 275 void WindowScheduler::restoreMBB() { 276 // Erase MIs and the corresponding live intervals. 277 for (auto &MI : make_early_inc_range(*MBB)) { 278 Context->LIS->getSlotIndexes()->removeMachineInstrFromMaps(MI, true); 279 MI.eraseFromParent(); 280 } 281 // Restore MBB to the state before window scheduling. 282 for (auto *MI : OriMIs) 283 MBB->push_back(MI); 284 updateLiveIntervals(); 285 } 286 287 void WindowScheduler::generateTripleMBB() { 288 const unsigned DuplicateNum = 3; 289 TriMIs.clear(); 290 TriToOri.clear(); 291 assert(OriMIs.size() > 0 && "The Original MIs were not backed up!"); 292 // Step 1: Performing the first copy of MBB instructions, excluding 293 // terminators. At the same time, we back up the anti-register of phis. 294 // DefPairs hold the old and new define register pairs. 295 DenseMap<Register, Register> DefPairs; 296 for (auto *MI : OriMIs) { 297 if (MI->isMetaInstruction() || MI->isTerminator()) 298 continue; 299 if (MI->isPHI()) 300 if (Register AntiReg = getAntiRegister(MI)) 301 DefPairs[MI->getOperand(0).getReg()] = AntiReg; 302 auto *NewMI = MF->CloneMachineInstr(MI); 303 MBB->push_back(NewMI); 304 TriMIs.push_back(NewMI); 305 TriToOri[NewMI] = MI; 306 } 307 // Step 2: Performing the remaining two copies of MBB instructions excluding 308 // phis, and the last one contains terminators. At the same time, registers 309 // are updated accordingly. 310 for (size_t Cnt = 1; Cnt < DuplicateNum; ++Cnt) { 311 for (auto *MI : OriMIs) { 312 if (MI->isPHI() || MI->isMetaInstruction() || 313 (MI->isTerminator() && Cnt < DuplicateNum - 1)) 314 continue; 315 auto *NewMI = MF->CloneMachineInstr(MI); 316 DenseMap<Register, Register> NewDefs; 317 // New defines are updated. 318 for (auto MO : NewMI->all_defs()) 319 if (MO.isReg() && MO.getReg().isVirtual()) { 320 Register NewDef = 321 MRI->createVirtualRegister(MRI->getRegClass(MO.getReg())); 322 NewMI->substituteRegister(MO.getReg(), NewDef, 0, *TRI); 323 NewDefs[MO.getReg()] = NewDef; 324 } 325 // New uses are updated. 326 for (auto DefRegPair : DefPairs) 327 if (NewMI->readsRegister(DefRegPair.first, TRI)) { 328 Register NewUse = DefRegPair.second; 329 // Note the update process for '%1 -> %9' in '%10 = sub i32 %9, %3': 330 // 331 // BB.3: DefPairs 332 // ================================== 333 // %1 = phi i32 [%2, %BB.1], [%7, %BB.3] (%1,%7) 334 // ... 335 // ================================== 336 // ... 337 // %4 = sub i32 %1, %3 338 // ... 339 // %7 = add i32 %5, %6 340 // ... 341 // ---------------------------------- 342 // ... 343 // %8 = sub i32 %7, %3 (%1,%7),(%4,%8) 344 // ... 345 // %9 = add i32 %5, %6 (%1,%7),(%4,%8),(%7,%9) 346 // ... 347 // ---------------------------------- 348 // ... 349 // %10 = sub i32 %9, %3 (%1,%7),(%4,%10),(%7,%9) 350 // ... ^ 351 // %11 = add i32 %5, %6 (%1,%7),(%4,%10),(%7,%11) 352 // ... 353 // ================================== 354 // < Terminators > 355 // ================================== 356 if (DefPairs.count(NewUse)) 357 NewUse = DefPairs[NewUse]; 358 NewMI->substituteRegister(DefRegPair.first, NewUse, 0, *TRI); 359 } 360 // DefPairs is updated at last. 361 for (auto &NewDef : NewDefs) 362 DefPairs[NewDef.first] = NewDef.second; 363 MBB->push_back(NewMI); 364 TriMIs.push_back(NewMI); 365 TriToOri[NewMI] = MI; 366 } 367 } 368 // Step 3: The registers used by phis are updated, and they are generated in 369 // the third copy of MBB. 370 // In the privious example, the old phi is: 371 // %1 = phi i32 [%2, %BB.1], [%7, %BB.3] 372 // The new phi is: 373 // %1 = phi i32 [%2, %BB.1], [%11, %BB.3] 374 for (auto &Phi : MBB->phis()) { 375 for (auto DefRegPair : DefPairs) 376 if (Phi.readsRegister(DefRegPair.first, TRI)) 377 Phi.substituteRegister(DefRegPair.first, DefRegPair.second, 0, *TRI); 378 } 379 updateLiveIntervals(); 380 } 381 382 void WindowScheduler::restoreTripleMBB() { 383 // After list scheduling, the MBB is restored in one traversal. 384 for (size_t I = 0; I < TriMIs.size(); ++I) { 385 auto *MI = TriMIs[I]; 386 auto OldPos = MBB->begin(); 387 std::advance(OldPos, I); 388 auto CurPos = MI->getIterator(); 389 if (CurPos != OldPos) { 390 MBB->splice(OldPos, MBB, CurPos); 391 Context->LIS->handleMove(*MI, /*UpdateFlags=*/false); 392 } 393 } 394 } 395 396 SmallVector<unsigned> WindowScheduler::getSearchIndexes(unsigned SearchNum, 397 unsigned SearchRatio) { 398 // We use SearchRatio to get the index range, and then evenly get the indexes 399 // according to the SearchNum. This is a simple huristic. Depending on the 400 // characteristics of the target, more complex algorithms can be used for both 401 // performance and compilation time. 402 assert(SearchRatio <= 100 && "SearchRatio should be equal or less than 100!"); 403 unsigned MaxIdx = SchedInstrNum * SearchRatio / 100; 404 unsigned Step = SearchNum > 0 && SearchNum <= MaxIdx ? MaxIdx / SearchNum : 1; 405 SmallVector<unsigned> SearchIndexes; 406 for (unsigned Idx = 0; Idx < MaxIdx; Idx += Step) 407 SearchIndexes.push_back(Idx); 408 return SearchIndexes; 409 } 410 411 int WindowScheduler::getEstimatedII(ScheduleDAGInstrs &DAG) { 412 // Sometimes MaxDepth is 0, so it should be limited to the minimum of 1. 413 unsigned MaxDepth = 1; 414 for (auto &SU : DAG.SUnits) 415 MaxDepth = std::max(SU.getDepth() + SU.Latency, MaxDepth); 416 return MaxDepth * WindowIICoeff; 417 } 418 419 int WindowScheduler::calculateMaxCycle(ScheduleDAGInstrs &DAG, 420 unsigned Offset) { 421 int InitII = getEstimatedII(DAG); 422 ResourceManager RM(Subtarget, &DAG); 423 RM.init(InitII); 424 // ResourceManager and DAG are used to calculate the maximum cycle for the 425 // scheduled MIs. Since MIs in the Region have already been scheduled, the 426 // emit cycles can be estimated in order here. 427 int CurCycle = 0; 428 auto Range = getScheduleRange(Offset, SchedInstrNum); 429 for (auto &MI : Range) { 430 auto *SU = DAG.getSUnit(&MI); 431 int ExpectCycle = CurCycle; 432 // The predecessors of current MI determine its earliest issue cycle. 433 for (auto &Pred : SU->Preds) { 434 if (Pred.isWeak()) 435 continue; 436 auto *PredMI = Pred.getSUnit()->getInstr(); 437 int PredCycle = getOriCycle(PredMI); 438 ExpectCycle = std::max(ExpectCycle, PredCycle + (int)Pred.getLatency()); 439 } 440 // ResourceManager can be used to detect resource conflicts between the 441 // current MI and the previously inserted MIs. 442 while (!RM.canReserveResources(*SU, CurCycle) || CurCycle < ExpectCycle) { 443 ++CurCycle; 444 if (CurCycle == (int)WindowIILimit) 445 return CurCycle; 446 } 447 RM.reserveResources(*SU, CurCycle); 448 OriToCycle[getOriMI(&MI)] = CurCycle; 449 LLVM_DEBUG(dbgs() << "\tCycle " << CurCycle << " [S." 450 << getOriStage(getOriMI(&MI), Offset) << "]: " << MI); 451 } 452 LLVM_DEBUG(dbgs() << "MaxCycle is " << CurCycle << ".\n"); 453 return CurCycle; 454 } 455 456 // By utilizing TripleDAG, we can easily establish dependencies between A and B. 457 // Based on the MaxCycle and the issue cycle of A and B, we can determine 458 // whether it is necessary to add a stall cycle. This is because, without 459 // inserting the stall cycle, the latency constraint between A and B cannot be 460 // satisfied. The details are as follows: 461 // 462 // New MBB: 463 // ======================================== 464 // < Phis > 465 // ======================================== (sliding direction) 466 // MBB copy 1 | 467 // V 468 // 469 // ~~~~~~~~~~~~~~~~~~~|~~~~~~~~~~~~~~~~~~~~ ----schedule window----- 470 // | | 471 // ===================V==================== | 472 // MBB copy 2 < MI B > | 473 // | 474 // < MI A > V 475 // ~~~~~~~~~~~~~~~~~~~:~~~~~~~~~~~~~~~~~~~~ ------------------------ 476 // : 477 // ===================V==================== 478 // MBB copy 3 < MI B'> 479 // 480 // 481 // 482 // 483 // ======================================== 484 // < Terminators > 485 // ======================================== 486 int WindowScheduler::calculateStallCycle(unsigned Offset, int MaxCycle) { 487 int MaxStallCycle = 0; 488 auto Range = getScheduleRange(Offset, SchedInstrNum); 489 for (auto &MI : Range) { 490 auto *SU = TripleDAG->getSUnit(&MI); 491 int DefCycle = getOriCycle(&MI); 492 for (auto &Succ : SU->Succs) { 493 if (Succ.isWeak() || Succ.getSUnit() == &TripleDAG->ExitSU) 494 continue; 495 // If the expected cycle does not exceed MaxCycle, no check is needed. 496 if (DefCycle + (int)Succ.getLatency() <= MaxCycle) 497 continue; 498 // If the cycle of the scheduled MI A is less than that of the scheduled 499 // MI B, the scheduling will fail because the lifetime of the 500 // corresponding register exceeds II. 501 auto *SuccMI = Succ.getSUnit()->getInstr(); 502 int UseCycle = getOriCycle(SuccMI); 503 if (DefCycle < UseCycle) 504 return WindowIILimit; 505 // Get the stall cycle introduced by the register between two trips. 506 int StallCycle = DefCycle + (int)Succ.getLatency() - MaxCycle - UseCycle; 507 MaxStallCycle = std::max(MaxStallCycle, StallCycle); 508 } 509 } 510 LLVM_DEBUG(dbgs() << "MaxStallCycle is " << MaxStallCycle << ".\n"); 511 return MaxStallCycle; 512 } 513 514 unsigned WindowScheduler::analyseII(ScheduleDAGInstrs &DAG, unsigned Offset) { 515 LLVM_DEBUG(dbgs() << "Start analyzing II:\n"); 516 int MaxCycle = calculateMaxCycle(DAG, Offset); 517 if (MaxCycle == (int)WindowIILimit) 518 return MaxCycle; 519 int StallCycle = calculateStallCycle(Offset, MaxCycle); 520 if (StallCycle == (int)WindowIILimit) 521 return StallCycle; 522 // The value of II is equal to the maximum execution cycle plus 1. 523 return MaxCycle + StallCycle + 1; 524 } 525 526 void WindowScheduler::schedulePhi(int Offset, unsigned &II) { 527 LLVM_DEBUG(dbgs() << "Start scheduling Phis:\n"); 528 for (auto &Phi : MBB->phis()) { 529 int LateCycle = INT_MAX; 530 auto *SU = TripleDAG->getSUnit(&Phi); 531 for (auto &Succ : SU->Succs) { 532 // Phi doesn't have any Anti successors. 533 if (Succ.getKind() != SDep::Data) 534 continue; 535 // Phi is scheduled before the successor of stage 0. The issue cycle of 536 // phi is the latest cycle in this interval. 537 auto *SuccMI = Succ.getSUnit()->getInstr(); 538 int Cycle = getOriCycle(SuccMI); 539 if (getOriStage(getOriMI(SuccMI), Offset) == 0) 540 LateCycle = std::min(LateCycle, Cycle); 541 } 542 // The anti-dependency of phi need to be handled separately in the same way. 543 if (Register AntiReg = getAntiRegister(&Phi)) { 544 auto *AntiMI = MRI->getVRegDef(AntiReg); 545 // AntiReg may be defined outside the kernel MBB. 546 if (AntiMI->getParent() == MBB) { 547 auto AntiCycle = getOriCycle(AntiMI); 548 if (getOriStage(getOriMI(AntiMI), Offset) == 0) 549 LateCycle = std::min(LateCycle, AntiCycle); 550 } 551 } 552 // If there is no limit to the late cycle, a default value is given. 553 if (LateCycle == INT_MAX) 554 LateCycle = (int)(II - 1); 555 LLVM_DEBUG(dbgs() << "\tCycle range [0, " << LateCycle << "] " << Phi); 556 // The issue cycle of phi is set to the latest cycle in the interval. 557 auto *OriPhi = getOriMI(&Phi); 558 OriToCycle[OriPhi] = LateCycle; 559 } 560 } 561 562 DenseMap<MachineInstr *, int> WindowScheduler::getIssueOrder(unsigned Offset, 563 unsigned II) { 564 // At each issue cycle, phi is placed before MIs in stage 0. So the simplest 565 // way is to put phi at the beginning of the current cycle. 566 DenseMap<int, SmallVector<MachineInstr *>> CycleToMIs; 567 auto Range = getScheduleRange(Offset, SchedInstrNum); 568 for (auto &Phi : MBB->phis()) 569 CycleToMIs[getOriCycle(&Phi)].push_back(getOriMI(&Phi)); 570 for (auto &MI : Range) 571 CycleToMIs[getOriCycle(&MI)].push_back(getOriMI(&MI)); 572 // Each MI is assigned a separate ordered Id, which is used as a sort marker 573 // in the following expand process. 574 DenseMap<MachineInstr *, int> IssueOrder; 575 int Id = 0; 576 for (int Cycle = 0; Cycle < (int)II; ++Cycle) { 577 if (!CycleToMIs.count(Cycle)) 578 continue; 579 for (auto *MI : CycleToMIs[Cycle]) 580 IssueOrder[MI] = Id++; 581 } 582 return IssueOrder; 583 } 584 585 void WindowScheduler::updateScheduleResult(unsigned Offset, unsigned II) { 586 // At the first update, Offset is equal to SchedPhiNum. At this time, only 587 // BestII, BestOffset, and BaseII need to be updated. 588 if (Offset == SchedPhiNum) { 589 BestII = II; 590 BestOffset = SchedPhiNum; 591 BaseII = II; 592 return; 593 } 594 // The update will only continue if the II is smaller than BestII and the II 595 // is sufficiently small. 596 if ((II >= BestII) || (II + WindowDiffLimit > BaseII)) 597 return; 598 BestII = II; 599 BestOffset = Offset; 600 // Record the result of the current list scheduling, noting that each MI is 601 // stored unordered in SchedResult. 602 SchedResult.clear(); 603 auto IssueOrder = getIssueOrder(Offset, II); 604 for (auto &Pair : OriToCycle) { 605 assert(IssueOrder.count(Pair.first) && "Cannot find original MI!"); 606 SchedResult.push_back(std::make_tuple(Pair.first, Pair.second, 607 getOriStage(Pair.first, Offset), 608 IssueOrder[Pair.first])); 609 } 610 } 611 612 void WindowScheduler::expand() { 613 // The MIs in the SchedResult are sorted by the issue order ID. 614 llvm::stable_sort(SchedResult, 615 [](const std::tuple<MachineInstr *, int, int, int> &A, 616 const std::tuple<MachineInstr *, int, int, int> &B) { 617 return std::get<3>(A) < std::get<3>(B); 618 }); 619 // Use the scheduling infrastructure for expansion, noting that InstrChanges 620 // is not supported here. 621 DenseMap<MachineInstr *, int> Cycles, Stages; 622 std::vector<MachineInstr *> OrderedInsts; 623 for (auto &Info : SchedResult) { 624 auto *MI = std::get<0>(Info); 625 OrderedInsts.push_back(MI); 626 Cycles[MI] = std::get<1>(Info); 627 Stages[MI] = std::get<2>(Info); 628 LLVM_DEBUG(dbgs() << "\tCycle " << Cycles[MI] << " [S." << Stages[MI] 629 << "]: " << *MI); 630 } 631 ModuloSchedule MS(*MF, &Loop, std::move(OrderedInsts), std::move(Cycles), 632 std::move(Stages)); 633 ModuloScheduleExpander MSE(*MF, MS, *Context->LIS, 634 ModuloScheduleExpander::InstrChangesTy()); 635 MSE.expand(); 636 MSE.cleanup(); 637 } 638 639 void WindowScheduler::updateLiveIntervals() { 640 SmallVector<Register, 128> UsedRegs; 641 for (MachineInstr &MI : *MBB) 642 for (const MachineOperand &MO : MI.operands()) { 643 if (!MO.isReg() || MO.getReg() == 0) 644 continue; 645 Register Reg = MO.getReg(); 646 if (!is_contained(UsedRegs, Reg)) 647 UsedRegs.push_back(Reg); 648 } 649 Context->LIS->repairIntervalsInRange(MBB, MBB->begin(), MBB->end(), UsedRegs); 650 } 651 652 iterator_range<MachineBasicBlock::iterator> 653 WindowScheduler::getScheduleRange(unsigned Offset, unsigned Num) { 654 auto RegionBegin = MBB->begin(); 655 std::advance(RegionBegin, Offset); 656 auto RegionEnd = RegionBegin; 657 std::advance(RegionEnd, Num); 658 return make_range(RegionBegin, RegionEnd); 659 } 660 661 int WindowScheduler::getOriCycle(MachineInstr *NewMI) { 662 assert(TriToOri.count(NewMI) && "Cannot find original MI!"); 663 auto *OriMI = TriToOri[NewMI]; 664 assert(OriToCycle.count(OriMI) && "Cannot find schedule cycle!"); 665 return OriToCycle[OriMI]; 666 } 667 668 MachineInstr *WindowScheduler::getOriMI(MachineInstr *NewMI) { 669 assert(TriToOri.count(NewMI) && "Cannot find original MI!"); 670 return TriToOri[NewMI]; 671 } 672 673 unsigned WindowScheduler::getOriStage(MachineInstr *OriMI, unsigned Offset) { 674 assert(llvm::find(OriMIs, OriMI) != OriMIs.end() && 675 "Cannot find OriMI in OriMIs!"); 676 // If there is no instruction fold, all MI stages are 0. 677 if (Offset == SchedPhiNum) 678 return 0; 679 // For those MIs with an ID less than the Offset, their stages are set to 0, 680 // while the rest are set to 1. 681 unsigned Id = 0; 682 for (auto *MI : OriMIs) { 683 if (MI->isMetaInstruction()) 684 continue; 685 if (MI == OriMI) 686 break; 687 ++Id; 688 } 689 return Id >= (size_t)Offset ? 1 : 0; 690 } 691 692 Register WindowScheduler::getAntiRegister(MachineInstr *Phi) { 693 assert(Phi->isPHI() && "Expecting PHI!"); 694 Register AntiReg; 695 for (auto MO : Phi->uses()) { 696 if (MO.isReg()) 697 AntiReg = MO.getReg(); 698 else if (MO.isMBB() && MO.getMBB() == MBB) 699 return AntiReg; 700 } 701 return 0; 702 } 703