1 //===- MachineScheduler.cpp - Machine Instruction Scheduler ---------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // MachineScheduler schedules machine instructions after phi elimination. It 11 // preserves LiveIntervals so it can be invoked before register allocation. 12 // 13 //===----------------------------------------------------------------------===// 14 15 #define DEBUG_TYPE "misched" 16 17 #include "llvm/CodeGen/MachineScheduler.h" 18 #include "llvm/ADT/OwningPtr.h" 19 #include "llvm/ADT/PriorityQueue.h" 20 #include "llvm/Analysis/AliasAnalysis.h" 21 #include "llvm/CodeGen/LiveIntervalAnalysis.h" 22 #include "llvm/CodeGen/MachineDominators.h" 23 #include "llvm/CodeGen/MachineLoopInfo.h" 24 #include "llvm/CodeGen/MachineRegisterInfo.h" 25 #include "llvm/CodeGen/Passes.h" 26 #include "llvm/CodeGen/RegisterClassInfo.h" 27 #include "llvm/CodeGen/ScheduleDFS.h" 28 #include "llvm/CodeGen/ScheduleHazardRecognizer.h" 29 #include "llvm/Support/CommandLine.h" 30 #include "llvm/Support/Debug.h" 31 #include "llvm/Support/ErrorHandling.h" 32 #include "llvm/Support/GraphWriter.h" 33 #include "llvm/Support/raw_ostream.h" 34 #include "llvm/Target/TargetInstrInfo.h" 35 #include <queue> 36 37 using namespace llvm; 38 39 namespace llvm { 40 cl::opt<bool> ForceTopDown("misched-topdown", cl::Hidden, 41 cl::desc("Force top-down list scheduling")); 42 cl::opt<bool> ForceBottomUp("misched-bottomup", cl::Hidden, 43 cl::desc("Force bottom-up list scheduling")); 44 } 45 46 #ifndef NDEBUG 47 static cl::opt<bool> ViewMISchedDAGs("view-misched-dags", cl::Hidden, 48 cl::desc("Pop up a window to show MISched dags after they are processed")); 49 50 static cl::opt<unsigned> MISchedCutoff("misched-cutoff", cl::Hidden, 51 cl::desc("Stop scheduling after N instructions"), cl::init(~0U)); 52 #else 53 static bool ViewMISchedDAGs = false; 54 #endif // NDEBUG 55 56 static cl::opt<bool> EnableLoadCluster("misched-cluster", cl::Hidden, 57 cl::desc("Enable load clustering."), cl::init(true)); 58 59 // Experimental heuristics 60 static cl::opt<bool> EnableMacroFusion("misched-fusion", cl::Hidden, 61 cl::desc("Enable scheduling for macro fusion."), cl::init(true)); 62 63 static cl::opt<bool> VerifyScheduling("verify-misched", cl::Hidden, 64 cl::desc("Verify machine instrs before and after machine scheduling")); 65 66 // DAG subtrees must have at least this many nodes. 67 static const unsigned MinSubtreeSize = 8; 68 69 //===----------------------------------------------------------------------===// 70 // Machine Instruction Scheduling Pass and Registry 71 //===----------------------------------------------------------------------===// 72 73 MachineSchedContext::MachineSchedContext(): 74 MF(0), MLI(0), MDT(0), PassConfig(0), AA(0), LIS(0) { 75 RegClassInfo = new RegisterClassInfo(); 76 } 77 78 MachineSchedContext::~MachineSchedContext() { 79 delete RegClassInfo; 80 } 81 82 namespace { 83 /// MachineScheduler runs after coalescing and before register allocation. 84 class MachineScheduler : public MachineSchedContext, 85 public MachineFunctionPass { 86 public: 87 MachineScheduler(); 88 89 virtual void getAnalysisUsage(AnalysisUsage &AU) const; 90 91 virtual void releaseMemory() {} 92 93 virtual bool runOnMachineFunction(MachineFunction&); 94 95 virtual void print(raw_ostream &O, const Module* = 0) const; 96 97 static char ID; // Class identification, replacement for typeinfo 98 }; 99 } // namespace 100 101 char MachineScheduler::ID = 0; 102 103 char &llvm::MachineSchedulerID = MachineScheduler::ID; 104 105 INITIALIZE_PASS_BEGIN(MachineScheduler, "misched", 106 "Machine Instruction Scheduler", false, false) 107 INITIALIZE_AG_DEPENDENCY(AliasAnalysis) 108 INITIALIZE_PASS_DEPENDENCY(SlotIndexes) 109 INITIALIZE_PASS_DEPENDENCY(LiveIntervals) 110 INITIALIZE_PASS_END(MachineScheduler, "misched", 111 "Machine Instruction Scheduler", false, false) 112 113 MachineScheduler::MachineScheduler() 114 : MachineFunctionPass(ID) { 115 initializeMachineSchedulerPass(*PassRegistry::getPassRegistry()); 116 } 117 118 void MachineScheduler::getAnalysisUsage(AnalysisUsage &AU) const { 119 AU.setPreservesCFG(); 120 AU.addRequiredID(MachineDominatorsID); 121 AU.addRequired<MachineLoopInfo>(); 122 AU.addRequired<AliasAnalysis>(); 123 AU.addRequired<TargetPassConfig>(); 124 AU.addRequired<SlotIndexes>(); 125 AU.addPreserved<SlotIndexes>(); 126 AU.addRequired<LiveIntervals>(); 127 AU.addPreserved<LiveIntervals>(); 128 MachineFunctionPass::getAnalysisUsage(AU); 129 } 130 131 MachinePassRegistry MachineSchedRegistry::Registry; 132 133 /// A dummy default scheduler factory indicates whether the scheduler 134 /// is overridden on the command line. 135 static ScheduleDAGInstrs *useDefaultMachineSched(MachineSchedContext *C) { 136 return 0; 137 } 138 139 /// MachineSchedOpt allows command line selection of the scheduler. 140 static cl::opt<MachineSchedRegistry::ScheduleDAGCtor, false, 141 RegisterPassParser<MachineSchedRegistry> > 142 MachineSchedOpt("misched", 143 cl::init(&useDefaultMachineSched), cl::Hidden, 144 cl::desc("Machine instruction scheduler to use")); 145 146 static MachineSchedRegistry 147 DefaultSchedRegistry("default", "Use the target's default scheduler choice.", 148 useDefaultMachineSched); 149 150 /// Forward declare the standard machine scheduler. This will be used as the 151 /// default scheduler if the target does not set a default. 152 static ScheduleDAGInstrs *createConvergingSched(MachineSchedContext *C); 153 154 155 /// Decrement this iterator until reaching the top or a non-debug instr. 156 static MachineBasicBlock::iterator 157 priorNonDebug(MachineBasicBlock::iterator I, MachineBasicBlock::iterator Beg) { 158 assert(I != Beg && "reached the top of the region, cannot decrement"); 159 while (--I != Beg) { 160 if (!I->isDebugValue()) 161 break; 162 } 163 return I; 164 } 165 166 /// If this iterator is a debug value, increment until reaching the End or a 167 /// non-debug instruction. 168 static MachineBasicBlock::iterator 169 nextIfDebug(MachineBasicBlock::iterator I, MachineBasicBlock::iterator End) { 170 for(; I != End; ++I) { 171 if (!I->isDebugValue()) 172 break; 173 } 174 return I; 175 } 176 177 /// Top-level MachineScheduler pass driver. 178 /// 179 /// Visit blocks in function order. Divide each block into scheduling regions 180 /// and visit them bottom-up. Visiting regions bottom-up is not required, but is 181 /// consistent with the DAG builder, which traverses the interior of the 182 /// scheduling regions bottom-up. 183 /// 184 /// This design avoids exposing scheduling boundaries to the DAG builder, 185 /// simplifying the DAG builder's support for "special" target instructions. 186 /// At the same time the design allows target schedulers to operate across 187 /// scheduling boundaries, for example to bundle the boudary instructions 188 /// without reordering them. This creates complexity, because the target 189 /// scheduler must update the RegionBegin and RegionEnd positions cached by 190 /// ScheduleDAGInstrs whenever adding or removing instructions. A much simpler 191 /// design would be to split blocks at scheduling boundaries, but LLVM has a 192 /// general bias against block splitting purely for implementation simplicity. 193 bool MachineScheduler::runOnMachineFunction(MachineFunction &mf) { 194 DEBUG(dbgs() << "Before MISsched:\n"; mf.print(dbgs())); 195 196 // Initialize the context of the pass. 197 MF = &mf; 198 MLI = &getAnalysis<MachineLoopInfo>(); 199 MDT = &getAnalysis<MachineDominatorTree>(); 200 PassConfig = &getAnalysis<TargetPassConfig>(); 201 AA = &getAnalysis<AliasAnalysis>(); 202 203 LIS = &getAnalysis<LiveIntervals>(); 204 const TargetInstrInfo *TII = MF->getTarget().getInstrInfo(); 205 206 if (VerifyScheduling) { 207 DEBUG(LIS->dump()); 208 MF->verify(this, "Before machine scheduling."); 209 } 210 RegClassInfo->runOnMachineFunction(*MF); 211 212 // Select the scheduler, or set the default. 213 MachineSchedRegistry::ScheduleDAGCtor Ctor = MachineSchedOpt; 214 if (Ctor == useDefaultMachineSched) { 215 // Get the default scheduler set by the target. 216 Ctor = MachineSchedRegistry::getDefault(); 217 if (!Ctor) { 218 Ctor = createConvergingSched; 219 MachineSchedRegistry::setDefault(Ctor); 220 } 221 } 222 // Instantiate the selected scheduler. 223 OwningPtr<ScheduleDAGInstrs> Scheduler(Ctor(this)); 224 225 // Visit all machine basic blocks. 226 // 227 // TODO: Visit blocks in global postorder or postorder within the bottom-up 228 // loop tree. Then we can optionally compute global RegPressure. 229 for (MachineFunction::iterator MBB = MF->begin(), MBBEnd = MF->end(); 230 MBB != MBBEnd; ++MBB) { 231 232 Scheduler->startBlock(MBB); 233 234 // Break the block into scheduling regions [I, RegionEnd), and schedule each 235 // region as soon as it is discovered. RegionEnd points the scheduling 236 // boundary at the bottom of the region. The DAG does not include RegionEnd, 237 // but the region does (i.e. the next RegionEnd is above the previous 238 // RegionBegin). If the current block has no terminator then RegionEnd == 239 // MBB->end() for the bottom region. 240 // 241 // The Scheduler may insert instructions during either schedule() or 242 // exitRegion(), even for empty regions. So the local iterators 'I' and 243 // 'RegionEnd' are invalid across these calls. 244 unsigned RemainingInstrs = MBB->size(); 245 for(MachineBasicBlock::iterator RegionEnd = MBB->end(); 246 RegionEnd != MBB->begin(); RegionEnd = Scheduler->begin()) { 247 248 // Avoid decrementing RegionEnd for blocks with no terminator. 249 if (RegionEnd != MBB->end() 250 || TII->isSchedulingBoundary(llvm::prior(RegionEnd), MBB, *MF)) { 251 --RegionEnd; 252 // Count the boundary instruction. 253 --RemainingInstrs; 254 } 255 256 // The next region starts above the previous region. Look backward in the 257 // instruction stream until we find the nearest boundary. 258 MachineBasicBlock::iterator I = RegionEnd; 259 for(;I != MBB->begin(); --I, --RemainingInstrs) { 260 if (TII->isSchedulingBoundary(llvm::prior(I), MBB, *MF)) 261 break; 262 } 263 // Notify the scheduler of the region, even if we may skip scheduling 264 // it. Perhaps it still needs to be bundled. 265 Scheduler->enterRegion(MBB, I, RegionEnd, RemainingInstrs); 266 267 // Skip empty scheduling regions (0 or 1 schedulable instructions). 268 if (I == RegionEnd || I == llvm::prior(RegionEnd)) { 269 // Close the current region. Bundle the terminator if needed. 270 // This invalidates 'RegionEnd' and 'I'. 271 Scheduler->exitRegion(); 272 continue; 273 } 274 DEBUG(dbgs() << "********** MI Scheduling **********\n"); 275 DEBUG(dbgs() << MF->getName() 276 << ":BB#" << MBB->getNumber() << " " << MBB->getName() 277 << "\n From: " << *I << " To: "; 278 if (RegionEnd != MBB->end()) dbgs() << *RegionEnd; 279 else dbgs() << "End"; 280 dbgs() << " Remaining: " << RemainingInstrs << "\n"); 281 282 // Schedule a region: possibly reorder instructions. 283 // This invalidates 'RegionEnd' and 'I'. 284 Scheduler->schedule(); 285 286 // Close the current region. 287 Scheduler->exitRegion(); 288 289 // Scheduling has invalidated the current iterator 'I'. Ask the 290 // scheduler for the top of it's scheduled region. 291 RegionEnd = Scheduler->begin(); 292 } 293 assert(RemainingInstrs == 0 && "Instruction count mismatch!"); 294 Scheduler->finishBlock(); 295 } 296 Scheduler->finalizeSchedule(); 297 DEBUG(LIS->dump()); 298 if (VerifyScheduling) 299 MF->verify(this, "After machine scheduling."); 300 return true; 301 } 302 303 void MachineScheduler::print(raw_ostream &O, const Module* m) const { 304 // unimplemented 305 } 306 307 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 308 void ReadyQueue::dump() { 309 dbgs() << Name << ": "; 310 for (unsigned i = 0, e = Queue.size(); i < e; ++i) 311 dbgs() << Queue[i]->NodeNum << " "; 312 dbgs() << "\n"; 313 } 314 #endif 315 316 //===----------------------------------------------------------------------===// 317 // ScheduleDAGMI - Base class for MachineInstr scheduling with LiveIntervals 318 // preservation. 319 //===----------------------------------------------------------------------===// 320 321 ScheduleDAGMI::~ScheduleDAGMI() { 322 delete DFSResult; 323 DeleteContainerPointers(Mutations); 324 delete SchedImpl; 325 } 326 327 bool ScheduleDAGMI::canAddEdge(SUnit *SuccSU, SUnit *PredSU) { 328 return SuccSU == &ExitSU || !Topo.IsReachable(PredSU, SuccSU); 329 } 330 331 bool ScheduleDAGMI::addEdge(SUnit *SuccSU, const SDep &PredDep) { 332 if (SuccSU != &ExitSU) { 333 // Do not use WillCreateCycle, it assumes SD scheduling. 334 // If Pred is reachable from Succ, then the edge creates a cycle. 335 if (Topo.IsReachable(PredDep.getSUnit(), SuccSU)) 336 return false; 337 Topo.AddPred(SuccSU, PredDep.getSUnit()); 338 } 339 SuccSU->addPred(PredDep, /*Required=*/!PredDep.isArtificial()); 340 // Return true regardless of whether a new edge needed to be inserted. 341 return true; 342 } 343 344 /// ReleaseSucc - Decrement the NumPredsLeft count of a successor. When 345 /// NumPredsLeft reaches zero, release the successor node. 346 /// 347 /// FIXME: Adjust SuccSU height based on MinLatency. 348 void ScheduleDAGMI::releaseSucc(SUnit *SU, SDep *SuccEdge) { 349 SUnit *SuccSU = SuccEdge->getSUnit(); 350 351 if (SuccEdge->isWeak()) { 352 --SuccSU->WeakPredsLeft; 353 if (SuccEdge->isCluster()) 354 NextClusterSucc = SuccSU; 355 return; 356 } 357 #ifndef NDEBUG 358 if (SuccSU->NumPredsLeft == 0) { 359 dbgs() << "*** Scheduling failed! ***\n"; 360 SuccSU->dump(this); 361 dbgs() << " has been released too many times!\n"; 362 llvm_unreachable(0); 363 } 364 #endif 365 --SuccSU->NumPredsLeft; 366 if (SuccSU->NumPredsLeft == 0 && SuccSU != &ExitSU) 367 SchedImpl->releaseTopNode(SuccSU); 368 } 369 370 /// releaseSuccessors - Call releaseSucc on each of SU's successors. 371 void ScheduleDAGMI::releaseSuccessors(SUnit *SU) { 372 for (SUnit::succ_iterator I = SU->Succs.begin(), E = SU->Succs.end(); 373 I != E; ++I) { 374 releaseSucc(SU, &*I); 375 } 376 } 377 378 /// ReleasePred - Decrement the NumSuccsLeft count of a predecessor. When 379 /// NumSuccsLeft reaches zero, release the predecessor node. 380 /// 381 /// FIXME: Adjust PredSU height based on MinLatency. 382 void ScheduleDAGMI::releasePred(SUnit *SU, SDep *PredEdge) { 383 SUnit *PredSU = PredEdge->getSUnit(); 384 385 if (PredEdge->isWeak()) { 386 --PredSU->WeakSuccsLeft; 387 if (PredEdge->isCluster()) 388 NextClusterPred = PredSU; 389 return; 390 } 391 #ifndef NDEBUG 392 if (PredSU->NumSuccsLeft == 0) { 393 dbgs() << "*** Scheduling failed! ***\n"; 394 PredSU->dump(this); 395 dbgs() << " has been released too many times!\n"; 396 llvm_unreachable(0); 397 } 398 #endif 399 --PredSU->NumSuccsLeft; 400 if (PredSU->NumSuccsLeft == 0 && PredSU != &EntrySU) 401 SchedImpl->releaseBottomNode(PredSU); 402 } 403 404 /// releasePredecessors - Call releasePred on each of SU's predecessors. 405 void ScheduleDAGMI::releasePredecessors(SUnit *SU) { 406 for (SUnit::pred_iterator I = SU->Preds.begin(), E = SU->Preds.end(); 407 I != E; ++I) { 408 releasePred(SU, &*I); 409 } 410 } 411 412 /// This is normally called from the main scheduler loop but may also be invoked 413 /// by the scheduling strategy to perform additional code motion. 414 void ScheduleDAGMI::moveInstruction(MachineInstr *MI, 415 MachineBasicBlock::iterator InsertPos) { 416 // Advance RegionBegin if the first instruction moves down. 417 if (&*RegionBegin == MI) 418 ++RegionBegin; 419 420 // Update the instruction stream. 421 BB->splice(InsertPos, BB, MI); 422 423 // Update LiveIntervals 424 LIS->handleMove(MI, /*UpdateFlags=*/true); 425 426 // Recede RegionBegin if an instruction moves above the first. 427 if (RegionBegin == InsertPos) 428 RegionBegin = MI; 429 } 430 431 bool ScheduleDAGMI::checkSchedLimit() { 432 #ifndef NDEBUG 433 if (NumInstrsScheduled == MISchedCutoff && MISchedCutoff != ~0U) { 434 CurrentTop = CurrentBottom; 435 return false; 436 } 437 ++NumInstrsScheduled; 438 #endif 439 return true; 440 } 441 442 /// enterRegion - Called back from MachineScheduler::runOnMachineFunction after 443 /// crossing a scheduling boundary. [begin, end) includes all instructions in 444 /// the region, including the boundary itself and single-instruction regions 445 /// that don't get scheduled. 446 void ScheduleDAGMI::enterRegion(MachineBasicBlock *bb, 447 MachineBasicBlock::iterator begin, 448 MachineBasicBlock::iterator end, 449 unsigned endcount) 450 { 451 ScheduleDAGInstrs::enterRegion(bb, begin, end, endcount); 452 453 // For convenience remember the end of the liveness region. 454 LiveRegionEnd = 455 (RegionEnd == bb->end()) ? RegionEnd : llvm::next(RegionEnd); 456 } 457 458 // Setup the register pressure trackers for the top scheduled top and bottom 459 // scheduled regions. 460 void ScheduleDAGMI::initRegPressure() { 461 TopRPTracker.init(&MF, RegClassInfo, LIS, BB, RegionBegin); 462 BotRPTracker.init(&MF, RegClassInfo, LIS, BB, LiveRegionEnd); 463 464 // Close the RPTracker to finalize live ins. 465 RPTracker.closeRegion(); 466 467 DEBUG(RPTracker.getPressure().dump(TRI)); 468 469 // Initialize the live ins and live outs. 470 TopRPTracker.addLiveRegs(RPTracker.getPressure().LiveInRegs); 471 BotRPTracker.addLiveRegs(RPTracker.getPressure().LiveOutRegs); 472 473 // Close one end of the tracker so we can call 474 // getMaxUpward/DownwardPressureDelta before advancing across any 475 // instructions. This converts currently live regs into live ins/outs. 476 TopRPTracker.closeTop(); 477 BotRPTracker.closeBottom(); 478 479 // Account for liveness generated by the region boundary. 480 if (LiveRegionEnd != RegionEnd) 481 BotRPTracker.recede(); 482 483 assert(BotRPTracker.getPos() == RegionEnd && "Can't find the region bottom"); 484 485 // Cache the list of excess pressure sets in this region. This will also track 486 // the max pressure in the scheduled code for these sets. 487 RegionCriticalPSets.clear(); 488 const std::vector<unsigned> &RegionPressure = 489 RPTracker.getPressure().MaxSetPressure; 490 for (unsigned i = 0, e = RegionPressure.size(); i < e; ++i) { 491 unsigned Limit = RegClassInfo->getRegPressureSetLimit(i); 492 if (RegionPressure[i] > Limit) { 493 DEBUG(dbgs() << TRI->getRegPressureSetName(i) 494 << " Limit " << Limit 495 << " Actual " << RegionPressure[i] << "\n"); 496 RegionCriticalPSets.push_back(PressureElement(i, 0)); 497 } 498 } 499 DEBUG(dbgs() << "Excess PSets: "; 500 for (unsigned i = 0, e = RegionCriticalPSets.size(); i != e; ++i) 501 dbgs() << TRI->getRegPressureSetName( 502 RegionCriticalPSets[i].PSetID) << " "; 503 dbgs() << "\n"); 504 } 505 506 // FIXME: When the pressure tracker deals in pressure differences then we won't 507 // iterate over all RegionCriticalPSets[i]. 508 void ScheduleDAGMI:: 509 updateScheduledPressure(const std::vector<unsigned> &NewMaxPressure) { 510 for (unsigned i = 0, e = RegionCriticalPSets.size(); i < e; ++i) { 511 unsigned ID = RegionCriticalPSets[i].PSetID; 512 int &MaxUnits = RegionCriticalPSets[i].UnitIncrease; 513 if ((int)NewMaxPressure[ID] > MaxUnits) 514 MaxUnits = NewMaxPressure[ID]; 515 } 516 DEBUG( 517 for (unsigned i = 0, e = NewMaxPressure.size(); i < e; ++i) { 518 unsigned Limit = RegClassInfo->getRegPressureSetLimit(i); 519 if (NewMaxPressure[i] > Limit ) { 520 dbgs() << " " << TRI->getRegPressureSetName(i) << ": " 521 << NewMaxPressure[i] << " > " << Limit << "\n"; 522 } 523 }); 524 } 525 526 /// schedule - Called back from MachineScheduler::runOnMachineFunction 527 /// after setting up the current scheduling region. [RegionBegin, RegionEnd) 528 /// only includes instructions that have DAG nodes, not scheduling boundaries. 529 /// 530 /// This is a skeletal driver, with all the functionality pushed into helpers, 531 /// so that it can be easilly extended by experimental schedulers. Generally, 532 /// implementing MachineSchedStrategy should be sufficient to implement a new 533 /// scheduling algorithm. However, if a scheduler further subclasses 534 /// ScheduleDAGMI then it will want to override this virtual method in order to 535 /// update any specialized state. 536 void ScheduleDAGMI::schedule() { 537 buildDAGWithRegPressure(); 538 539 Topo.InitDAGTopologicalSorting(); 540 541 postprocessDAG(); 542 543 SmallVector<SUnit*, 8> TopRoots, BotRoots; 544 findRootsAndBiasEdges(TopRoots, BotRoots); 545 546 // Initialize the strategy before modifying the DAG. 547 // This may initialize a DFSResult to be used for queue priority. 548 SchedImpl->initialize(this); 549 550 DEBUG(for (unsigned su = 0, e = SUnits.size(); su != e; ++su) 551 SUnits[su].dumpAll(this)); 552 if (ViewMISchedDAGs) viewGraph(); 553 554 // Initialize ready queues now that the DAG and priority data are finalized. 555 initQueues(TopRoots, BotRoots); 556 557 bool IsTopNode = false; 558 while (SUnit *SU = SchedImpl->pickNode(IsTopNode)) { 559 assert(!SU->isScheduled && "Node already scheduled"); 560 if (!checkSchedLimit()) 561 break; 562 563 scheduleMI(SU, IsTopNode); 564 565 updateQueues(SU, IsTopNode); 566 } 567 assert(CurrentTop == CurrentBottom && "Nonempty unscheduled zone."); 568 569 placeDebugValues(); 570 571 DEBUG({ 572 unsigned BBNum = begin()->getParent()->getNumber(); 573 dbgs() << "*** Final schedule for BB#" << BBNum << " ***\n"; 574 dumpSchedule(); 575 dbgs() << '\n'; 576 }); 577 } 578 579 /// Build the DAG and setup three register pressure trackers. 580 void ScheduleDAGMI::buildDAGWithRegPressure() { 581 // Initialize the register pressure tracker used by buildSchedGraph. 582 RPTracker.init(&MF, RegClassInfo, LIS, BB, LiveRegionEnd); 583 584 // Account for liveness generate by the region boundary. 585 if (LiveRegionEnd != RegionEnd) 586 RPTracker.recede(); 587 588 // Build the DAG, and compute current register pressure. 589 buildSchedGraph(AA, &RPTracker); 590 591 // Initialize top/bottom trackers after computing region pressure. 592 initRegPressure(); 593 } 594 595 /// Apply each ScheduleDAGMutation step in order. 596 void ScheduleDAGMI::postprocessDAG() { 597 for (unsigned i = 0, e = Mutations.size(); i < e; ++i) { 598 Mutations[i]->apply(this); 599 } 600 } 601 602 void ScheduleDAGMI::computeDFSResult() { 603 if (!DFSResult) 604 DFSResult = new SchedDFSResult(/*BottomU*/true, MinSubtreeSize); 605 DFSResult->clear(); 606 ScheduledTrees.clear(); 607 DFSResult->resize(SUnits.size()); 608 DFSResult->compute(SUnits); 609 ScheduledTrees.resize(DFSResult->getNumSubtrees()); 610 } 611 612 void ScheduleDAGMI::findRootsAndBiasEdges(SmallVectorImpl<SUnit*> &TopRoots, 613 SmallVectorImpl<SUnit*> &BotRoots) { 614 for (std::vector<SUnit>::iterator 615 I = SUnits.begin(), E = SUnits.end(); I != E; ++I) { 616 SUnit *SU = &(*I); 617 assert(!SU->isBoundaryNode() && "Boundary node should not be in SUnits"); 618 619 // Order predecessors so DFSResult follows the critical path. 620 SU->biasCriticalPath(); 621 622 // A SUnit is ready to top schedule if it has no predecessors. 623 if (!I->NumPredsLeft) 624 TopRoots.push_back(SU); 625 // A SUnit is ready to bottom schedule if it has no successors. 626 if (!I->NumSuccsLeft) 627 BotRoots.push_back(SU); 628 } 629 ExitSU.biasCriticalPath(); 630 } 631 632 /// Identify DAG roots and setup scheduler queues. 633 void ScheduleDAGMI::initQueues(ArrayRef<SUnit*> TopRoots, 634 ArrayRef<SUnit*> BotRoots) { 635 NextClusterSucc = NULL; 636 NextClusterPred = NULL; 637 638 // Release all DAG roots for scheduling, not including EntrySU/ExitSU. 639 // 640 // Nodes with unreleased weak edges can still be roots. 641 // Release top roots in forward order. 642 for (SmallVectorImpl<SUnit*>::const_iterator 643 I = TopRoots.begin(), E = TopRoots.end(); I != E; ++I) { 644 SchedImpl->releaseTopNode(*I); 645 } 646 // Release bottom roots in reverse order so the higher priority nodes appear 647 // first. This is more natural and slightly more efficient. 648 for (SmallVectorImpl<SUnit*>::const_reverse_iterator 649 I = BotRoots.rbegin(), E = BotRoots.rend(); I != E; ++I) { 650 SchedImpl->releaseBottomNode(*I); 651 } 652 653 releaseSuccessors(&EntrySU); 654 releasePredecessors(&ExitSU); 655 656 SchedImpl->registerRoots(); 657 658 // Advance past initial DebugValues. 659 assert(TopRPTracker.getPos() == RegionBegin && "bad initial Top tracker"); 660 CurrentTop = nextIfDebug(RegionBegin, RegionEnd); 661 TopRPTracker.setPos(CurrentTop); 662 663 CurrentBottom = RegionEnd; 664 } 665 666 /// Move an instruction and update register pressure. 667 void ScheduleDAGMI::scheduleMI(SUnit *SU, bool IsTopNode) { 668 // Move the instruction to its new location in the instruction stream. 669 MachineInstr *MI = SU->getInstr(); 670 671 if (IsTopNode) { 672 assert(SU->isTopReady() && "node still has unscheduled dependencies"); 673 if (&*CurrentTop == MI) 674 CurrentTop = nextIfDebug(++CurrentTop, CurrentBottom); 675 else { 676 moveInstruction(MI, CurrentTop); 677 TopRPTracker.setPos(MI); 678 } 679 680 // Update top scheduled pressure. 681 TopRPTracker.advance(); 682 assert(TopRPTracker.getPos() == CurrentTop && "out of sync"); 683 updateScheduledPressure(TopRPTracker.getPressure().MaxSetPressure); 684 } 685 else { 686 assert(SU->isBottomReady() && "node still has unscheduled dependencies"); 687 MachineBasicBlock::iterator priorII = 688 priorNonDebug(CurrentBottom, CurrentTop); 689 if (&*priorII == MI) 690 CurrentBottom = priorII; 691 else { 692 if (&*CurrentTop == MI) { 693 CurrentTop = nextIfDebug(++CurrentTop, priorII); 694 TopRPTracker.setPos(CurrentTop); 695 } 696 moveInstruction(MI, CurrentBottom); 697 CurrentBottom = MI; 698 } 699 // Update bottom scheduled pressure. 700 BotRPTracker.recede(); 701 assert(BotRPTracker.getPos() == CurrentBottom && "out of sync"); 702 updateScheduledPressure(BotRPTracker.getPressure().MaxSetPressure); 703 } 704 } 705 706 /// Update scheduler queues after scheduling an instruction. 707 void ScheduleDAGMI::updateQueues(SUnit *SU, bool IsTopNode) { 708 // Release dependent instructions for scheduling. 709 if (IsTopNode) 710 releaseSuccessors(SU); 711 else 712 releasePredecessors(SU); 713 714 SU->isScheduled = true; 715 716 if (DFSResult) { 717 unsigned SubtreeID = DFSResult->getSubtreeID(SU); 718 if (!ScheduledTrees.test(SubtreeID)) { 719 ScheduledTrees.set(SubtreeID); 720 DFSResult->scheduleTree(SubtreeID); 721 SchedImpl->scheduleTree(SubtreeID); 722 } 723 } 724 725 // Notify the scheduling strategy after updating the DAG. 726 SchedImpl->schedNode(SU, IsTopNode); 727 } 728 729 /// Reinsert any remaining debug_values, just like the PostRA scheduler. 730 void ScheduleDAGMI::placeDebugValues() { 731 // If first instruction was a DBG_VALUE then put it back. 732 if (FirstDbgValue) { 733 BB->splice(RegionBegin, BB, FirstDbgValue); 734 RegionBegin = FirstDbgValue; 735 } 736 737 for (std::vector<std::pair<MachineInstr *, MachineInstr *> >::iterator 738 DI = DbgValues.end(), DE = DbgValues.begin(); DI != DE; --DI) { 739 std::pair<MachineInstr *, MachineInstr *> P = *prior(DI); 740 MachineInstr *DbgValue = P.first; 741 MachineBasicBlock::iterator OrigPrevMI = P.second; 742 if (&*RegionBegin == DbgValue) 743 ++RegionBegin; 744 BB->splice(++OrigPrevMI, BB, DbgValue); 745 if (OrigPrevMI == llvm::prior(RegionEnd)) 746 RegionEnd = DbgValue; 747 } 748 DbgValues.clear(); 749 FirstDbgValue = NULL; 750 } 751 752 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 753 void ScheduleDAGMI::dumpSchedule() const { 754 for (MachineBasicBlock::iterator MI = begin(), ME = end(); MI != ME; ++MI) { 755 if (SUnit *SU = getSUnit(&(*MI))) 756 SU->dump(this); 757 else 758 dbgs() << "Missing SUnit\n"; 759 } 760 } 761 #endif 762 763 //===----------------------------------------------------------------------===// 764 // LoadClusterMutation - DAG post-processing to cluster loads. 765 //===----------------------------------------------------------------------===// 766 767 namespace { 768 /// \brief Post-process the DAG to create cluster edges between neighboring 769 /// loads. 770 class LoadClusterMutation : public ScheduleDAGMutation { 771 struct LoadInfo { 772 SUnit *SU; 773 unsigned BaseReg; 774 unsigned Offset; 775 LoadInfo(SUnit *su, unsigned reg, unsigned ofs) 776 : SU(su), BaseReg(reg), Offset(ofs) {} 777 }; 778 static bool LoadInfoLess(const LoadClusterMutation::LoadInfo &LHS, 779 const LoadClusterMutation::LoadInfo &RHS); 780 781 const TargetInstrInfo *TII; 782 const TargetRegisterInfo *TRI; 783 public: 784 LoadClusterMutation(const TargetInstrInfo *tii, 785 const TargetRegisterInfo *tri) 786 : TII(tii), TRI(tri) {} 787 788 virtual void apply(ScheduleDAGMI *DAG); 789 protected: 790 void clusterNeighboringLoads(ArrayRef<SUnit*> Loads, ScheduleDAGMI *DAG); 791 }; 792 } // anonymous 793 794 bool LoadClusterMutation::LoadInfoLess( 795 const LoadClusterMutation::LoadInfo &LHS, 796 const LoadClusterMutation::LoadInfo &RHS) { 797 if (LHS.BaseReg != RHS.BaseReg) 798 return LHS.BaseReg < RHS.BaseReg; 799 return LHS.Offset < RHS.Offset; 800 } 801 802 void LoadClusterMutation::clusterNeighboringLoads(ArrayRef<SUnit*> Loads, 803 ScheduleDAGMI *DAG) { 804 SmallVector<LoadClusterMutation::LoadInfo,32> LoadRecords; 805 for (unsigned Idx = 0, End = Loads.size(); Idx != End; ++Idx) { 806 SUnit *SU = Loads[Idx]; 807 unsigned BaseReg; 808 unsigned Offset; 809 if (TII->getLdStBaseRegImmOfs(SU->getInstr(), BaseReg, Offset, TRI)) 810 LoadRecords.push_back(LoadInfo(SU, BaseReg, Offset)); 811 } 812 if (LoadRecords.size() < 2) 813 return; 814 std::sort(LoadRecords.begin(), LoadRecords.end(), LoadInfoLess); 815 unsigned ClusterLength = 1; 816 for (unsigned Idx = 0, End = LoadRecords.size(); Idx < (End - 1); ++Idx) { 817 if (LoadRecords[Idx].BaseReg != LoadRecords[Idx+1].BaseReg) { 818 ClusterLength = 1; 819 continue; 820 } 821 822 SUnit *SUa = LoadRecords[Idx].SU; 823 SUnit *SUb = LoadRecords[Idx+1].SU; 824 if (TII->shouldClusterLoads(SUa->getInstr(), SUb->getInstr(), ClusterLength) 825 && DAG->addEdge(SUb, SDep(SUa, SDep::Cluster))) { 826 827 DEBUG(dbgs() << "Cluster loads SU(" << SUa->NodeNum << ") - SU(" 828 << SUb->NodeNum << ")\n"); 829 // Copy successor edges from SUa to SUb. Interleaving computation 830 // dependent on SUa can prevent load combining due to register reuse. 831 // Predecessor edges do not need to be copied from SUb to SUa since nearby 832 // loads should have effectively the same inputs. 833 for (SUnit::const_succ_iterator 834 SI = SUa->Succs.begin(), SE = SUa->Succs.end(); SI != SE; ++SI) { 835 if (SI->getSUnit() == SUb) 836 continue; 837 DEBUG(dbgs() << " Copy Succ SU(" << SI->getSUnit()->NodeNum << ")\n"); 838 DAG->addEdge(SI->getSUnit(), SDep(SUb, SDep::Artificial)); 839 } 840 ++ClusterLength; 841 } 842 else 843 ClusterLength = 1; 844 } 845 } 846 847 /// \brief Callback from DAG postProcessing to create cluster edges for loads. 848 void LoadClusterMutation::apply(ScheduleDAGMI *DAG) { 849 // Map DAG NodeNum to store chain ID. 850 DenseMap<unsigned, unsigned> StoreChainIDs; 851 // Map each store chain to a set of dependent loads. 852 SmallVector<SmallVector<SUnit*,4>, 32> StoreChainDependents; 853 for (unsigned Idx = 0, End = DAG->SUnits.size(); Idx != End; ++Idx) { 854 SUnit *SU = &DAG->SUnits[Idx]; 855 if (!SU->getInstr()->mayLoad()) 856 continue; 857 unsigned ChainPredID = DAG->SUnits.size(); 858 for (SUnit::const_pred_iterator 859 PI = SU->Preds.begin(), PE = SU->Preds.end(); PI != PE; ++PI) { 860 if (PI->isCtrl()) { 861 ChainPredID = PI->getSUnit()->NodeNum; 862 break; 863 } 864 } 865 // Check if this chain-like pred has been seen 866 // before. ChainPredID==MaxNodeID for loads at the top of the schedule. 867 unsigned NumChains = StoreChainDependents.size(); 868 std::pair<DenseMap<unsigned, unsigned>::iterator, bool> Result = 869 StoreChainIDs.insert(std::make_pair(ChainPredID, NumChains)); 870 if (Result.second) 871 StoreChainDependents.resize(NumChains + 1); 872 StoreChainDependents[Result.first->second].push_back(SU); 873 } 874 // Iterate over the store chains. 875 for (unsigned Idx = 0, End = StoreChainDependents.size(); Idx != End; ++Idx) 876 clusterNeighboringLoads(StoreChainDependents[Idx], DAG); 877 } 878 879 //===----------------------------------------------------------------------===// 880 // MacroFusion - DAG post-processing to encourage fusion of macro ops. 881 //===----------------------------------------------------------------------===// 882 883 namespace { 884 /// \brief Post-process the DAG to create cluster edges between instructions 885 /// that may be fused by the processor into a single operation. 886 class MacroFusion : public ScheduleDAGMutation { 887 const TargetInstrInfo *TII; 888 public: 889 MacroFusion(const TargetInstrInfo *tii): TII(tii) {} 890 891 virtual void apply(ScheduleDAGMI *DAG); 892 }; 893 } // anonymous 894 895 /// \brief Callback from DAG postProcessing to create cluster edges to encourage 896 /// fused operations. 897 void MacroFusion::apply(ScheduleDAGMI *DAG) { 898 // For now, assume targets can only fuse with the branch. 899 MachineInstr *Branch = DAG->ExitSU.getInstr(); 900 if (!Branch) 901 return; 902 903 for (unsigned Idx = DAG->SUnits.size(); Idx > 0;) { 904 SUnit *SU = &DAG->SUnits[--Idx]; 905 if (!TII->shouldScheduleAdjacent(SU->getInstr(), Branch)) 906 continue; 907 908 // Create a single weak edge from SU to ExitSU. The only effect is to cause 909 // bottom-up scheduling to heavily prioritize the clustered SU. There is no 910 // need to copy predecessor edges from ExitSU to SU, since top-down 911 // scheduling cannot prioritize ExitSU anyway. To defer top-down scheduling 912 // of SU, we could create an artificial edge from the deepest root, but it 913 // hasn't been needed yet. 914 bool Success = DAG->addEdge(&DAG->ExitSU, SDep(SU, SDep::Cluster)); 915 (void)Success; 916 assert(Success && "No DAG nodes should be reachable from ExitSU"); 917 918 DEBUG(dbgs() << "Macro Fuse SU(" << SU->NodeNum << ")\n"); 919 break; 920 } 921 } 922 923 //===----------------------------------------------------------------------===// 924 // CopyConstrain - DAG post-processing to encourage copy elimination. 925 //===----------------------------------------------------------------------===// 926 927 namespace { 928 /// \brief Post-process the DAG to create weak edges from all uses of a copy to 929 /// the one use that defines the copy's source vreg, most likely an induction 930 /// variable increment. 931 class CopyConstrain : public ScheduleDAGMutation { 932 // Transient state. 933 SlotIndex RegionBeginIdx; 934 // RegionEndIdx is the slot index of the last non-debug instruction in the 935 // scheduling region. So we may have RegionBeginIdx == RegionEndIdx. 936 SlotIndex RegionEndIdx; 937 public: 938 CopyConstrain(const TargetInstrInfo *, const TargetRegisterInfo *) {} 939 940 virtual void apply(ScheduleDAGMI *DAG); 941 942 protected: 943 void constrainLocalCopy(SUnit *CopySU, ScheduleDAGMI *DAG); 944 }; 945 } // anonymous 946 947 /// constrainLocalCopy handles two possibilities: 948 /// 1) Local src: 949 /// I0: = dst 950 /// I1: src = ... 951 /// I2: = dst 952 /// I3: dst = src (copy) 953 /// (create pred->succ edges I0->I1, I2->I1) 954 /// 955 /// 2) Local copy: 956 /// I0: dst = src (copy) 957 /// I1: = dst 958 /// I2: src = ... 959 /// I3: = dst 960 /// (create pred->succ edges I1->I2, I3->I2) 961 /// 962 /// Although the MachineScheduler is currently constrained to single blocks, 963 /// this algorithm should handle extended blocks. An EBB is a set of 964 /// contiguously numbered blocks such that the previous block in the EBB is 965 /// always the single predecessor. 966 void CopyConstrain::constrainLocalCopy(SUnit *CopySU, ScheduleDAGMI *DAG) { 967 LiveIntervals *LIS = DAG->getLIS(); 968 MachineInstr *Copy = CopySU->getInstr(); 969 970 // Check for pure vreg copies. 971 unsigned SrcReg = Copy->getOperand(1).getReg(); 972 if (!TargetRegisterInfo::isVirtualRegister(SrcReg)) 973 return; 974 975 unsigned DstReg = Copy->getOperand(0).getReg(); 976 if (!TargetRegisterInfo::isVirtualRegister(DstReg)) 977 return; 978 979 // Check if either the dest or source is local. If it's live across a back 980 // edge, it's not local. Note that if both vregs are live across the back 981 // edge, we cannot successfully contrain the copy without cyclic scheduling. 982 unsigned LocalReg = DstReg; 983 unsigned GlobalReg = SrcReg; 984 LiveInterval *LocalLI = &LIS->getInterval(LocalReg); 985 if (!LocalLI->isLocal(RegionBeginIdx, RegionEndIdx)) { 986 LocalReg = SrcReg; 987 GlobalReg = DstReg; 988 LocalLI = &LIS->getInterval(LocalReg); 989 if (!LocalLI->isLocal(RegionBeginIdx, RegionEndIdx)) 990 return; 991 } 992 LiveInterval *GlobalLI = &LIS->getInterval(GlobalReg); 993 994 // Find the global segment after the start of the local LI. 995 LiveInterval::iterator GlobalSegment = GlobalLI->find(LocalLI->beginIndex()); 996 // If GlobalLI does not overlap LocalLI->start, then a copy directly feeds a 997 // local live range. We could create edges from other global uses to the local 998 // start, but the coalescer should have already eliminated these cases, so 999 // don't bother dealing with it. 1000 if (GlobalSegment == GlobalLI->end()) 1001 return; 1002 1003 // If GlobalSegment is killed at the LocalLI->start, the call to find() 1004 // returned the next global segment. But if GlobalSegment overlaps with 1005 // LocalLI->start, then advance to the next segement. If a hole in GlobalLI 1006 // exists in LocalLI's vicinity, GlobalSegment will be the end of the hole. 1007 if (GlobalSegment->contains(LocalLI->beginIndex())) 1008 ++GlobalSegment; 1009 1010 if (GlobalSegment == GlobalLI->end()) 1011 return; 1012 1013 // Check if GlobalLI contains a hole in the vicinity of LocalLI. 1014 if (GlobalSegment != GlobalLI->begin()) { 1015 // Two address defs have no hole. 1016 if (SlotIndex::isSameInstr(llvm::prior(GlobalSegment)->end, 1017 GlobalSegment->start)) { 1018 return; 1019 } 1020 // If the prior global segment may be defined by the same two-address 1021 // instruction that also defines LocalLI, then can't make a hole here. 1022 if (SlotIndex::isSameInstr(llvm::prior(GlobalSegment)->start, 1023 LocalLI->beginIndex())) { 1024 return; 1025 } 1026 // If GlobalLI has a prior segment, it must be live into the EBB. Otherwise 1027 // it would be a disconnected component in the live range. 1028 assert(llvm::prior(GlobalSegment)->start < LocalLI->beginIndex() && 1029 "Disconnected LRG within the scheduling region."); 1030 } 1031 MachineInstr *GlobalDef = LIS->getInstructionFromIndex(GlobalSegment->start); 1032 if (!GlobalDef) 1033 return; 1034 1035 SUnit *GlobalSU = DAG->getSUnit(GlobalDef); 1036 if (!GlobalSU) 1037 return; 1038 1039 // GlobalDef is the bottom of the GlobalLI hole. Open the hole by 1040 // constraining the uses of the last local def to precede GlobalDef. 1041 SmallVector<SUnit*,8> LocalUses; 1042 const VNInfo *LastLocalVN = LocalLI->getVNInfoBefore(LocalLI->endIndex()); 1043 MachineInstr *LastLocalDef = LIS->getInstructionFromIndex(LastLocalVN->def); 1044 SUnit *LastLocalSU = DAG->getSUnit(LastLocalDef); 1045 for (SUnit::const_succ_iterator 1046 I = LastLocalSU->Succs.begin(), E = LastLocalSU->Succs.end(); 1047 I != E; ++I) { 1048 if (I->getKind() != SDep::Data || I->getReg() != LocalReg) 1049 continue; 1050 if (I->getSUnit() == GlobalSU) 1051 continue; 1052 if (!DAG->canAddEdge(GlobalSU, I->getSUnit())) 1053 return; 1054 LocalUses.push_back(I->getSUnit()); 1055 } 1056 // Open the top of the GlobalLI hole by constraining any earlier global uses 1057 // to precede the start of LocalLI. 1058 SmallVector<SUnit*,8> GlobalUses; 1059 MachineInstr *FirstLocalDef = 1060 LIS->getInstructionFromIndex(LocalLI->beginIndex()); 1061 SUnit *FirstLocalSU = DAG->getSUnit(FirstLocalDef); 1062 for (SUnit::const_pred_iterator 1063 I = GlobalSU->Preds.begin(), E = GlobalSU->Preds.end(); I != E; ++I) { 1064 if (I->getKind() != SDep::Anti || I->getReg() != GlobalReg) 1065 continue; 1066 if (I->getSUnit() == FirstLocalSU) 1067 continue; 1068 if (!DAG->canAddEdge(FirstLocalSU, I->getSUnit())) 1069 return; 1070 GlobalUses.push_back(I->getSUnit()); 1071 } 1072 DEBUG(dbgs() << "Constraining copy SU(" << CopySU->NodeNum << ")\n"); 1073 // Add the weak edges. 1074 for (SmallVectorImpl<SUnit*>::const_iterator 1075 I = LocalUses.begin(), E = LocalUses.end(); I != E; ++I) { 1076 DEBUG(dbgs() << " Local use SU(" << (*I)->NodeNum << ") -> SU(" 1077 << GlobalSU->NodeNum << ")\n"); 1078 DAG->addEdge(GlobalSU, SDep(*I, SDep::Weak)); 1079 } 1080 for (SmallVectorImpl<SUnit*>::const_iterator 1081 I = GlobalUses.begin(), E = GlobalUses.end(); I != E; ++I) { 1082 DEBUG(dbgs() << " Global use SU(" << (*I)->NodeNum << ") -> SU(" 1083 << FirstLocalSU->NodeNum << ")\n"); 1084 DAG->addEdge(FirstLocalSU, SDep(*I, SDep::Weak)); 1085 } 1086 } 1087 1088 /// \brief Callback from DAG postProcessing to create weak edges to encourage 1089 /// copy elimination. 1090 void CopyConstrain::apply(ScheduleDAGMI *DAG) { 1091 MachineBasicBlock::iterator FirstPos = nextIfDebug(DAG->begin(), DAG->end()); 1092 if (FirstPos == DAG->end()) 1093 return; 1094 RegionBeginIdx = DAG->getLIS()->getInstructionIndex(&*FirstPos); 1095 RegionEndIdx = DAG->getLIS()->getInstructionIndex( 1096 &*priorNonDebug(DAG->end(), DAG->begin())); 1097 1098 for (unsigned Idx = 0, End = DAG->SUnits.size(); Idx != End; ++Idx) { 1099 SUnit *SU = &DAG->SUnits[Idx]; 1100 if (!SU->getInstr()->isCopy()) 1101 continue; 1102 1103 constrainLocalCopy(SU, DAG); 1104 } 1105 } 1106 1107 //===----------------------------------------------------------------------===// 1108 // ConvergingScheduler - Implementation of the generic MachineSchedStrategy. 1109 //===----------------------------------------------------------------------===// 1110 1111 namespace { 1112 /// ConvergingScheduler shrinks the unscheduled zone using heuristics to balance 1113 /// the schedule. 1114 class ConvergingScheduler : public MachineSchedStrategy { 1115 public: 1116 /// Represent the type of SchedCandidate found within a single queue. 1117 /// pickNodeBidirectional depends on these listed by decreasing priority. 1118 enum CandReason { 1119 NoCand, PhysRegCopy, RegExcess, RegCritical, Cluster, Weak, RegMax, 1120 ResourceReduce, ResourceDemand, BotHeightReduce, BotPathReduce, 1121 TopDepthReduce, TopPathReduce, NextDefUse, NodeOrder}; 1122 1123 #ifndef NDEBUG 1124 static const char *getReasonStr(ConvergingScheduler::CandReason Reason); 1125 #endif 1126 1127 /// Policy for scheduling the next instruction in the candidate's zone. 1128 struct CandPolicy { 1129 bool ReduceLatency; 1130 unsigned ReduceResIdx; 1131 unsigned DemandResIdx; 1132 1133 CandPolicy(): ReduceLatency(false), ReduceResIdx(0), DemandResIdx(0) {} 1134 }; 1135 1136 /// Status of an instruction's critical resource consumption. 1137 struct SchedResourceDelta { 1138 // Count critical resources in the scheduled region required by SU. 1139 unsigned CritResources; 1140 1141 // Count critical resources from another region consumed by SU. 1142 unsigned DemandedResources; 1143 1144 SchedResourceDelta(): CritResources(0), DemandedResources(0) {} 1145 1146 bool operator==(const SchedResourceDelta &RHS) const { 1147 return CritResources == RHS.CritResources 1148 && DemandedResources == RHS.DemandedResources; 1149 } 1150 bool operator!=(const SchedResourceDelta &RHS) const { 1151 return !operator==(RHS); 1152 } 1153 }; 1154 1155 /// Store the state used by ConvergingScheduler heuristics, required for the 1156 /// lifetime of one invocation of pickNode(). 1157 struct SchedCandidate { 1158 CandPolicy Policy; 1159 1160 // The best SUnit candidate. 1161 SUnit *SU; 1162 1163 // The reason for this candidate. 1164 CandReason Reason; 1165 1166 // Set of reasons that apply to multiple candidates. 1167 uint32_t RepeatReasonSet; 1168 1169 // Register pressure values for the best candidate. 1170 RegPressureDelta RPDelta; 1171 1172 // Critical resource consumption of the best candidate. 1173 SchedResourceDelta ResDelta; 1174 1175 SchedCandidate(const CandPolicy &policy) 1176 : Policy(policy), SU(NULL), Reason(NoCand), RepeatReasonSet(0) {} 1177 1178 bool isValid() const { return SU; } 1179 1180 // Copy the status of another candidate without changing policy. 1181 void setBest(SchedCandidate &Best) { 1182 assert(Best.Reason != NoCand && "uninitialized Sched candidate"); 1183 SU = Best.SU; 1184 Reason = Best.Reason; 1185 RPDelta = Best.RPDelta; 1186 ResDelta = Best.ResDelta; 1187 } 1188 1189 bool isRepeat(CandReason R) { return RepeatReasonSet & (1 << R); } 1190 void setRepeat(CandReason R) { RepeatReasonSet |= (1 << R); } 1191 1192 void initResourceDelta(const ScheduleDAGMI *DAG, 1193 const TargetSchedModel *SchedModel); 1194 }; 1195 1196 /// Summarize the unscheduled region. 1197 struct SchedRemainder { 1198 // Critical path through the DAG in expected latency. 1199 unsigned CriticalPath; 1200 1201 // Scaled count of micro-ops left to schedule. 1202 unsigned RemIssueCount; 1203 1204 // Unscheduled resources 1205 SmallVector<unsigned, 16> RemainingCounts; 1206 1207 void reset() { 1208 CriticalPath = 0; 1209 RemIssueCount = 0; 1210 RemainingCounts.clear(); 1211 } 1212 1213 SchedRemainder() { reset(); } 1214 1215 void init(ScheduleDAGMI *DAG, const TargetSchedModel *SchedModel); 1216 }; 1217 1218 /// Each Scheduling boundary is associated with ready queues. It tracks the 1219 /// current cycle in the direction of movement, and maintains the state 1220 /// of "hazards" and other interlocks at the current cycle. 1221 struct SchedBoundary { 1222 ScheduleDAGMI *DAG; 1223 const TargetSchedModel *SchedModel; 1224 SchedRemainder *Rem; 1225 1226 ReadyQueue Available; 1227 ReadyQueue Pending; 1228 bool CheckPending; 1229 1230 // For heuristics, keep a list of the nodes that immediately depend on the 1231 // most recently scheduled node. 1232 SmallPtrSet<const SUnit*, 8> NextSUs; 1233 1234 ScheduleHazardRecognizer *HazardRec; 1235 1236 /// Number of cycles it takes to issue the instructions scheduled in this 1237 /// zone. It is defined as: scheduled-micro-ops / issue-width + stalls. 1238 /// See getStalls(). 1239 unsigned CurrCycle; 1240 1241 /// Micro-ops issued in the current cycle 1242 unsigned CurrMOps; 1243 1244 /// MinReadyCycle - Cycle of the soonest available instruction. 1245 unsigned MinReadyCycle; 1246 1247 // The expected latency of the critical path in this scheduled zone. 1248 unsigned ExpectedLatency; 1249 1250 // The latency of dependence chains leading into this zone. 1251 // For each node scheduled top-down: DLat = max DLat, N.Depth. 1252 // For each cycle scheduled: DLat -= 1. 1253 unsigned DependentLatency; 1254 1255 /// Count the scheduled (issued) micro-ops that can be retired by 1256 /// time=CurrCycle assuming the first scheduled instr is retired at time=0. 1257 unsigned RetiredMOps; 1258 1259 // Count scheduled resources that have been executed. Resources are 1260 // considered executed if they become ready in the time that it takes to 1261 // saturate any resource including the one in question. Counts are scaled 1262 // for direct comparison with other resources. Counts can be compared with 1263 // MOps * getMicroOpFactor and Latency * getLatencyFactor. 1264 SmallVector<unsigned, 16> ExecutedResCounts; 1265 1266 /// Cache the max count for a single resource. 1267 unsigned MaxExecutedResCount; 1268 1269 // Cache the critical resources ID in this scheduled zone. 1270 unsigned ZoneCritResIdx; 1271 1272 // Is the scheduled region resource limited vs. latency limited. 1273 bool IsResourceLimited; 1274 1275 #ifndef NDEBUG 1276 // Remember the greatest operand latency as an upper bound on the number of 1277 // times we should retry the pending queue because of a hazard. 1278 unsigned MaxObservedLatency; 1279 #endif 1280 1281 void reset() { 1282 // A new HazardRec is created for each DAG and owned by SchedBoundary. 1283 delete HazardRec; 1284 1285 Available.clear(); 1286 Pending.clear(); 1287 CheckPending = false; 1288 NextSUs.clear(); 1289 HazardRec = 0; 1290 CurrCycle = 0; 1291 CurrMOps = 0; 1292 MinReadyCycle = UINT_MAX; 1293 ExpectedLatency = 0; 1294 DependentLatency = 0; 1295 RetiredMOps = 0; 1296 MaxExecutedResCount = 0; 1297 ZoneCritResIdx = 0; 1298 IsResourceLimited = false; 1299 #ifndef NDEBUG 1300 MaxObservedLatency = 0; 1301 #endif 1302 // Reserve a zero-count for invalid CritResIdx. 1303 ExecutedResCounts.resize(1); 1304 assert(!ExecutedResCounts[0] && "nonzero count for bad resource"); 1305 } 1306 1307 /// Pending queues extend the ready queues with the same ID and the 1308 /// PendingFlag set. 1309 SchedBoundary(unsigned ID, const Twine &Name): 1310 DAG(0), SchedModel(0), Rem(0), Available(ID, Name+".A"), 1311 Pending(ID << ConvergingScheduler::LogMaxQID, Name+".P"), 1312 HazardRec(0) { 1313 reset(); 1314 } 1315 1316 ~SchedBoundary() { delete HazardRec; } 1317 1318 void init(ScheduleDAGMI *dag, const TargetSchedModel *smodel, 1319 SchedRemainder *rem); 1320 1321 bool isTop() const { 1322 return Available.getID() == ConvergingScheduler::TopQID; 1323 } 1324 1325 #ifndef NDEBUG 1326 const char *getResourceName(unsigned PIdx) { 1327 if (!PIdx) 1328 return "MOps"; 1329 return SchedModel->getProcResource(PIdx)->Name; 1330 } 1331 #endif 1332 1333 /// Get the number of latency cycles "covered" by the scheduled 1334 /// instructions. This is the larger of the critical path within the zone 1335 /// and the number of cycles required to issue the instructions. 1336 unsigned getScheduledLatency() const { 1337 return std::max(ExpectedLatency, CurrCycle); 1338 } 1339 1340 unsigned getUnscheduledLatency(SUnit *SU) const { 1341 return isTop() ? SU->getHeight() : SU->getDepth(); 1342 } 1343 1344 unsigned getResourceCount(unsigned ResIdx) const { 1345 return ExecutedResCounts[ResIdx]; 1346 } 1347 1348 /// Get the scaled count of scheduled micro-ops and resources, including 1349 /// executed resources. 1350 unsigned getCriticalCount() const { 1351 if (!ZoneCritResIdx) 1352 return RetiredMOps * SchedModel->getMicroOpFactor(); 1353 return getResourceCount(ZoneCritResIdx); 1354 } 1355 1356 /// Get a scaled count for the minimum execution time of the scheduled 1357 /// micro-ops that are ready to execute by getExecutedCount. Notice the 1358 /// feedback loop. 1359 unsigned getExecutedCount() const { 1360 return std::max(CurrCycle * SchedModel->getLatencyFactor(), 1361 MaxExecutedResCount); 1362 } 1363 1364 bool checkHazard(SUnit *SU); 1365 1366 unsigned findMaxLatency(ArrayRef<SUnit*> ReadySUs); 1367 1368 unsigned getOtherResourceCount(unsigned &OtherCritIdx); 1369 1370 void setPolicy(CandPolicy &Policy, SchedBoundary &OtherZone); 1371 1372 void releaseNode(SUnit *SU, unsigned ReadyCycle); 1373 1374 void bumpCycle(unsigned NextCycle); 1375 1376 void incExecutedResources(unsigned PIdx, unsigned Count); 1377 1378 unsigned countResource(unsigned PIdx, unsigned Cycles, unsigned ReadyCycle); 1379 1380 void bumpNode(SUnit *SU); 1381 1382 void releasePending(); 1383 1384 void removeReady(SUnit *SU); 1385 1386 SUnit *pickOnlyChoice(); 1387 1388 #ifndef NDEBUG 1389 void dumpScheduledState(); 1390 #endif 1391 }; 1392 1393 private: 1394 ScheduleDAGMI *DAG; 1395 const TargetSchedModel *SchedModel; 1396 const TargetRegisterInfo *TRI; 1397 1398 // State of the top and bottom scheduled instruction boundaries. 1399 SchedRemainder Rem; 1400 SchedBoundary Top; 1401 SchedBoundary Bot; 1402 1403 public: 1404 /// SUnit::NodeQueueId: 0 (none), 1 (top), 2 (bot), 3 (both) 1405 enum { 1406 TopQID = 1, 1407 BotQID = 2, 1408 LogMaxQID = 2 1409 }; 1410 1411 ConvergingScheduler(): 1412 DAG(0), SchedModel(0), TRI(0), Top(TopQID, "TopQ"), Bot(BotQID, "BotQ") {} 1413 1414 virtual void initialize(ScheduleDAGMI *dag); 1415 1416 virtual SUnit *pickNode(bool &IsTopNode); 1417 1418 virtual void schedNode(SUnit *SU, bool IsTopNode); 1419 1420 virtual void releaseTopNode(SUnit *SU); 1421 1422 virtual void releaseBottomNode(SUnit *SU); 1423 1424 virtual void registerRoots(); 1425 1426 protected: 1427 void tryCandidate(SchedCandidate &Cand, 1428 SchedCandidate &TryCand, 1429 SchedBoundary &Zone, 1430 const RegPressureTracker &RPTracker, 1431 RegPressureTracker &TempTracker); 1432 1433 SUnit *pickNodeBidirectional(bool &IsTopNode); 1434 1435 void pickNodeFromQueue(SchedBoundary &Zone, 1436 const RegPressureTracker &RPTracker, 1437 SchedCandidate &Candidate); 1438 1439 void reschedulePhysRegCopies(SUnit *SU, bool isTop); 1440 1441 #ifndef NDEBUG 1442 void traceCandidate(const SchedCandidate &Cand); 1443 #endif 1444 }; 1445 } // namespace 1446 1447 void ConvergingScheduler::SchedRemainder:: 1448 init(ScheduleDAGMI *DAG, const TargetSchedModel *SchedModel) { 1449 reset(); 1450 if (!SchedModel->hasInstrSchedModel()) 1451 return; 1452 RemainingCounts.resize(SchedModel->getNumProcResourceKinds()); 1453 for (std::vector<SUnit>::iterator 1454 I = DAG->SUnits.begin(), E = DAG->SUnits.end(); I != E; ++I) { 1455 const MCSchedClassDesc *SC = DAG->getSchedClass(&*I); 1456 RemIssueCount += SchedModel->getNumMicroOps(I->getInstr(), SC) 1457 * SchedModel->getMicroOpFactor(); 1458 for (TargetSchedModel::ProcResIter 1459 PI = SchedModel->getWriteProcResBegin(SC), 1460 PE = SchedModel->getWriteProcResEnd(SC); PI != PE; ++PI) { 1461 unsigned PIdx = PI->ProcResourceIdx; 1462 unsigned Factor = SchedModel->getResourceFactor(PIdx); 1463 RemainingCounts[PIdx] += (Factor * PI->Cycles); 1464 } 1465 } 1466 } 1467 1468 void ConvergingScheduler::SchedBoundary:: 1469 init(ScheduleDAGMI *dag, const TargetSchedModel *smodel, SchedRemainder *rem) { 1470 reset(); 1471 DAG = dag; 1472 SchedModel = smodel; 1473 Rem = rem; 1474 if (SchedModel->hasInstrSchedModel()) 1475 ExecutedResCounts.resize(SchedModel->getNumProcResourceKinds()); 1476 } 1477 1478 void ConvergingScheduler::initialize(ScheduleDAGMI *dag) { 1479 DAG = dag; 1480 SchedModel = DAG->getSchedModel(); 1481 TRI = DAG->TRI; 1482 1483 Rem.init(DAG, SchedModel); 1484 Top.init(DAG, SchedModel, &Rem); 1485 Bot.init(DAG, SchedModel, &Rem); 1486 1487 // Initialize resource counts. 1488 1489 // Initialize the HazardRecognizers. If itineraries don't exist, are empty, or 1490 // are disabled, then these HazardRecs will be disabled. 1491 const InstrItineraryData *Itin = SchedModel->getInstrItineraries(); 1492 const TargetMachine &TM = DAG->MF.getTarget(); 1493 Top.HazardRec = TM.getInstrInfo()->CreateTargetMIHazardRecognizer(Itin, DAG); 1494 Bot.HazardRec = TM.getInstrInfo()->CreateTargetMIHazardRecognizer(Itin, DAG); 1495 1496 assert((!ForceTopDown || !ForceBottomUp) && 1497 "-misched-topdown incompatible with -misched-bottomup"); 1498 } 1499 1500 void ConvergingScheduler::releaseTopNode(SUnit *SU) { 1501 if (SU->isScheduled) 1502 return; 1503 1504 for (SUnit::pred_iterator I = SU->Preds.begin(), E = SU->Preds.end(); 1505 I != E; ++I) { 1506 if (I->isWeak()) 1507 continue; 1508 unsigned PredReadyCycle = I->getSUnit()->TopReadyCycle; 1509 unsigned Latency = I->getLatency(); 1510 #ifndef NDEBUG 1511 Top.MaxObservedLatency = std::max(Latency, Top.MaxObservedLatency); 1512 #endif 1513 if (SU->TopReadyCycle < PredReadyCycle + Latency) 1514 SU->TopReadyCycle = PredReadyCycle + Latency; 1515 } 1516 Top.releaseNode(SU, SU->TopReadyCycle); 1517 } 1518 1519 void ConvergingScheduler::releaseBottomNode(SUnit *SU) { 1520 if (SU->isScheduled) 1521 return; 1522 1523 assert(SU->getInstr() && "Scheduled SUnit must have instr"); 1524 1525 for (SUnit::succ_iterator I = SU->Succs.begin(), E = SU->Succs.end(); 1526 I != E; ++I) { 1527 if (I->isWeak()) 1528 continue; 1529 unsigned SuccReadyCycle = I->getSUnit()->BotReadyCycle; 1530 unsigned Latency = I->getLatency(); 1531 #ifndef NDEBUG 1532 Bot.MaxObservedLatency = std::max(Latency, Bot.MaxObservedLatency); 1533 #endif 1534 if (SU->BotReadyCycle < SuccReadyCycle + Latency) 1535 SU->BotReadyCycle = SuccReadyCycle + Latency; 1536 } 1537 Bot.releaseNode(SU, SU->BotReadyCycle); 1538 } 1539 1540 void ConvergingScheduler::registerRoots() { 1541 Rem.CriticalPath = DAG->ExitSU.getDepth(); 1542 // Some roots may not feed into ExitSU. Check all of them in case. 1543 for (std::vector<SUnit*>::const_iterator 1544 I = Bot.Available.begin(), E = Bot.Available.end(); I != E; ++I) { 1545 if ((*I)->getDepth() > Rem.CriticalPath) 1546 Rem.CriticalPath = (*I)->getDepth(); 1547 } 1548 DEBUG(dbgs() << "Critical Path: " << Rem.CriticalPath << '\n'); 1549 } 1550 1551 /// Does this SU have a hazard within the current instruction group. 1552 /// 1553 /// The scheduler supports two modes of hazard recognition. The first is the 1554 /// ScheduleHazardRecognizer API. It is a fully general hazard recognizer that 1555 /// supports highly complicated in-order reservation tables 1556 /// (ScoreboardHazardRecognizer) and arbitraty target-specific logic. 1557 /// 1558 /// The second is a streamlined mechanism that checks for hazards based on 1559 /// simple counters that the scheduler itself maintains. It explicitly checks 1560 /// for instruction dispatch limitations, including the number of micro-ops that 1561 /// can dispatch per cycle. 1562 /// 1563 /// TODO: Also check whether the SU must start a new group. 1564 bool ConvergingScheduler::SchedBoundary::checkHazard(SUnit *SU) { 1565 if (HazardRec->isEnabled()) 1566 return HazardRec->getHazardType(SU) != ScheduleHazardRecognizer::NoHazard; 1567 1568 unsigned uops = SchedModel->getNumMicroOps(SU->getInstr()); 1569 if ((CurrMOps > 0) && (CurrMOps + uops > SchedModel->getIssueWidth())) { 1570 DEBUG(dbgs() << " SU(" << SU->NodeNum << ") uops=" 1571 << SchedModel->getNumMicroOps(SU->getInstr()) << '\n'); 1572 return true; 1573 } 1574 return false; 1575 } 1576 1577 // Find the unscheduled node in ReadySUs with the highest latency. 1578 unsigned ConvergingScheduler::SchedBoundary:: 1579 findMaxLatency(ArrayRef<SUnit*> ReadySUs) { 1580 SUnit *LateSU = 0; 1581 unsigned RemLatency = 0; 1582 for (ArrayRef<SUnit*>::iterator I = ReadySUs.begin(), E = ReadySUs.end(); 1583 I != E; ++I) { 1584 unsigned L = getUnscheduledLatency(*I); 1585 if (L > RemLatency) { 1586 RemLatency = L; 1587 LateSU = *I; 1588 } 1589 } 1590 if (LateSU) { 1591 DEBUG(dbgs() << Available.getName() << " RemLatency SU(" 1592 << LateSU->NodeNum << ") " << RemLatency << "c\n"); 1593 } 1594 return RemLatency; 1595 } 1596 1597 // Count resources in this zone and the remaining unscheduled 1598 // instruction. Return the max count, scaled. Set OtherCritIdx to the critical 1599 // resource index, or zero if the zone is issue limited. 1600 unsigned ConvergingScheduler::SchedBoundary:: 1601 getOtherResourceCount(unsigned &OtherCritIdx) { 1602 OtherCritIdx = 0; 1603 if (!SchedModel->hasInstrSchedModel()) 1604 return 0; 1605 1606 unsigned OtherCritCount = Rem->RemIssueCount 1607 + (RetiredMOps * SchedModel->getMicroOpFactor()); 1608 DEBUG(dbgs() << " " << Available.getName() << " + Remain MOps: " 1609 << OtherCritCount / SchedModel->getMicroOpFactor() << '\n'); 1610 for (unsigned PIdx = 1, PEnd = SchedModel->getNumProcResourceKinds(); 1611 PIdx != PEnd; ++PIdx) { 1612 unsigned OtherCount = getResourceCount(PIdx) + Rem->RemainingCounts[PIdx]; 1613 if (OtherCount > OtherCritCount) { 1614 OtherCritCount = OtherCount; 1615 OtherCritIdx = PIdx; 1616 } 1617 } 1618 if (OtherCritIdx) { 1619 DEBUG(dbgs() << " " << Available.getName() << " + Remain CritRes: " 1620 << OtherCritCount / SchedModel->getResourceFactor(OtherCritIdx) 1621 << " " << getResourceName(OtherCritIdx) << "\n"); 1622 } 1623 return OtherCritCount; 1624 } 1625 1626 /// Set the CandPolicy for this zone given the current resources and latencies 1627 /// inside and outside the zone. 1628 void ConvergingScheduler::SchedBoundary::setPolicy(CandPolicy &Policy, 1629 SchedBoundary &OtherZone) { 1630 // Now that potential stalls have been considered, apply preemptive heuristics 1631 // based on the the total latency and resources inside and outside this 1632 // zone. 1633 1634 // Compute remaining latency. We need this both to determine whether the 1635 // overall schedule has become latency-limited and whether the instructions 1636 // outside this zone are resource or latency limited. 1637 // 1638 // The "dependent" latency is updated incrementally during scheduling as the 1639 // max height/depth of scheduled nodes minus the cycles since it was 1640 // scheduled: 1641 // DLat = max (N.depth - (CurrCycle - N.ReadyCycle) for N in Zone 1642 // 1643 // The "independent" latency is the max ready queue depth: 1644 // ILat = max N.depth for N in Available|Pending 1645 // 1646 // RemainingLatency is the greater of independent and dependent latency. 1647 unsigned RemLatency = DependentLatency; 1648 RemLatency = std::max(RemLatency, findMaxLatency(Available.elements())); 1649 RemLatency = std::max(RemLatency, findMaxLatency(Pending.elements())); 1650 1651 // Compute the critical resource outside the zone. 1652 unsigned OtherCritIdx; 1653 unsigned OtherCount = OtherZone.getOtherResourceCount(OtherCritIdx); 1654 1655 bool OtherResLimited = false; 1656 if (SchedModel->hasInstrSchedModel()) { 1657 unsigned LFactor = SchedModel->getLatencyFactor(); 1658 OtherResLimited = (int)(OtherCount - (RemLatency * LFactor)) > (int)LFactor; 1659 } 1660 if (!OtherResLimited && (RemLatency + CurrCycle > Rem->CriticalPath)) { 1661 Policy.ReduceLatency |= true; 1662 DEBUG(dbgs() << " " << Available.getName() << " RemainingLatency " 1663 << RemLatency << " + " << CurrCycle << "c > CritPath " 1664 << Rem->CriticalPath << "\n"); 1665 } 1666 // If the same resource is limiting inside and outside the zone, do nothing. 1667 if (ZoneCritResIdx == OtherCritIdx) 1668 return; 1669 1670 DEBUG( 1671 if (IsResourceLimited) { 1672 dbgs() << " " << Available.getName() << " ResourceLimited: " 1673 << getResourceName(ZoneCritResIdx) << "\n"; 1674 } 1675 if (OtherResLimited) 1676 dbgs() << " RemainingLimit: " << getResourceName(OtherCritIdx) << "\n"; 1677 if (!IsResourceLimited && !OtherResLimited) 1678 dbgs() << " Latency limited both directions.\n"); 1679 1680 if (IsResourceLimited && !Policy.ReduceResIdx) 1681 Policy.ReduceResIdx = ZoneCritResIdx; 1682 1683 if (OtherResLimited) 1684 Policy.DemandResIdx = OtherCritIdx; 1685 } 1686 1687 void ConvergingScheduler::SchedBoundary::releaseNode(SUnit *SU, 1688 unsigned ReadyCycle) { 1689 if (ReadyCycle < MinReadyCycle) 1690 MinReadyCycle = ReadyCycle; 1691 1692 // Check for interlocks first. For the purpose of other heuristics, an 1693 // instruction that cannot issue appears as if it's not in the ReadyQueue. 1694 bool IsBuffered = SchedModel->getMicroOpBufferSize() != 0; 1695 if ((!IsBuffered && ReadyCycle > CurrCycle) || checkHazard(SU)) 1696 Pending.push(SU); 1697 else 1698 Available.push(SU); 1699 1700 // Record this node as an immediate dependent of the scheduled node. 1701 NextSUs.insert(SU); 1702 } 1703 1704 /// Move the boundary of scheduled code by one cycle. 1705 void ConvergingScheduler::SchedBoundary::bumpCycle(unsigned NextCycle) { 1706 if (SchedModel->getMicroOpBufferSize() == 0) { 1707 assert(MinReadyCycle < UINT_MAX && "MinReadyCycle uninitialized"); 1708 if (MinReadyCycle > NextCycle) 1709 NextCycle = MinReadyCycle; 1710 } 1711 // Update the current micro-ops, which will issue in the next cycle. 1712 unsigned DecMOps = SchedModel->getIssueWidth() * (NextCycle - CurrCycle); 1713 CurrMOps = (CurrMOps <= DecMOps) ? 0 : CurrMOps - DecMOps; 1714 1715 // Decrement DependentLatency based on the next cycle. 1716 if ((NextCycle - CurrCycle) > DependentLatency) 1717 DependentLatency = 0; 1718 else 1719 DependentLatency -= (NextCycle - CurrCycle); 1720 1721 if (!HazardRec->isEnabled()) { 1722 // Bypass HazardRec virtual calls. 1723 CurrCycle = NextCycle; 1724 } 1725 else { 1726 // Bypass getHazardType calls in case of long latency. 1727 for (; CurrCycle != NextCycle; ++CurrCycle) { 1728 if (isTop()) 1729 HazardRec->AdvanceCycle(); 1730 else 1731 HazardRec->RecedeCycle(); 1732 } 1733 } 1734 CheckPending = true; 1735 unsigned LFactor = SchedModel->getLatencyFactor(); 1736 IsResourceLimited = 1737 (int)(getCriticalCount() - (getScheduledLatency() * LFactor)) 1738 > (int)LFactor; 1739 1740 DEBUG(dbgs() << "Cycle: " << CurrCycle << ' ' << Available.getName() << '\n'); 1741 } 1742 1743 void ConvergingScheduler::SchedBoundary::incExecutedResources(unsigned PIdx, 1744 unsigned Count) { 1745 ExecutedResCounts[PIdx] += Count; 1746 if (ExecutedResCounts[PIdx] > MaxExecutedResCount) 1747 MaxExecutedResCount = ExecutedResCounts[PIdx]; 1748 } 1749 1750 /// Add the given processor resource to this scheduled zone. 1751 /// 1752 /// \param Cycles indicates the number of consecutive (non-pipelined) cycles 1753 /// during which this resource is consumed. 1754 /// 1755 /// \return the next cycle at which the instruction may execute without 1756 /// oversubscribing resources. 1757 unsigned ConvergingScheduler::SchedBoundary:: 1758 countResource(unsigned PIdx, unsigned Cycles, unsigned ReadyCycle) { 1759 unsigned Factor = SchedModel->getResourceFactor(PIdx); 1760 unsigned Count = Factor * Cycles; 1761 DEBUG(dbgs() << " " << getResourceName(PIdx) 1762 << " +" << Cycles << "x" << Factor << "u\n"); 1763 1764 // Update Executed resources counts. 1765 incExecutedResources(PIdx, Count); 1766 assert(Rem->RemainingCounts[PIdx] >= Count && "resource double counted"); 1767 Rem->RemainingCounts[PIdx] -= Count; 1768 1769 // Check if this resource exceeds the current critical resource. If so, it 1770 // becomes the critical resource. 1771 if (ZoneCritResIdx != PIdx && (getResourceCount(PIdx) > getCriticalCount())) { 1772 ZoneCritResIdx = PIdx; 1773 DEBUG(dbgs() << " *** Critical resource " 1774 << getResourceName(PIdx) << ": " 1775 << getResourceCount(PIdx) / SchedModel->getLatencyFactor() << "c\n"); 1776 } 1777 // TODO: We don't yet model reserved resources. It's not hard though. 1778 return CurrCycle; 1779 } 1780 1781 /// Move the boundary of scheduled code by one SUnit. 1782 void ConvergingScheduler::SchedBoundary::bumpNode(SUnit *SU) { 1783 // Update the reservation table. 1784 if (HazardRec->isEnabled()) { 1785 if (!isTop() && SU->isCall) { 1786 // Calls are scheduled with their preceding instructions. For bottom-up 1787 // scheduling, clear the pipeline state before emitting. 1788 HazardRec->Reset(); 1789 } 1790 HazardRec->EmitInstruction(SU); 1791 } 1792 const MCSchedClassDesc *SC = DAG->getSchedClass(SU); 1793 unsigned IncMOps = SchedModel->getNumMicroOps(SU->getInstr()); 1794 CurrMOps += IncMOps; 1795 // checkHazard prevents scheduling multiple instructions per cycle that exceed 1796 // issue width. However, we commonly reach the maximum. In this case 1797 // opportunistically bump the cycle to avoid uselessly checking everything in 1798 // the readyQ. Furthermore, a single instruction may produce more than one 1799 // cycle's worth of micro-ops. 1800 // 1801 // TODO: Also check if this SU must end a dispatch group. 1802 unsigned NextCycle = CurrCycle; 1803 if (CurrMOps >= SchedModel->getIssueWidth()) { 1804 ++NextCycle; 1805 DEBUG(dbgs() << " *** Max MOps " << CurrMOps 1806 << " at cycle " << CurrCycle << '\n'); 1807 } 1808 unsigned ReadyCycle = (isTop() ? SU->TopReadyCycle : SU->BotReadyCycle); 1809 DEBUG(dbgs() << " Ready @" << ReadyCycle << "c\n"); 1810 1811 switch (SchedModel->getMicroOpBufferSize()) { 1812 case 0: 1813 assert(ReadyCycle <= CurrCycle && "Broken PendingQueue"); 1814 break; 1815 case 1: 1816 if (ReadyCycle > NextCycle) { 1817 NextCycle = ReadyCycle; 1818 DEBUG(dbgs() << " *** Stall until: " << ReadyCycle << "\n"); 1819 } 1820 break; 1821 default: 1822 // We don't currently model the OOO reorder buffer, so consider all 1823 // scheduled MOps to be "retired". 1824 break; 1825 } 1826 RetiredMOps += IncMOps; 1827 1828 // Update resource counts and critical resource. 1829 if (SchedModel->hasInstrSchedModel()) { 1830 unsigned DecRemIssue = IncMOps * SchedModel->getMicroOpFactor(); 1831 assert(Rem->RemIssueCount >= DecRemIssue && "MOps double counted"); 1832 Rem->RemIssueCount -= DecRemIssue; 1833 if (ZoneCritResIdx) { 1834 // Scale scheduled micro-ops for comparing with the critical resource. 1835 unsigned ScaledMOps = 1836 RetiredMOps * SchedModel->getMicroOpFactor(); 1837 1838 // If scaled micro-ops are now more than the previous critical resource by 1839 // a full cycle, then micro-ops issue becomes critical. 1840 if ((int)(ScaledMOps - getResourceCount(ZoneCritResIdx)) 1841 >= (int)SchedModel->getLatencyFactor()) { 1842 ZoneCritResIdx = 0; 1843 DEBUG(dbgs() << " *** Critical resource NumMicroOps: " 1844 << ScaledMOps / SchedModel->getLatencyFactor() << "c\n"); 1845 } 1846 } 1847 for (TargetSchedModel::ProcResIter 1848 PI = SchedModel->getWriteProcResBegin(SC), 1849 PE = SchedModel->getWriteProcResEnd(SC); PI != PE; ++PI) { 1850 unsigned RCycle = 1851 countResource(PI->ProcResourceIdx, PI->Cycles, ReadyCycle); 1852 if (RCycle > NextCycle) 1853 NextCycle = RCycle; 1854 } 1855 } 1856 // Update ExpectedLatency and DependentLatency. 1857 unsigned &TopLatency = isTop() ? ExpectedLatency : DependentLatency; 1858 unsigned &BotLatency = isTop() ? DependentLatency : ExpectedLatency; 1859 if (SU->getDepth() > TopLatency) { 1860 TopLatency = SU->getDepth(); 1861 DEBUG(dbgs() << " " << Available.getName() 1862 << " TopLatency SU(" << SU->NodeNum << ") " << TopLatency << "c\n"); 1863 } 1864 if (SU->getHeight() > BotLatency) { 1865 BotLatency = SU->getHeight(); 1866 DEBUG(dbgs() << " " << Available.getName() 1867 << " BotLatency SU(" << SU->NodeNum << ") " << BotLatency << "c\n"); 1868 } 1869 // If we stall for any reason, bump the cycle. 1870 if (NextCycle > CurrCycle) { 1871 bumpCycle(NextCycle); 1872 } 1873 else { 1874 // After updating ZoneCritResIdx and ExpectedLatency, check if we're 1875 // resource limited. If a stall occured, bumpCycle does this. 1876 unsigned LFactor = SchedModel->getLatencyFactor(); 1877 IsResourceLimited = 1878 (int)(getCriticalCount() - (getScheduledLatency() * LFactor)) 1879 > (int)LFactor; 1880 } 1881 DEBUG(dumpScheduledState()); 1882 } 1883 1884 /// Release pending ready nodes in to the available queue. This makes them 1885 /// visible to heuristics. 1886 void ConvergingScheduler::SchedBoundary::releasePending() { 1887 // If the available queue is empty, it is safe to reset MinReadyCycle. 1888 if (Available.empty()) 1889 MinReadyCycle = UINT_MAX; 1890 1891 // Check to see if any of the pending instructions are ready to issue. If 1892 // so, add them to the available queue. 1893 bool IsBuffered = SchedModel->getMicroOpBufferSize() != 0; 1894 for (unsigned i = 0, e = Pending.size(); i != e; ++i) { 1895 SUnit *SU = *(Pending.begin()+i); 1896 unsigned ReadyCycle = isTop() ? SU->TopReadyCycle : SU->BotReadyCycle; 1897 1898 if (ReadyCycle < MinReadyCycle) 1899 MinReadyCycle = ReadyCycle; 1900 1901 if (!IsBuffered && ReadyCycle > CurrCycle) 1902 continue; 1903 1904 if (checkHazard(SU)) 1905 continue; 1906 1907 Available.push(SU); 1908 Pending.remove(Pending.begin()+i); 1909 --i; --e; 1910 } 1911 DEBUG(if (!Pending.empty()) Pending.dump()); 1912 CheckPending = false; 1913 } 1914 1915 /// Remove SU from the ready set for this boundary. 1916 void ConvergingScheduler::SchedBoundary::removeReady(SUnit *SU) { 1917 if (Available.isInQueue(SU)) 1918 Available.remove(Available.find(SU)); 1919 else { 1920 assert(Pending.isInQueue(SU) && "bad ready count"); 1921 Pending.remove(Pending.find(SU)); 1922 } 1923 } 1924 1925 /// If this queue only has one ready candidate, return it. As a side effect, 1926 /// defer any nodes that now hit a hazard, and advance the cycle until at least 1927 /// one node is ready. If multiple instructions are ready, return NULL. 1928 SUnit *ConvergingScheduler::SchedBoundary::pickOnlyChoice() { 1929 if (CheckPending) 1930 releasePending(); 1931 1932 if (CurrMOps > 0) { 1933 // Defer any ready instrs that now have a hazard. 1934 for (ReadyQueue::iterator I = Available.begin(); I != Available.end();) { 1935 if (checkHazard(*I)) { 1936 Pending.push(*I); 1937 I = Available.remove(I); 1938 continue; 1939 } 1940 ++I; 1941 } 1942 } 1943 for (unsigned i = 0; Available.empty(); ++i) { 1944 assert(i <= (HazardRec->getMaxLookAhead() + MaxObservedLatency) && 1945 "permanent hazard"); (void)i; 1946 bumpCycle(CurrCycle + 1); 1947 releasePending(); 1948 } 1949 if (Available.size() == 1) 1950 return *Available.begin(); 1951 return NULL; 1952 } 1953 1954 #ifndef NDEBUG 1955 // This is useful information to dump after bumpNode. 1956 // Note that the Queue contents are more useful before pickNodeFromQueue. 1957 void ConvergingScheduler::SchedBoundary::dumpScheduledState() { 1958 unsigned ResFactor; 1959 unsigned ResCount; 1960 if (ZoneCritResIdx) { 1961 ResFactor = SchedModel->getResourceFactor(ZoneCritResIdx); 1962 ResCount = getResourceCount(ZoneCritResIdx); 1963 } 1964 else { 1965 ResFactor = SchedModel->getMicroOpFactor(); 1966 ResCount = RetiredMOps * SchedModel->getMicroOpFactor(); 1967 } 1968 unsigned LFactor = SchedModel->getLatencyFactor(); 1969 dbgs() << Available.getName() << " @" << CurrCycle << "c\n" 1970 << " Retired: " << RetiredMOps; 1971 dbgs() << "\n Executed: " << getExecutedCount() / LFactor << "c"; 1972 dbgs() << "\n Critical: " << ResCount / LFactor << "c, " 1973 << ResCount / ResFactor << " " << getResourceName(ZoneCritResIdx) 1974 << "\n ExpectedLatency: " << ExpectedLatency << "c\n" 1975 << (IsResourceLimited ? " - Resource" : " - Latency") 1976 << " limited.\n"; 1977 } 1978 #endif 1979 1980 void ConvergingScheduler::SchedCandidate:: 1981 initResourceDelta(const ScheduleDAGMI *DAG, 1982 const TargetSchedModel *SchedModel) { 1983 if (!Policy.ReduceResIdx && !Policy.DemandResIdx) 1984 return; 1985 1986 const MCSchedClassDesc *SC = DAG->getSchedClass(SU); 1987 for (TargetSchedModel::ProcResIter 1988 PI = SchedModel->getWriteProcResBegin(SC), 1989 PE = SchedModel->getWriteProcResEnd(SC); PI != PE; ++PI) { 1990 if (PI->ProcResourceIdx == Policy.ReduceResIdx) 1991 ResDelta.CritResources += PI->Cycles; 1992 if (PI->ProcResourceIdx == Policy.DemandResIdx) 1993 ResDelta.DemandedResources += PI->Cycles; 1994 } 1995 } 1996 1997 1998 /// Return true if this heuristic determines order. 1999 static bool tryLess(int TryVal, int CandVal, 2000 ConvergingScheduler::SchedCandidate &TryCand, 2001 ConvergingScheduler::SchedCandidate &Cand, 2002 ConvergingScheduler::CandReason Reason) { 2003 if (TryVal < CandVal) { 2004 TryCand.Reason = Reason; 2005 return true; 2006 } 2007 if (TryVal > CandVal) { 2008 if (Cand.Reason > Reason) 2009 Cand.Reason = Reason; 2010 return true; 2011 } 2012 Cand.setRepeat(Reason); 2013 return false; 2014 } 2015 2016 static bool tryGreater(int TryVal, int CandVal, 2017 ConvergingScheduler::SchedCandidate &TryCand, 2018 ConvergingScheduler::SchedCandidate &Cand, 2019 ConvergingScheduler::CandReason Reason) { 2020 if (TryVal > CandVal) { 2021 TryCand.Reason = Reason; 2022 return true; 2023 } 2024 if (TryVal < CandVal) { 2025 if (Cand.Reason > Reason) 2026 Cand.Reason = Reason; 2027 return true; 2028 } 2029 Cand.setRepeat(Reason); 2030 return false; 2031 } 2032 2033 static bool tryPressure(const PressureElement &TryP, 2034 const PressureElement &CandP, 2035 ConvergingScheduler::SchedCandidate &TryCand, 2036 ConvergingScheduler::SchedCandidate &Cand, 2037 ConvergingScheduler::CandReason Reason) { 2038 // If both candidates affect the same set, go with the smallest increase. 2039 if (TryP.PSetID == CandP.PSetID) { 2040 return tryLess(TryP.UnitIncrease, CandP.UnitIncrease, TryCand, Cand, 2041 Reason); 2042 } 2043 // If one candidate decreases and the other increases, go with it. 2044 if (tryLess(TryP.UnitIncrease < 0, CandP.UnitIncrease < 0, TryCand, Cand, 2045 Reason)) { 2046 return true; 2047 } 2048 // If TryP has lower Rank, it has a higher priority. 2049 int TryRank = TryP.PSetRank(); 2050 int CandRank = CandP.PSetRank(); 2051 // If the candidates are decreasing pressure, reverse priority. 2052 if (TryP.UnitIncrease < 0) 2053 std::swap(TryRank, CandRank); 2054 return tryGreater(TryRank, CandRank, TryCand, Cand, Reason); 2055 } 2056 2057 static unsigned getWeakLeft(const SUnit *SU, bool isTop) { 2058 return (isTop) ? SU->WeakPredsLeft : SU->WeakSuccsLeft; 2059 } 2060 2061 /// Minimize physical register live ranges. Regalloc wants them adjacent to 2062 /// their physreg def/use. 2063 /// 2064 /// FIXME: This is an unnecessary check on the critical path. Most are root/leaf 2065 /// copies which can be prescheduled. The rest (e.g. x86 MUL) could be bundled 2066 /// with the operation that produces or consumes the physreg. We'll do this when 2067 /// regalloc has support for parallel copies. 2068 static int biasPhysRegCopy(const SUnit *SU, bool isTop) { 2069 const MachineInstr *MI = SU->getInstr(); 2070 if (!MI->isCopy()) 2071 return 0; 2072 2073 unsigned ScheduledOper = isTop ? 1 : 0; 2074 unsigned UnscheduledOper = isTop ? 0 : 1; 2075 // If we have already scheduled the physreg produce/consumer, immediately 2076 // schedule the copy. 2077 if (TargetRegisterInfo::isPhysicalRegister( 2078 MI->getOperand(ScheduledOper).getReg())) 2079 return 1; 2080 // If the physreg is at the boundary, defer it. Otherwise schedule it 2081 // immediately to free the dependent. We can hoist the copy later. 2082 bool AtBoundary = isTop ? !SU->NumSuccsLeft : !SU->NumPredsLeft; 2083 if (TargetRegisterInfo::isPhysicalRegister( 2084 MI->getOperand(UnscheduledOper).getReg())) 2085 return AtBoundary ? -1 : 1; 2086 return 0; 2087 } 2088 2089 /// Apply a set of heursitics to a new candidate. Heuristics are currently 2090 /// hierarchical. This may be more efficient than a graduated cost model because 2091 /// we don't need to evaluate all aspects of the model for each node in the 2092 /// queue. But it's really done to make the heuristics easier to debug and 2093 /// statistically analyze. 2094 /// 2095 /// \param Cand provides the policy and current best candidate. 2096 /// \param TryCand refers to the next SUnit candidate, otherwise uninitialized. 2097 /// \param Zone describes the scheduled zone that we are extending. 2098 /// \param RPTracker describes reg pressure within the scheduled zone. 2099 /// \param TempTracker is a scratch pressure tracker to reuse in queries. 2100 void ConvergingScheduler::tryCandidate(SchedCandidate &Cand, 2101 SchedCandidate &TryCand, 2102 SchedBoundary &Zone, 2103 const RegPressureTracker &RPTracker, 2104 RegPressureTracker &TempTracker) { 2105 2106 // Always initialize TryCand's RPDelta. 2107 TempTracker.getMaxPressureDelta(TryCand.SU->getInstr(), TryCand.RPDelta, 2108 DAG->getRegionCriticalPSets(), 2109 DAG->getRegPressure().MaxSetPressure); 2110 2111 // Initialize the candidate if needed. 2112 if (!Cand.isValid()) { 2113 TryCand.Reason = NodeOrder; 2114 return; 2115 } 2116 2117 if (tryGreater(biasPhysRegCopy(TryCand.SU, Zone.isTop()), 2118 biasPhysRegCopy(Cand.SU, Zone.isTop()), 2119 TryCand, Cand, PhysRegCopy)) 2120 return; 2121 2122 // Avoid exceeding the target's limit. If signed PSetID is negative, it is 2123 // invalid; convert it to INT_MAX to give it lowest priority. 2124 if (tryPressure(TryCand.RPDelta.Excess, Cand.RPDelta.Excess, TryCand, Cand, 2125 RegExcess)) 2126 return; 2127 2128 // Avoid increasing the max critical pressure in the scheduled region. 2129 if (tryPressure(TryCand.RPDelta.CriticalMax, Cand.RPDelta.CriticalMax, 2130 TryCand, Cand, RegCritical)) 2131 return; 2132 2133 // Keep clustered nodes together to encourage downstream peephole 2134 // optimizations which may reduce resource requirements. 2135 // 2136 // This is a best effort to set things up for a post-RA pass. Optimizations 2137 // like generating loads of multiple registers should ideally be done within 2138 // the scheduler pass by combining the loads during DAG postprocessing. 2139 const SUnit *NextClusterSU = 2140 Zone.isTop() ? DAG->getNextClusterSucc() : DAG->getNextClusterPred(); 2141 if (tryGreater(TryCand.SU == NextClusterSU, Cand.SU == NextClusterSU, 2142 TryCand, Cand, Cluster)) 2143 return; 2144 2145 // Weak edges are for clustering and other constraints. 2146 if (tryLess(getWeakLeft(TryCand.SU, Zone.isTop()), 2147 getWeakLeft(Cand.SU, Zone.isTop()), 2148 TryCand, Cand, Weak)) { 2149 return; 2150 } 2151 // Avoid increasing the max pressure of the entire region. 2152 if (tryPressure(TryCand.RPDelta.CurrentMax, Cand.RPDelta.CurrentMax, 2153 TryCand, Cand, RegMax)) 2154 return; 2155 2156 // Avoid critical resource consumption and balance the schedule. 2157 TryCand.initResourceDelta(DAG, SchedModel); 2158 if (tryLess(TryCand.ResDelta.CritResources, Cand.ResDelta.CritResources, 2159 TryCand, Cand, ResourceReduce)) 2160 return; 2161 if (tryGreater(TryCand.ResDelta.DemandedResources, 2162 Cand.ResDelta.DemandedResources, 2163 TryCand, Cand, ResourceDemand)) 2164 return; 2165 2166 // Avoid serializing long latency dependence chains. 2167 if (Cand.Policy.ReduceLatency) { 2168 if (Zone.isTop()) { 2169 if (Cand.SU->getDepth() > Zone.getScheduledLatency()) { 2170 if (tryLess(TryCand.SU->getDepth(), Cand.SU->getDepth(), 2171 TryCand, Cand, TopDepthReduce)) 2172 return; 2173 } 2174 if (tryGreater(TryCand.SU->getHeight(), Cand.SU->getHeight(), 2175 TryCand, Cand, TopPathReduce)) 2176 return; 2177 } 2178 else { 2179 if (Cand.SU->getHeight() > Zone.getScheduledLatency()) { 2180 if (tryLess(TryCand.SU->getHeight(), Cand.SU->getHeight(), 2181 TryCand, Cand, BotHeightReduce)) 2182 return; 2183 } 2184 if (tryGreater(TryCand.SU->getDepth(), Cand.SU->getDepth(), 2185 TryCand, Cand, BotPathReduce)) 2186 return; 2187 } 2188 } 2189 2190 // Prefer immediate defs/users of the last scheduled instruction. This is a 2191 // local pressure avoidance strategy that also makes the machine code 2192 // readable. 2193 if (tryGreater(Zone.NextSUs.count(TryCand.SU), Zone.NextSUs.count(Cand.SU), 2194 TryCand, Cand, NextDefUse)) 2195 return; 2196 2197 // Fall through to original instruction order. 2198 if ((Zone.isTop() && TryCand.SU->NodeNum < Cand.SU->NodeNum) 2199 || (!Zone.isTop() && TryCand.SU->NodeNum > Cand.SU->NodeNum)) { 2200 TryCand.Reason = NodeOrder; 2201 } 2202 } 2203 2204 #ifndef NDEBUG 2205 const char *ConvergingScheduler::getReasonStr( 2206 ConvergingScheduler::CandReason Reason) { 2207 switch (Reason) { 2208 case NoCand: return "NOCAND "; 2209 case PhysRegCopy: return "PREG-COPY"; 2210 case RegExcess: return "REG-EXCESS"; 2211 case RegCritical: return "REG-CRIT "; 2212 case Cluster: return "CLUSTER "; 2213 case Weak: return "WEAK "; 2214 case RegMax: return "REG-MAX "; 2215 case ResourceReduce: return "RES-REDUCE"; 2216 case ResourceDemand: return "RES-DEMAND"; 2217 case TopDepthReduce: return "TOP-DEPTH "; 2218 case TopPathReduce: return "TOP-PATH "; 2219 case BotHeightReduce:return "BOT-HEIGHT"; 2220 case BotPathReduce: return "BOT-PATH "; 2221 case NextDefUse: return "DEF-USE "; 2222 case NodeOrder: return "ORDER "; 2223 }; 2224 llvm_unreachable("Unknown reason!"); 2225 } 2226 2227 void ConvergingScheduler::traceCandidate(const SchedCandidate &Cand) { 2228 PressureElement P; 2229 unsigned ResIdx = 0; 2230 unsigned Latency = 0; 2231 switch (Cand.Reason) { 2232 default: 2233 break; 2234 case RegExcess: 2235 P = Cand.RPDelta.Excess; 2236 break; 2237 case RegCritical: 2238 P = Cand.RPDelta.CriticalMax; 2239 break; 2240 case RegMax: 2241 P = Cand.RPDelta.CurrentMax; 2242 break; 2243 case ResourceReduce: 2244 ResIdx = Cand.Policy.ReduceResIdx; 2245 break; 2246 case ResourceDemand: 2247 ResIdx = Cand.Policy.DemandResIdx; 2248 break; 2249 case TopDepthReduce: 2250 Latency = Cand.SU->getDepth(); 2251 break; 2252 case TopPathReduce: 2253 Latency = Cand.SU->getHeight(); 2254 break; 2255 case BotHeightReduce: 2256 Latency = Cand.SU->getHeight(); 2257 break; 2258 case BotPathReduce: 2259 Latency = Cand.SU->getDepth(); 2260 break; 2261 } 2262 dbgs() << " SU(" << Cand.SU->NodeNum << ") " << getReasonStr(Cand.Reason); 2263 if (P.isValid()) 2264 dbgs() << " " << TRI->getRegPressureSetName(P.PSetID) 2265 << ":" << P.UnitIncrease << " "; 2266 else 2267 dbgs() << " "; 2268 if (ResIdx) 2269 dbgs() << " " << SchedModel->getProcResource(ResIdx)->Name << " "; 2270 else 2271 dbgs() << " "; 2272 if (Latency) 2273 dbgs() << " " << Latency << " cycles "; 2274 else 2275 dbgs() << " "; 2276 dbgs() << '\n'; 2277 } 2278 #endif 2279 2280 /// Pick the best candidate from the top queue. 2281 /// 2282 /// TODO: getMaxPressureDelta results can be mostly cached for each SUnit during 2283 /// DAG building. To adjust for the current scheduling location we need to 2284 /// maintain the number of vreg uses remaining to be top-scheduled. 2285 void ConvergingScheduler::pickNodeFromQueue(SchedBoundary &Zone, 2286 const RegPressureTracker &RPTracker, 2287 SchedCandidate &Cand) { 2288 ReadyQueue &Q = Zone.Available; 2289 2290 DEBUG(Q.dump()); 2291 2292 // getMaxPressureDelta temporarily modifies the tracker. 2293 RegPressureTracker &TempTracker = const_cast<RegPressureTracker&>(RPTracker); 2294 2295 for (ReadyQueue::iterator I = Q.begin(), E = Q.end(); I != E; ++I) { 2296 2297 SchedCandidate TryCand(Cand.Policy); 2298 TryCand.SU = *I; 2299 tryCandidate(Cand, TryCand, Zone, RPTracker, TempTracker); 2300 if (TryCand.Reason != NoCand) { 2301 // Initialize resource delta if needed in case future heuristics query it. 2302 if (TryCand.ResDelta == SchedResourceDelta()) 2303 TryCand.initResourceDelta(DAG, SchedModel); 2304 Cand.setBest(TryCand); 2305 DEBUG(traceCandidate(Cand)); 2306 } 2307 } 2308 } 2309 2310 static void tracePick(const ConvergingScheduler::SchedCandidate &Cand, 2311 bool IsTop) { 2312 DEBUG(dbgs() << "Pick " << (IsTop ? "Top " : "Bot ") 2313 << ConvergingScheduler::getReasonStr(Cand.Reason) << '\n'); 2314 } 2315 2316 /// Pick the best candidate node from either the top or bottom queue. 2317 SUnit *ConvergingScheduler::pickNodeBidirectional(bool &IsTopNode) { 2318 // Schedule as far as possible in the direction of no choice. This is most 2319 // efficient, but also provides the best heuristics for CriticalPSets. 2320 if (SUnit *SU = Bot.pickOnlyChoice()) { 2321 IsTopNode = false; 2322 DEBUG(dbgs() << "Pick Bot NOCAND\n"); 2323 return SU; 2324 } 2325 if (SUnit *SU = Top.pickOnlyChoice()) { 2326 IsTopNode = true; 2327 DEBUG(dbgs() << "Pick Top NOCAND\n"); 2328 return SU; 2329 } 2330 CandPolicy NoPolicy; 2331 SchedCandidate BotCand(NoPolicy); 2332 SchedCandidate TopCand(NoPolicy); 2333 Bot.setPolicy(BotCand.Policy, Top); 2334 Top.setPolicy(TopCand.Policy, Bot); 2335 2336 // Prefer bottom scheduling when heuristics are silent. 2337 pickNodeFromQueue(Bot, DAG->getBotRPTracker(), BotCand); 2338 assert(BotCand.Reason != NoCand && "failed to find the first candidate"); 2339 2340 // If either Q has a single candidate that provides the least increase in 2341 // Excess pressure, we can immediately schedule from that Q. 2342 // 2343 // RegionCriticalPSets summarizes the pressure within the scheduled region and 2344 // affects picking from either Q. If scheduling in one direction must 2345 // increase pressure for one of the excess PSets, then schedule in that 2346 // direction first to provide more freedom in the other direction. 2347 if ((BotCand.Reason == RegExcess && !BotCand.isRepeat(RegExcess)) 2348 || (BotCand.Reason == RegCritical 2349 && !BotCand.isRepeat(RegCritical))) 2350 { 2351 IsTopNode = false; 2352 tracePick(BotCand, IsTopNode); 2353 return BotCand.SU; 2354 } 2355 // Check if the top Q has a better candidate. 2356 pickNodeFromQueue(Top, DAG->getTopRPTracker(), TopCand); 2357 assert(TopCand.Reason != NoCand && "failed to find the first candidate"); 2358 2359 // Choose the queue with the most important (lowest enum) reason. 2360 if (TopCand.Reason < BotCand.Reason) { 2361 IsTopNode = true; 2362 tracePick(TopCand, IsTopNode); 2363 return TopCand.SU; 2364 } 2365 // Otherwise prefer the bottom candidate, in node order if all else failed. 2366 IsTopNode = false; 2367 tracePick(BotCand, IsTopNode); 2368 return BotCand.SU; 2369 } 2370 2371 /// Pick the best node to balance the schedule. Implements MachineSchedStrategy. 2372 SUnit *ConvergingScheduler::pickNode(bool &IsTopNode) { 2373 if (DAG->top() == DAG->bottom()) { 2374 assert(Top.Available.empty() && Top.Pending.empty() && 2375 Bot.Available.empty() && Bot.Pending.empty() && "ReadyQ garbage"); 2376 return NULL; 2377 } 2378 SUnit *SU; 2379 do { 2380 if (ForceTopDown) { 2381 SU = Top.pickOnlyChoice(); 2382 if (!SU) { 2383 CandPolicy NoPolicy; 2384 SchedCandidate TopCand(NoPolicy); 2385 pickNodeFromQueue(Top, DAG->getTopRPTracker(), TopCand); 2386 assert(TopCand.Reason != NoCand && "failed to find the first candidate"); 2387 SU = TopCand.SU; 2388 } 2389 IsTopNode = true; 2390 } 2391 else if (ForceBottomUp) { 2392 SU = Bot.pickOnlyChoice(); 2393 if (!SU) { 2394 CandPolicy NoPolicy; 2395 SchedCandidate BotCand(NoPolicy); 2396 pickNodeFromQueue(Bot, DAG->getBotRPTracker(), BotCand); 2397 assert(BotCand.Reason != NoCand && "failed to find the first candidate"); 2398 SU = BotCand.SU; 2399 } 2400 IsTopNode = false; 2401 } 2402 else { 2403 SU = pickNodeBidirectional(IsTopNode); 2404 } 2405 } while (SU->isScheduled); 2406 2407 if (SU->isTopReady()) 2408 Top.removeReady(SU); 2409 if (SU->isBottomReady()) 2410 Bot.removeReady(SU); 2411 2412 DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") " << *SU->getInstr()); 2413 return SU; 2414 } 2415 2416 void ConvergingScheduler::reschedulePhysRegCopies(SUnit *SU, bool isTop) { 2417 2418 MachineBasicBlock::iterator InsertPos = SU->getInstr(); 2419 if (!isTop) 2420 ++InsertPos; 2421 SmallVectorImpl<SDep> &Deps = isTop ? SU->Preds : SU->Succs; 2422 2423 // Find already scheduled copies with a single physreg dependence and move 2424 // them just above the scheduled instruction. 2425 for (SmallVectorImpl<SDep>::iterator I = Deps.begin(), E = Deps.end(); 2426 I != E; ++I) { 2427 if (I->getKind() != SDep::Data || !TRI->isPhysicalRegister(I->getReg())) 2428 continue; 2429 SUnit *DepSU = I->getSUnit(); 2430 if (isTop ? DepSU->Succs.size() > 1 : DepSU->Preds.size() > 1) 2431 continue; 2432 MachineInstr *Copy = DepSU->getInstr(); 2433 if (!Copy->isCopy()) 2434 continue; 2435 DEBUG(dbgs() << " Rescheduling physreg copy "; 2436 I->getSUnit()->dump(DAG)); 2437 DAG->moveInstruction(Copy, InsertPos); 2438 } 2439 } 2440 2441 /// Update the scheduler's state after scheduling a node. This is the same node 2442 /// that was just returned by pickNode(). However, ScheduleDAGMI needs to update 2443 /// it's state based on the current cycle before MachineSchedStrategy does. 2444 /// 2445 /// FIXME: Eventually, we may bundle physreg copies rather than rescheduling 2446 /// them here. See comments in biasPhysRegCopy. 2447 void ConvergingScheduler::schedNode(SUnit *SU, bool IsTopNode) { 2448 if (IsTopNode) { 2449 SU->TopReadyCycle = std::max(SU->TopReadyCycle, Top.CurrCycle); 2450 Top.bumpNode(SU); 2451 if (SU->hasPhysRegUses) 2452 reschedulePhysRegCopies(SU, true); 2453 } 2454 else { 2455 SU->BotReadyCycle = std::max(SU->BotReadyCycle, Bot.CurrCycle); 2456 Bot.bumpNode(SU); 2457 if (SU->hasPhysRegDefs) 2458 reschedulePhysRegCopies(SU, false); 2459 } 2460 } 2461 2462 /// Create the standard converging machine scheduler. This will be used as the 2463 /// default scheduler if the target does not set a default. 2464 static ScheduleDAGInstrs *createConvergingSched(MachineSchedContext *C) { 2465 assert((!ForceTopDown || !ForceBottomUp) && 2466 "-misched-topdown incompatible with -misched-bottomup"); 2467 ScheduleDAGMI *DAG = new ScheduleDAGMI(C, new ConvergingScheduler()); 2468 // Register DAG post-processors. 2469 // 2470 // FIXME: extend the mutation API to allow earlier mutations to instantiate 2471 // data and pass it to later mutations. Have a single mutation that gathers 2472 // the interesting nodes in one pass. 2473 DAG->addMutation(new CopyConstrain(DAG->TII, DAG->TRI)); 2474 if (EnableLoadCluster) 2475 DAG->addMutation(new LoadClusterMutation(DAG->TII, DAG->TRI)); 2476 if (EnableMacroFusion) 2477 DAG->addMutation(new MacroFusion(DAG->TII)); 2478 return DAG; 2479 } 2480 static MachineSchedRegistry 2481 ConvergingSchedRegistry("converge", "Standard converging scheduler.", 2482 createConvergingSched); 2483 2484 //===----------------------------------------------------------------------===// 2485 // ILP Scheduler. Currently for experimental analysis of heuristics. 2486 //===----------------------------------------------------------------------===// 2487 2488 namespace { 2489 /// \brief Order nodes by the ILP metric. 2490 struct ILPOrder { 2491 const SchedDFSResult *DFSResult; 2492 const BitVector *ScheduledTrees; 2493 bool MaximizeILP; 2494 2495 ILPOrder(bool MaxILP): DFSResult(0), ScheduledTrees(0), MaximizeILP(MaxILP) {} 2496 2497 /// \brief Apply a less-than relation on node priority. 2498 /// 2499 /// (Return true if A comes after B in the Q.) 2500 bool operator()(const SUnit *A, const SUnit *B) const { 2501 unsigned SchedTreeA = DFSResult->getSubtreeID(A); 2502 unsigned SchedTreeB = DFSResult->getSubtreeID(B); 2503 if (SchedTreeA != SchedTreeB) { 2504 // Unscheduled trees have lower priority. 2505 if (ScheduledTrees->test(SchedTreeA) != ScheduledTrees->test(SchedTreeB)) 2506 return ScheduledTrees->test(SchedTreeB); 2507 2508 // Trees with shallower connections have have lower priority. 2509 if (DFSResult->getSubtreeLevel(SchedTreeA) 2510 != DFSResult->getSubtreeLevel(SchedTreeB)) { 2511 return DFSResult->getSubtreeLevel(SchedTreeA) 2512 < DFSResult->getSubtreeLevel(SchedTreeB); 2513 } 2514 } 2515 if (MaximizeILP) 2516 return DFSResult->getILP(A) < DFSResult->getILP(B); 2517 else 2518 return DFSResult->getILP(A) > DFSResult->getILP(B); 2519 } 2520 }; 2521 2522 /// \brief Schedule based on the ILP metric. 2523 class ILPScheduler : public MachineSchedStrategy { 2524 /// In case all subtrees are eventually connected to a common root through 2525 /// data dependence (e.g. reduction), place an upper limit on their size. 2526 /// 2527 /// FIXME: A subtree limit is generally good, but in the situation commented 2528 /// above, where multiple similar subtrees feed a common root, we should 2529 /// only split at a point where the resulting subtrees will be balanced. 2530 /// (a motivating test case must be found). 2531 static const unsigned SubtreeLimit = 16; 2532 2533 ScheduleDAGMI *DAG; 2534 ILPOrder Cmp; 2535 2536 std::vector<SUnit*> ReadyQ; 2537 public: 2538 ILPScheduler(bool MaximizeILP): DAG(0), Cmp(MaximizeILP) {} 2539 2540 virtual void initialize(ScheduleDAGMI *dag) { 2541 DAG = dag; 2542 DAG->computeDFSResult(); 2543 Cmp.DFSResult = DAG->getDFSResult(); 2544 Cmp.ScheduledTrees = &DAG->getScheduledTrees(); 2545 ReadyQ.clear(); 2546 } 2547 2548 virtual void registerRoots() { 2549 // Restore the heap in ReadyQ with the updated DFS results. 2550 std::make_heap(ReadyQ.begin(), ReadyQ.end(), Cmp); 2551 } 2552 2553 /// Implement MachineSchedStrategy interface. 2554 /// ----------------------------------------- 2555 2556 /// Callback to select the highest priority node from the ready Q. 2557 virtual SUnit *pickNode(bool &IsTopNode) { 2558 if (ReadyQ.empty()) return NULL; 2559 std::pop_heap(ReadyQ.begin(), ReadyQ.end(), Cmp); 2560 SUnit *SU = ReadyQ.back(); 2561 ReadyQ.pop_back(); 2562 IsTopNode = false; 2563 DEBUG(dbgs() << "Pick node " << "SU(" << SU->NodeNum << ") " 2564 << " ILP: " << DAG->getDFSResult()->getILP(SU) 2565 << " Tree: " << DAG->getDFSResult()->getSubtreeID(SU) << " @" 2566 << DAG->getDFSResult()->getSubtreeLevel( 2567 DAG->getDFSResult()->getSubtreeID(SU)) << '\n' 2568 << "Scheduling " << *SU->getInstr()); 2569 return SU; 2570 } 2571 2572 /// \brief Scheduler callback to notify that a new subtree is scheduled. 2573 virtual void scheduleTree(unsigned SubtreeID) { 2574 std::make_heap(ReadyQ.begin(), ReadyQ.end(), Cmp); 2575 } 2576 2577 /// Callback after a node is scheduled. Mark a newly scheduled tree, notify 2578 /// DFSResults, and resort the priority Q. 2579 virtual void schedNode(SUnit *SU, bool IsTopNode) { 2580 assert(!IsTopNode && "SchedDFSResult needs bottom-up"); 2581 } 2582 2583 virtual void releaseTopNode(SUnit *) { /*only called for top roots*/ } 2584 2585 virtual void releaseBottomNode(SUnit *SU) { 2586 ReadyQ.push_back(SU); 2587 std::push_heap(ReadyQ.begin(), ReadyQ.end(), Cmp); 2588 } 2589 }; 2590 } // namespace 2591 2592 static ScheduleDAGInstrs *createILPMaxScheduler(MachineSchedContext *C) { 2593 return new ScheduleDAGMI(C, new ILPScheduler(true)); 2594 } 2595 static ScheduleDAGInstrs *createILPMinScheduler(MachineSchedContext *C) { 2596 return new ScheduleDAGMI(C, new ILPScheduler(false)); 2597 } 2598 static MachineSchedRegistry ILPMaxRegistry( 2599 "ilpmax", "Schedule bottom-up for max ILP", createILPMaxScheduler); 2600 static MachineSchedRegistry ILPMinRegistry( 2601 "ilpmin", "Schedule bottom-up for min ILP", createILPMinScheduler); 2602 2603 //===----------------------------------------------------------------------===// 2604 // Machine Instruction Shuffler for Correctness Testing 2605 //===----------------------------------------------------------------------===// 2606 2607 #ifndef NDEBUG 2608 namespace { 2609 /// Apply a less-than relation on the node order, which corresponds to the 2610 /// instruction order prior to scheduling. IsReverse implements greater-than. 2611 template<bool IsReverse> 2612 struct SUnitOrder { 2613 bool operator()(SUnit *A, SUnit *B) const { 2614 if (IsReverse) 2615 return A->NodeNum > B->NodeNum; 2616 else 2617 return A->NodeNum < B->NodeNum; 2618 } 2619 }; 2620 2621 /// Reorder instructions as much as possible. 2622 class InstructionShuffler : public MachineSchedStrategy { 2623 bool IsAlternating; 2624 bool IsTopDown; 2625 2626 // Using a less-than relation (SUnitOrder<false>) for the TopQ priority 2627 // gives nodes with a higher number higher priority causing the latest 2628 // instructions to be scheduled first. 2629 PriorityQueue<SUnit*, std::vector<SUnit*>, SUnitOrder<false> > 2630 TopQ; 2631 // When scheduling bottom-up, use greater-than as the queue priority. 2632 PriorityQueue<SUnit*, std::vector<SUnit*>, SUnitOrder<true> > 2633 BottomQ; 2634 public: 2635 InstructionShuffler(bool alternate, bool topdown) 2636 : IsAlternating(alternate), IsTopDown(topdown) {} 2637 2638 virtual void initialize(ScheduleDAGMI *) { 2639 TopQ.clear(); 2640 BottomQ.clear(); 2641 } 2642 2643 /// Implement MachineSchedStrategy interface. 2644 /// ----------------------------------------- 2645 2646 virtual SUnit *pickNode(bool &IsTopNode) { 2647 SUnit *SU; 2648 if (IsTopDown) { 2649 do { 2650 if (TopQ.empty()) return NULL; 2651 SU = TopQ.top(); 2652 TopQ.pop(); 2653 } while (SU->isScheduled); 2654 IsTopNode = true; 2655 } 2656 else { 2657 do { 2658 if (BottomQ.empty()) return NULL; 2659 SU = BottomQ.top(); 2660 BottomQ.pop(); 2661 } while (SU->isScheduled); 2662 IsTopNode = false; 2663 } 2664 if (IsAlternating) 2665 IsTopDown = !IsTopDown; 2666 return SU; 2667 } 2668 2669 virtual void schedNode(SUnit *SU, bool IsTopNode) {} 2670 2671 virtual void releaseTopNode(SUnit *SU) { 2672 TopQ.push(SU); 2673 } 2674 virtual void releaseBottomNode(SUnit *SU) { 2675 BottomQ.push(SU); 2676 } 2677 }; 2678 } // namespace 2679 2680 static ScheduleDAGInstrs *createInstructionShuffler(MachineSchedContext *C) { 2681 bool Alternate = !ForceTopDown && !ForceBottomUp; 2682 bool TopDown = !ForceBottomUp; 2683 assert((TopDown || !ForceTopDown) && 2684 "-misched-topdown incompatible with -misched-bottomup"); 2685 return new ScheduleDAGMI(C, new InstructionShuffler(Alternate, TopDown)); 2686 } 2687 static MachineSchedRegistry ShufflerRegistry( 2688 "shuffle", "Shuffle machine instructions alternating directions", 2689 createInstructionShuffler); 2690 #endif // !NDEBUG 2691 2692 //===----------------------------------------------------------------------===// 2693 // GraphWriter support for ScheduleDAGMI. 2694 //===----------------------------------------------------------------------===// 2695 2696 #ifndef NDEBUG 2697 namespace llvm { 2698 2699 template<> struct GraphTraits< 2700 ScheduleDAGMI*> : public GraphTraits<ScheduleDAG*> {}; 2701 2702 template<> 2703 struct DOTGraphTraits<ScheduleDAGMI*> : public DefaultDOTGraphTraits { 2704 2705 DOTGraphTraits (bool isSimple=false) : DefaultDOTGraphTraits(isSimple) {} 2706 2707 static std::string getGraphName(const ScheduleDAG *G) { 2708 return G->MF.getName(); 2709 } 2710 2711 static bool renderGraphFromBottomUp() { 2712 return true; 2713 } 2714 2715 static bool isNodeHidden(const SUnit *Node) { 2716 return (Node->NumPreds > 10 || Node->NumSuccs > 10); 2717 } 2718 2719 static bool hasNodeAddressLabel(const SUnit *Node, 2720 const ScheduleDAG *Graph) { 2721 return false; 2722 } 2723 2724 /// If you want to override the dot attributes printed for a particular 2725 /// edge, override this method. 2726 static std::string getEdgeAttributes(const SUnit *Node, 2727 SUnitIterator EI, 2728 const ScheduleDAG *Graph) { 2729 if (EI.isArtificialDep()) 2730 return "color=cyan,style=dashed"; 2731 if (EI.isCtrlDep()) 2732 return "color=blue,style=dashed"; 2733 return ""; 2734 } 2735 2736 static std::string getNodeLabel(const SUnit *SU, const ScheduleDAG *G) { 2737 std::string Str; 2738 raw_string_ostream SS(Str); 2739 SS << "SU(" << SU->NodeNum << ')'; 2740 return SS.str(); 2741 } 2742 static std::string getNodeDescription(const SUnit *SU, const ScheduleDAG *G) { 2743 return G->getGraphNodeLabel(SU); 2744 } 2745 2746 static std::string getNodeAttributes(const SUnit *N, 2747 const ScheduleDAG *Graph) { 2748 std::string Str("shape=Mrecord"); 2749 const SchedDFSResult *DFS = 2750 static_cast<const ScheduleDAGMI*>(Graph)->getDFSResult(); 2751 if (DFS) { 2752 Str += ",style=filled,fillcolor=\"#"; 2753 Str += DOT::getColorString(DFS->getSubtreeID(N)); 2754 Str += '"'; 2755 } 2756 return Str; 2757 } 2758 }; 2759 } // namespace llvm 2760 #endif // NDEBUG 2761 2762 /// viewGraph - Pop up a ghostview window with the reachable parts of the DAG 2763 /// rendered using 'dot'. 2764 /// 2765 void ScheduleDAGMI::viewGraph(const Twine &Name, const Twine &Title) { 2766 #ifndef NDEBUG 2767 ViewGraph(this, Name, false, Title); 2768 #else 2769 errs() << "ScheduleDAGMI::viewGraph is only available in debug builds on " 2770 << "systems with Graphviz or gv!\n"; 2771 #endif // NDEBUG 2772 } 2773 2774 /// Out-of-line implementation with no arguments is handy for gdb. 2775 void ScheduleDAGMI::viewGraph() { 2776 viewGraph(getDAGName(), "Scheduling-Units Graph for " + getDAGName()); 2777 } 2778