1 //===- MachineScheduler.cpp - Machine Instruction Scheduler ---------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // MachineScheduler schedules machine instructions after phi elimination. It 11 // preserves LiveIntervals so it can be invoked before register allocation. 12 // 13 //===----------------------------------------------------------------------===// 14 15 #define DEBUG_TYPE "misched" 16 17 #include "llvm/CodeGen/MachineScheduler.h" 18 #include "llvm/ADT/OwningPtr.h" 19 #include "llvm/ADT/PriorityQueue.h" 20 #include "llvm/Analysis/AliasAnalysis.h" 21 #include "llvm/CodeGen/LiveIntervalAnalysis.h" 22 #include "llvm/CodeGen/MachineDominators.h" 23 #include "llvm/CodeGen/MachineLoopInfo.h" 24 #include "llvm/CodeGen/MachineRegisterInfo.h" 25 #include "llvm/CodeGen/Passes.h" 26 #include "llvm/CodeGen/RegisterClassInfo.h" 27 #include "llvm/CodeGen/ScheduleDFS.h" 28 #include "llvm/CodeGen/ScheduleHazardRecognizer.h" 29 #include "llvm/Support/CommandLine.h" 30 #include "llvm/Support/Debug.h" 31 #include "llvm/Support/ErrorHandling.h" 32 #include "llvm/Support/GraphWriter.h" 33 #include "llvm/Support/raw_ostream.h" 34 #include "llvm/Target/TargetInstrInfo.h" 35 #include <queue> 36 37 using namespace llvm; 38 39 namespace llvm { 40 cl::opt<bool> ForceTopDown("misched-topdown", cl::Hidden, 41 cl::desc("Force top-down list scheduling")); 42 cl::opt<bool> ForceBottomUp("misched-bottomup", cl::Hidden, 43 cl::desc("Force bottom-up list scheduling")); 44 } 45 46 #ifndef NDEBUG 47 static cl::opt<bool> ViewMISchedDAGs("view-misched-dags", cl::Hidden, 48 cl::desc("Pop up a window to show MISched dags after they are processed")); 49 50 static cl::opt<unsigned> MISchedCutoff("misched-cutoff", cl::Hidden, 51 cl::desc("Stop scheduling after N instructions"), cl::init(~0U)); 52 #else 53 static bool ViewMISchedDAGs = false; 54 #endif // NDEBUG 55 56 static cl::opt<bool> EnableCyclicPath("misched-cyclicpath", cl::Hidden, 57 cl::desc("Enable cyclic critical path analysis."), cl::init(false)); 58 59 static cl::opt<bool> EnableLoadCluster("misched-cluster", cl::Hidden, 60 cl::desc("Enable load clustering."), cl::init(true)); 61 62 // Experimental heuristics 63 static cl::opt<bool> EnableMacroFusion("misched-fusion", cl::Hidden, 64 cl::desc("Enable scheduling for macro fusion."), cl::init(true)); 65 66 static cl::opt<bool> VerifyScheduling("verify-misched", cl::Hidden, 67 cl::desc("Verify machine instrs before and after machine scheduling")); 68 69 // DAG subtrees must have at least this many nodes. 70 static const unsigned MinSubtreeSize = 8; 71 72 //===----------------------------------------------------------------------===// 73 // Machine Instruction Scheduling Pass and Registry 74 //===----------------------------------------------------------------------===// 75 76 MachineSchedContext::MachineSchedContext(): 77 MF(0), MLI(0), MDT(0), PassConfig(0), AA(0), LIS(0) { 78 RegClassInfo = new RegisterClassInfo(); 79 } 80 81 MachineSchedContext::~MachineSchedContext() { 82 delete RegClassInfo; 83 } 84 85 namespace { 86 /// MachineScheduler runs after coalescing and before register allocation. 87 class MachineScheduler : public MachineSchedContext, 88 public MachineFunctionPass { 89 public: 90 MachineScheduler(); 91 92 virtual void getAnalysisUsage(AnalysisUsage &AU) const; 93 94 virtual void releaseMemory() {} 95 96 virtual bool runOnMachineFunction(MachineFunction&); 97 98 virtual void print(raw_ostream &O, const Module* = 0) const; 99 100 static char ID; // Class identification, replacement for typeinfo 101 }; 102 } // namespace 103 104 char MachineScheduler::ID = 0; 105 106 char &llvm::MachineSchedulerID = MachineScheduler::ID; 107 108 INITIALIZE_PASS_BEGIN(MachineScheduler, "misched", 109 "Machine Instruction Scheduler", false, false) 110 INITIALIZE_AG_DEPENDENCY(AliasAnalysis) 111 INITIALIZE_PASS_DEPENDENCY(SlotIndexes) 112 INITIALIZE_PASS_DEPENDENCY(LiveIntervals) 113 INITIALIZE_PASS_END(MachineScheduler, "misched", 114 "Machine Instruction Scheduler", false, false) 115 116 MachineScheduler::MachineScheduler() 117 : MachineFunctionPass(ID) { 118 initializeMachineSchedulerPass(*PassRegistry::getPassRegistry()); 119 } 120 121 void MachineScheduler::getAnalysisUsage(AnalysisUsage &AU) const { 122 AU.setPreservesCFG(); 123 AU.addRequiredID(MachineDominatorsID); 124 AU.addRequired<MachineLoopInfo>(); 125 AU.addRequired<AliasAnalysis>(); 126 AU.addRequired<TargetPassConfig>(); 127 AU.addRequired<SlotIndexes>(); 128 AU.addPreserved<SlotIndexes>(); 129 AU.addRequired<LiveIntervals>(); 130 AU.addPreserved<LiveIntervals>(); 131 MachineFunctionPass::getAnalysisUsage(AU); 132 } 133 134 MachinePassRegistry MachineSchedRegistry::Registry; 135 136 /// A dummy default scheduler factory indicates whether the scheduler 137 /// is overridden on the command line. 138 static ScheduleDAGInstrs *useDefaultMachineSched(MachineSchedContext *C) { 139 return 0; 140 } 141 142 /// MachineSchedOpt allows command line selection of the scheduler. 143 static cl::opt<MachineSchedRegistry::ScheduleDAGCtor, false, 144 RegisterPassParser<MachineSchedRegistry> > 145 MachineSchedOpt("misched", 146 cl::init(&useDefaultMachineSched), cl::Hidden, 147 cl::desc("Machine instruction scheduler to use")); 148 149 static MachineSchedRegistry 150 DefaultSchedRegistry("default", "Use the target's default scheduler choice.", 151 useDefaultMachineSched); 152 153 /// Forward declare the standard machine scheduler. This will be used as the 154 /// default scheduler if the target does not set a default. 155 static ScheduleDAGInstrs *createConvergingSched(MachineSchedContext *C); 156 157 158 /// Decrement this iterator until reaching the top or a non-debug instr. 159 static MachineBasicBlock::const_iterator 160 priorNonDebug(MachineBasicBlock::const_iterator I, 161 MachineBasicBlock::const_iterator Beg) { 162 assert(I != Beg && "reached the top of the region, cannot decrement"); 163 while (--I != Beg) { 164 if (!I->isDebugValue()) 165 break; 166 } 167 return I; 168 } 169 170 /// Non-const version. 171 static MachineBasicBlock::iterator 172 priorNonDebug(MachineBasicBlock::iterator I, 173 MachineBasicBlock::const_iterator Beg) { 174 return const_cast<MachineInstr*>( 175 &*priorNonDebug(MachineBasicBlock::const_iterator(I), Beg)); 176 } 177 178 /// If this iterator is a debug value, increment until reaching the End or a 179 /// non-debug instruction. 180 static MachineBasicBlock::const_iterator 181 nextIfDebug(MachineBasicBlock::const_iterator I, 182 MachineBasicBlock::const_iterator End) { 183 for(; I != End; ++I) { 184 if (!I->isDebugValue()) 185 break; 186 } 187 return I; 188 } 189 190 /// Non-const version. 191 static MachineBasicBlock::iterator 192 nextIfDebug(MachineBasicBlock::iterator I, 193 MachineBasicBlock::const_iterator End) { 194 // Cast the return value to nonconst MachineInstr, then cast to an 195 // instr_iterator, which does not check for null, finally return a 196 // bundle_iterator. 197 return MachineBasicBlock::instr_iterator( 198 const_cast<MachineInstr*>( 199 &*nextIfDebug(MachineBasicBlock::const_iterator(I), End))); 200 } 201 202 /// Top-level MachineScheduler pass driver. 203 /// 204 /// Visit blocks in function order. Divide each block into scheduling regions 205 /// and visit them bottom-up. Visiting regions bottom-up is not required, but is 206 /// consistent with the DAG builder, which traverses the interior of the 207 /// scheduling regions bottom-up. 208 /// 209 /// This design avoids exposing scheduling boundaries to the DAG builder, 210 /// simplifying the DAG builder's support for "special" target instructions. 211 /// At the same time the design allows target schedulers to operate across 212 /// scheduling boundaries, for example to bundle the boudary instructions 213 /// without reordering them. This creates complexity, because the target 214 /// scheduler must update the RegionBegin and RegionEnd positions cached by 215 /// ScheduleDAGInstrs whenever adding or removing instructions. A much simpler 216 /// design would be to split blocks at scheduling boundaries, but LLVM has a 217 /// general bias against block splitting purely for implementation simplicity. 218 bool MachineScheduler::runOnMachineFunction(MachineFunction &mf) { 219 DEBUG(dbgs() << "Before MISsched:\n"; mf.print(dbgs())); 220 221 // Initialize the context of the pass. 222 MF = &mf; 223 MLI = &getAnalysis<MachineLoopInfo>(); 224 MDT = &getAnalysis<MachineDominatorTree>(); 225 PassConfig = &getAnalysis<TargetPassConfig>(); 226 AA = &getAnalysis<AliasAnalysis>(); 227 228 LIS = &getAnalysis<LiveIntervals>(); 229 const TargetInstrInfo *TII = MF->getTarget().getInstrInfo(); 230 231 if (VerifyScheduling) { 232 DEBUG(LIS->dump()); 233 MF->verify(this, "Before machine scheduling."); 234 } 235 RegClassInfo->runOnMachineFunction(*MF); 236 237 // Select the scheduler, or set the default. 238 MachineSchedRegistry::ScheduleDAGCtor Ctor = MachineSchedOpt; 239 if (Ctor == useDefaultMachineSched) { 240 // Get the default scheduler set by the target. 241 Ctor = MachineSchedRegistry::getDefault(); 242 if (!Ctor) { 243 Ctor = createConvergingSched; 244 MachineSchedRegistry::setDefault(Ctor); 245 } 246 } 247 // Instantiate the selected scheduler. 248 OwningPtr<ScheduleDAGInstrs> Scheduler(Ctor(this)); 249 250 // Visit all machine basic blocks. 251 // 252 // TODO: Visit blocks in global postorder or postorder within the bottom-up 253 // loop tree. Then we can optionally compute global RegPressure. 254 for (MachineFunction::iterator MBB = MF->begin(), MBBEnd = MF->end(); 255 MBB != MBBEnd; ++MBB) { 256 257 Scheduler->startBlock(MBB); 258 259 // Break the block into scheduling regions [I, RegionEnd), and schedule each 260 // region as soon as it is discovered. RegionEnd points the scheduling 261 // boundary at the bottom of the region. The DAG does not include RegionEnd, 262 // but the region does (i.e. the next RegionEnd is above the previous 263 // RegionBegin). If the current block has no terminator then RegionEnd == 264 // MBB->end() for the bottom region. 265 // 266 // The Scheduler may insert instructions during either schedule() or 267 // exitRegion(), even for empty regions. So the local iterators 'I' and 268 // 'RegionEnd' are invalid across these calls. 269 unsigned RemainingInstrs = MBB->size(); 270 for(MachineBasicBlock::iterator RegionEnd = MBB->end(); 271 RegionEnd != MBB->begin(); RegionEnd = Scheduler->begin()) { 272 273 // Avoid decrementing RegionEnd for blocks with no terminator. 274 if (RegionEnd != MBB->end() 275 || TII->isSchedulingBoundary(llvm::prior(RegionEnd), MBB, *MF)) { 276 --RegionEnd; 277 // Count the boundary instruction. 278 --RemainingInstrs; 279 } 280 281 // The next region starts above the previous region. Look backward in the 282 // instruction stream until we find the nearest boundary. 283 unsigned NumRegionInstrs = 0; 284 MachineBasicBlock::iterator I = RegionEnd; 285 for(;I != MBB->begin(); --I, --RemainingInstrs, ++NumRegionInstrs) { 286 if (TII->isSchedulingBoundary(llvm::prior(I), MBB, *MF)) 287 break; 288 } 289 // Notify the scheduler of the region, even if we may skip scheduling 290 // it. Perhaps it still needs to be bundled. 291 Scheduler->enterRegion(MBB, I, RegionEnd, NumRegionInstrs); 292 293 // Skip empty scheduling regions (0 or 1 schedulable instructions). 294 if (I == RegionEnd || I == llvm::prior(RegionEnd)) { 295 // Close the current region. Bundle the terminator if needed. 296 // This invalidates 'RegionEnd' and 'I'. 297 Scheduler->exitRegion(); 298 continue; 299 } 300 DEBUG(dbgs() << "********** MI Scheduling **********\n"); 301 DEBUG(dbgs() << MF->getName() 302 << ":BB#" << MBB->getNumber() << " " << MBB->getName() 303 << "\n From: " << *I << " To: "; 304 if (RegionEnd != MBB->end()) dbgs() << *RegionEnd; 305 else dbgs() << "End"; 306 dbgs() << " RegionInstrs: " << NumRegionInstrs 307 << " Remaining: " << RemainingInstrs << "\n"); 308 309 // Schedule a region: possibly reorder instructions. 310 // This invalidates 'RegionEnd' and 'I'. 311 Scheduler->schedule(); 312 313 // Close the current region. 314 Scheduler->exitRegion(); 315 316 // Scheduling has invalidated the current iterator 'I'. Ask the 317 // scheduler for the top of it's scheduled region. 318 RegionEnd = Scheduler->begin(); 319 } 320 assert(RemainingInstrs == 0 && "Instruction count mismatch!"); 321 Scheduler->finishBlock(); 322 } 323 Scheduler->finalizeSchedule(); 324 DEBUG(LIS->dump()); 325 if (VerifyScheduling) 326 MF->verify(this, "After machine scheduling."); 327 return true; 328 } 329 330 void MachineScheduler::print(raw_ostream &O, const Module* m) const { 331 // unimplemented 332 } 333 334 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 335 void ReadyQueue::dump() { 336 dbgs() << Name << ": "; 337 for (unsigned i = 0, e = Queue.size(); i < e; ++i) 338 dbgs() << Queue[i]->NodeNum << " "; 339 dbgs() << "\n"; 340 } 341 #endif 342 343 //===----------------------------------------------------------------------===// 344 // ScheduleDAGMI - Base class for MachineInstr scheduling with LiveIntervals 345 // preservation. 346 //===----------------------------------------------------------------------===// 347 348 ScheduleDAGMI::~ScheduleDAGMI() { 349 delete DFSResult; 350 DeleteContainerPointers(Mutations); 351 delete SchedImpl; 352 } 353 354 bool ScheduleDAGMI::canAddEdge(SUnit *SuccSU, SUnit *PredSU) { 355 return SuccSU == &ExitSU || !Topo.IsReachable(PredSU, SuccSU); 356 } 357 358 bool ScheduleDAGMI::addEdge(SUnit *SuccSU, const SDep &PredDep) { 359 if (SuccSU != &ExitSU) { 360 // Do not use WillCreateCycle, it assumes SD scheduling. 361 // If Pred is reachable from Succ, then the edge creates a cycle. 362 if (Topo.IsReachable(PredDep.getSUnit(), SuccSU)) 363 return false; 364 Topo.AddPred(SuccSU, PredDep.getSUnit()); 365 } 366 SuccSU->addPred(PredDep, /*Required=*/!PredDep.isArtificial()); 367 // Return true regardless of whether a new edge needed to be inserted. 368 return true; 369 } 370 371 /// ReleaseSucc - Decrement the NumPredsLeft count of a successor. When 372 /// NumPredsLeft reaches zero, release the successor node. 373 /// 374 /// FIXME: Adjust SuccSU height based on MinLatency. 375 void ScheduleDAGMI::releaseSucc(SUnit *SU, SDep *SuccEdge) { 376 SUnit *SuccSU = SuccEdge->getSUnit(); 377 378 if (SuccEdge->isWeak()) { 379 --SuccSU->WeakPredsLeft; 380 if (SuccEdge->isCluster()) 381 NextClusterSucc = SuccSU; 382 return; 383 } 384 #ifndef NDEBUG 385 if (SuccSU->NumPredsLeft == 0) { 386 dbgs() << "*** Scheduling failed! ***\n"; 387 SuccSU->dump(this); 388 dbgs() << " has been released too many times!\n"; 389 llvm_unreachable(0); 390 } 391 #endif 392 --SuccSU->NumPredsLeft; 393 if (SuccSU->NumPredsLeft == 0 && SuccSU != &ExitSU) 394 SchedImpl->releaseTopNode(SuccSU); 395 } 396 397 /// releaseSuccessors - Call releaseSucc on each of SU's successors. 398 void ScheduleDAGMI::releaseSuccessors(SUnit *SU) { 399 for (SUnit::succ_iterator I = SU->Succs.begin(), E = SU->Succs.end(); 400 I != E; ++I) { 401 releaseSucc(SU, &*I); 402 } 403 } 404 405 /// ReleasePred - Decrement the NumSuccsLeft count of a predecessor. When 406 /// NumSuccsLeft reaches zero, release the predecessor node. 407 /// 408 /// FIXME: Adjust PredSU height based on MinLatency. 409 void ScheduleDAGMI::releasePred(SUnit *SU, SDep *PredEdge) { 410 SUnit *PredSU = PredEdge->getSUnit(); 411 412 if (PredEdge->isWeak()) { 413 --PredSU->WeakSuccsLeft; 414 if (PredEdge->isCluster()) 415 NextClusterPred = PredSU; 416 return; 417 } 418 #ifndef NDEBUG 419 if (PredSU->NumSuccsLeft == 0) { 420 dbgs() << "*** Scheduling failed! ***\n"; 421 PredSU->dump(this); 422 dbgs() << " has been released too many times!\n"; 423 llvm_unreachable(0); 424 } 425 #endif 426 --PredSU->NumSuccsLeft; 427 if (PredSU->NumSuccsLeft == 0 && PredSU != &EntrySU) 428 SchedImpl->releaseBottomNode(PredSU); 429 } 430 431 /// releasePredecessors - Call releasePred on each of SU's predecessors. 432 void ScheduleDAGMI::releasePredecessors(SUnit *SU) { 433 for (SUnit::pred_iterator I = SU->Preds.begin(), E = SU->Preds.end(); 434 I != E; ++I) { 435 releasePred(SU, &*I); 436 } 437 } 438 439 /// This is normally called from the main scheduler loop but may also be invoked 440 /// by the scheduling strategy to perform additional code motion. 441 void ScheduleDAGMI::moveInstruction(MachineInstr *MI, 442 MachineBasicBlock::iterator InsertPos) { 443 // Advance RegionBegin if the first instruction moves down. 444 if (&*RegionBegin == MI) 445 ++RegionBegin; 446 447 // Update the instruction stream. 448 BB->splice(InsertPos, BB, MI); 449 450 // Update LiveIntervals 451 LIS->handleMove(MI, /*UpdateFlags=*/true); 452 453 // Recede RegionBegin if an instruction moves above the first. 454 if (RegionBegin == InsertPos) 455 RegionBegin = MI; 456 } 457 458 bool ScheduleDAGMI::checkSchedLimit() { 459 #ifndef NDEBUG 460 if (NumInstrsScheduled == MISchedCutoff && MISchedCutoff != ~0U) { 461 CurrentTop = CurrentBottom; 462 return false; 463 } 464 ++NumInstrsScheduled; 465 #endif 466 return true; 467 } 468 469 /// enterRegion - Called back from MachineScheduler::runOnMachineFunction after 470 /// crossing a scheduling boundary. [begin, end) includes all instructions in 471 /// the region, including the boundary itself and single-instruction regions 472 /// that don't get scheduled. 473 void ScheduleDAGMI::enterRegion(MachineBasicBlock *bb, 474 MachineBasicBlock::iterator begin, 475 MachineBasicBlock::iterator end, 476 unsigned regioninstrs) 477 { 478 ScheduleDAGInstrs::enterRegion(bb, begin, end, regioninstrs); 479 480 // For convenience remember the end of the liveness region. 481 LiveRegionEnd = 482 (RegionEnd == bb->end()) ? RegionEnd : llvm::next(RegionEnd); 483 } 484 485 // Setup the register pressure trackers for the top scheduled top and bottom 486 // scheduled regions. 487 void ScheduleDAGMI::initRegPressure() { 488 TopRPTracker.init(&MF, RegClassInfo, LIS, BB, RegionBegin); 489 BotRPTracker.init(&MF, RegClassInfo, LIS, BB, LiveRegionEnd); 490 491 // Close the RPTracker to finalize live ins. 492 RPTracker.closeRegion(); 493 494 DEBUG(RPTracker.dump()); 495 496 // Initialize the live ins and live outs. 497 TopRPTracker.addLiveRegs(RPTracker.getPressure().LiveInRegs); 498 BotRPTracker.addLiveRegs(RPTracker.getPressure().LiveOutRegs); 499 500 // Close one end of the tracker so we can call 501 // getMaxUpward/DownwardPressureDelta before advancing across any 502 // instructions. This converts currently live regs into live ins/outs. 503 TopRPTracker.closeTop(); 504 BotRPTracker.closeBottom(); 505 506 BotRPTracker.initLiveThru(RPTracker); 507 if (!BotRPTracker.getLiveThru().empty()) { 508 TopRPTracker.initLiveThru(BotRPTracker.getLiveThru()); 509 DEBUG(dbgs() << "Live Thru: "; 510 dumpRegSetPressure(BotRPTracker.getLiveThru(), TRI)); 511 }; 512 513 // For each live out vreg reduce the pressure change associated with other 514 // uses of the same vreg below the live-out reaching def. 515 updatePressureDiffs(RPTracker.getPressure().LiveOutRegs); 516 517 // Account for liveness generated by the region boundary. 518 if (LiveRegionEnd != RegionEnd) { 519 SmallVector<unsigned, 8> LiveUses; 520 BotRPTracker.recede(&LiveUses); 521 updatePressureDiffs(LiveUses); 522 } 523 524 assert(BotRPTracker.getPos() == RegionEnd && "Can't find the region bottom"); 525 526 // Cache the list of excess pressure sets in this region. This will also track 527 // the max pressure in the scheduled code for these sets. 528 RegionCriticalPSets.clear(); 529 const std::vector<unsigned> &RegionPressure = 530 RPTracker.getPressure().MaxSetPressure; 531 for (unsigned i = 0, e = RegionPressure.size(); i < e; ++i) { 532 unsigned Limit = RegClassInfo->getRegPressureSetLimit(i); 533 if (RegionPressure[i] > Limit) { 534 DEBUG(dbgs() << TRI->getRegPressureSetName(i) 535 << " Limit " << Limit 536 << " Actual " << RegionPressure[i] << "\n"); 537 RegionCriticalPSets.push_back(PressureChange(i)); 538 } 539 } 540 DEBUG(dbgs() << "Excess PSets: "; 541 for (unsigned i = 0, e = RegionCriticalPSets.size(); i != e; ++i) 542 dbgs() << TRI->getRegPressureSetName( 543 RegionCriticalPSets[i].getPSet()) << " "; 544 dbgs() << "\n"); 545 } 546 547 // FIXME: When the pressure tracker deals in pressure differences then we won't 548 // iterate over all RegionCriticalPSets[i]. 549 void ScheduleDAGMI:: 550 updateScheduledPressure(const std::vector<unsigned> &NewMaxPressure) { 551 for (unsigned i = 0, e = RegionCriticalPSets.size(); i < e; ++i) { 552 unsigned ID = RegionCriticalPSets[i].getPSet(); 553 if ((int)NewMaxPressure[ID] > RegionCriticalPSets[i].getUnitInc() 554 && NewMaxPressure[ID] <= INT16_MAX) 555 RegionCriticalPSets[i].setUnitInc(NewMaxPressure[ID]); 556 } 557 DEBUG( 558 for (unsigned i = 0, e = NewMaxPressure.size(); i < e; ++i) { 559 unsigned Limit = RegClassInfo->getRegPressureSetLimit(i); 560 if (NewMaxPressure[i] > Limit ) { 561 dbgs() << " " << TRI->getRegPressureSetName(i) << ": " 562 << NewMaxPressure[i] << " > " << Limit << "\n"; 563 } 564 }); 565 } 566 567 /// Update the PressureDiff array for liveness after scheduling this 568 /// instruction. 569 void ScheduleDAGMI::updatePressureDiffs(ArrayRef<unsigned> LiveUses) { 570 for (unsigned LUIdx = 0, LUEnd = LiveUses.size(); LUIdx != LUEnd; ++LUIdx) { 571 /// FIXME: Currently assuming single-use physregs. 572 unsigned Reg = LiveUses[LUIdx]; 573 if (!TRI->isVirtualRegister(Reg)) 574 continue; 575 // This may be called before CurrentBottom has been initialized. However, 576 // BotRPTracker must have a valid position. We want the value live into the 577 // instruction or live out of the block, so ask for the previous 578 // instruction's live-out. 579 const LiveInterval &LI = LIS->getInterval(Reg); 580 VNInfo *VNI; 581 MachineBasicBlock::const_iterator I = 582 nextIfDebug(BotRPTracker.getPos(), BB->end()); 583 if (I == BB->end()) 584 VNI = LI.getVNInfoBefore(LIS->getMBBEndIdx(BB)); 585 else { 586 LiveRangeQuery LRQ(LI, LIS->getInstructionIndex(I)); 587 VNI = LRQ.valueIn(); 588 } 589 // RegisterPressureTracker guarantees that readsReg is true for LiveUses. 590 assert(VNI && "No live value at use."); 591 for (VReg2UseMap::iterator 592 UI = VRegUses.find(Reg); UI != VRegUses.end(); ++UI) { 593 SUnit *SU = UI->SU; 594 // If this use comes before the reaching def, it cannot be a last use, so 595 // descrease its pressure change. 596 if (!SU->isScheduled && SU != &ExitSU) { 597 LiveRangeQuery LRQ(LI, LIS->getInstructionIndex(SU->getInstr())); 598 if (LRQ.valueIn() == VNI) 599 getPressureDiff(SU).addPressureChange(Reg, true, &MRI); 600 } 601 } 602 } 603 } 604 605 /// schedule - Called back from MachineScheduler::runOnMachineFunction 606 /// after setting up the current scheduling region. [RegionBegin, RegionEnd) 607 /// only includes instructions that have DAG nodes, not scheduling boundaries. 608 /// 609 /// This is a skeletal driver, with all the functionality pushed into helpers, 610 /// so that it can be easilly extended by experimental schedulers. Generally, 611 /// implementing MachineSchedStrategy should be sufficient to implement a new 612 /// scheduling algorithm. However, if a scheduler further subclasses 613 /// ScheduleDAGMI then it will want to override this virtual method in order to 614 /// update any specialized state. 615 void ScheduleDAGMI::schedule() { 616 buildDAGWithRegPressure(); 617 618 Topo.InitDAGTopologicalSorting(); 619 620 postprocessDAG(); 621 622 SmallVector<SUnit*, 8> TopRoots, BotRoots; 623 findRootsAndBiasEdges(TopRoots, BotRoots); 624 625 // Initialize the strategy before modifying the DAG. 626 // This may initialize a DFSResult to be used for queue priority. 627 SchedImpl->initialize(this); 628 629 DEBUG(for (unsigned su = 0, e = SUnits.size(); su != e; ++su) 630 SUnits[su].dumpAll(this)); 631 if (ViewMISchedDAGs) viewGraph(); 632 633 // Initialize ready queues now that the DAG and priority data are finalized. 634 initQueues(TopRoots, BotRoots); 635 636 bool IsTopNode = false; 637 while (SUnit *SU = SchedImpl->pickNode(IsTopNode)) { 638 assert(!SU->isScheduled && "Node already scheduled"); 639 if (!checkSchedLimit()) 640 break; 641 642 scheduleMI(SU, IsTopNode); 643 644 updateQueues(SU, IsTopNode); 645 } 646 assert(CurrentTop == CurrentBottom && "Nonempty unscheduled zone."); 647 648 placeDebugValues(); 649 650 DEBUG({ 651 unsigned BBNum = begin()->getParent()->getNumber(); 652 dbgs() << "*** Final schedule for BB#" << BBNum << " ***\n"; 653 dumpSchedule(); 654 dbgs() << '\n'; 655 }); 656 } 657 658 /// Build the DAG and setup three register pressure trackers. 659 void ScheduleDAGMI::buildDAGWithRegPressure() { 660 // Initialize the register pressure tracker used by buildSchedGraph. 661 RPTracker.init(&MF, RegClassInfo, LIS, BB, LiveRegionEnd, 662 /*TrackUntiedDefs=*/true); 663 664 // Account for liveness generate by the region boundary. 665 if (LiveRegionEnd != RegionEnd) 666 RPTracker.recede(); 667 668 // Build the DAG, and compute current register pressure. 669 buildSchedGraph(AA, &RPTracker, &SUPressureDiffs); 670 671 // Initialize top/bottom trackers after computing region pressure. 672 initRegPressure(); 673 } 674 675 /// Apply each ScheduleDAGMutation step in order. 676 void ScheduleDAGMI::postprocessDAG() { 677 for (unsigned i = 0, e = Mutations.size(); i < e; ++i) { 678 Mutations[i]->apply(this); 679 } 680 } 681 682 void ScheduleDAGMI::computeDFSResult() { 683 if (!DFSResult) 684 DFSResult = new SchedDFSResult(/*BottomU*/true, MinSubtreeSize); 685 DFSResult->clear(); 686 ScheduledTrees.clear(); 687 DFSResult->resize(SUnits.size()); 688 DFSResult->compute(SUnits); 689 ScheduledTrees.resize(DFSResult->getNumSubtrees()); 690 } 691 692 void ScheduleDAGMI::findRootsAndBiasEdges(SmallVectorImpl<SUnit*> &TopRoots, 693 SmallVectorImpl<SUnit*> &BotRoots) { 694 for (std::vector<SUnit>::iterator 695 I = SUnits.begin(), E = SUnits.end(); I != E; ++I) { 696 SUnit *SU = &(*I); 697 assert(!SU->isBoundaryNode() && "Boundary node should not be in SUnits"); 698 699 // Order predecessors so DFSResult follows the critical path. 700 SU->biasCriticalPath(); 701 702 // A SUnit is ready to top schedule if it has no predecessors. 703 if (!I->NumPredsLeft) 704 TopRoots.push_back(SU); 705 // A SUnit is ready to bottom schedule if it has no successors. 706 if (!I->NumSuccsLeft) 707 BotRoots.push_back(SU); 708 } 709 ExitSU.biasCriticalPath(); 710 } 711 712 /// Compute the max cyclic critical path through the DAG. The scheduling DAG 713 /// only provides the critical path for single block loops. To handle loops that 714 /// span blocks, we could use the vreg path latencies provided by 715 /// MachineTraceMetrics instead. However, MachineTraceMetrics is not currently 716 /// available for use in the scheduler. 717 /// 718 /// The cyclic path estimation identifies a def-use pair that crosses the back 719 /// edge and considers the depth and height of the nodes. For example, consider 720 /// the following instruction sequence where each instruction has unit latency 721 /// and defines an epomymous virtual register: 722 /// 723 /// a->b(a,c)->c(b)->d(c)->exit 724 /// 725 /// The cyclic critical path is a two cycles: b->c->b 726 /// The acyclic critical path is four cycles: a->b->c->d->exit 727 /// LiveOutHeight = height(c) = len(c->d->exit) = 2 728 /// LiveOutDepth = depth(c) + 1 = len(a->b->c) + 1 = 3 729 /// LiveInHeight = height(b) + 1 = len(b->c->d->exit) + 1 = 4 730 /// LiveInDepth = depth(b) = len(a->b) = 1 731 /// 732 /// LiveOutDepth - LiveInDepth = 3 - 1 = 2 733 /// LiveInHeight - LiveOutHeight = 4 - 2 = 2 734 /// CyclicCriticalPath = min(2, 2) = 2 735 unsigned ScheduleDAGMI::computeCyclicCriticalPath() { 736 // This only applies to single block loop. 737 if (!BB->isSuccessor(BB)) 738 return 0; 739 740 unsigned MaxCyclicLatency = 0; 741 // Visit each live out vreg def to find def/use pairs that cross iterations. 742 ArrayRef<unsigned> LiveOuts = RPTracker.getPressure().LiveOutRegs; 743 for (ArrayRef<unsigned>::iterator RI = LiveOuts.begin(), RE = LiveOuts.end(); 744 RI != RE; ++RI) { 745 unsigned Reg = *RI; 746 if (!TRI->isVirtualRegister(Reg)) 747 continue; 748 const LiveInterval &LI = LIS->getInterval(Reg); 749 const VNInfo *DefVNI = LI.getVNInfoBefore(LIS->getMBBEndIdx(BB)); 750 if (!DefVNI) 751 continue; 752 753 MachineInstr *DefMI = LIS->getInstructionFromIndex(DefVNI->def); 754 const SUnit *DefSU = getSUnit(DefMI); 755 if (!DefSU) 756 continue; 757 758 unsigned LiveOutHeight = DefSU->getHeight(); 759 unsigned LiveOutDepth = DefSU->getDepth() + DefSU->Latency; 760 // Visit all local users of the vreg def. 761 for (VReg2UseMap::iterator 762 UI = VRegUses.find(Reg); UI != VRegUses.end(); ++UI) { 763 if (UI->SU == &ExitSU) 764 continue; 765 766 // Only consider uses of the phi. 767 LiveRangeQuery LRQ(LI, LIS->getInstructionIndex(UI->SU->getInstr())); 768 if (!LRQ.valueIn()->isPHIDef()) 769 continue; 770 771 // Assume that a path spanning two iterations is a cycle, which could 772 // overestimate in strange cases. This allows cyclic latency to be 773 // estimated as the minimum slack of the vreg's depth or height. 774 unsigned CyclicLatency = 0; 775 if (LiveOutDepth > UI->SU->getDepth()) 776 CyclicLatency = LiveOutDepth - UI->SU->getDepth(); 777 778 unsigned LiveInHeight = UI->SU->getHeight() + DefSU->Latency; 779 if (LiveInHeight > LiveOutHeight) { 780 if (LiveInHeight - LiveOutHeight < CyclicLatency) 781 CyclicLatency = LiveInHeight - LiveOutHeight; 782 } 783 else 784 CyclicLatency = 0; 785 786 DEBUG(dbgs() << "Cyclic Path: SU(" << DefSU->NodeNum << ") -> SU(" 787 << UI->SU->NodeNum << ") = " << CyclicLatency << "c\n"); 788 if (CyclicLatency > MaxCyclicLatency) 789 MaxCyclicLatency = CyclicLatency; 790 } 791 } 792 DEBUG(dbgs() << "Cyclic Critical Path: " << MaxCyclicLatency << "c\n"); 793 return MaxCyclicLatency; 794 } 795 796 /// Identify DAG roots and setup scheduler queues. 797 void ScheduleDAGMI::initQueues(ArrayRef<SUnit*> TopRoots, 798 ArrayRef<SUnit*> BotRoots) { 799 NextClusterSucc = NULL; 800 NextClusterPred = NULL; 801 802 // Release all DAG roots for scheduling, not including EntrySU/ExitSU. 803 // 804 // Nodes with unreleased weak edges can still be roots. 805 // Release top roots in forward order. 806 for (SmallVectorImpl<SUnit*>::const_iterator 807 I = TopRoots.begin(), E = TopRoots.end(); I != E; ++I) { 808 SchedImpl->releaseTopNode(*I); 809 } 810 // Release bottom roots in reverse order so the higher priority nodes appear 811 // first. This is more natural and slightly more efficient. 812 for (SmallVectorImpl<SUnit*>::const_reverse_iterator 813 I = BotRoots.rbegin(), E = BotRoots.rend(); I != E; ++I) { 814 SchedImpl->releaseBottomNode(*I); 815 } 816 817 releaseSuccessors(&EntrySU); 818 releasePredecessors(&ExitSU); 819 820 SchedImpl->registerRoots(); 821 822 // Advance past initial DebugValues. 823 assert(TopRPTracker.getPos() == RegionBegin && "bad initial Top tracker"); 824 CurrentTop = nextIfDebug(RegionBegin, RegionEnd); 825 TopRPTracker.setPos(CurrentTop); 826 827 CurrentBottom = RegionEnd; 828 } 829 830 /// Move an instruction and update register pressure. 831 void ScheduleDAGMI::scheduleMI(SUnit *SU, bool IsTopNode) { 832 // Move the instruction to its new location in the instruction stream. 833 MachineInstr *MI = SU->getInstr(); 834 835 if (IsTopNode) { 836 assert(SU->isTopReady() && "node still has unscheduled dependencies"); 837 if (&*CurrentTop == MI) 838 CurrentTop = nextIfDebug(++CurrentTop, CurrentBottom); 839 else { 840 moveInstruction(MI, CurrentTop); 841 TopRPTracker.setPos(MI); 842 } 843 844 // Update top scheduled pressure. 845 TopRPTracker.advance(); 846 assert(TopRPTracker.getPos() == CurrentTop && "out of sync"); 847 updateScheduledPressure(TopRPTracker.getPressure().MaxSetPressure); 848 } 849 else { 850 assert(SU->isBottomReady() && "node still has unscheduled dependencies"); 851 MachineBasicBlock::iterator priorII = 852 priorNonDebug(CurrentBottom, CurrentTop); 853 if (&*priorII == MI) 854 CurrentBottom = priorII; 855 else { 856 if (&*CurrentTop == MI) { 857 CurrentTop = nextIfDebug(++CurrentTop, priorII); 858 TopRPTracker.setPos(CurrentTop); 859 } 860 moveInstruction(MI, CurrentBottom); 861 CurrentBottom = MI; 862 } 863 // Update bottom scheduled pressure. 864 SmallVector<unsigned, 8> LiveUses; 865 BotRPTracker.recede(&LiveUses); 866 assert(BotRPTracker.getPos() == CurrentBottom && "out of sync"); 867 updatePressureDiffs(LiveUses); 868 updateScheduledPressure(BotRPTracker.getPressure().MaxSetPressure); 869 } 870 } 871 872 /// Update scheduler queues after scheduling an instruction. 873 void ScheduleDAGMI::updateQueues(SUnit *SU, bool IsTopNode) { 874 // Release dependent instructions for scheduling. 875 if (IsTopNode) 876 releaseSuccessors(SU); 877 else 878 releasePredecessors(SU); 879 880 SU->isScheduled = true; 881 882 if (DFSResult) { 883 unsigned SubtreeID = DFSResult->getSubtreeID(SU); 884 if (!ScheduledTrees.test(SubtreeID)) { 885 ScheduledTrees.set(SubtreeID); 886 DFSResult->scheduleTree(SubtreeID); 887 SchedImpl->scheduleTree(SubtreeID); 888 } 889 } 890 891 // Notify the scheduling strategy after updating the DAG. 892 SchedImpl->schedNode(SU, IsTopNode); 893 } 894 895 /// Reinsert any remaining debug_values, just like the PostRA scheduler. 896 void ScheduleDAGMI::placeDebugValues() { 897 // If first instruction was a DBG_VALUE then put it back. 898 if (FirstDbgValue) { 899 BB->splice(RegionBegin, BB, FirstDbgValue); 900 RegionBegin = FirstDbgValue; 901 } 902 903 for (std::vector<std::pair<MachineInstr *, MachineInstr *> >::iterator 904 DI = DbgValues.end(), DE = DbgValues.begin(); DI != DE; --DI) { 905 std::pair<MachineInstr *, MachineInstr *> P = *prior(DI); 906 MachineInstr *DbgValue = P.first; 907 MachineBasicBlock::iterator OrigPrevMI = P.second; 908 if (&*RegionBegin == DbgValue) 909 ++RegionBegin; 910 BB->splice(++OrigPrevMI, BB, DbgValue); 911 if (OrigPrevMI == llvm::prior(RegionEnd)) 912 RegionEnd = DbgValue; 913 } 914 DbgValues.clear(); 915 FirstDbgValue = NULL; 916 } 917 918 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 919 void ScheduleDAGMI::dumpSchedule() const { 920 for (MachineBasicBlock::iterator MI = begin(), ME = end(); MI != ME; ++MI) { 921 if (SUnit *SU = getSUnit(&(*MI))) 922 SU->dump(this); 923 else 924 dbgs() << "Missing SUnit\n"; 925 } 926 } 927 #endif 928 929 //===----------------------------------------------------------------------===// 930 // LoadClusterMutation - DAG post-processing to cluster loads. 931 //===----------------------------------------------------------------------===// 932 933 namespace { 934 /// \brief Post-process the DAG to create cluster edges between neighboring 935 /// loads. 936 class LoadClusterMutation : public ScheduleDAGMutation { 937 struct LoadInfo { 938 SUnit *SU; 939 unsigned BaseReg; 940 unsigned Offset; 941 LoadInfo(SUnit *su, unsigned reg, unsigned ofs) 942 : SU(su), BaseReg(reg), Offset(ofs) {} 943 }; 944 static bool LoadInfoLess(const LoadClusterMutation::LoadInfo &LHS, 945 const LoadClusterMutation::LoadInfo &RHS); 946 947 const TargetInstrInfo *TII; 948 const TargetRegisterInfo *TRI; 949 public: 950 LoadClusterMutation(const TargetInstrInfo *tii, 951 const TargetRegisterInfo *tri) 952 : TII(tii), TRI(tri) {} 953 954 virtual void apply(ScheduleDAGMI *DAG); 955 protected: 956 void clusterNeighboringLoads(ArrayRef<SUnit*> Loads, ScheduleDAGMI *DAG); 957 }; 958 } // anonymous 959 960 bool LoadClusterMutation::LoadInfoLess( 961 const LoadClusterMutation::LoadInfo &LHS, 962 const LoadClusterMutation::LoadInfo &RHS) { 963 if (LHS.BaseReg != RHS.BaseReg) 964 return LHS.BaseReg < RHS.BaseReg; 965 return LHS.Offset < RHS.Offset; 966 } 967 968 void LoadClusterMutation::clusterNeighboringLoads(ArrayRef<SUnit*> Loads, 969 ScheduleDAGMI *DAG) { 970 SmallVector<LoadClusterMutation::LoadInfo,32> LoadRecords; 971 for (unsigned Idx = 0, End = Loads.size(); Idx != End; ++Idx) { 972 SUnit *SU = Loads[Idx]; 973 unsigned BaseReg; 974 unsigned Offset; 975 if (TII->getLdStBaseRegImmOfs(SU->getInstr(), BaseReg, Offset, TRI)) 976 LoadRecords.push_back(LoadInfo(SU, BaseReg, Offset)); 977 } 978 if (LoadRecords.size() < 2) 979 return; 980 std::sort(LoadRecords.begin(), LoadRecords.end(), LoadInfoLess); 981 unsigned ClusterLength = 1; 982 for (unsigned Idx = 0, End = LoadRecords.size(); Idx < (End - 1); ++Idx) { 983 if (LoadRecords[Idx].BaseReg != LoadRecords[Idx+1].BaseReg) { 984 ClusterLength = 1; 985 continue; 986 } 987 988 SUnit *SUa = LoadRecords[Idx].SU; 989 SUnit *SUb = LoadRecords[Idx+1].SU; 990 if (TII->shouldClusterLoads(SUa->getInstr(), SUb->getInstr(), ClusterLength) 991 && DAG->addEdge(SUb, SDep(SUa, SDep::Cluster))) { 992 993 DEBUG(dbgs() << "Cluster loads SU(" << SUa->NodeNum << ") - SU(" 994 << SUb->NodeNum << ")\n"); 995 // Copy successor edges from SUa to SUb. Interleaving computation 996 // dependent on SUa can prevent load combining due to register reuse. 997 // Predecessor edges do not need to be copied from SUb to SUa since nearby 998 // loads should have effectively the same inputs. 999 for (SUnit::const_succ_iterator 1000 SI = SUa->Succs.begin(), SE = SUa->Succs.end(); SI != SE; ++SI) { 1001 if (SI->getSUnit() == SUb) 1002 continue; 1003 DEBUG(dbgs() << " Copy Succ SU(" << SI->getSUnit()->NodeNum << ")\n"); 1004 DAG->addEdge(SI->getSUnit(), SDep(SUb, SDep::Artificial)); 1005 } 1006 ++ClusterLength; 1007 } 1008 else 1009 ClusterLength = 1; 1010 } 1011 } 1012 1013 /// \brief Callback from DAG postProcessing to create cluster edges for loads. 1014 void LoadClusterMutation::apply(ScheduleDAGMI *DAG) { 1015 // Map DAG NodeNum to store chain ID. 1016 DenseMap<unsigned, unsigned> StoreChainIDs; 1017 // Map each store chain to a set of dependent loads. 1018 SmallVector<SmallVector<SUnit*,4>, 32> StoreChainDependents; 1019 for (unsigned Idx = 0, End = DAG->SUnits.size(); Idx != End; ++Idx) { 1020 SUnit *SU = &DAG->SUnits[Idx]; 1021 if (!SU->getInstr()->mayLoad()) 1022 continue; 1023 unsigned ChainPredID = DAG->SUnits.size(); 1024 for (SUnit::const_pred_iterator 1025 PI = SU->Preds.begin(), PE = SU->Preds.end(); PI != PE; ++PI) { 1026 if (PI->isCtrl()) { 1027 ChainPredID = PI->getSUnit()->NodeNum; 1028 break; 1029 } 1030 } 1031 // Check if this chain-like pred has been seen 1032 // before. ChainPredID==MaxNodeID for loads at the top of the schedule. 1033 unsigned NumChains = StoreChainDependents.size(); 1034 std::pair<DenseMap<unsigned, unsigned>::iterator, bool> Result = 1035 StoreChainIDs.insert(std::make_pair(ChainPredID, NumChains)); 1036 if (Result.second) 1037 StoreChainDependents.resize(NumChains + 1); 1038 StoreChainDependents[Result.first->second].push_back(SU); 1039 } 1040 // Iterate over the store chains. 1041 for (unsigned Idx = 0, End = StoreChainDependents.size(); Idx != End; ++Idx) 1042 clusterNeighboringLoads(StoreChainDependents[Idx], DAG); 1043 } 1044 1045 //===----------------------------------------------------------------------===// 1046 // MacroFusion - DAG post-processing to encourage fusion of macro ops. 1047 //===----------------------------------------------------------------------===// 1048 1049 namespace { 1050 /// \brief Post-process the DAG to create cluster edges between instructions 1051 /// that may be fused by the processor into a single operation. 1052 class MacroFusion : public ScheduleDAGMutation { 1053 const TargetInstrInfo *TII; 1054 public: 1055 MacroFusion(const TargetInstrInfo *tii): TII(tii) {} 1056 1057 virtual void apply(ScheduleDAGMI *DAG); 1058 }; 1059 } // anonymous 1060 1061 /// \brief Callback from DAG postProcessing to create cluster edges to encourage 1062 /// fused operations. 1063 void MacroFusion::apply(ScheduleDAGMI *DAG) { 1064 // For now, assume targets can only fuse with the branch. 1065 MachineInstr *Branch = DAG->ExitSU.getInstr(); 1066 if (!Branch) 1067 return; 1068 1069 for (unsigned Idx = DAG->SUnits.size(); Idx > 0;) { 1070 SUnit *SU = &DAG->SUnits[--Idx]; 1071 if (!TII->shouldScheduleAdjacent(SU->getInstr(), Branch)) 1072 continue; 1073 1074 // Create a single weak edge from SU to ExitSU. The only effect is to cause 1075 // bottom-up scheduling to heavily prioritize the clustered SU. There is no 1076 // need to copy predecessor edges from ExitSU to SU, since top-down 1077 // scheduling cannot prioritize ExitSU anyway. To defer top-down scheduling 1078 // of SU, we could create an artificial edge from the deepest root, but it 1079 // hasn't been needed yet. 1080 bool Success = DAG->addEdge(&DAG->ExitSU, SDep(SU, SDep::Cluster)); 1081 (void)Success; 1082 assert(Success && "No DAG nodes should be reachable from ExitSU"); 1083 1084 DEBUG(dbgs() << "Macro Fuse SU(" << SU->NodeNum << ")\n"); 1085 break; 1086 } 1087 } 1088 1089 //===----------------------------------------------------------------------===// 1090 // CopyConstrain - DAG post-processing to encourage copy elimination. 1091 //===----------------------------------------------------------------------===// 1092 1093 namespace { 1094 /// \brief Post-process the DAG to create weak edges from all uses of a copy to 1095 /// the one use that defines the copy's source vreg, most likely an induction 1096 /// variable increment. 1097 class CopyConstrain : public ScheduleDAGMutation { 1098 // Transient state. 1099 SlotIndex RegionBeginIdx; 1100 // RegionEndIdx is the slot index of the last non-debug instruction in the 1101 // scheduling region. So we may have RegionBeginIdx == RegionEndIdx. 1102 SlotIndex RegionEndIdx; 1103 public: 1104 CopyConstrain(const TargetInstrInfo *, const TargetRegisterInfo *) {} 1105 1106 virtual void apply(ScheduleDAGMI *DAG); 1107 1108 protected: 1109 void constrainLocalCopy(SUnit *CopySU, ScheduleDAGMI *DAG); 1110 }; 1111 } // anonymous 1112 1113 /// constrainLocalCopy handles two possibilities: 1114 /// 1) Local src: 1115 /// I0: = dst 1116 /// I1: src = ... 1117 /// I2: = dst 1118 /// I3: dst = src (copy) 1119 /// (create pred->succ edges I0->I1, I2->I1) 1120 /// 1121 /// 2) Local copy: 1122 /// I0: dst = src (copy) 1123 /// I1: = dst 1124 /// I2: src = ... 1125 /// I3: = dst 1126 /// (create pred->succ edges I1->I2, I3->I2) 1127 /// 1128 /// Although the MachineScheduler is currently constrained to single blocks, 1129 /// this algorithm should handle extended blocks. An EBB is a set of 1130 /// contiguously numbered blocks such that the previous block in the EBB is 1131 /// always the single predecessor. 1132 void CopyConstrain::constrainLocalCopy(SUnit *CopySU, ScheduleDAGMI *DAG) { 1133 LiveIntervals *LIS = DAG->getLIS(); 1134 MachineInstr *Copy = CopySU->getInstr(); 1135 1136 // Check for pure vreg copies. 1137 unsigned SrcReg = Copy->getOperand(1).getReg(); 1138 if (!TargetRegisterInfo::isVirtualRegister(SrcReg)) 1139 return; 1140 1141 unsigned DstReg = Copy->getOperand(0).getReg(); 1142 if (!TargetRegisterInfo::isVirtualRegister(DstReg)) 1143 return; 1144 1145 // Check if either the dest or source is local. If it's live across a back 1146 // edge, it's not local. Note that if both vregs are live across the back 1147 // edge, we cannot successfully contrain the copy without cyclic scheduling. 1148 unsigned LocalReg = DstReg; 1149 unsigned GlobalReg = SrcReg; 1150 LiveInterval *LocalLI = &LIS->getInterval(LocalReg); 1151 if (!LocalLI->isLocal(RegionBeginIdx, RegionEndIdx)) { 1152 LocalReg = SrcReg; 1153 GlobalReg = DstReg; 1154 LocalLI = &LIS->getInterval(LocalReg); 1155 if (!LocalLI->isLocal(RegionBeginIdx, RegionEndIdx)) 1156 return; 1157 } 1158 LiveInterval *GlobalLI = &LIS->getInterval(GlobalReg); 1159 1160 // Find the global segment after the start of the local LI. 1161 LiveInterval::iterator GlobalSegment = GlobalLI->find(LocalLI->beginIndex()); 1162 // If GlobalLI does not overlap LocalLI->start, then a copy directly feeds a 1163 // local live range. We could create edges from other global uses to the local 1164 // start, but the coalescer should have already eliminated these cases, so 1165 // don't bother dealing with it. 1166 if (GlobalSegment == GlobalLI->end()) 1167 return; 1168 1169 // If GlobalSegment is killed at the LocalLI->start, the call to find() 1170 // returned the next global segment. But if GlobalSegment overlaps with 1171 // LocalLI->start, then advance to the next segement. If a hole in GlobalLI 1172 // exists in LocalLI's vicinity, GlobalSegment will be the end of the hole. 1173 if (GlobalSegment->contains(LocalLI->beginIndex())) 1174 ++GlobalSegment; 1175 1176 if (GlobalSegment == GlobalLI->end()) 1177 return; 1178 1179 // Check if GlobalLI contains a hole in the vicinity of LocalLI. 1180 if (GlobalSegment != GlobalLI->begin()) { 1181 // Two address defs have no hole. 1182 if (SlotIndex::isSameInstr(llvm::prior(GlobalSegment)->end, 1183 GlobalSegment->start)) { 1184 return; 1185 } 1186 // If the prior global segment may be defined by the same two-address 1187 // instruction that also defines LocalLI, then can't make a hole here. 1188 if (SlotIndex::isSameInstr(llvm::prior(GlobalSegment)->start, 1189 LocalLI->beginIndex())) { 1190 return; 1191 } 1192 // If GlobalLI has a prior segment, it must be live into the EBB. Otherwise 1193 // it would be a disconnected component in the live range. 1194 assert(llvm::prior(GlobalSegment)->start < LocalLI->beginIndex() && 1195 "Disconnected LRG within the scheduling region."); 1196 } 1197 MachineInstr *GlobalDef = LIS->getInstructionFromIndex(GlobalSegment->start); 1198 if (!GlobalDef) 1199 return; 1200 1201 SUnit *GlobalSU = DAG->getSUnit(GlobalDef); 1202 if (!GlobalSU) 1203 return; 1204 1205 // GlobalDef is the bottom of the GlobalLI hole. Open the hole by 1206 // constraining the uses of the last local def to precede GlobalDef. 1207 SmallVector<SUnit*,8> LocalUses; 1208 const VNInfo *LastLocalVN = LocalLI->getVNInfoBefore(LocalLI->endIndex()); 1209 MachineInstr *LastLocalDef = LIS->getInstructionFromIndex(LastLocalVN->def); 1210 SUnit *LastLocalSU = DAG->getSUnit(LastLocalDef); 1211 for (SUnit::const_succ_iterator 1212 I = LastLocalSU->Succs.begin(), E = LastLocalSU->Succs.end(); 1213 I != E; ++I) { 1214 if (I->getKind() != SDep::Data || I->getReg() != LocalReg) 1215 continue; 1216 if (I->getSUnit() == GlobalSU) 1217 continue; 1218 if (!DAG->canAddEdge(GlobalSU, I->getSUnit())) 1219 return; 1220 LocalUses.push_back(I->getSUnit()); 1221 } 1222 // Open the top of the GlobalLI hole by constraining any earlier global uses 1223 // to precede the start of LocalLI. 1224 SmallVector<SUnit*,8> GlobalUses; 1225 MachineInstr *FirstLocalDef = 1226 LIS->getInstructionFromIndex(LocalLI->beginIndex()); 1227 SUnit *FirstLocalSU = DAG->getSUnit(FirstLocalDef); 1228 for (SUnit::const_pred_iterator 1229 I = GlobalSU->Preds.begin(), E = GlobalSU->Preds.end(); I != E; ++I) { 1230 if (I->getKind() != SDep::Anti || I->getReg() != GlobalReg) 1231 continue; 1232 if (I->getSUnit() == FirstLocalSU) 1233 continue; 1234 if (!DAG->canAddEdge(FirstLocalSU, I->getSUnit())) 1235 return; 1236 GlobalUses.push_back(I->getSUnit()); 1237 } 1238 DEBUG(dbgs() << "Constraining copy SU(" << CopySU->NodeNum << ")\n"); 1239 // Add the weak edges. 1240 for (SmallVectorImpl<SUnit*>::const_iterator 1241 I = LocalUses.begin(), E = LocalUses.end(); I != E; ++I) { 1242 DEBUG(dbgs() << " Local use SU(" << (*I)->NodeNum << ") -> SU(" 1243 << GlobalSU->NodeNum << ")\n"); 1244 DAG->addEdge(GlobalSU, SDep(*I, SDep::Weak)); 1245 } 1246 for (SmallVectorImpl<SUnit*>::const_iterator 1247 I = GlobalUses.begin(), E = GlobalUses.end(); I != E; ++I) { 1248 DEBUG(dbgs() << " Global use SU(" << (*I)->NodeNum << ") -> SU(" 1249 << FirstLocalSU->NodeNum << ")\n"); 1250 DAG->addEdge(FirstLocalSU, SDep(*I, SDep::Weak)); 1251 } 1252 } 1253 1254 /// \brief Callback from DAG postProcessing to create weak edges to encourage 1255 /// copy elimination. 1256 void CopyConstrain::apply(ScheduleDAGMI *DAG) { 1257 MachineBasicBlock::iterator FirstPos = nextIfDebug(DAG->begin(), DAG->end()); 1258 if (FirstPos == DAG->end()) 1259 return; 1260 RegionBeginIdx = DAG->getLIS()->getInstructionIndex(&*FirstPos); 1261 RegionEndIdx = DAG->getLIS()->getInstructionIndex( 1262 &*priorNonDebug(DAG->end(), DAG->begin())); 1263 1264 for (unsigned Idx = 0, End = DAG->SUnits.size(); Idx != End; ++Idx) { 1265 SUnit *SU = &DAG->SUnits[Idx]; 1266 if (!SU->getInstr()->isCopy()) 1267 continue; 1268 1269 constrainLocalCopy(SU, DAG); 1270 } 1271 } 1272 1273 //===----------------------------------------------------------------------===// 1274 // ConvergingScheduler - Implementation of the generic MachineSchedStrategy. 1275 //===----------------------------------------------------------------------===// 1276 1277 namespace { 1278 /// ConvergingScheduler shrinks the unscheduled zone using heuristics to balance 1279 /// the schedule. 1280 class ConvergingScheduler : public MachineSchedStrategy { 1281 public: 1282 /// Represent the type of SchedCandidate found within a single queue. 1283 /// pickNodeBidirectional depends on these listed by decreasing priority. 1284 enum CandReason { 1285 NoCand, PhysRegCopy, RegExcess, RegCritical, Cluster, Weak, RegMax, 1286 ResourceReduce, ResourceDemand, BotHeightReduce, BotPathReduce, 1287 TopDepthReduce, TopPathReduce, NextDefUse, NodeOrder}; 1288 1289 #ifndef NDEBUG 1290 static const char *getReasonStr(ConvergingScheduler::CandReason Reason); 1291 #endif 1292 1293 /// Policy for scheduling the next instruction in the candidate's zone. 1294 struct CandPolicy { 1295 bool ReduceLatency; 1296 unsigned ReduceResIdx; 1297 unsigned DemandResIdx; 1298 1299 CandPolicy(): ReduceLatency(false), ReduceResIdx(0), DemandResIdx(0) {} 1300 }; 1301 1302 /// Status of an instruction's critical resource consumption. 1303 struct SchedResourceDelta { 1304 // Count critical resources in the scheduled region required by SU. 1305 unsigned CritResources; 1306 1307 // Count critical resources from another region consumed by SU. 1308 unsigned DemandedResources; 1309 1310 SchedResourceDelta(): CritResources(0), DemandedResources(0) {} 1311 1312 bool operator==(const SchedResourceDelta &RHS) const { 1313 return CritResources == RHS.CritResources 1314 && DemandedResources == RHS.DemandedResources; 1315 } 1316 bool operator!=(const SchedResourceDelta &RHS) const { 1317 return !operator==(RHS); 1318 } 1319 }; 1320 1321 /// Store the state used by ConvergingScheduler heuristics, required for the 1322 /// lifetime of one invocation of pickNode(). 1323 struct SchedCandidate { 1324 CandPolicy Policy; 1325 1326 // The best SUnit candidate. 1327 SUnit *SU; 1328 1329 // The reason for this candidate. 1330 CandReason Reason; 1331 1332 // Set of reasons that apply to multiple candidates. 1333 uint32_t RepeatReasonSet; 1334 1335 // Register pressure values for the best candidate. 1336 RegPressureDelta RPDelta; 1337 1338 // Critical resource consumption of the best candidate. 1339 SchedResourceDelta ResDelta; 1340 1341 SchedCandidate(const CandPolicy &policy) 1342 : Policy(policy), SU(NULL), Reason(NoCand), RepeatReasonSet(0) {} 1343 1344 bool isValid() const { return SU; } 1345 1346 // Copy the status of another candidate without changing policy. 1347 void setBest(SchedCandidate &Best) { 1348 assert(Best.Reason != NoCand && "uninitialized Sched candidate"); 1349 SU = Best.SU; 1350 Reason = Best.Reason; 1351 RPDelta = Best.RPDelta; 1352 ResDelta = Best.ResDelta; 1353 } 1354 1355 bool isRepeat(CandReason R) { return RepeatReasonSet & (1 << R); } 1356 void setRepeat(CandReason R) { RepeatReasonSet |= (1 << R); } 1357 1358 void initResourceDelta(const ScheduleDAGMI *DAG, 1359 const TargetSchedModel *SchedModel); 1360 }; 1361 1362 /// Summarize the unscheduled region. 1363 struct SchedRemainder { 1364 // Critical path through the DAG in expected latency. 1365 unsigned CriticalPath; 1366 unsigned CyclicCritPath; 1367 1368 // Scaled count of micro-ops left to schedule. 1369 unsigned RemIssueCount; 1370 1371 bool IsAcyclicLatencyLimited; 1372 1373 // Unscheduled resources 1374 SmallVector<unsigned, 16> RemainingCounts; 1375 1376 void reset() { 1377 CriticalPath = 0; 1378 CyclicCritPath = 0; 1379 RemIssueCount = 0; 1380 IsAcyclicLatencyLimited = false; 1381 RemainingCounts.clear(); 1382 } 1383 1384 SchedRemainder() { reset(); } 1385 1386 void init(ScheduleDAGMI *DAG, const TargetSchedModel *SchedModel); 1387 }; 1388 1389 /// Each Scheduling boundary is associated with ready queues. It tracks the 1390 /// current cycle in the direction of movement, and maintains the state 1391 /// of "hazards" and other interlocks at the current cycle. 1392 struct SchedBoundary { 1393 ScheduleDAGMI *DAG; 1394 const TargetSchedModel *SchedModel; 1395 SchedRemainder *Rem; 1396 1397 ReadyQueue Available; 1398 ReadyQueue Pending; 1399 bool CheckPending; 1400 1401 // For heuristics, keep a list of the nodes that immediately depend on the 1402 // most recently scheduled node. 1403 SmallPtrSet<const SUnit*, 8> NextSUs; 1404 1405 ScheduleHazardRecognizer *HazardRec; 1406 1407 /// Number of cycles it takes to issue the instructions scheduled in this 1408 /// zone. It is defined as: scheduled-micro-ops / issue-width + stalls. 1409 /// See getStalls(). 1410 unsigned CurrCycle; 1411 1412 /// Micro-ops issued in the current cycle 1413 unsigned CurrMOps; 1414 1415 /// MinReadyCycle - Cycle of the soonest available instruction. 1416 unsigned MinReadyCycle; 1417 1418 // The expected latency of the critical path in this scheduled zone. 1419 unsigned ExpectedLatency; 1420 1421 // The latency of dependence chains leading into this zone. 1422 // For each node scheduled bottom-up: DLat = max DLat, N.Depth. 1423 // For each cycle scheduled: DLat -= 1. 1424 unsigned DependentLatency; 1425 1426 /// Count the scheduled (issued) micro-ops that can be retired by 1427 /// time=CurrCycle assuming the first scheduled instr is retired at time=0. 1428 unsigned RetiredMOps; 1429 1430 // Count scheduled resources that have been executed. Resources are 1431 // considered executed if they become ready in the time that it takes to 1432 // saturate any resource including the one in question. Counts are scaled 1433 // for direct comparison with other resources. Counts can be compared with 1434 // MOps * getMicroOpFactor and Latency * getLatencyFactor. 1435 SmallVector<unsigned, 16> ExecutedResCounts; 1436 1437 /// Cache the max count for a single resource. 1438 unsigned MaxExecutedResCount; 1439 1440 // Cache the critical resources ID in this scheduled zone. 1441 unsigned ZoneCritResIdx; 1442 1443 // Is the scheduled region resource limited vs. latency limited. 1444 bool IsResourceLimited; 1445 1446 #ifndef NDEBUG 1447 // Remember the greatest operand latency as an upper bound on the number of 1448 // times we should retry the pending queue because of a hazard. 1449 unsigned MaxObservedLatency; 1450 #endif 1451 1452 void reset() { 1453 // A new HazardRec is created for each DAG and owned by SchedBoundary. 1454 delete HazardRec; 1455 1456 Available.clear(); 1457 Pending.clear(); 1458 CheckPending = false; 1459 NextSUs.clear(); 1460 HazardRec = 0; 1461 CurrCycle = 0; 1462 CurrMOps = 0; 1463 MinReadyCycle = UINT_MAX; 1464 ExpectedLatency = 0; 1465 DependentLatency = 0; 1466 RetiredMOps = 0; 1467 MaxExecutedResCount = 0; 1468 ZoneCritResIdx = 0; 1469 IsResourceLimited = false; 1470 #ifndef NDEBUG 1471 MaxObservedLatency = 0; 1472 #endif 1473 // Reserve a zero-count for invalid CritResIdx. 1474 ExecutedResCounts.resize(1); 1475 assert(!ExecutedResCounts[0] && "nonzero count for bad resource"); 1476 } 1477 1478 /// Pending queues extend the ready queues with the same ID and the 1479 /// PendingFlag set. 1480 SchedBoundary(unsigned ID, const Twine &Name): 1481 DAG(0), SchedModel(0), Rem(0), Available(ID, Name+".A"), 1482 Pending(ID << ConvergingScheduler::LogMaxQID, Name+".P"), 1483 HazardRec(0) { 1484 reset(); 1485 } 1486 1487 ~SchedBoundary() { delete HazardRec; } 1488 1489 void init(ScheduleDAGMI *dag, const TargetSchedModel *smodel, 1490 SchedRemainder *rem); 1491 1492 bool isTop() const { 1493 return Available.getID() == ConvergingScheduler::TopQID; 1494 } 1495 1496 #ifndef NDEBUG 1497 const char *getResourceName(unsigned PIdx) { 1498 if (!PIdx) 1499 return "MOps"; 1500 return SchedModel->getProcResource(PIdx)->Name; 1501 } 1502 #endif 1503 1504 /// Get the number of latency cycles "covered" by the scheduled 1505 /// instructions. This is the larger of the critical path within the zone 1506 /// and the number of cycles required to issue the instructions. 1507 unsigned getScheduledLatency() const { 1508 return std::max(ExpectedLatency, CurrCycle); 1509 } 1510 1511 unsigned getUnscheduledLatency(SUnit *SU) const { 1512 return isTop() ? SU->getHeight() : SU->getDepth(); 1513 } 1514 1515 unsigned getResourceCount(unsigned ResIdx) const { 1516 return ExecutedResCounts[ResIdx]; 1517 } 1518 1519 /// Get the scaled count of scheduled micro-ops and resources, including 1520 /// executed resources. 1521 unsigned getCriticalCount() const { 1522 if (!ZoneCritResIdx) 1523 return RetiredMOps * SchedModel->getMicroOpFactor(); 1524 return getResourceCount(ZoneCritResIdx); 1525 } 1526 1527 /// Get a scaled count for the minimum execution time of the scheduled 1528 /// micro-ops that are ready to execute by getExecutedCount. Notice the 1529 /// feedback loop. 1530 unsigned getExecutedCount() const { 1531 return std::max(CurrCycle * SchedModel->getLatencyFactor(), 1532 MaxExecutedResCount); 1533 } 1534 1535 bool checkHazard(SUnit *SU); 1536 1537 unsigned findMaxLatency(ArrayRef<SUnit*> ReadySUs); 1538 1539 unsigned getOtherResourceCount(unsigned &OtherCritIdx); 1540 1541 void setPolicy(CandPolicy &Policy, SchedBoundary &OtherZone); 1542 1543 void releaseNode(SUnit *SU, unsigned ReadyCycle); 1544 1545 void bumpCycle(unsigned NextCycle); 1546 1547 void incExecutedResources(unsigned PIdx, unsigned Count); 1548 1549 unsigned countResource(unsigned PIdx, unsigned Cycles, unsigned ReadyCycle); 1550 1551 void bumpNode(SUnit *SU); 1552 1553 void releasePending(); 1554 1555 void removeReady(SUnit *SU); 1556 1557 SUnit *pickOnlyChoice(); 1558 1559 #ifndef NDEBUG 1560 void dumpScheduledState(); 1561 #endif 1562 }; 1563 1564 private: 1565 ScheduleDAGMI *DAG; 1566 const TargetSchedModel *SchedModel; 1567 const TargetRegisterInfo *TRI; 1568 1569 // State of the top and bottom scheduled instruction boundaries. 1570 SchedRemainder Rem; 1571 SchedBoundary Top; 1572 SchedBoundary Bot; 1573 1574 public: 1575 /// SUnit::NodeQueueId: 0 (none), 1 (top), 2 (bot), 3 (both) 1576 enum { 1577 TopQID = 1, 1578 BotQID = 2, 1579 LogMaxQID = 2 1580 }; 1581 1582 ConvergingScheduler(): 1583 DAG(0), SchedModel(0), TRI(0), Top(TopQID, "TopQ"), Bot(BotQID, "BotQ") {} 1584 1585 virtual void initialize(ScheduleDAGMI *dag); 1586 1587 virtual SUnit *pickNode(bool &IsTopNode); 1588 1589 virtual void schedNode(SUnit *SU, bool IsTopNode); 1590 1591 virtual void releaseTopNode(SUnit *SU); 1592 1593 virtual void releaseBottomNode(SUnit *SU); 1594 1595 virtual void registerRoots(); 1596 1597 protected: 1598 void checkAcyclicLatency(); 1599 1600 void tryCandidate(SchedCandidate &Cand, 1601 SchedCandidate &TryCand, 1602 SchedBoundary &Zone, 1603 const RegPressureTracker &RPTracker, 1604 RegPressureTracker &TempTracker); 1605 1606 SUnit *pickNodeBidirectional(bool &IsTopNode); 1607 1608 void pickNodeFromQueue(SchedBoundary &Zone, 1609 const RegPressureTracker &RPTracker, 1610 SchedCandidate &Candidate); 1611 1612 void reschedulePhysRegCopies(SUnit *SU, bool isTop); 1613 1614 #ifndef NDEBUG 1615 void traceCandidate(const SchedCandidate &Cand); 1616 #endif 1617 }; 1618 } // namespace 1619 1620 void ConvergingScheduler::SchedRemainder:: 1621 init(ScheduleDAGMI *DAG, const TargetSchedModel *SchedModel) { 1622 reset(); 1623 if (!SchedModel->hasInstrSchedModel()) 1624 return; 1625 RemainingCounts.resize(SchedModel->getNumProcResourceKinds()); 1626 for (std::vector<SUnit>::iterator 1627 I = DAG->SUnits.begin(), E = DAG->SUnits.end(); I != E; ++I) { 1628 const MCSchedClassDesc *SC = DAG->getSchedClass(&*I); 1629 RemIssueCount += SchedModel->getNumMicroOps(I->getInstr(), SC) 1630 * SchedModel->getMicroOpFactor(); 1631 for (TargetSchedModel::ProcResIter 1632 PI = SchedModel->getWriteProcResBegin(SC), 1633 PE = SchedModel->getWriteProcResEnd(SC); PI != PE; ++PI) { 1634 unsigned PIdx = PI->ProcResourceIdx; 1635 unsigned Factor = SchedModel->getResourceFactor(PIdx); 1636 RemainingCounts[PIdx] += (Factor * PI->Cycles); 1637 } 1638 } 1639 } 1640 1641 void ConvergingScheduler::SchedBoundary:: 1642 init(ScheduleDAGMI *dag, const TargetSchedModel *smodel, SchedRemainder *rem) { 1643 reset(); 1644 DAG = dag; 1645 SchedModel = smodel; 1646 Rem = rem; 1647 if (SchedModel->hasInstrSchedModel()) 1648 ExecutedResCounts.resize(SchedModel->getNumProcResourceKinds()); 1649 } 1650 1651 void ConvergingScheduler::initialize(ScheduleDAGMI *dag) { 1652 DAG = dag; 1653 SchedModel = DAG->getSchedModel(); 1654 TRI = DAG->TRI; 1655 1656 Rem.init(DAG, SchedModel); 1657 Top.init(DAG, SchedModel, &Rem); 1658 Bot.init(DAG, SchedModel, &Rem); 1659 1660 // Initialize resource counts. 1661 1662 // Initialize the HazardRecognizers. If itineraries don't exist, are empty, or 1663 // are disabled, then these HazardRecs will be disabled. 1664 const InstrItineraryData *Itin = SchedModel->getInstrItineraries(); 1665 const TargetMachine &TM = DAG->MF.getTarget(); 1666 Top.HazardRec = TM.getInstrInfo()->CreateTargetMIHazardRecognizer(Itin, DAG); 1667 Bot.HazardRec = TM.getInstrInfo()->CreateTargetMIHazardRecognizer(Itin, DAG); 1668 1669 assert((!ForceTopDown || !ForceBottomUp) && 1670 "-misched-topdown incompatible with -misched-bottomup"); 1671 } 1672 1673 void ConvergingScheduler::releaseTopNode(SUnit *SU) { 1674 if (SU->isScheduled) 1675 return; 1676 1677 for (SUnit::pred_iterator I = SU->Preds.begin(), E = SU->Preds.end(); 1678 I != E; ++I) { 1679 if (I->isWeak()) 1680 continue; 1681 unsigned PredReadyCycle = I->getSUnit()->TopReadyCycle; 1682 unsigned Latency = I->getLatency(); 1683 #ifndef NDEBUG 1684 Top.MaxObservedLatency = std::max(Latency, Top.MaxObservedLatency); 1685 #endif 1686 if (SU->TopReadyCycle < PredReadyCycle + Latency) 1687 SU->TopReadyCycle = PredReadyCycle + Latency; 1688 } 1689 Top.releaseNode(SU, SU->TopReadyCycle); 1690 } 1691 1692 void ConvergingScheduler::releaseBottomNode(SUnit *SU) { 1693 if (SU->isScheduled) 1694 return; 1695 1696 assert(SU->getInstr() && "Scheduled SUnit must have instr"); 1697 1698 for (SUnit::succ_iterator I = SU->Succs.begin(), E = SU->Succs.end(); 1699 I != E; ++I) { 1700 if (I->isWeak()) 1701 continue; 1702 unsigned SuccReadyCycle = I->getSUnit()->BotReadyCycle; 1703 unsigned Latency = I->getLatency(); 1704 #ifndef NDEBUG 1705 Bot.MaxObservedLatency = std::max(Latency, Bot.MaxObservedLatency); 1706 #endif 1707 if (SU->BotReadyCycle < SuccReadyCycle + Latency) 1708 SU->BotReadyCycle = SuccReadyCycle + Latency; 1709 } 1710 Bot.releaseNode(SU, SU->BotReadyCycle); 1711 } 1712 1713 /// Set IsAcyclicLatencyLimited if the acyclic path is longer than the cyclic 1714 /// critical path by more cycles than it takes to drain the instruction buffer. 1715 /// We estimate an upper bounds on in-flight instructions as: 1716 /// 1717 /// CyclesPerIteration = max( CyclicPath, Loop-Resource-Height ) 1718 /// InFlightIterations = AcyclicPath / CyclesPerIteration 1719 /// InFlightResources = InFlightIterations * LoopResources 1720 /// 1721 /// TODO: Check execution resources in addition to IssueCount. 1722 void ConvergingScheduler::checkAcyclicLatency() { 1723 if (Rem.CyclicCritPath == 0 || Rem.CyclicCritPath >= Rem.CriticalPath) 1724 return; 1725 1726 // Scaled number of cycles per loop iteration. 1727 unsigned IterCount = 1728 std::max(Rem.CyclicCritPath * SchedModel->getLatencyFactor(), 1729 Rem.RemIssueCount); 1730 // Scaled acyclic critical path. 1731 unsigned AcyclicCount = Rem.CriticalPath * SchedModel->getLatencyFactor(); 1732 // InFlightCount = (AcyclicPath / IterCycles) * InstrPerLoop 1733 unsigned InFlightCount = 1734 (AcyclicCount * Rem.RemIssueCount + IterCount-1) / IterCount; 1735 unsigned BufferLimit = 1736 SchedModel->getMicroOpBufferSize() * SchedModel->getMicroOpFactor(); 1737 1738 Rem.IsAcyclicLatencyLimited = InFlightCount > BufferLimit; 1739 1740 DEBUG(dbgs() << "IssueCycles=" 1741 << Rem.RemIssueCount / SchedModel->getLatencyFactor() << "c " 1742 << "IterCycles=" << IterCount / SchedModel->getLatencyFactor() 1743 << "c NumIters=" << (AcyclicCount + IterCount-1) / IterCount 1744 << " InFlight=" << InFlightCount / SchedModel->getMicroOpFactor() 1745 << "m BufferLim=" << SchedModel->getMicroOpBufferSize() << "m\n"; 1746 if (Rem.IsAcyclicLatencyLimited) 1747 dbgs() << " ACYCLIC LATENCY LIMIT\n"); 1748 } 1749 1750 void ConvergingScheduler::registerRoots() { 1751 Rem.CriticalPath = DAG->ExitSU.getDepth(); 1752 1753 // Some roots may not feed into ExitSU. Check all of them in case. 1754 for (std::vector<SUnit*>::const_iterator 1755 I = Bot.Available.begin(), E = Bot.Available.end(); I != E; ++I) { 1756 if ((*I)->getDepth() > Rem.CriticalPath) 1757 Rem.CriticalPath = (*I)->getDepth(); 1758 } 1759 DEBUG(dbgs() << "Critical Path: " << Rem.CriticalPath << '\n'); 1760 1761 if (EnableCyclicPath) { 1762 Rem.CyclicCritPath = DAG->computeCyclicCriticalPath(); 1763 checkAcyclicLatency(); 1764 } 1765 } 1766 1767 /// Does this SU have a hazard within the current instruction group. 1768 /// 1769 /// The scheduler supports two modes of hazard recognition. The first is the 1770 /// ScheduleHazardRecognizer API. It is a fully general hazard recognizer that 1771 /// supports highly complicated in-order reservation tables 1772 /// (ScoreboardHazardRecognizer) and arbitraty target-specific logic. 1773 /// 1774 /// The second is a streamlined mechanism that checks for hazards based on 1775 /// simple counters that the scheduler itself maintains. It explicitly checks 1776 /// for instruction dispatch limitations, including the number of micro-ops that 1777 /// can dispatch per cycle. 1778 /// 1779 /// TODO: Also check whether the SU must start a new group. 1780 bool ConvergingScheduler::SchedBoundary::checkHazard(SUnit *SU) { 1781 if (HazardRec->isEnabled()) 1782 return HazardRec->getHazardType(SU) != ScheduleHazardRecognizer::NoHazard; 1783 1784 unsigned uops = SchedModel->getNumMicroOps(SU->getInstr()); 1785 if ((CurrMOps > 0) && (CurrMOps + uops > SchedModel->getIssueWidth())) { 1786 DEBUG(dbgs() << " SU(" << SU->NodeNum << ") uops=" 1787 << SchedModel->getNumMicroOps(SU->getInstr()) << '\n'); 1788 return true; 1789 } 1790 return false; 1791 } 1792 1793 // Find the unscheduled node in ReadySUs with the highest latency. 1794 unsigned ConvergingScheduler::SchedBoundary:: 1795 findMaxLatency(ArrayRef<SUnit*> ReadySUs) { 1796 SUnit *LateSU = 0; 1797 unsigned RemLatency = 0; 1798 for (ArrayRef<SUnit*>::iterator I = ReadySUs.begin(), E = ReadySUs.end(); 1799 I != E; ++I) { 1800 unsigned L = getUnscheduledLatency(*I); 1801 if (L > RemLatency) { 1802 RemLatency = L; 1803 LateSU = *I; 1804 } 1805 } 1806 if (LateSU) { 1807 DEBUG(dbgs() << Available.getName() << " RemLatency SU(" 1808 << LateSU->NodeNum << ") " << RemLatency << "c\n"); 1809 } 1810 return RemLatency; 1811 } 1812 1813 // Count resources in this zone and the remaining unscheduled 1814 // instruction. Return the max count, scaled. Set OtherCritIdx to the critical 1815 // resource index, or zero if the zone is issue limited. 1816 unsigned ConvergingScheduler::SchedBoundary:: 1817 getOtherResourceCount(unsigned &OtherCritIdx) { 1818 OtherCritIdx = 0; 1819 if (!SchedModel->hasInstrSchedModel()) 1820 return 0; 1821 1822 unsigned OtherCritCount = Rem->RemIssueCount 1823 + (RetiredMOps * SchedModel->getMicroOpFactor()); 1824 DEBUG(dbgs() << " " << Available.getName() << " + Remain MOps: " 1825 << OtherCritCount / SchedModel->getMicroOpFactor() << '\n'); 1826 for (unsigned PIdx = 1, PEnd = SchedModel->getNumProcResourceKinds(); 1827 PIdx != PEnd; ++PIdx) { 1828 unsigned OtherCount = getResourceCount(PIdx) + Rem->RemainingCounts[PIdx]; 1829 if (OtherCount > OtherCritCount) { 1830 OtherCritCount = OtherCount; 1831 OtherCritIdx = PIdx; 1832 } 1833 } 1834 if (OtherCritIdx) { 1835 DEBUG(dbgs() << " " << Available.getName() << " + Remain CritRes: " 1836 << OtherCritCount / SchedModel->getResourceFactor(OtherCritIdx) 1837 << " " << getResourceName(OtherCritIdx) << "\n"); 1838 } 1839 return OtherCritCount; 1840 } 1841 1842 /// Set the CandPolicy for this zone given the current resources and latencies 1843 /// inside and outside the zone. 1844 void ConvergingScheduler::SchedBoundary::setPolicy(CandPolicy &Policy, 1845 SchedBoundary &OtherZone) { 1846 // Now that potential stalls have been considered, apply preemptive heuristics 1847 // based on the the total latency and resources inside and outside this 1848 // zone. 1849 1850 // Compute remaining latency. We need this both to determine whether the 1851 // overall schedule has become latency-limited and whether the instructions 1852 // outside this zone are resource or latency limited. 1853 // 1854 // The "dependent" latency is updated incrementally during scheduling as the 1855 // max height/depth of scheduled nodes minus the cycles since it was 1856 // scheduled: 1857 // DLat = max (N.depth - (CurrCycle - N.ReadyCycle) for N in Zone 1858 // 1859 // The "independent" latency is the max ready queue depth: 1860 // ILat = max N.depth for N in Available|Pending 1861 // 1862 // RemainingLatency is the greater of independent and dependent latency. 1863 unsigned RemLatency = DependentLatency; 1864 RemLatency = std::max(RemLatency, findMaxLatency(Available.elements())); 1865 RemLatency = std::max(RemLatency, findMaxLatency(Pending.elements())); 1866 1867 // Compute the critical resource outside the zone. 1868 unsigned OtherCritIdx; 1869 unsigned OtherCount = OtherZone.getOtherResourceCount(OtherCritIdx); 1870 1871 bool OtherResLimited = false; 1872 if (SchedModel->hasInstrSchedModel()) { 1873 unsigned LFactor = SchedModel->getLatencyFactor(); 1874 OtherResLimited = (int)(OtherCount - (RemLatency * LFactor)) > (int)LFactor; 1875 } 1876 if (!OtherResLimited && (RemLatency + CurrCycle > Rem->CriticalPath)) { 1877 Policy.ReduceLatency |= true; 1878 DEBUG(dbgs() << " " << Available.getName() << " RemainingLatency " 1879 << RemLatency << " + " << CurrCycle << "c > CritPath " 1880 << Rem->CriticalPath << "\n"); 1881 } 1882 // If the same resource is limiting inside and outside the zone, do nothing. 1883 if (ZoneCritResIdx == OtherCritIdx) 1884 return; 1885 1886 DEBUG( 1887 if (IsResourceLimited) { 1888 dbgs() << " " << Available.getName() << " ResourceLimited: " 1889 << getResourceName(ZoneCritResIdx) << "\n"; 1890 } 1891 if (OtherResLimited) 1892 dbgs() << " RemainingLimit: " << getResourceName(OtherCritIdx) << "\n"; 1893 if (!IsResourceLimited && !OtherResLimited) 1894 dbgs() << " Latency limited both directions.\n"); 1895 1896 if (IsResourceLimited && !Policy.ReduceResIdx) 1897 Policy.ReduceResIdx = ZoneCritResIdx; 1898 1899 if (OtherResLimited) 1900 Policy.DemandResIdx = OtherCritIdx; 1901 } 1902 1903 void ConvergingScheduler::SchedBoundary::releaseNode(SUnit *SU, 1904 unsigned ReadyCycle) { 1905 if (ReadyCycle < MinReadyCycle) 1906 MinReadyCycle = ReadyCycle; 1907 1908 // Check for interlocks first. For the purpose of other heuristics, an 1909 // instruction that cannot issue appears as if it's not in the ReadyQueue. 1910 bool IsBuffered = SchedModel->getMicroOpBufferSize() != 0; 1911 if ((!IsBuffered && ReadyCycle > CurrCycle) || checkHazard(SU)) 1912 Pending.push(SU); 1913 else 1914 Available.push(SU); 1915 1916 // Record this node as an immediate dependent of the scheduled node. 1917 NextSUs.insert(SU); 1918 } 1919 1920 /// Move the boundary of scheduled code by one cycle. 1921 void ConvergingScheduler::SchedBoundary::bumpCycle(unsigned NextCycle) { 1922 if (SchedModel->getMicroOpBufferSize() == 0) { 1923 assert(MinReadyCycle < UINT_MAX && "MinReadyCycle uninitialized"); 1924 if (MinReadyCycle > NextCycle) 1925 NextCycle = MinReadyCycle; 1926 } 1927 // Update the current micro-ops, which will issue in the next cycle. 1928 unsigned DecMOps = SchedModel->getIssueWidth() * (NextCycle - CurrCycle); 1929 CurrMOps = (CurrMOps <= DecMOps) ? 0 : CurrMOps - DecMOps; 1930 1931 // Decrement DependentLatency based on the next cycle. 1932 if ((NextCycle - CurrCycle) > DependentLatency) 1933 DependentLatency = 0; 1934 else 1935 DependentLatency -= (NextCycle - CurrCycle); 1936 1937 if (!HazardRec->isEnabled()) { 1938 // Bypass HazardRec virtual calls. 1939 CurrCycle = NextCycle; 1940 } 1941 else { 1942 // Bypass getHazardType calls in case of long latency. 1943 for (; CurrCycle != NextCycle; ++CurrCycle) { 1944 if (isTop()) 1945 HazardRec->AdvanceCycle(); 1946 else 1947 HazardRec->RecedeCycle(); 1948 } 1949 } 1950 CheckPending = true; 1951 unsigned LFactor = SchedModel->getLatencyFactor(); 1952 IsResourceLimited = 1953 (int)(getCriticalCount() - (getScheduledLatency() * LFactor)) 1954 > (int)LFactor; 1955 1956 DEBUG(dbgs() << "Cycle: " << CurrCycle << ' ' << Available.getName() << '\n'); 1957 } 1958 1959 void ConvergingScheduler::SchedBoundary::incExecutedResources(unsigned PIdx, 1960 unsigned Count) { 1961 ExecutedResCounts[PIdx] += Count; 1962 if (ExecutedResCounts[PIdx] > MaxExecutedResCount) 1963 MaxExecutedResCount = ExecutedResCounts[PIdx]; 1964 } 1965 1966 /// Add the given processor resource to this scheduled zone. 1967 /// 1968 /// \param Cycles indicates the number of consecutive (non-pipelined) cycles 1969 /// during which this resource is consumed. 1970 /// 1971 /// \return the next cycle at which the instruction may execute without 1972 /// oversubscribing resources. 1973 unsigned ConvergingScheduler::SchedBoundary:: 1974 countResource(unsigned PIdx, unsigned Cycles, unsigned ReadyCycle) { 1975 unsigned Factor = SchedModel->getResourceFactor(PIdx); 1976 unsigned Count = Factor * Cycles; 1977 DEBUG(dbgs() << " " << getResourceName(PIdx) 1978 << " +" << Cycles << "x" << Factor << "u\n"); 1979 1980 // Update Executed resources counts. 1981 incExecutedResources(PIdx, Count); 1982 assert(Rem->RemainingCounts[PIdx] >= Count && "resource double counted"); 1983 Rem->RemainingCounts[PIdx] -= Count; 1984 1985 // Check if this resource exceeds the current critical resource. If so, it 1986 // becomes the critical resource. 1987 if (ZoneCritResIdx != PIdx && (getResourceCount(PIdx) > getCriticalCount())) { 1988 ZoneCritResIdx = PIdx; 1989 DEBUG(dbgs() << " *** Critical resource " 1990 << getResourceName(PIdx) << ": " 1991 << getResourceCount(PIdx) / SchedModel->getLatencyFactor() << "c\n"); 1992 } 1993 // TODO: We don't yet model reserved resources. It's not hard though. 1994 return CurrCycle; 1995 } 1996 1997 /// Move the boundary of scheduled code by one SUnit. 1998 void ConvergingScheduler::SchedBoundary::bumpNode(SUnit *SU) { 1999 // Update the reservation table. 2000 if (HazardRec->isEnabled()) { 2001 if (!isTop() && SU->isCall) { 2002 // Calls are scheduled with their preceding instructions. For bottom-up 2003 // scheduling, clear the pipeline state before emitting. 2004 HazardRec->Reset(); 2005 } 2006 HazardRec->EmitInstruction(SU); 2007 } 2008 const MCSchedClassDesc *SC = DAG->getSchedClass(SU); 2009 unsigned IncMOps = SchedModel->getNumMicroOps(SU->getInstr()); 2010 CurrMOps += IncMOps; 2011 // checkHazard prevents scheduling multiple instructions per cycle that exceed 2012 // issue width. However, we commonly reach the maximum. In this case 2013 // opportunistically bump the cycle to avoid uselessly checking everything in 2014 // the readyQ. Furthermore, a single instruction may produce more than one 2015 // cycle's worth of micro-ops. 2016 // 2017 // TODO: Also check if this SU must end a dispatch group. 2018 unsigned NextCycle = CurrCycle; 2019 if (CurrMOps >= SchedModel->getIssueWidth()) { 2020 ++NextCycle; 2021 DEBUG(dbgs() << " *** Max MOps " << CurrMOps 2022 << " at cycle " << CurrCycle << '\n'); 2023 } 2024 unsigned ReadyCycle = (isTop() ? SU->TopReadyCycle : SU->BotReadyCycle); 2025 DEBUG(dbgs() << " Ready @" << ReadyCycle << "c\n"); 2026 2027 switch (SchedModel->getMicroOpBufferSize()) { 2028 case 0: 2029 assert(ReadyCycle <= CurrCycle && "Broken PendingQueue"); 2030 break; 2031 case 1: 2032 if (ReadyCycle > NextCycle) { 2033 NextCycle = ReadyCycle; 2034 DEBUG(dbgs() << " *** Stall until: " << ReadyCycle << "\n"); 2035 } 2036 break; 2037 default: 2038 // We don't currently model the OOO reorder buffer, so consider all 2039 // scheduled MOps to be "retired". 2040 break; 2041 } 2042 RetiredMOps += IncMOps; 2043 2044 // Update resource counts and critical resource. 2045 if (SchedModel->hasInstrSchedModel()) { 2046 unsigned DecRemIssue = IncMOps * SchedModel->getMicroOpFactor(); 2047 assert(Rem->RemIssueCount >= DecRemIssue && "MOps double counted"); 2048 Rem->RemIssueCount -= DecRemIssue; 2049 if (ZoneCritResIdx) { 2050 // Scale scheduled micro-ops for comparing with the critical resource. 2051 unsigned ScaledMOps = 2052 RetiredMOps * SchedModel->getMicroOpFactor(); 2053 2054 // If scaled micro-ops are now more than the previous critical resource by 2055 // a full cycle, then micro-ops issue becomes critical. 2056 if ((int)(ScaledMOps - getResourceCount(ZoneCritResIdx)) 2057 >= (int)SchedModel->getLatencyFactor()) { 2058 ZoneCritResIdx = 0; 2059 DEBUG(dbgs() << " *** Critical resource NumMicroOps: " 2060 << ScaledMOps / SchedModel->getLatencyFactor() << "c\n"); 2061 } 2062 } 2063 for (TargetSchedModel::ProcResIter 2064 PI = SchedModel->getWriteProcResBegin(SC), 2065 PE = SchedModel->getWriteProcResEnd(SC); PI != PE; ++PI) { 2066 unsigned RCycle = 2067 countResource(PI->ProcResourceIdx, PI->Cycles, ReadyCycle); 2068 if (RCycle > NextCycle) 2069 NextCycle = RCycle; 2070 } 2071 } 2072 // Update ExpectedLatency and DependentLatency. 2073 unsigned &TopLatency = isTop() ? ExpectedLatency : DependentLatency; 2074 unsigned &BotLatency = isTop() ? DependentLatency : ExpectedLatency; 2075 if (SU->getDepth() > TopLatency) { 2076 TopLatency = SU->getDepth(); 2077 DEBUG(dbgs() << " " << Available.getName() 2078 << " TopLatency SU(" << SU->NodeNum << ") " << TopLatency << "c\n"); 2079 } 2080 if (SU->getHeight() > BotLatency) { 2081 BotLatency = SU->getHeight(); 2082 DEBUG(dbgs() << " " << Available.getName() 2083 << " BotLatency SU(" << SU->NodeNum << ") " << BotLatency << "c\n"); 2084 } 2085 // If we stall for any reason, bump the cycle. 2086 if (NextCycle > CurrCycle) { 2087 bumpCycle(NextCycle); 2088 } 2089 else { 2090 // After updating ZoneCritResIdx and ExpectedLatency, check if we're 2091 // resource limited. If a stall occured, bumpCycle does this. 2092 unsigned LFactor = SchedModel->getLatencyFactor(); 2093 IsResourceLimited = 2094 (int)(getCriticalCount() - (getScheduledLatency() * LFactor)) 2095 > (int)LFactor; 2096 } 2097 DEBUG(dumpScheduledState()); 2098 } 2099 2100 /// Release pending ready nodes in to the available queue. This makes them 2101 /// visible to heuristics. 2102 void ConvergingScheduler::SchedBoundary::releasePending() { 2103 // If the available queue is empty, it is safe to reset MinReadyCycle. 2104 if (Available.empty()) 2105 MinReadyCycle = UINT_MAX; 2106 2107 // Check to see if any of the pending instructions are ready to issue. If 2108 // so, add them to the available queue. 2109 bool IsBuffered = SchedModel->getMicroOpBufferSize() != 0; 2110 for (unsigned i = 0, e = Pending.size(); i != e; ++i) { 2111 SUnit *SU = *(Pending.begin()+i); 2112 unsigned ReadyCycle = isTop() ? SU->TopReadyCycle : SU->BotReadyCycle; 2113 2114 if (ReadyCycle < MinReadyCycle) 2115 MinReadyCycle = ReadyCycle; 2116 2117 if (!IsBuffered && ReadyCycle > CurrCycle) 2118 continue; 2119 2120 if (checkHazard(SU)) 2121 continue; 2122 2123 Available.push(SU); 2124 Pending.remove(Pending.begin()+i); 2125 --i; --e; 2126 } 2127 DEBUG(if (!Pending.empty()) Pending.dump()); 2128 CheckPending = false; 2129 } 2130 2131 /// Remove SU from the ready set for this boundary. 2132 void ConvergingScheduler::SchedBoundary::removeReady(SUnit *SU) { 2133 if (Available.isInQueue(SU)) 2134 Available.remove(Available.find(SU)); 2135 else { 2136 assert(Pending.isInQueue(SU) && "bad ready count"); 2137 Pending.remove(Pending.find(SU)); 2138 } 2139 } 2140 2141 /// If this queue only has one ready candidate, return it. As a side effect, 2142 /// defer any nodes that now hit a hazard, and advance the cycle until at least 2143 /// one node is ready. If multiple instructions are ready, return NULL. 2144 SUnit *ConvergingScheduler::SchedBoundary::pickOnlyChoice() { 2145 if (CheckPending) 2146 releasePending(); 2147 2148 if (CurrMOps > 0) { 2149 // Defer any ready instrs that now have a hazard. 2150 for (ReadyQueue::iterator I = Available.begin(); I != Available.end();) { 2151 if (checkHazard(*I)) { 2152 Pending.push(*I); 2153 I = Available.remove(I); 2154 continue; 2155 } 2156 ++I; 2157 } 2158 } 2159 for (unsigned i = 0; Available.empty(); ++i) { 2160 assert(i <= (HazardRec->getMaxLookAhead() + MaxObservedLatency) && 2161 "permanent hazard"); (void)i; 2162 bumpCycle(CurrCycle + 1); 2163 releasePending(); 2164 } 2165 if (Available.size() == 1) 2166 return *Available.begin(); 2167 return NULL; 2168 } 2169 2170 #ifndef NDEBUG 2171 // This is useful information to dump after bumpNode. 2172 // Note that the Queue contents are more useful before pickNodeFromQueue. 2173 void ConvergingScheduler::SchedBoundary::dumpScheduledState() { 2174 unsigned ResFactor; 2175 unsigned ResCount; 2176 if (ZoneCritResIdx) { 2177 ResFactor = SchedModel->getResourceFactor(ZoneCritResIdx); 2178 ResCount = getResourceCount(ZoneCritResIdx); 2179 } 2180 else { 2181 ResFactor = SchedModel->getMicroOpFactor(); 2182 ResCount = RetiredMOps * SchedModel->getMicroOpFactor(); 2183 } 2184 unsigned LFactor = SchedModel->getLatencyFactor(); 2185 dbgs() << Available.getName() << " @" << CurrCycle << "c\n" 2186 << " Retired: " << RetiredMOps; 2187 dbgs() << "\n Executed: " << getExecutedCount() / LFactor << "c"; 2188 dbgs() << "\n Critical: " << ResCount / LFactor << "c, " 2189 << ResCount / ResFactor << " " << getResourceName(ZoneCritResIdx) 2190 << "\n ExpectedLatency: " << ExpectedLatency << "c\n" 2191 << (IsResourceLimited ? " - Resource" : " - Latency") 2192 << " limited.\n"; 2193 } 2194 #endif 2195 2196 void ConvergingScheduler::SchedCandidate:: 2197 initResourceDelta(const ScheduleDAGMI *DAG, 2198 const TargetSchedModel *SchedModel) { 2199 if (!Policy.ReduceResIdx && !Policy.DemandResIdx) 2200 return; 2201 2202 const MCSchedClassDesc *SC = DAG->getSchedClass(SU); 2203 for (TargetSchedModel::ProcResIter 2204 PI = SchedModel->getWriteProcResBegin(SC), 2205 PE = SchedModel->getWriteProcResEnd(SC); PI != PE; ++PI) { 2206 if (PI->ProcResourceIdx == Policy.ReduceResIdx) 2207 ResDelta.CritResources += PI->Cycles; 2208 if (PI->ProcResourceIdx == Policy.DemandResIdx) 2209 ResDelta.DemandedResources += PI->Cycles; 2210 } 2211 } 2212 2213 2214 /// Return true if this heuristic determines order. 2215 static bool tryLess(int TryVal, int CandVal, 2216 ConvergingScheduler::SchedCandidate &TryCand, 2217 ConvergingScheduler::SchedCandidate &Cand, 2218 ConvergingScheduler::CandReason Reason) { 2219 if (TryVal < CandVal) { 2220 TryCand.Reason = Reason; 2221 return true; 2222 } 2223 if (TryVal > CandVal) { 2224 if (Cand.Reason > Reason) 2225 Cand.Reason = Reason; 2226 return true; 2227 } 2228 Cand.setRepeat(Reason); 2229 return false; 2230 } 2231 2232 static bool tryGreater(int TryVal, int CandVal, 2233 ConvergingScheduler::SchedCandidate &TryCand, 2234 ConvergingScheduler::SchedCandidate &Cand, 2235 ConvergingScheduler::CandReason Reason) { 2236 if (TryVal > CandVal) { 2237 TryCand.Reason = Reason; 2238 return true; 2239 } 2240 if (TryVal < CandVal) { 2241 if (Cand.Reason > Reason) 2242 Cand.Reason = Reason; 2243 return true; 2244 } 2245 Cand.setRepeat(Reason); 2246 return false; 2247 } 2248 2249 static bool tryPressure(const PressureChange &TryP, 2250 const PressureChange &CandP, 2251 ConvergingScheduler::SchedCandidate &TryCand, 2252 ConvergingScheduler::SchedCandidate &Cand, 2253 ConvergingScheduler::CandReason Reason) { 2254 int TryRank = TryP.getPSetOrMax(); 2255 int CandRank = CandP.getPSetOrMax(); 2256 // If both candidates affect the same set, go with the smallest increase. 2257 if (TryRank == CandRank) { 2258 return tryLess(TryP.getUnitInc(), CandP.getUnitInc(), TryCand, Cand, 2259 Reason); 2260 } 2261 // If one candidate decreases and the other increases, go with it. 2262 // Invalid candidates have UnitInc==0. 2263 if (tryLess(TryP.getUnitInc() < 0, CandP.getUnitInc() < 0, TryCand, Cand, 2264 Reason)) { 2265 return true; 2266 } 2267 // If the candidates are decreasing pressure, reverse priority. 2268 if (TryP.getUnitInc() < 0) 2269 std::swap(TryRank, CandRank); 2270 return tryGreater(TryRank, CandRank, TryCand, Cand, Reason); 2271 } 2272 2273 static unsigned getWeakLeft(const SUnit *SU, bool isTop) { 2274 return (isTop) ? SU->WeakPredsLeft : SU->WeakSuccsLeft; 2275 } 2276 2277 /// Minimize physical register live ranges. Regalloc wants them adjacent to 2278 /// their physreg def/use. 2279 /// 2280 /// FIXME: This is an unnecessary check on the critical path. Most are root/leaf 2281 /// copies which can be prescheduled. The rest (e.g. x86 MUL) could be bundled 2282 /// with the operation that produces or consumes the physreg. We'll do this when 2283 /// regalloc has support for parallel copies. 2284 static int biasPhysRegCopy(const SUnit *SU, bool isTop) { 2285 const MachineInstr *MI = SU->getInstr(); 2286 if (!MI->isCopy()) 2287 return 0; 2288 2289 unsigned ScheduledOper = isTop ? 1 : 0; 2290 unsigned UnscheduledOper = isTop ? 0 : 1; 2291 // If we have already scheduled the physreg produce/consumer, immediately 2292 // schedule the copy. 2293 if (TargetRegisterInfo::isPhysicalRegister( 2294 MI->getOperand(ScheduledOper).getReg())) 2295 return 1; 2296 // If the physreg is at the boundary, defer it. Otherwise schedule it 2297 // immediately to free the dependent. We can hoist the copy later. 2298 bool AtBoundary = isTop ? !SU->NumSuccsLeft : !SU->NumPredsLeft; 2299 if (TargetRegisterInfo::isPhysicalRegister( 2300 MI->getOperand(UnscheduledOper).getReg())) 2301 return AtBoundary ? -1 : 1; 2302 return 0; 2303 } 2304 2305 static bool tryLatency(ConvergingScheduler::SchedCandidate &TryCand, 2306 ConvergingScheduler::SchedCandidate &Cand, 2307 ConvergingScheduler::SchedBoundary &Zone) { 2308 if (Zone.isTop()) { 2309 if (Cand.SU->getDepth() > Zone.getScheduledLatency()) { 2310 if (tryLess(TryCand.SU->getDepth(), Cand.SU->getDepth(), 2311 TryCand, Cand, ConvergingScheduler::TopDepthReduce)) 2312 return true; 2313 } 2314 if (tryGreater(TryCand.SU->getHeight(), Cand.SU->getHeight(), 2315 TryCand, Cand, ConvergingScheduler::TopPathReduce)) 2316 return true; 2317 } 2318 else { 2319 if (Cand.SU->getHeight() > Zone.getScheduledLatency()) { 2320 if (tryLess(TryCand.SU->getHeight(), Cand.SU->getHeight(), 2321 TryCand, Cand, ConvergingScheduler::BotHeightReduce)) 2322 return true; 2323 } 2324 if (tryGreater(TryCand.SU->getDepth(), Cand.SU->getDepth(), 2325 TryCand, Cand, ConvergingScheduler::BotPathReduce)) 2326 return true; 2327 } 2328 return false; 2329 } 2330 2331 /// Apply a set of heursitics to a new candidate. Heuristics are currently 2332 /// hierarchical. This may be more efficient than a graduated cost model because 2333 /// we don't need to evaluate all aspects of the model for each node in the 2334 /// queue. But it's really done to make the heuristics easier to debug and 2335 /// statistically analyze. 2336 /// 2337 /// \param Cand provides the policy and current best candidate. 2338 /// \param TryCand refers to the next SUnit candidate, otherwise uninitialized. 2339 /// \param Zone describes the scheduled zone that we are extending. 2340 /// \param RPTracker describes reg pressure within the scheduled zone. 2341 /// \param TempTracker is a scratch pressure tracker to reuse in queries. 2342 void ConvergingScheduler::tryCandidate(SchedCandidate &Cand, 2343 SchedCandidate &TryCand, 2344 SchedBoundary &Zone, 2345 const RegPressureTracker &RPTracker, 2346 RegPressureTracker &TempTracker) { 2347 2348 // Always initialize TryCand's RPDelta. 2349 if (Zone.isTop()) { 2350 TempTracker.getMaxDownwardPressureDelta( 2351 TryCand.SU->getInstr(), 2352 TryCand.RPDelta, 2353 DAG->getRegionCriticalPSets(), 2354 DAG->getRegPressure().MaxSetPressure); 2355 } 2356 else { 2357 if (VerifyScheduling) { 2358 TempTracker.getMaxUpwardPressureDelta( 2359 TryCand.SU->getInstr(), 2360 &DAG->getPressureDiff(TryCand.SU), 2361 TryCand.RPDelta, 2362 DAG->getRegionCriticalPSets(), 2363 DAG->getRegPressure().MaxSetPressure); 2364 } 2365 else { 2366 RPTracker.getUpwardPressureDelta( 2367 TryCand.SU->getInstr(), 2368 DAG->getPressureDiff(TryCand.SU), 2369 TryCand.RPDelta, 2370 DAG->getRegionCriticalPSets(), 2371 DAG->getRegPressure().MaxSetPressure); 2372 } 2373 } 2374 2375 // Initialize the candidate if needed. 2376 if (!Cand.isValid()) { 2377 TryCand.Reason = NodeOrder; 2378 return; 2379 } 2380 2381 if (tryGreater(biasPhysRegCopy(TryCand.SU, Zone.isTop()), 2382 biasPhysRegCopy(Cand.SU, Zone.isTop()), 2383 TryCand, Cand, PhysRegCopy)) 2384 return; 2385 2386 // Avoid exceeding the target's limit. If signed PSetID is negative, it is 2387 // invalid; convert it to INT_MAX to give it lowest priority. 2388 if (tryPressure(TryCand.RPDelta.Excess, Cand.RPDelta.Excess, TryCand, Cand, 2389 RegExcess)) 2390 return; 2391 2392 // For loops that are acyclic path limited, aggressively schedule for latency. 2393 if (Rem.IsAcyclicLatencyLimited && tryLatency(TryCand, Cand, Zone)) 2394 return; 2395 2396 // Avoid increasing the max critical pressure in the scheduled region. 2397 if (tryPressure(TryCand.RPDelta.CriticalMax, Cand.RPDelta.CriticalMax, 2398 TryCand, Cand, RegCritical)) 2399 return; 2400 2401 // Keep clustered nodes together to encourage downstream peephole 2402 // optimizations which may reduce resource requirements. 2403 // 2404 // This is a best effort to set things up for a post-RA pass. Optimizations 2405 // like generating loads of multiple registers should ideally be done within 2406 // the scheduler pass by combining the loads during DAG postprocessing. 2407 const SUnit *NextClusterSU = 2408 Zone.isTop() ? DAG->getNextClusterSucc() : DAG->getNextClusterPred(); 2409 if (tryGreater(TryCand.SU == NextClusterSU, Cand.SU == NextClusterSU, 2410 TryCand, Cand, Cluster)) 2411 return; 2412 2413 // Weak edges are for clustering and other constraints. 2414 if (tryLess(getWeakLeft(TryCand.SU, Zone.isTop()), 2415 getWeakLeft(Cand.SU, Zone.isTop()), 2416 TryCand, Cand, Weak)) { 2417 return; 2418 } 2419 // Avoid increasing the max pressure of the entire region. 2420 if (tryPressure(TryCand.RPDelta.CurrentMax, Cand.RPDelta.CurrentMax, 2421 TryCand, Cand, RegMax)) 2422 return; 2423 2424 // Avoid critical resource consumption and balance the schedule. 2425 TryCand.initResourceDelta(DAG, SchedModel); 2426 if (tryLess(TryCand.ResDelta.CritResources, Cand.ResDelta.CritResources, 2427 TryCand, Cand, ResourceReduce)) 2428 return; 2429 if (tryGreater(TryCand.ResDelta.DemandedResources, 2430 Cand.ResDelta.DemandedResources, 2431 TryCand, Cand, ResourceDemand)) 2432 return; 2433 2434 // Avoid serializing long latency dependence chains. 2435 // For acyclic path limited loops, latency was already checked above. 2436 if (Cand.Policy.ReduceLatency && !Rem.IsAcyclicLatencyLimited 2437 && tryLatency(TryCand, Cand, Zone)) { 2438 return; 2439 } 2440 2441 // Prefer immediate defs/users of the last scheduled instruction. This is a 2442 // local pressure avoidance strategy that also makes the machine code 2443 // readable. 2444 if (tryGreater(Zone.NextSUs.count(TryCand.SU), Zone.NextSUs.count(Cand.SU), 2445 TryCand, Cand, NextDefUse)) 2446 return; 2447 2448 // Fall through to original instruction order. 2449 if ((Zone.isTop() && TryCand.SU->NodeNum < Cand.SU->NodeNum) 2450 || (!Zone.isTop() && TryCand.SU->NodeNum > Cand.SU->NodeNum)) { 2451 TryCand.Reason = NodeOrder; 2452 } 2453 } 2454 2455 #ifndef NDEBUG 2456 const char *ConvergingScheduler::getReasonStr( 2457 ConvergingScheduler::CandReason Reason) { 2458 switch (Reason) { 2459 case NoCand: return "NOCAND "; 2460 case PhysRegCopy: return "PREG-COPY"; 2461 case RegExcess: return "REG-EXCESS"; 2462 case RegCritical: return "REG-CRIT "; 2463 case Cluster: return "CLUSTER "; 2464 case Weak: return "WEAK "; 2465 case RegMax: return "REG-MAX "; 2466 case ResourceReduce: return "RES-REDUCE"; 2467 case ResourceDemand: return "RES-DEMAND"; 2468 case TopDepthReduce: return "TOP-DEPTH "; 2469 case TopPathReduce: return "TOP-PATH "; 2470 case BotHeightReduce:return "BOT-HEIGHT"; 2471 case BotPathReduce: return "BOT-PATH "; 2472 case NextDefUse: return "DEF-USE "; 2473 case NodeOrder: return "ORDER "; 2474 }; 2475 llvm_unreachable("Unknown reason!"); 2476 } 2477 2478 void ConvergingScheduler::traceCandidate(const SchedCandidate &Cand) { 2479 PressureChange P; 2480 unsigned ResIdx = 0; 2481 unsigned Latency = 0; 2482 switch (Cand.Reason) { 2483 default: 2484 break; 2485 case RegExcess: 2486 P = Cand.RPDelta.Excess; 2487 break; 2488 case RegCritical: 2489 P = Cand.RPDelta.CriticalMax; 2490 break; 2491 case RegMax: 2492 P = Cand.RPDelta.CurrentMax; 2493 break; 2494 case ResourceReduce: 2495 ResIdx = Cand.Policy.ReduceResIdx; 2496 break; 2497 case ResourceDemand: 2498 ResIdx = Cand.Policy.DemandResIdx; 2499 break; 2500 case TopDepthReduce: 2501 Latency = Cand.SU->getDepth(); 2502 break; 2503 case TopPathReduce: 2504 Latency = Cand.SU->getHeight(); 2505 break; 2506 case BotHeightReduce: 2507 Latency = Cand.SU->getHeight(); 2508 break; 2509 case BotPathReduce: 2510 Latency = Cand.SU->getDepth(); 2511 break; 2512 } 2513 dbgs() << " SU(" << Cand.SU->NodeNum << ") " << getReasonStr(Cand.Reason); 2514 if (P.isValid()) 2515 dbgs() << " " << TRI->getRegPressureSetName(P.getPSet()) 2516 << ":" << P.getUnitInc() << " "; 2517 else 2518 dbgs() << " "; 2519 if (ResIdx) 2520 dbgs() << " " << SchedModel->getProcResource(ResIdx)->Name << " "; 2521 else 2522 dbgs() << " "; 2523 if (Latency) 2524 dbgs() << " " << Latency << " cycles "; 2525 else 2526 dbgs() << " "; 2527 dbgs() << '\n'; 2528 } 2529 #endif 2530 2531 /// Pick the best candidate from the top queue. 2532 /// 2533 /// TODO: getMaxPressureDelta results can be mostly cached for each SUnit during 2534 /// DAG building. To adjust for the current scheduling location we need to 2535 /// maintain the number of vreg uses remaining to be top-scheduled. 2536 void ConvergingScheduler::pickNodeFromQueue(SchedBoundary &Zone, 2537 const RegPressureTracker &RPTracker, 2538 SchedCandidate &Cand) { 2539 ReadyQueue &Q = Zone.Available; 2540 2541 DEBUG(Q.dump()); 2542 2543 // getMaxPressureDelta temporarily modifies the tracker. 2544 RegPressureTracker &TempTracker = const_cast<RegPressureTracker&>(RPTracker); 2545 2546 for (ReadyQueue::iterator I = Q.begin(), E = Q.end(); I != E; ++I) { 2547 2548 SchedCandidate TryCand(Cand.Policy); 2549 TryCand.SU = *I; 2550 tryCandidate(Cand, TryCand, Zone, RPTracker, TempTracker); 2551 if (TryCand.Reason != NoCand) { 2552 // Initialize resource delta if needed in case future heuristics query it. 2553 if (TryCand.ResDelta == SchedResourceDelta()) 2554 TryCand.initResourceDelta(DAG, SchedModel); 2555 Cand.setBest(TryCand); 2556 DEBUG(traceCandidate(Cand)); 2557 } 2558 } 2559 } 2560 2561 static void tracePick(const ConvergingScheduler::SchedCandidate &Cand, 2562 bool IsTop) { 2563 DEBUG(dbgs() << "Pick " << (IsTop ? "Top " : "Bot ") 2564 << ConvergingScheduler::getReasonStr(Cand.Reason) << '\n'); 2565 } 2566 2567 /// Pick the best candidate node from either the top or bottom queue. 2568 SUnit *ConvergingScheduler::pickNodeBidirectional(bool &IsTopNode) { 2569 // Schedule as far as possible in the direction of no choice. This is most 2570 // efficient, but also provides the best heuristics for CriticalPSets. 2571 if (SUnit *SU = Bot.pickOnlyChoice()) { 2572 IsTopNode = false; 2573 DEBUG(dbgs() << "Pick Bot NOCAND\n"); 2574 return SU; 2575 } 2576 if (SUnit *SU = Top.pickOnlyChoice()) { 2577 IsTopNode = true; 2578 DEBUG(dbgs() << "Pick Top NOCAND\n"); 2579 return SU; 2580 } 2581 CandPolicy NoPolicy; 2582 SchedCandidate BotCand(NoPolicy); 2583 SchedCandidate TopCand(NoPolicy); 2584 Bot.setPolicy(BotCand.Policy, Top); 2585 Top.setPolicy(TopCand.Policy, Bot); 2586 2587 // Prefer bottom scheduling when heuristics are silent. 2588 pickNodeFromQueue(Bot, DAG->getBotRPTracker(), BotCand); 2589 assert(BotCand.Reason != NoCand && "failed to find the first candidate"); 2590 2591 // If either Q has a single candidate that provides the least increase in 2592 // Excess pressure, we can immediately schedule from that Q. 2593 // 2594 // RegionCriticalPSets summarizes the pressure within the scheduled region and 2595 // affects picking from either Q. If scheduling in one direction must 2596 // increase pressure for one of the excess PSets, then schedule in that 2597 // direction first to provide more freedom in the other direction. 2598 if ((BotCand.Reason == RegExcess && !BotCand.isRepeat(RegExcess)) 2599 || (BotCand.Reason == RegCritical 2600 && !BotCand.isRepeat(RegCritical))) 2601 { 2602 IsTopNode = false; 2603 tracePick(BotCand, IsTopNode); 2604 return BotCand.SU; 2605 } 2606 // Check if the top Q has a better candidate. 2607 pickNodeFromQueue(Top, DAG->getTopRPTracker(), TopCand); 2608 assert(TopCand.Reason != NoCand && "failed to find the first candidate"); 2609 2610 // Choose the queue with the most important (lowest enum) reason. 2611 if (TopCand.Reason < BotCand.Reason) { 2612 IsTopNode = true; 2613 tracePick(TopCand, IsTopNode); 2614 return TopCand.SU; 2615 } 2616 // Otherwise prefer the bottom candidate, in node order if all else failed. 2617 IsTopNode = false; 2618 tracePick(BotCand, IsTopNode); 2619 return BotCand.SU; 2620 } 2621 2622 /// Pick the best node to balance the schedule. Implements MachineSchedStrategy. 2623 SUnit *ConvergingScheduler::pickNode(bool &IsTopNode) { 2624 if (DAG->top() == DAG->bottom()) { 2625 assert(Top.Available.empty() && Top.Pending.empty() && 2626 Bot.Available.empty() && Bot.Pending.empty() && "ReadyQ garbage"); 2627 return NULL; 2628 } 2629 SUnit *SU; 2630 do { 2631 if (ForceTopDown) { 2632 SU = Top.pickOnlyChoice(); 2633 if (!SU) { 2634 CandPolicy NoPolicy; 2635 SchedCandidate TopCand(NoPolicy); 2636 pickNodeFromQueue(Top, DAG->getTopRPTracker(), TopCand); 2637 assert(TopCand.Reason != NoCand && "failed to find the first candidate"); 2638 SU = TopCand.SU; 2639 } 2640 IsTopNode = true; 2641 } 2642 else if (ForceBottomUp) { 2643 SU = Bot.pickOnlyChoice(); 2644 if (!SU) { 2645 CandPolicy NoPolicy; 2646 SchedCandidate BotCand(NoPolicy); 2647 pickNodeFromQueue(Bot, DAG->getBotRPTracker(), BotCand); 2648 assert(BotCand.Reason != NoCand && "failed to find the first candidate"); 2649 SU = BotCand.SU; 2650 } 2651 IsTopNode = false; 2652 } 2653 else { 2654 SU = pickNodeBidirectional(IsTopNode); 2655 } 2656 } while (SU->isScheduled); 2657 2658 if (SU->isTopReady()) 2659 Top.removeReady(SU); 2660 if (SU->isBottomReady()) 2661 Bot.removeReady(SU); 2662 2663 DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") " << *SU->getInstr()); 2664 return SU; 2665 } 2666 2667 void ConvergingScheduler::reschedulePhysRegCopies(SUnit *SU, bool isTop) { 2668 2669 MachineBasicBlock::iterator InsertPos = SU->getInstr(); 2670 if (!isTop) 2671 ++InsertPos; 2672 SmallVectorImpl<SDep> &Deps = isTop ? SU->Preds : SU->Succs; 2673 2674 // Find already scheduled copies with a single physreg dependence and move 2675 // them just above the scheduled instruction. 2676 for (SmallVectorImpl<SDep>::iterator I = Deps.begin(), E = Deps.end(); 2677 I != E; ++I) { 2678 if (I->getKind() != SDep::Data || !TRI->isPhysicalRegister(I->getReg())) 2679 continue; 2680 SUnit *DepSU = I->getSUnit(); 2681 if (isTop ? DepSU->Succs.size() > 1 : DepSU->Preds.size() > 1) 2682 continue; 2683 MachineInstr *Copy = DepSU->getInstr(); 2684 if (!Copy->isCopy()) 2685 continue; 2686 DEBUG(dbgs() << " Rescheduling physreg copy "; 2687 I->getSUnit()->dump(DAG)); 2688 DAG->moveInstruction(Copy, InsertPos); 2689 } 2690 } 2691 2692 /// Update the scheduler's state after scheduling a node. This is the same node 2693 /// that was just returned by pickNode(). However, ScheduleDAGMI needs to update 2694 /// it's state based on the current cycle before MachineSchedStrategy does. 2695 /// 2696 /// FIXME: Eventually, we may bundle physreg copies rather than rescheduling 2697 /// them here. See comments in biasPhysRegCopy. 2698 void ConvergingScheduler::schedNode(SUnit *SU, bool IsTopNode) { 2699 if (IsTopNode) { 2700 SU->TopReadyCycle = std::max(SU->TopReadyCycle, Top.CurrCycle); 2701 Top.bumpNode(SU); 2702 if (SU->hasPhysRegUses) 2703 reschedulePhysRegCopies(SU, true); 2704 } 2705 else { 2706 SU->BotReadyCycle = std::max(SU->BotReadyCycle, Bot.CurrCycle); 2707 Bot.bumpNode(SU); 2708 if (SU->hasPhysRegDefs) 2709 reschedulePhysRegCopies(SU, false); 2710 } 2711 } 2712 2713 /// Create the standard converging machine scheduler. This will be used as the 2714 /// default scheduler if the target does not set a default. 2715 static ScheduleDAGInstrs *createConvergingSched(MachineSchedContext *C) { 2716 assert((!ForceTopDown || !ForceBottomUp) && 2717 "-misched-topdown incompatible with -misched-bottomup"); 2718 ScheduleDAGMI *DAG = new ScheduleDAGMI(C, new ConvergingScheduler()); 2719 // Register DAG post-processors. 2720 // 2721 // FIXME: extend the mutation API to allow earlier mutations to instantiate 2722 // data and pass it to later mutations. Have a single mutation that gathers 2723 // the interesting nodes in one pass. 2724 DAG->addMutation(new CopyConstrain(DAG->TII, DAG->TRI)); 2725 if (EnableLoadCluster) 2726 DAG->addMutation(new LoadClusterMutation(DAG->TII, DAG->TRI)); 2727 if (EnableMacroFusion) 2728 DAG->addMutation(new MacroFusion(DAG->TII)); 2729 return DAG; 2730 } 2731 static MachineSchedRegistry 2732 ConvergingSchedRegistry("converge", "Standard converging scheduler.", 2733 createConvergingSched); 2734 2735 //===----------------------------------------------------------------------===// 2736 // ILP Scheduler. Currently for experimental analysis of heuristics. 2737 //===----------------------------------------------------------------------===// 2738 2739 namespace { 2740 /// \brief Order nodes by the ILP metric. 2741 struct ILPOrder { 2742 const SchedDFSResult *DFSResult; 2743 const BitVector *ScheduledTrees; 2744 bool MaximizeILP; 2745 2746 ILPOrder(bool MaxILP): DFSResult(0), ScheduledTrees(0), MaximizeILP(MaxILP) {} 2747 2748 /// \brief Apply a less-than relation on node priority. 2749 /// 2750 /// (Return true if A comes after B in the Q.) 2751 bool operator()(const SUnit *A, const SUnit *B) const { 2752 unsigned SchedTreeA = DFSResult->getSubtreeID(A); 2753 unsigned SchedTreeB = DFSResult->getSubtreeID(B); 2754 if (SchedTreeA != SchedTreeB) { 2755 // Unscheduled trees have lower priority. 2756 if (ScheduledTrees->test(SchedTreeA) != ScheduledTrees->test(SchedTreeB)) 2757 return ScheduledTrees->test(SchedTreeB); 2758 2759 // Trees with shallower connections have have lower priority. 2760 if (DFSResult->getSubtreeLevel(SchedTreeA) 2761 != DFSResult->getSubtreeLevel(SchedTreeB)) { 2762 return DFSResult->getSubtreeLevel(SchedTreeA) 2763 < DFSResult->getSubtreeLevel(SchedTreeB); 2764 } 2765 } 2766 if (MaximizeILP) 2767 return DFSResult->getILP(A) < DFSResult->getILP(B); 2768 else 2769 return DFSResult->getILP(A) > DFSResult->getILP(B); 2770 } 2771 }; 2772 2773 /// \brief Schedule based on the ILP metric. 2774 class ILPScheduler : public MachineSchedStrategy { 2775 /// In case all subtrees are eventually connected to a common root through 2776 /// data dependence (e.g. reduction), place an upper limit on their size. 2777 /// 2778 /// FIXME: A subtree limit is generally good, but in the situation commented 2779 /// above, where multiple similar subtrees feed a common root, we should 2780 /// only split at a point where the resulting subtrees will be balanced. 2781 /// (a motivating test case must be found). 2782 static const unsigned SubtreeLimit = 16; 2783 2784 ScheduleDAGMI *DAG; 2785 ILPOrder Cmp; 2786 2787 std::vector<SUnit*> ReadyQ; 2788 public: 2789 ILPScheduler(bool MaximizeILP): DAG(0), Cmp(MaximizeILP) {} 2790 2791 virtual void initialize(ScheduleDAGMI *dag) { 2792 DAG = dag; 2793 DAG->computeDFSResult(); 2794 Cmp.DFSResult = DAG->getDFSResult(); 2795 Cmp.ScheduledTrees = &DAG->getScheduledTrees(); 2796 ReadyQ.clear(); 2797 } 2798 2799 virtual void registerRoots() { 2800 // Restore the heap in ReadyQ with the updated DFS results. 2801 std::make_heap(ReadyQ.begin(), ReadyQ.end(), Cmp); 2802 } 2803 2804 /// Implement MachineSchedStrategy interface. 2805 /// ----------------------------------------- 2806 2807 /// Callback to select the highest priority node from the ready Q. 2808 virtual SUnit *pickNode(bool &IsTopNode) { 2809 if (ReadyQ.empty()) return NULL; 2810 std::pop_heap(ReadyQ.begin(), ReadyQ.end(), Cmp); 2811 SUnit *SU = ReadyQ.back(); 2812 ReadyQ.pop_back(); 2813 IsTopNode = false; 2814 DEBUG(dbgs() << "Pick node " << "SU(" << SU->NodeNum << ") " 2815 << " ILP: " << DAG->getDFSResult()->getILP(SU) 2816 << " Tree: " << DAG->getDFSResult()->getSubtreeID(SU) << " @" 2817 << DAG->getDFSResult()->getSubtreeLevel( 2818 DAG->getDFSResult()->getSubtreeID(SU)) << '\n' 2819 << "Scheduling " << *SU->getInstr()); 2820 return SU; 2821 } 2822 2823 /// \brief Scheduler callback to notify that a new subtree is scheduled. 2824 virtual void scheduleTree(unsigned SubtreeID) { 2825 std::make_heap(ReadyQ.begin(), ReadyQ.end(), Cmp); 2826 } 2827 2828 /// Callback after a node is scheduled. Mark a newly scheduled tree, notify 2829 /// DFSResults, and resort the priority Q. 2830 virtual void schedNode(SUnit *SU, bool IsTopNode) { 2831 assert(!IsTopNode && "SchedDFSResult needs bottom-up"); 2832 } 2833 2834 virtual void releaseTopNode(SUnit *) { /*only called for top roots*/ } 2835 2836 virtual void releaseBottomNode(SUnit *SU) { 2837 ReadyQ.push_back(SU); 2838 std::push_heap(ReadyQ.begin(), ReadyQ.end(), Cmp); 2839 } 2840 }; 2841 } // namespace 2842 2843 static ScheduleDAGInstrs *createILPMaxScheduler(MachineSchedContext *C) { 2844 return new ScheduleDAGMI(C, new ILPScheduler(true)); 2845 } 2846 static ScheduleDAGInstrs *createILPMinScheduler(MachineSchedContext *C) { 2847 return new ScheduleDAGMI(C, new ILPScheduler(false)); 2848 } 2849 static MachineSchedRegistry ILPMaxRegistry( 2850 "ilpmax", "Schedule bottom-up for max ILP", createILPMaxScheduler); 2851 static MachineSchedRegistry ILPMinRegistry( 2852 "ilpmin", "Schedule bottom-up for min ILP", createILPMinScheduler); 2853 2854 //===----------------------------------------------------------------------===// 2855 // Machine Instruction Shuffler for Correctness Testing 2856 //===----------------------------------------------------------------------===// 2857 2858 #ifndef NDEBUG 2859 namespace { 2860 /// Apply a less-than relation on the node order, which corresponds to the 2861 /// instruction order prior to scheduling. IsReverse implements greater-than. 2862 template<bool IsReverse> 2863 struct SUnitOrder { 2864 bool operator()(SUnit *A, SUnit *B) const { 2865 if (IsReverse) 2866 return A->NodeNum > B->NodeNum; 2867 else 2868 return A->NodeNum < B->NodeNum; 2869 } 2870 }; 2871 2872 /// Reorder instructions as much as possible. 2873 class InstructionShuffler : public MachineSchedStrategy { 2874 bool IsAlternating; 2875 bool IsTopDown; 2876 2877 // Using a less-than relation (SUnitOrder<false>) for the TopQ priority 2878 // gives nodes with a higher number higher priority causing the latest 2879 // instructions to be scheduled first. 2880 PriorityQueue<SUnit*, std::vector<SUnit*>, SUnitOrder<false> > 2881 TopQ; 2882 // When scheduling bottom-up, use greater-than as the queue priority. 2883 PriorityQueue<SUnit*, std::vector<SUnit*>, SUnitOrder<true> > 2884 BottomQ; 2885 public: 2886 InstructionShuffler(bool alternate, bool topdown) 2887 : IsAlternating(alternate), IsTopDown(topdown) {} 2888 2889 virtual void initialize(ScheduleDAGMI *) { 2890 TopQ.clear(); 2891 BottomQ.clear(); 2892 } 2893 2894 /// Implement MachineSchedStrategy interface. 2895 /// ----------------------------------------- 2896 2897 virtual SUnit *pickNode(bool &IsTopNode) { 2898 SUnit *SU; 2899 if (IsTopDown) { 2900 do { 2901 if (TopQ.empty()) return NULL; 2902 SU = TopQ.top(); 2903 TopQ.pop(); 2904 } while (SU->isScheduled); 2905 IsTopNode = true; 2906 } 2907 else { 2908 do { 2909 if (BottomQ.empty()) return NULL; 2910 SU = BottomQ.top(); 2911 BottomQ.pop(); 2912 } while (SU->isScheduled); 2913 IsTopNode = false; 2914 } 2915 if (IsAlternating) 2916 IsTopDown = !IsTopDown; 2917 return SU; 2918 } 2919 2920 virtual void schedNode(SUnit *SU, bool IsTopNode) {} 2921 2922 virtual void releaseTopNode(SUnit *SU) { 2923 TopQ.push(SU); 2924 } 2925 virtual void releaseBottomNode(SUnit *SU) { 2926 BottomQ.push(SU); 2927 } 2928 }; 2929 } // namespace 2930 2931 static ScheduleDAGInstrs *createInstructionShuffler(MachineSchedContext *C) { 2932 bool Alternate = !ForceTopDown && !ForceBottomUp; 2933 bool TopDown = !ForceBottomUp; 2934 assert((TopDown || !ForceTopDown) && 2935 "-misched-topdown incompatible with -misched-bottomup"); 2936 return new ScheduleDAGMI(C, new InstructionShuffler(Alternate, TopDown)); 2937 } 2938 static MachineSchedRegistry ShufflerRegistry( 2939 "shuffle", "Shuffle machine instructions alternating directions", 2940 createInstructionShuffler); 2941 #endif // !NDEBUG 2942 2943 //===----------------------------------------------------------------------===// 2944 // GraphWriter support for ScheduleDAGMI. 2945 //===----------------------------------------------------------------------===// 2946 2947 #ifndef NDEBUG 2948 namespace llvm { 2949 2950 template<> struct GraphTraits< 2951 ScheduleDAGMI*> : public GraphTraits<ScheduleDAG*> {}; 2952 2953 template<> 2954 struct DOTGraphTraits<ScheduleDAGMI*> : public DefaultDOTGraphTraits { 2955 2956 DOTGraphTraits (bool isSimple=false) : DefaultDOTGraphTraits(isSimple) {} 2957 2958 static std::string getGraphName(const ScheduleDAG *G) { 2959 return G->MF.getName(); 2960 } 2961 2962 static bool renderGraphFromBottomUp() { 2963 return true; 2964 } 2965 2966 static bool isNodeHidden(const SUnit *Node) { 2967 return (Node->NumPreds > 10 || Node->NumSuccs > 10); 2968 } 2969 2970 static bool hasNodeAddressLabel(const SUnit *Node, 2971 const ScheduleDAG *Graph) { 2972 return false; 2973 } 2974 2975 /// If you want to override the dot attributes printed for a particular 2976 /// edge, override this method. 2977 static std::string getEdgeAttributes(const SUnit *Node, 2978 SUnitIterator EI, 2979 const ScheduleDAG *Graph) { 2980 if (EI.isArtificialDep()) 2981 return "color=cyan,style=dashed"; 2982 if (EI.isCtrlDep()) 2983 return "color=blue,style=dashed"; 2984 return ""; 2985 } 2986 2987 static std::string getNodeLabel(const SUnit *SU, const ScheduleDAG *G) { 2988 std::string Str; 2989 raw_string_ostream SS(Str); 2990 SS << "SU(" << SU->NodeNum << ')'; 2991 return SS.str(); 2992 } 2993 static std::string getNodeDescription(const SUnit *SU, const ScheduleDAG *G) { 2994 return G->getGraphNodeLabel(SU); 2995 } 2996 2997 static std::string getNodeAttributes(const SUnit *N, 2998 const ScheduleDAG *Graph) { 2999 std::string Str("shape=Mrecord"); 3000 const SchedDFSResult *DFS = 3001 static_cast<const ScheduleDAGMI*>(Graph)->getDFSResult(); 3002 if (DFS) { 3003 Str += ",style=filled,fillcolor=\"#"; 3004 Str += DOT::getColorString(DFS->getSubtreeID(N)); 3005 Str += '"'; 3006 } 3007 return Str; 3008 } 3009 }; 3010 } // namespace llvm 3011 #endif // NDEBUG 3012 3013 /// viewGraph - Pop up a ghostview window with the reachable parts of the DAG 3014 /// rendered using 'dot'. 3015 /// 3016 void ScheduleDAGMI::viewGraph(const Twine &Name, const Twine &Title) { 3017 #ifndef NDEBUG 3018 ViewGraph(this, Name, false, Title); 3019 #else 3020 errs() << "ScheduleDAGMI::viewGraph is only available in debug builds on " 3021 << "systems with Graphviz or gv!\n"; 3022 #endif // NDEBUG 3023 } 3024 3025 /// Out-of-line implementation with no arguments is handy for gdb. 3026 void ScheduleDAGMI::viewGraph() { 3027 viewGraph(getDAGName(), "Scheduling-Units Graph for " + getDAGName()); 3028 } 3029