1 //===- MachineScheduler.cpp - Machine Instruction Scheduler ---------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // MachineScheduler schedules machine instructions after phi elimination. It 11 // preserves LiveIntervals so it can be invoked before register allocation. 12 // 13 //===----------------------------------------------------------------------===// 14 15 #define DEBUG_TYPE "misched" 16 17 #include "llvm/CodeGen/MachineScheduler.h" 18 #include "llvm/ADT/OwningPtr.h" 19 #include "llvm/ADT/PriorityQueue.h" 20 #include "llvm/Analysis/AliasAnalysis.h" 21 #include "llvm/CodeGen/LiveIntervalAnalysis.h" 22 #include "llvm/CodeGen/MachineDominators.h" 23 #include "llvm/CodeGen/MachineLoopInfo.h" 24 #include "llvm/CodeGen/MachineRegisterInfo.h" 25 #include "llvm/CodeGen/Passes.h" 26 #include "llvm/CodeGen/RegisterClassInfo.h" 27 #include "llvm/CodeGen/ScheduleDFS.h" 28 #include "llvm/CodeGen/ScheduleHazardRecognizer.h" 29 #include "llvm/Support/CommandLine.h" 30 #include "llvm/Support/Debug.h" 31 #include "llvm/Support/ErrorHandling.h" 32 #include "llvm/Support/GraphWriter.h" 33 #include "llvm/Support/raw_ostream.h" 34 #include "llvm/Target/TargetInstrInfo.h" 35 #include <queue> 36 37 using namespace llvm; 38 39 namespace llvm { 40 cl::opt<bool> ForceTopDown("misched-topdown", cl::Hidden, 41 cl::desc("Force top-down list scheduling")); 42 cl::opt<bool> ForceBottomUp("misched-bottomup", cl::Hidden, 43 cl::desc("Force bottom-up list scheduling")); 44 } 45 46 #ifndef NDEBUG 47 static cl::opt<bool> ViewMISchedDAGs("view-misched-dags", cl::Hidden, 48 cl::desc("Pop up a window to show MISched dags after they are processed")); 49 50 static cl::opt<unsigned> MISchedCutoff("misched-cutoff", cl::Hidden, 51 cl::desc("Stop scheduling after N instructions"), cl::init(~0U)); 52 #else 53 static bool ViewMISchedDAGs = false; 54 #endif // NDEBUG 55 56 static cl::opt<bool> EnableRegPressure("misched-regpressure", cl::Hidden, 57 cl::desc("Enable register pressure scheduling."), cl::init(true)); 58 59 static cl::opt<bool> EnableCyclicPath("misched-cyclicpath", cl::Hidden, 60 cl::desc("Enable cyclic critical path analysis."), cl::init(true)); 61 62 static cl::opt<bool> EnableLoadCluster("misched-cluster", cl::Hidden, 63 cl::desc("Enable load clustering."), cl::init(true)); 64 65 // Experimental heuristics 66 static cl::opt<bool> EnableMacroFusion("misched-fusion", cl::Hidden, 67 cl::desc("Enable scheduling for macro fusion."), cl::init(true)); 68 69 static cl::opt<bool> VerifyScheduling("verify-misched", cl::Hidden, 70 cl::desc("Verify machine instrs before and after machine scheduling")); 71 72 // DAG subtrees must have at least this many nodes. 73 static const unsigned MinSubtreeSize = 8; 74 75 //===----------------------------------------------------------------------===// 76 // Machine Instruction Scheduling Pass and Registry 77 //===----------------------------------------------------------------------===// 78 79 MachineSchedContext::MachineSchedContext(): 80 MF(0), MLI(0), MDT(0), PassConfig(0), AA(0), LIS(0) { 81 RegClassInfo = new RegisterClassInfo(); 82 } 83 84 MachineSchedContext::~MachineSchedContext() { 85 delete RegClassInfo; 86 } 87 88 namespace { 89 /// MachineScheduler runs after coalescing and before register allocation. 90 class MachineScheduler : public MachineSchedContext, 91 public MachineFunctionPass { 92 public: 93 MachineScheduler(); 94 95 virtual void getAnalysisUsage(AnalysisUsage &AU) const; 96 97 virtual void releaseMemory() {} 98 99 virtual bool runOnMachineFunction(MachineFunction&); 100 101 virtual void print(raw_ostream &O, const Module* = 0) const; 102 103 static char ID; // Class identification, replacement for typeinfo 104 105 protected: 106 ScheduleDAGInstrs *createMachineScheduler(); 107 }; 108 } // namespace 109 110 char MachineScheduler::ID = 0; 111 112 char &llvm::MachineSchedulerID = MachineScheduler::ID; 113 114 INITIALIZE_PASS_BEGIN(MachineScheduler, "misched", 115 "Machine Instruction Scheduler", false, false) 116 INITIALIZE_AG_DEPENDENCY(AliasAnalysis) 117 INITIALIZE_PASS_DEPENDENCY(SlotIndexes) 118 INITIALIZE_PASS_DEPENDENCY(LiveIntervals) 119 INITIALIZE_PASS_END(MachineScheduler, "misched", 120 "Machine Instruction Scheduler", false, false) 121 122 MachineScheduler::MachineScheduler() 123 : MachineFunctionPass(ID) { 124 initializeMachineSchedulerPass(*PassRegistry::getPassRegistry()); 125 } 126 127 void MachineScheduler::getAnalysisUsage(AnalysisUsage &AU) const { 128 AU.setPreservesCFG(); 129 AU.addRequiredID(MachineDominatorsID); 130 AU.addRequired<MachineLoopInfo>(); 131 AU.addRequired<AliasAnalysis>(); 132 AU.addRequired<TargetPassConfig>(); 133 AU.addRequired<SlotIndexes>(); 134 AU.addPreserved<SlotIndexes>(); 135 AU.addRequired<LiveIntervals>(); 136 AU.addPreserved<LiveIntervals>(); 137 MachineFunctionPass::getAnalysisUsage(AU); 138 } 139 140 MachinePassRegistry MachineSchedRegistry::Registry; 141 142 /// A dummy default scheduler factory indicates whether the scheduler 143 /// is overridden on the command line. 144 static ScheduleDAGInstrs *useDefaultMachineSched(MachineSchedContext *C) { 145 return 0; 146 } 147 148 /// MachineSchedOpt allows command line selection of the scheduler. 149 static cl::opt<MachineSchedRegistry::ScheduleDAGCtor, false, 150 RegisterPassParser<MachineSchedRegistry> > 151 MachineSchedOpt("misched", 152 cl::init(&useDefaultMachineSched), cl::Hidden, 153 cl::desc("Machine instruction scheduler to use")); 154 155 static MachineSchedRegistry 156 DefaultSchedRegistry("default", "Use the target's default scheduler choice.", 157 useDefaultMachineSched); 158 159 /// Forward declare the standard machine scheduler. This will be used as the 160 /// default scheduler if the target does not set a default. 161 static ScheduleDAGInstrs *createGenericSched(MachineSchedContext *C); 162 163 164 /// Decrement this iterator until reaching the top or a non-debug instr. 165 static MachineBasicBlock::const_iterator 166 priorNonDebug(MachineBasicBlock::const_iterator I, 167 MachineBasicBlock::const_iterator Beg) { 168 assert(I != Beg && "reached the top of the region, cannot decrement"); 169 while (--I != Beg) { 170 if (!I->isDebugValue()) 171 break; 172 } 173 return I; 174 } 175 176 /// Non-const version. 177 static MachineBasicBlock::iterator 178 priorNonDebug(MachineBasicBlock::iterator I, 179 MachineBasicBlock::const_iterator Beg) { 180 return const_cast<MachineInstr*>( 181 &*priorNonDebug(MachineBasicBlock::const_iterator(I), Beg)); 182 } 183 184 /// If this iterator is a debug value, increment until reaching the End or a 185 /// non-debug instruction. 186 static MachineBasicBlock::const_iterator 187 nextIfDebug(MachineBasicBlock::const_iterator I, 188 MachineBasicBlock::const_iterator End) { 189 for(; I != End; ++I) { 190 if (!I->isDebugValue()) 191 break; 192 } 193 return I; 194 } 195 196 /// Non-const version. 197 static MachineBasicBlock::iterator 198 nextIfDebug(MachineBasicBlock::iterator I, 199 MachineBasicBlock::const_iterator End) { 200 // Cast the return value to nonconst MachineInstr, then cast to an 201 // instr_iterator, which does not check for null, finally return a 202 // bundle_iterator. 203 return MachineBasicBlock::instr_iterator( 204 const_cast<MachineInstr*>( 205 &*nextIfDebug(MachineBasicBlock::const_iterator(I), End))); 206 } 207 208 /// Instantiate a ScheduleDAGInstrs that will be owned by the caller. 209 ScheduleDAGInstrs *MachineScheduler::createMachineScheduler() { 210 // Select the scheduler, or set the default. 211 MachineSchedRegistry::ScheduleDAGCtor Ctor = MachineSchedOpt; 212 if (Ctor != useDefaultMachineSched) 213 return Ctor(this); 214 215 // Get the default scheduler set by the target for this function. 216 ScheduleDAGInstrs *Scheduler = PassConfig->createMachineScheduler(this); 217 if (Scheduler) 218 return Scheduler; 219 220 // Default to GenericScheduler. 221 return createGenericSched(this); 222 } 223 224 /// Top-level MachineScheduler pass driver. 225 /// 226 /// Visit blocks in function order. Divide each block into scheduling regions 227 /// and visit them bottom-up. Visiting regions bottom-up is not required, but is 228 /// consistent with the DAG builder, which traverses the interior of the 229 /// scheduling regions bottom-up. 230 /// 231 /// This design avoids exposing scheduling boundaries to the DAG builder, 232 /// simplifying the DAG builder's support for "special" target instructions. 233 /// At the same time the design allows target schedulers to operate across 234 /// scheduling boundaries, for example to bundle the boudary instructions 235 /// without reordering them. This creates complexity, because the target 236 /// scheduler must update the RegionBegin and RegionEnd positions cached by 237 /// ScheduleDAGInstrs whenever adding or removing instructions. A much simpler 238 /// design would be to split blocks at scheduling boundaries, but LLVM has a 239 /// general bias against block splitting purely for implementation simplicity. 240 bool MachineScheduler::runOnMachineFunction(MachineFunction &mf) { 241 DEBUG(dbgs() << "Before MISsched:\n"; mf.print(dbgs())); 242 243 // Initialize the context of the pass. 244 MF = &mf; 245 MLI = &getAnalysis<MachineLoopInfo>(); 246 MDT = &getAnalysis<MachineDominatorTree>(); 247 PassConfig = &getAnalysis<TargetPassConfig>(); 248 AA = &getAnalysis<AliasAnalysis>(); 249 250 LIS = &getAnalysis<LiveIntervals>(); 251 const TargetInstrInfo *TII = MF->getTarget().getInstrInfo(); 252 253 if (VerifyScheduling) { 254 DEBUG(LIS->dump()); 255 MF->verify(this, "Before machine scheduling."); 256 } 257 RegClassInfo->runOnMachineFunction(*MF); 258 259 // Instantiate the selected scheduler for this target, function, and 260 // optimization level. 261 OwningPtr<ScheduleDAGInstrs> Scheduler(createMachineScheduler()); 262 263 // Visit all machine basic blocks. 264 // 265 // TODO: Visit blocks in global postorder or postorder within the bottom-up 266 // loop tree. Then we can optionally compute global RegPressure. 267 for (MachineFunction::iterator MBB = MF->begin(), MBBEnd = MF->end(); 268 MBB != MBBEnd; ++MBB) { 269 270 Scheduler->startBlock(MBB); 271 272 // Break the block into scheduling regions [I, RegionEnd), and schedule each 273 // region as soon as it is discovered. RegionEnd points the scheduling 274 // boundary at the bottom of the region. The DAG does not include RegionEnd, 275 // but the region does (i.e. the next RegionEnd is above the previous 276 // RegionBegin). If the current block has no terminator then RegionEnd == 277 // MBB->end() for the bottom region. 278 // 279 // The Scheduler may insert instructions during either schedule() or 280 // exitRegion(), even for empty regions. So the local iterators 'I' and 281 // 'RegionEnd' are invalid across these calls. 282 unsigned RemainingInstrs = MBB->size(); 283 for(MachineBasicBlock::iterator RegionEnd = MBB->end(); 284 RegionEnd != MBB->begin(); RegionEnd = Scheduler->begin()) { 285 286 // Avoid decrementing RegionEnd for blocks with no terminator. 287 if (RegionEnd != MBB->end() 288 || TII->isSchedulingBoundary(llvm::prior(RegionEnd), MBB, *MF)) { 289 --RegionEnd; 290 // Count the boundary instruction. 291 --RemainingInstrs; 292 } 293 294 // The next region starts above the previous region. Look backward in the 295 // instruction stream until we find the nearest boundary. 296 unsigned NumRegionInstrs = 0; 297 MachineBasicBlock::iterator I = RegionEnd; 298 for(;I != MBB->begin(); --I, --RemainingInstrs, ++NumRegionInstrs) { 299 if (TII->isSchedulingBoundary(llvm::prior(I), MBB, *MF)) 300 break; 301 } 302 // Notify the scheduler of the region, even if we may skip scheduling 303 // it. Perhaps it still needs to be bundled. 304 Scheduler->enterRegion(MBB, I, RegionEnd, NumRegionInstrs); 305 306 // Skip empty scheduling regions (0 or 1 schedulable instructions). 307 if (I == RegionEnd || I == llvm::prior(RegionEnd)) { 308 // Close the current region. Bundle the terminator if needed. 309 // This invalidates 'RegionEnd' and 'I'. 310 Scheduler->exitRegion(); 311 continue; 312 } 313 DEBUG(dbgs() << "********** MI Scheduling **********\n"); 314 DEBUG(dbgs() << MF->getName() 315 << ":BB#" << MBB->getNumber() << " " << MBB->getName() 316 << "\n From: " << *I << " To: "; 317 if (RegionEnd != MBB->end()) dbgs() << *RegionEnd; 318 else dbgs() << "End"; 319 dbgs() << " RegionInstrs: " << NumRegionInstrs 320 << " Remaining: " << RemainingInstrs << "\n"); 321 322 // Schedule a region: possibly reorder instructions. 323 // This invalidates 'RegionEnd' and 'I'. 324 Scheduler->schedule(); 325 326 // Close the current region. 327 Scheduler->exitRegion(); 328 329 // Scheduling has invalidated the current iterator 'I'. Ask the 330 // scheduler for the top of it's scheduled region. 331 RegionEnd = Scheduler->begin(); 332 } 333 assert(RemainingInstrs == 0 && "Instruction count mismatch!"); 334 Scheduler->finishBlock(); 335 } 336 Scheduler->finalizeSchedule(); 337 DEBUG(LIS->dump()); 338 if (VerifyScheduling) 339 MF->verify(this, "After machine scheduling."); 340 return true; 341 } 342 343 void MachineScheduler::print(raw_ostream &O, const Module* m) const { 344 // unimplemented 345 } 346 347 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 348 void ReadyQueue::dump() { 349 dbgs() << Name << ": "; 350 for (unsigned i = 0, e = Queue.size(); i < e; ++i) 351 dbgs() << Queue[i]->NodeNum << " "; 352 dbgs() << "\n"; 353 } 354 #endif 355 356 //===----------------------------------------------------------------------===// 357 // ScheduleDAGMI - Base class for MachineInstr scheduling with LiveIntervals 358 // preservation. 359 //===----------------------------------------------------------------------===// 360 361 ScheduleDAGMI::~ScheduleDAGMI() { 362 delete DFSResult; 363 DeleteContainerPointers(Mutations); 364 delete SchedImpl; 365 } 366 367 bool ScheduleDAGMI::canAddEdge(SUnit *SuccSU, SUnit *PredSU) { 368 return SuccSU == &ExitSU || !Topo.IsReachable(PredSU, SuccSU); 369 } 370 371 bool ScheduleDAGMI::addEdge(SUnit *SuccSU, const SDep &PredDep) { 372 if (SuccSU != &ExitSU) { 373 // Do not use WillCreateCycle, it assumes SD scheduling. 374 // If Pred is reachable from Succ, then the edge creates a cycle. 375 if (Topo.IsReachable(PredDep.getSUnit(), SuccSU)) 376 return false; 377 Topo.AddPred(SuccSU, PredDep.getSUnit()); 378 } 379 SuccSU->addPred(PredDep, /*Required=*/!PredDep.isArtificial()); 380 // Return true regardless of whether a new edge needed to be inserted. 381 return true; 382 } 383 384 /// ReleaseSucc - Decrement the NumPredsLeft count of a successor. When 385 /// NumPredsLeft reaches zero, release the successor node. 386 /// 387 /// FIXME: Adjust SuccSU height based on MinLatency. 388 void ScheduleDAGMI::releaseSucc(SUnit *SU, SDep *SuccEdge) { 389 SUnit *SuccSU = SuccEdge->getSUnit(); 390 391 if (SuccEdge->isWeak()) { 392 --SuccSU->WeakPredsLeft; 393 if (SuccEdge->isCluster()) 394 NextClusterSucc = SuccSU; 395 return; 396 } 397 #ifndef NDEBUG 398 if (SuccSU->NumPredsLeft == 0) { 399 dbgs() << "*** Scheduling failed! ***\n"; 400 SuccSU->dump(this); 401 dbgs() << " has been released too many times!\n"; 402 llvm_unreachable(0); 403 } 404 #endif 405 --SuccSU->NumPredsLeft; 406 if (SuccSU->NumPredsLeft == 0 && SuccSU != &ExitSU) 407 SchedImpl->releaseTopNode(SuccSU); 408 } 409 410 /// releaseSuccessors - Call releaseSucc on each of SU's successors. 411 void ScheduleDAGMI::releaseSuccessors(SUnit *SU) { 412 for (SUnit::succ_iterator I = SU->Succs.begin(), E = SU->Succs.end(); 413 I != E; ++I) { 414 releaseSucc(SU, &*I); 415 } 416 } 417 418 /// ReleasePred - Decrement the NumSuccsLeft count of a predecessor. When 419 /// NumSuccsLeft reaches zero, release the predecessor node. 420 /// 421 /// FIXME: Adjust PredSU height based on MinLatency. 422 void ScheduleDAGMI::releasePred(SUnit *SU, SDep *PredEdge) { 423 SUnit *PredSU = PredEdge->getSUnit(); 424 425 if (PredEdge->isWeak()) { 426 --PredSU->WeakSuccsLeft; 427 if (PredEdge->isCluster()) 428 NextClusterPred = PredSU; 429 return; 430 } 431 #ifndef NDEBUG 432 if (PredSU->NumSuccsLeft == 0) { 433 dbgs() << "*** Scheduling failed! ***\n"; 434 PredSU->dump(this); 435 dbgs() << " has been released too many times!\n"; 436 llvm_unreachable(0); 437 } 438 #endif 439 --PredSU->NumSuccsLeft; 440 if (PredSU->NumSuccsLeft == 0 && PredSU != &EntrySU) 441 SchedImpl->releaseBottomNode(PredSU); 442 } 443 444 /// releasePredecessors - Call releasePred on each of SU's predecessors. 445 void ScheduleDAGMI::releasePredecessors(SUnit *SU) { 446 for (SUnit::pred_iterator I = SU->Preds.begin(), E = SU->Preds.end(); 447 I != E; ++I) { 448 releasePred(SU, &*I); 449 } 450 } 451 452 /// This is normally called from the main scheduler loop but may also be invoked 453 /// by the scheduling strategy to perform additional code motion. 454 void ScheduleDAGMI::moveInstruction(MachineInstr *MI, 455 MachineBasicBlock::iterator InsertPos) { 456 // Advance RegionBegin if the first instruction moves down. 457 if (&*RegionBegin == MI) 458 ++RegionBegin; 459 460 // Update the instruction stream. 461 BB->splice(InsertPos, BB, MI); 462 463 // Update LiveIntervals 464 LIS->handleMove(MI, /*UpdateFlags=*/true); 465 466 // Recede RegionBegin if an instruction moves above the first. 467 if (RegionBegin == InsertPos) 468 RegionBegin = MI; 469 } 470 471 bool ScheduleDAGMI::checkSchedLimit() { 472 #ifndef NDEBUG 473 if (NumInstrsScheduled == MISchedCutoff && MISchedCutoff != ~0U) { 474 CurrentTop = CurrentBottom; 475 return false; 476 } 477 ++NumInstrsScheduled; 478 #endif 479 return true; 480 } 481 482 /// enterRegion - Called back from MachineScheduler::runOnMachineFunction after 483 /// crossing a scheduling boundary. [begin, end) includes all instructions in 484 /// the region, including the boundary itself and single-instruction regions 485 /// that don't get scheduled. 486 void ScheduleDAGMI::enterRegion(MachineBasicBlock *bb, 487 MachineBasicBlock::iterator begin, 488 MachineBasicBlock::iterator end, 489 unsigned regioninstrs) 490 { 491 ScheduleDAGInstrs::enterRegion(bb, begin, end, regioninstrs); 492 493 // For convenience remember the end of the liveness region. 494 LiveRegionEnd = 495 (RegionEnd == bb->end()) ? RegionEnd : llvm::next(RegionEnd); 496 497 SUPressureDiffs.clear(); 498 499 SchedImpl->initPolicy(begin, end, regioninstrs); 500 501 ShouldTrackPressure = SchedImpl->shouldTrackPressure(); 502 } 503 504 // Setup the register pressure trackers for the top scheduled top and bottom 505 // scheduled regions. 506 void ScheduleDAGMI::initRegPressure() { 507 TopRPTracker.init(&MF, RegClassInfo, LIS, BB, RegionBegin); 508 BotRPTracker.init(&MF, RegClassInfo, LIS, BB, LiveRegionEnd); 509 510 // Close the RPTracker to finalize live ins. 511 RPTracker.closeRegion(); 512 513 DEBUG(RPTracker.dump()); 514 515 // Initialize the live ins and live outs. 516 TopRPTracker.addLiveRegs(RPTracker.getPressure().LiveInRegs); 517 BotRPTracker.addLiveRegs(RPTracker.getPressure().LiveOutRegs); 518 519 // Close one end of the tracker so we can call 520 // getMaxUpward/DownwardPressureDelta before advancing across any 521 // instructions. This converts currently live regs into live ins/outs. 522 TopRPTracker.closeTop(); 523 BotRPTracker.closeBottom(); 524 525 BotRPTracker.initLiveThru(RPTracker); 526 if (!BotRPTracker.getLiveThru().empty()) { 527 TopRPTracker.initLiveThru(BotRPTracker.getLiveThru()); 528 DEBUG(dbgs() << "Live Thru: "; 529 dumpRegSetPressure(BotRPTracker.getLiveThru(), TRI)); 530 }; 531 532 // For each live out vreg reduce the pressure change associated with other 533 // uses of the same vreg below the live-out reaching def. 534 updatePressureDiffs(RPTracker.getPressure().LiveOutRegs); 535 536 // Account for liveness generated by the region boundary. 537 if (LiveRegionEnd != RegionEnd) { 538 SmallVector<unsigned, 8> LiveUses; 539 BotRPTracker.recede(&LiveUses); 540 updatePressureDiffs(LiveUses); 541 } 542 543 assert(BotRPTracker.getPos() == RegionEnd && "Can't find the region bottom"); 544 545 // Cache the list of excess pressure sets in this region. This will also track 546 // the max pressure in the scheduled code for these sets. 547 RegionCriticalPSets.clear(); 548 const std::vector<unsigned> &RegionPressure = 549 RPTracker.getPressure().MaxSetPressure; 550 for (unsigned i = 0, e = RegionPressure.size(); i < e; ++i) { 551 unsigned Limit = RegClassInfo->getRegPressureSetLimit(i); 552 if (RegionPressure[i] > Limit) { 553 DEBUG(dbgs() << TRI->getRegPressureSetName(i) 554 << " Limit " << Limit 555 << " Actual " << RegionPressure[i] << "\n"); 556 RegionCriticalPSets.push_back(PressureChange(i)); 557 } 558 } 559 DEBUG(dbgs() << "Excess PSets: "; 560 for (unsigned i = 0, e = RegionCriticalPSets.size(); i != e; ++i) 561 dbgs() << TRI->getRegPressureSetName( 562 RegionCriticalPSets[i].getPSet()) << " "; 563 dbgs() << "\n"); 564 } 565 566 void ScheduleDAGMI:: 567 updateScheduledPressure(const SUnit *SU, 568 const std::vector<unsigned> &NewMaxPressure) { 569 const PressureDiff &PDiff = getPressureDiff(SU); 570 unsigned CritIdx = 0, CritEnd = RegionCriticalPSets.size(); 571 for (PressureDiff::const_iterator I = PDiff.begin(), E = PDiff.end(); 572 I != E; ++I) { 573 if (!I->isValid()) 574 break; 575 unsigned ID = I->getPSet(); 576 while (CritIdx != CritEnd && RegionCriticalPSets[CritIdx].getPSet() < ID) 577 ++CritIdx; 578 if (CritIdx != CritEnd && RegionCriticalPSets[CritIdx].getPSet() == ID) { 579 if ((int)NewMaxPressure[ID] > RegionCriticalPSets[CritIdx].getUnitInc() 580 && NewMaxPressure[ID] <= INT16_MAX) 581 RegionCriticalPSets[CritIdx].setUnitInc(NewMaxPressure[ID]); 582 } 583 unsigned Limit = RegClassInfo->getRegPressureSetLimit(ID); 584 if (NewMaxPressure[ID] >= Limit - 2) { 585 DEBUG(dbgs() << " " << TRI->getRegPressureSetName(ID) << ": " 586 << NewMaxPressure[ID] << " > " << Limit << "(+ " 587 << BotRPTracker.getLiveThru()[ID] << " livethru)\n"); 588 } 589 } 590 } 591 592 /// Update the PressureDiff array for liveness after scheduling this 593 /// instruction. 594 void ScheduleDAGMI::updatePressureDiffs(ArrayRef<unsigned> LiveUses) { 595 for (unsigned LUIdx = 0, LUEnd = LiveUses.size(); LUIdx != LUEnd; ++LUIdx) { 596 /// FIXME: Currently assuming single-use physregs. 597 unsigned Reg = LiveUses[LUIdx]; 598 DEBUG(dbgs() << " LiveReg: " << PrintVRegOrUnit(Reg, TRI) << "\n"); 599 if (!TRI->isVirtualRegister(Reg)) 600 continue; 601 602 // This may be called before CurrentBottom has been initialized. However, 603 // BotRPTracker must have a valid position. We want the value live into the 604 // instruction or live out of the block, so ask for the previous 605 // instruction's live-out. 606 const LiveInterval &LI = LIS->getInterval(Reg); 607 VNInfo *VNI; 608 MachineBasicBlock::const_iterator I = 609 nextIfDebug(BotRPTracker.getPos(), BB->end()); 610 if (I == BB->end()) 611 VNI = LI.getVNInfoBefore(LIS->getMBBEndIdx(BB)); 612 else { 613 LiveRangeQuery LRQ(LI, LIS->getInstructionIndex(I)); 614 VNI = LRQ.valueIn(); 615 } 616 // RegisterPressureTracker guarantees that readsReg is true for LiveUses. 617 assert(VNI && "No live value at use."); 618 for (VReg2UseMap::iterator 619 UI = VRegUses.find(Reg); UI != VRegUses.end(); ++UI) { 620 SUnit *SU = UI->SU; 621 DEBUG(dbgs() << " UpdateRegP: SU(" << SU->NodeNum << ") " 622 << *SU->getInstr()); 623 // If this use comes before the reaching def, it cannot be a last use, so 624 // descrease its pressure change. 625 if (!SU->isScheduled && SU != &ExitSU) { 626 LiveRangeQuery LRQ(LI, LIS->getInstructionIndex(SU->getInstr())); 627 if (LRQ.valueIn() == VNI) 628 getPressureDiff(SU).addPressureChange(Reg, true, &MRI); 629 } 630 } 631 } 632 } 633 634 /// schedule - Called back from MachineScheduler::runOnMachineFunction 635 /// after setting up the current scheduling region. [RegionBegin, RegionEnd) 636 /// only includes instructions that have DAG nodes, not scheduling boundaries. 637 /// 638 /// This is a skeletal driver, with all the functionality pushed into helpers, 639 /// so that it can be easilly extended by experimental schedulers. Generally, 640 /// implementing MachineSchedStrategy should be sufficient to implement a new 641 /// scheduling algorithm. However, if a scheduler further subclasses 642 /// ScheduleDAGMI then it will want to override this virtual method in order to 643 /// update any specialized state. 644 void ScheduleDAGMI::schedule() { 645 buildDAGWithRegPressure(); 646 647 Topo.InitDAGTopologicalSorting(); 648 649 postprocessDAG(); 650 651 SmallVector<SUnit*, 8> TopRoots, BotRoots; 652 findRootsAndBiasEdges(TopRoots, BotRoots); 653 654 // Initialize the strategy before modifying the DAG. 655 // This may initialize a DFSResult to be used for queue priority. 656 SchedImpl->initialize(this); 657 658 DEBUG(for (unsigned su = 0, e = SUnits.size(); su != e; ++su) 659 SUnits[su].dumpAll(this)); 660 if (ViewMISchedDAGs) viewGraph(); 661 662 // Initialize ready queues now that the DAG and priority data are finalized. 663 initQueues(TopRoots, BotRoots); 664 665 bool IsTopNode = false; 666 while (SUnit *SU = SchedImpl->pickNode(IsTopNode)) { 667 assert(!SU->isScheduled && "Node already scheduled"); 668 if (!checkSchedLimit()) 669 break; 670 671 scheduleMI(SU, IsTopNode); 672 673 updateQueues(SU, IsTopNode); 674 } 675 assert(CurrentTop == CurrentBottom && "Nonempty unscheduled zone."); 676 677 placeDebugValues(); 678 679 DEBUG({ 680 unsigned BBNum = begin()->getParent()->getNumber(); 681 dbgs() << "*** Final schedule for BB#" << BBNum << " ***\n"; 682 dumpSchedule(); 683 dbgs() << '\n'; 684 }); 685 } 686 687 /// Build the DAG and setup three register pressure trackers. 688 void ScheduleDAGMI::buildDAGWithRegPressure() { 689 if (!ShouldTrackPressure) { 690 RPTracker.reset(); 691 RegionCriticalPSets.clear(); 692 buildSchedGraph(AA); 693 return; 694 } 695 696 // Initialize the register pressure tracker used by buildSchedGraph. 697 RPTracker.init(&MF, RegClassInfo, LIS, BB, LiveRegionEnd, 698 /*TrackUntiedDefs=*/true); 699 700 // Account for liveness generate by the region boundary. 701 if (LiveRegionEnd != RegionEnd) 702 RPTracker.recede(); 703 704 // Build the DAG, and compute current register pressure. 705 buildSchedGraph(AA, &RPTracker, &SUPressureDiffs); 706 707 // Initialize top/bottom trackers after computing region pressure. 708 initRegPressure(); 709 } 710 711 /// Apply each ScheduleDAGMutation step in order. 712 void ScheduleDAGMI::postprocessDAG() { 713 for (unsigned i = 0, e = Mutations.size(); i < e; ++i) { 714 Mutations[i]->apply(this); 715 } 716 } 717 718 void ScheduleDAGMI::computeDFSResult() { 719 if (!DFSResult) 720 DFSResult = new SchedDFSResult(/*BottomU*/true, MinSubtreeSize); 721 DFSResult->clear(); 722 ScheduledTrees.clear(); 723 DFSResult->resize(SUnits.size()); 724 DFSResult->compute(SUnits); 725 ScheduledTrees.resize(DFSResult->getNumSubtrees()); 726 } 727 728 void ScheduleDAGMI::findRootsAndBiasEdges(SmallVectorImpl<SUnit*> &TopRoots, 729 SmallVectorImpl<SUnit*> &BotRoots) { 730 for (std::vector<SUnit>::iterator 731 I = SUnits.begin(), E = SUnits.end(); I != E; ++I) { 732 SUnit *SU = &(*I); 733 assert(!SU->isBoundaryNode() && "Boundary node should not be in SUnits"); 734 735 // Order predecessors so DFSResult follows the critical path. 736 SU->biasCriticalPath(); 737 738 // A SUnit is ready to top schedule if it has no predecessors. 739 if (!I->NumPredsLeft) 740 TopRoots.push_back(SU); 741 // A SUnit is ready to bottom schedule if it has no successors. 742 if (!I->NumSuccsLeft) 743 BotRoots.push_back(SU); 744 } 745 ExitSU.biasCriticalPath(); 746 } 747 748 /// Compute the max cyclic critical path through the DAG. The scheduling DAG 749 /// only provides the critical path for single block loops. To handle loops that 750 /// span blocks, we could use the vreg path latencies provided by 751 /// MachineTraceMetrics instead. However, MachineTraceMetrics is not currently 752 /// available for use in the scheduler. 753 /// 754 /// The cyclic path estimation identifies a def-use pair that crosses the back 755 /// edge and considers the depth and height of the nodes. For example, consider 756 /// the following instruction sequence where each instruction has unit latency 757 /// and defines an epomymous virtual register: 758 /// 759 /// a->b(a,c)->c(b)->d(c)->exit 760 /// 761 /// The cyclic critical path is a two cycles: b->c->b 762 /// The acyclic critical path is four cycles: a->b->c->d->exit 763 /// LiveOutHeight = height(c) = len(c->d->exit) = 2 764 /// LiveOutDepth = depth(c) + 1 = len(a->b->c) + 1 = 3 765 /// LiveInHeight = height(b) + 1 = len(b->c->d->exit) + 1 = 4 766 /// LiveInDepth = depth(b) = len(a->b) = 1 767 /// 768 /// LiveOutDepth - LiveInDepth = 3 - 1 = 2 769 /// LiveInHeight - LiveOutHeight = 4 - 2 = 2 770 /// CyclicCriticalPath = min(2, 2) = 2 771 unsigned ScheduleDAGMI::computeCyclicCriticalPath() { 772 // This only applies to single block loop. 773 if (!BB->isSuccessor(BB)) 774 return 0; 775 776 unsigned MaxCyclicLatency = 0; 777 // Visit each live out vreg def to find def/use pairs that cross iterations. 778 ArrayRef<unsigned> LiveOuts = RPTracker.getPressure().LiveOutRegs; 779 for (ArrayRef<unsigned>::iterator RI = LiveOuts.begin(), RE = LiveOuts.end(); 780 RI != RE; ++RI) { 781 unsigned Reg = *RI; 782 if (!TRI->isVirtualRegister(Reg)) 783 continue; 784 const LiveInterval &LI = LIS->getInterval(Reg); 785 const VNInfo *DefVNI = LI.getVNInfoBefore(LIS->getMBBEndIdx(BB)); 786 if (!DefVNI) 787 continue; 788 789 MachineInstr *DefMI = LIS->getInstructionFromIndex(DefVNI->def); 790 const SUnit *DefSU = getSUnit(DefMI); 791 if (!DefSU) 792 continue; 793 794 unsigned LiveOutHeight = DefSU->getHeight(); 795 unsigned LiveOutDepth = DefSU->getDepth() + DefSU->Latency; 796 // Visit all local users of the vreg def. 797 for (VReg2UseMap::iterator 798 UI = VRegUses.find(Reg); UI != VRegUses.end(); ++UI) { 799 if (UI->SU == &ExitSU) 800 continue; 801 802 // Only consider uses of the phi. 803 LiveRangeQuery LRQ(LI, LIS->getInstructionIndex(UI->SU->getInstr())); 804 if (!LRQ.valueIn()->isPHIDef()) 805 continue; 806 807 // Assume that a path spanning two iterations is a cycle, which could 808 // overestimate in strange cases. This allows cyclic latency to be 809 // estimated as the minimum slack of the vreg's depth or height. 810 unsigned CyclicLatency = 0; 811 if (LiveOutDepth > UI->SU->getDepth()) 812 CyclicLatency = LiveOutDepth - UI->SU->getDepth(); 813 814 unsigned LiveInHeight = UI->SU->getHeight() + DefSU->Latency; 815 if (LiveInHeight > LiveOutHeight) { 816 if (LiveInHeight - LiveOutHeight < CyclicLatency) 817 CyclicLatency = LiveInHeight - LiveOutHeight; 818 } 819 else 820 CyclicLatency = 0; 821 822 DEBUG(dbgs() << "Cyclic Path: SU(" << DefSU->NodeNum << ") -> SU(" 823 << UI->SU->NodeNum << ") = " << CyclicLatency << "c\n"); 824 if (CyclicLatency > MaxCyclicLatency) 825 MaxCyclicLatency = CyclicLatency; 826 } 827 } 828 DEBUG(dbgs() << "Cyclic Critical Path: " << MaxCyclicLatency << "c\n"); 829 return MaxCyclicLatency; 830 } 831 832 /// Identify DAG roots and setup scheduler queues. 833 void ScheduleDAGMI::initQueues(ArrayRef<SUnit*> TopRoots, 834 ArrayRef<SUnit*> BotRoots) { 835 NextClusterSucc = NULL; 836 NextClusterPred = NULL; 837 838 // Release all DAG roots for scheduling, not including EntrySU/ExitSU. 839 // 840 // Nodes with unreleased weak edges can still be roots. 841 // Release top roots in forward order. 842 for (SmallVectorImpl<SUnit*>::const_iterator 843 I = TopRoots.begin(), E = TopRoots.end(); I != E; ++I) { 844 SchedImpl->releaseTopNode(*I); 845 } 846 // Release bottom roots in reverse order so the higher priority nodes appear 847 // first. This is more natural and slightly more efficient. 848 for (SmallVectorImpl<SUnit*>::const_reverse_iterator 849 I = BotRoots.rbegin(), E = BotRoots.rend(); I != E; ++I) { 850 SchedImpl->releaseBottomNode(*I); 851 } 852 853 releaseSuccessors(&EntrySU); 854 releasePredecessors(&ExitSU); 855 856 SchedImpl->registerRoots(); 857 858 // Advance past initial DebugValues. 859 CurrentTop = nextIfDebug(RegionBegin, RegionEnd); 860 CurrentBottom = RegionEnd; 861 862 if (ShouldTrackPressure) { 863 assert(TopRPTracker.getPos() == RegionBegin && "bad initial Top tracker"); 864 TopRPTracker.setPos(CurrentTop); 865 } 866 } 867 868 /// Move an instruction and update register pressure. 869 void ScheduleDAGMI::scheduleMI(SUnit *SU, bool IsTopNode) { 870 // Move the instruction to its new location in the instruction stream. 871 MachineInstr *MI = SU->getInstr(); 872 873 if (IsTopNode) { 874 assert(SU->isTopReady() && "node still has unscheduled dependencies"); 875 if (&*CurrentTop == MI) 876 CurrentTop = nextIfDebug(++CurrentTop, CurrentBottom); 877 else { 878 moveInstruction(MI, CurrentTop); 879 TopRPTracker.setPos(MI); 880 } 881 882 if (ShouldTrackPressure) { 883 // Update top scheduled pressure. 884 TopRPTracker.advance(); 885 assert(TopRPTracker.getPos() == CurrentTop && "out of sync"); 886 updateScheduledPressure(SU, TopRPTracker.getPressure().MaxSetPressure); 887 } 888 } 889 else { 890 assert(SU->isBottomReady() && "node still has unscheduled dependencies"); 891 MachineBasicBlock::iterator priorII = 892 priorNonDebug(CurrentBottom, CurrentTop); 893 if (&*priorII == MI) 894 CurrentBottom = priorII; 895 else { 896 if (&*CurrentTop == MI) { 897 CurrentTop = nextIfDebug(++CurrentTop, priorII); 898 TopRPTracker.setPos(CurrentTop); 899 } 900 moveInstruction(MI, CurrentBottom); 901 CurrentBottom = MI; 902 } 903 if (ShouldTrackPressure) { 904 // Update bottom scheduled pressure. 905 SmallVector<unsigned, 8> LiveUses; 906 BotRPTracker.recede(&LiveUses); 907 assert(BotRPTracker.getPos() == CurrentBottom && "out of sync"); 908 updateScheduledPressure(SU, BotRPTracker.getPressure().MaxSetPressure); 909 updatePressureDiffs(LiveUses); 910 } 911 } 912 } 913 914 /// Update scheduler queues after scheduling an instruction. 915 void ScheduleDAGMI::updateQueues(SUnit *SU, bool IsTopNode) { 916 // Release dependent instructions for scheduling. 917 if (IsTopNode) 918 releaseSuccessors(SU); 919 else 920 releasePredecessors(SU); 921 922 SU->isScheduled = true; 923 924 if (DFSResult) { 925 unsigned SubtreeID = DFSResult->getSubtreeID(SU); 926 if (!ScheduledTrees.test(SubtreeID)) { 927 ScheduledTrees.set(SubtreeID); 928 DFSResult->scheduleTree(SubtreeID); 929 SchedImpl->scheduleTree(SubtreeID); 930 } 931 } 932 933 // Notify the scheduling strategy after updating the DAG. 934 SchedImpl->schedNode(SU, IsTopNode); 935 } 936 937 /// Reinsert any remaining debug_values, just like the PostRA scheduler. 938 void ScheduleDAGMI::placeDebugValues() { 939 // If first instruction was a DBG_VALUE then put it back. 940 if (FirstDbgValue) { 941 BB->splice(RegionBegin, BB, FirstDbgValue); 942 RegionBegin = FirstDbgValue; 943 } 944 945 for (std::vector<std::pair<MachineInstr *, MachineInstr *> >::iterator 946 DI = DbgValues.end(), DE = DbgValues.begin(); DI != DE; --DI) { 947 std::pair<MachineInstr *, MachineInstr *> P = *prior(DI); 948 MachineInstr *DbgValue = P.first; 949 MachineBasicBlock::iterator OrigPrevMI = P.second; 950 if (&*RegionBegin == DbgValue) 951 ++RegionBegin; 952 BB->splice(++OrigPrevMI, BB, DbgValue); 953 if (OrigPrevMI == llvm::prior(RegionEnd)) 954 RegionEnd = DbgValue; 955 } 956 DbgValues.clear(); 957 FirstDbgValue = NULL; 958 } 959 960 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 961 void ScheduleDAGMI::dumpSchedule() const { 962 for (MachineBasicBlock::iterator MI = begin(), ME = end(); MI != ME; ++MI) { 963 if (SUnit *SU = getSUnit(&(*MI))) 964 SU->dump(this); 965 else 966 dbgs() << "Missing SUnit\n"; 967 } 968 } 969 #endif 970 971 //===----------------------------------------------------------------------===// 972 // LoadClusterMutation - DAG post-processing to cluster loads. 973 //===----------------------------------------------------------------------===// 974 975 namespace { 976 /// \brief Post-process the DAG to create cluster edges between neighboring 977 /// loads. 978 class LoadClusterMutation : public ScheduleDAGMutation { 979 struct LoadInfo { 980 SUnit *SU; 981 unsigned BaseReg; 982 unsigned Offset; 983 LoadInfo(SUnit *su, unsigned reg, unsigned ofs) 984 : SU(su), BaseReg(reg), Offset(ofs) {} 985 }; 986 static bool LoadInfoLess(const LoadClusterMutation::LoadInfo &LHS, 987 const LoadClusterMutation::LoadInfo &RHS); 988 989 const TargetInstrInfo *TII; 990 const TargetRegisterInfo *TRI; 991 public: 992 LoadClusterMutation(const TargetInstrInfo *tii, 993 const TargetRegisterInfo *tri) 994 : TII(tii), TRI(tri) {} 995 996 virtual void apply(ScheduleDAGMI *DAG); 997 protected: 998 void clusterNeighboringLoads(ArrayRef<SUnit*> Loads, ScheduleDAGMI *DAG); 999 }; 1000 } // anonymous 1001 1002 bool LoadClusterMutation::LoadInfoLess( 1003 const LoadClusterMutation::LoadInfo &LHS, 1004 const LoadClusterMutation::LoadInfo &RHS) { 1005 if (LHS.BaseReg != RHS.BaseReg) 1006 return LHS.BaseReg < RHS.BaseReg; 1007 return LHS.Offset < RHS.Offset; 1008 } 1009 1010 void LoadClusterMutation::clusterNeighboringLoads(ArrayRef<SUnit*> Loads, 1011 ScheduleDAGMI *DAG) { 1012 SmallVector<LoadClusterMutation::LoadInfo,32> LoadRecords; 1013 for (unsigned Idx = 0, End = Loads.size(); Idx != End; ++Idx) { 1014 SUnit *SU = Loads[Idx]; 1015 unsigned BaseReg; 1016 unsigned Offset; 1017 if (TII->getLdStBaseRegImmOfs(SU->getInstr(), BaseReg, Offset, TRI)) 1018 LoadRecords.push_back(LoadInfo(SU, BaseReg, Offset)); 1019 } 1020 if (LoadRecords.size() < 2) 1021 return; 1022 std::sort(LoadRecords.begin(), LoadRecords.end(), LoadInfoLess); 1023 unsigned ClusterLength = 1; 1024 for (unsigned Idx = 0, End = LoadRecords.size(); Idx < (End - 1); ++Idx) { 1025 if (LoadRecords[Idx].BaseReg != LoadRecords[Idx+1].BaseReg) { 1026 ClusterLength = 1; 1027 continue; 1028 } 1029 1030 SUnit *SUa = LoadRecords[Idx].SU; 1031 SUnit *SUb = LoadRecords[Idx+1].SU; 1032 if (TII->shouldClusterLoads(SUa->getInstr(), SUb->getInstr(), ClusterLength) 1033 && DAG->addEdge(SUb, SDep(SUa, SDep::Cluster))) { 1034 1035 DEBUG(dbgs() << "Cluster loads SU(" << SUa->NodeNum << ") - SU(" 1036 << SUb->NodeNum << ")\n"); 1037 // Copy successor edges from SUa to SUb. Interleaving computation 1038 // dependent on SUa can prevent load combining due to register reuse. 1039 // Predecessor edges do not need to be copied from SUb to SUa since nearby 1040 // loads should have effectively the same inputs. 1041 for (SUnit::const_succ_iterator 1042 SI = SUa->Succs.begin(), SE = SUa->Succs.end(); SI != SE; ++SI) { 1043 if (SI->getSUnit() == SUb) 1044 continue; 1045 DEBUG(dbgs() << " Copy Succ SU(" << SI->getSUnit()->NodeNum << ")\n"); 1046 DAG->addEdge(SI->getSUnit(), SDep(SUb, SDep::Artificial)); 1047 } 1048 ++ClusterLength; 1049 } 1050 else 1051 ClusterLength = 1; 1052 } 1053 } 1054 1055 /// \brief Callback from DAG postProcessing to create cluster edges for loads. 1056 void LoadClusterMutation::apply(ScheduleDAGMI *DAG) { 1057 // Map DAG NodeNum to store chain ID. 1058 DenseMap<unsigned, unsigned> StoreChainIDs; 1059 // Map each store chain to a set of dependent loads. 1060 SmallVector<SmallVector<SUnit*,4>, 32> StoreChainDependents; 1061 for (unsigned Idx = 0, End = DAG->SUnits.size(); Idx != End; ++Idx) { 1062 SUnit *SU = &DAG->SUnits[Idx]; 1063 if (!SU->getInstr()->mayLoad()) 1064 continue; 1065 unsigned ChainPredID = DAG->SUnits.size(); 1066 for (SUnit::const_pred_iterator 1067 PI = SU->Preds.begin(), PE = SU->Preds.end(); PI != PE; ++PI) { 1068 if (PI->isCtrl()) { 1069 ChainPredID = PI->getSUnit()->NodeNum; 1070 break; 1071 } 1072 } 1073 // Check if this chain-like pred has been seen 1074 // before. ChainPredID==MaxNodeID for loads at the top of the schedule. 1075 unsigned NumChains = StoreChainDependents.size(); 1076 std::pair<DenseMap<unsigned, unsigned>::iterator, bool> Result = 1077 StoreChainIDs.insert(std::make_pair(ChainPredID, NumChains)); 1078 if (Result.second) 1079 StoreChainDependents.resize(NumChains + 1); 1080 StoreChainDependents[Result.first->second].push_back(SU); 1081 } 1082 // Iterate over the store chains. 1083 for (unsigned Idx = 0, End = StoreChainDependents.size(); Idx != End; ++Idx) 1084 clusterNeighboringLoads(StoreChainDependents[Idx], DAG); 1085 } 1086 1087 //===----------------------------------------------------------------------===// 1088 // MacroFusion - DAG post-processing to encourage fusion of macro ops. 1089 //===----------------------------------------------------------------------===// 1090 1091 namespace { 1092 /// \brief Post-process the DAG to create cluster edges between instructions 1093 /// that may be fused by the processor into a single operation. 1094 class MacroFusion : public ScheduleDAGMutation { 1095 const TargetInstrInfo *TII; 1096 public: 1097 MacroFusion(const TargetInstrInfo *tii): TII(tii) {} 1098 1099 virtual void apply(ScheduleDAGMI *DAG); 1100 }; 1101 } // anonymous 1102 1103 /// \brief Callback from DAG postProcessing to create cluster edges to encourage 1104 /// fused operations. 1105 void MacroFusion::apply(ScheduleDAGMI *DAG) { 1106 // For now, assume targets can only fuse with the branch. 1107 MachineInstr *Branch = DAG->ExitSU.getInstr(); 1108 if (!Branch) 1109 return; 1110 1111 for (unsigned Idx = DAG->SUnits.size(); Idx > 0;) { 1112 SUnit *SU = &DAG->SUnits[--Idx]; 1113 if (!TII->shouldScheduleAdjacent(SU->getInstr(), Branch)) 1114 continue; 1115 1116 // Create a single weak edge from SU to ExitSU. The only effect is to cause 1117 // bottom-up scheduling to heavily prioritize the clustered SU. There is no 1118 // need to copy predecessor edges from ExitSU to SU, since top-down 1119 // scheduling cannot prioritize ExitSU anyway. To defer top-down scheduling 1120 // of SU, we could create an artificial edge from the deepest root, but it 1121 // hasn't been needed yet. 1122 bool Success = DAG->addEdge(&DAG->ExitSU, SDep(SU, SDep::Cluster)); 1123 (void)Success; 1124 assert(Success && "No DAG nodes should be reachable from ExitSU"); 1125 1126 DEBUG(dbgs() << "Macro Fuse SU(" << SU->NodeNum << ")\n"); 1127 break; 1128 } 1129 } 1130 1131 //===----------------------------------------------------------------------===// 1132 // CopyConstrain - DAG post-processing to encourage copy elimination. 1133 //===----------------------------------------------------------------------===// 1134 1135 namespace { 1136 /// \brief Post-process the DAG to create weak edges from all uses of a copy to 1137 /// the one use that defines the copy's source vreg, most likely an induction 1138 /// variable increment. 1139 class CopyConstrain : public ScheduleDAGMutation { 1140 // Transient state. 1141 SlotIndex RegionBeginIdx; 1142 // RegionEndIdx is the slot index of the last non-debug instruction in the 1143 // scheduling region. So we may have RegionBeginIdx == RegionEndIdx. 1144 SlotIndex RegionEndIdx; 1145 public: 1146 CopyConstrain(const TargetInstrInfo *, const TargetRegisterInfo *) {} 1147 1148 virtual void apply(ScheduleDAGMI *DAG); 1149 1150 protected: 1151 void constrainLocalCopy(SUnit *CopySU, ScheduleDAGMI *DAG); 1152 }; 1153 } // anonymous 1154 1155 /// constrainLocalCopy handles two possibilities: 1156 /// 1) Local src: 1157 /// I0: = dst 1158 /// I1: src = ... 1159 /// I2: = dst 1160 /// I3: dst = src (copy) 1161 /// (create pred->succ edges I0->I1, I2->I1) 1162 /// 1163 /// 2) Local copy: 1164 /// I0: dst = src (copy) 1165 /// I1: = dst 1166 /// I2: src = ... 1167 /// I3: = dst 1168 /// (create pred->succ edges I1->I2, I3->I2) 1169 /// 1170 /// Although the MachineScheduler is currently constrained to single blocks, 1171 /// this algorithm should handle extended blocks. An EBB is a set of 1172 /// contiguously numbered blocks such that the previous block in the EBB is 1173 /// always the single predecessor. 1174 void CopyConstrain::constrainLocalCopy(SUnit *CopySU, ScheduleDAGMI *DAG) { 1175 LiveIntervals *LIS = DAG->getLIS(); 1176 MachineInstr *Copy = CopySU->getInstr(); 1177 1178 // Check for pure vreg copies. 1179 unsigned SrcReg = Copy->getOperand(1).getReg(); 1180 if (!TargetRegisterInfo::isVirtualRegister(SrcReg)) 1181 return; 1182 1183 unsigned DstReg = Copy->getOperand(0).getReg(); 1184 if (!TargetRegisterInfo::isVirtualRegister(DstReg)) 1185 return; 1186 1187 // Check if either the dest or source is local. If it's live across a back 1188 // edge, it's not local. Note that if both vregs are live across the back 1189 // edge, we cannot successfully contrain the copy without cyclic scheduling. 1190 unsigned LocalReg = DstReg; 1191 unsigned GlobalReg = SrcReg; 1192 LiveInterval *LocalLI = &LIS->getInterval(LocalReg); 1193 if (!LocalLI->isLocal(RegionBeginIdx, RegionEndIdx)) { 1194 LocalReg = SrcReg; 1195 GlobalReg = DstReg; 1196 LocalLI = &LIS->getInterval(LocalReg); 1197 if (!LocalLI->isLocal(RegionBeginIdx, RegionEndIdx)) 1198 return; 1199 } 1200 LiveInterval *GlobalLI = &LIS->getInterval(GlobalReg); 1201 1202 // Find the global segment after the start of the local LI. 1203 LiveInterval::iterator GlobalSegment = GlobalLI->find(LocalLI->beginIndex()); 1204 // If GlobalLI does not overlap LocalLI->start, then a copy directly feeds a 1205 // local live range. We could create edges from other global uses to the local 1206 // start, but the coalescer should have already eliminated these cases, so 1207 // don't bother dealing with it. 1208 if (GlobalSegment == GlobalLI->end()) 1209 return; 1210 1211 // If GlobalSegment is killed at the LocalLI->start, the call to find() 1212 // returned the next global segment. But if GlobalSegment overlaps with 1213 // LocalLI->start, then advance to the next segement. If a hole in GlobalLI 1214 // exists in LocalLI's vicinity, GlobalSegment will be the end of the hole. 1215 if (GlobalSegment->contains(LocalLI->beginIndex())) 1216 ++GlobalSegment; 1217 1218 if (GlobalSegment == GlobalLI->end()) 1219 return; 1220 1221 // Check if GlobalLI contains a hole in the vicinity of LocalLI. 1222 if (GlobalSegment != GlobalLI->begin()) { 1223 // Two address defs have no hole. 1224 if (SlotIndex::isSameInstr(llvm::prior(GlobalSegment)->end, 1225 GlobalSegment->start)) { 1226 return; 1227 } 1228 // If the prior global segment may be defined by the same two-address 1229 // instruction that also defines LocalLI, then can't make a hole here. 1230 if (SlotIndex::isSameInstr(llvm::prior(GlobalSegment)->start, 1231 LocalLI->beginIndex())) { 1232 return; 1233 } 1234 // If GlobalLI has a prior segment, it must be live into the EBB. Otherwise 1235 // it would be a disconnected component in the live range. 1236 assert(llvm::prior(GlobalSegment)->start < LocalLI->beginIndex() && 1237 "Disconnected LRG within the scheduling region."); 1238 } 1239 MachineInstr *GlobalDef = LIS->getInstructionFromIndex(GlobalSegment->start); 1240 if (!GlobalDef) 1241 return; 1242 1243 SUnit *GlobalSU = DAG->getSUnit(GlobalDef); 1244 if (!GlobalSU) 1245 return; 1246 1247 // GlobalDef is the bottom of the GlobalLI hole. Open the hole by 1248 // constraining the uses of the last local def to precede GlobalDef. 1249 SmallVector<SUnit*,8> LocalUses; 1250 const VNInfo *LastLocalVN = LocalLI->getVNInfoBefore(LocalLI->endIndex()); 1251 MachineInstr *LastLocalDef = LIS->getInstructionFromIndex(LastLocalVN->def); 1252 SUnit *LastLocalSU = DAG->getSUnit(LastLocalDef); 1253 for (SUnit::const_succ_iterator 1254 I = LastLocalSU->Succs.begin(), E = LastLocalSU->Succs.end(); 1255 I != E; ++I) { 1256 if (I->getKind() != SDep::Data || I->getReg() != LocalReg) 1257 continue; 1258 if (I->getSUnit() == GlobalSU) 1259 continue; 1260 if (!DAG->canAddEdge(GlobalSU, I->getSUnit())) 1261 return; 1262 LocalUses.push_back(I->getSUnit()); 1263 } 1264 // Open the top of the GlobalLI hole by constraining any earlier global uses 1265 // to precede the start of LocalLI. 1266 SmallVector<SUnit*,8> GlobalUses; 1267 MachineInstr *FirstLocalDef = 1268 LIS->getInstructionFromIndex(LocalLI->beginIndex()); 1269 SUnit *FirstLocalSU = DAG->getSUnit(FirstLocalDef); 1270 for (SUnit::const_pred_iterator 1271 I = GlobalSU->Preds.begin(), E = GlobalSU->Preds.end(); I != E; ++I) { 1272 if (I->getKind() != SDep::Anti || I->getReg() != GlobalReg) 1273 continue; 1274 if (I->getSUnit() == FirstLocalSU) 1275 continue; 1276 if (!DAG->canAddEdge(FirstLocalSU, I->getSUnit())) 1277 return; 1278 GlobalUses.push_back(I->getSUnit()); 1279 } 1280 DEBUG(dbgs() << "Constraining copy SU(" << CopySU->NodeNum << ")\n"); 1281 // Add the weak edges. 1282 for (SmallVectorImpl<SUnit*>::const_iterator 1283 I = LocalUses.begin(), E = LocalUses.end(); I != E; ++I) { 1284 DEBUG(dbgs() << " Local use SU(" << (*I)->NodeNum << ") -> SU(" 1285 << GlobalSU->NodeNum << ")\n"); 1286 DAG->addEdge(GlobalSU, SDep(*I, SDep::Weak)); 1287 } 1288 for (SmallVectorImpl<SUnit*>::const_iterator 1289 I = GlobalUses.begin(), E = GlobalUses.end(); I != E; ++I) { 1290 DEBUG(dbgs() << " Global use SU(" << (*I)->NodeNum << ") -> SU(" 1291 << FirstLocalSU->NodeNum << ")\n"); 1292 DAG->addEdge(FirstLocalSU, SDep(*I, SDep::Weak)); 1293 } 1294 } 1295 1296 /// \brief Callback from DAG postProcessing to create weak edges to encourage 1297 /// copy elimination. 1298 void CopyConstrain::apply(ScheduleDAGMI *DAG) { 1299 MachineBasicBlock::iterator FirstPos = nextIfDebug(DAG->begin(), DAG->end()); 1300 if (FirstPos == DAG->end()) 1301 return; 1302 RegionBeginIdx = DAG->getLIS()->getInstructionIndex(&*FirstPos); 1303 RegionEndIdx = DAG->getLIS()->getInstructionIndex( 1304 &*priorNonDebug(DAG->end(), DAG->begin())); 1305 1306 for (unsigned Idx = 0, End = DAG->SUnits.size(); Idx != End; ++Idx) { 1307 SUnit *SU = &DAG->SUnits[Idx]; 1308 if (!SU->getInstr()->isCopy()) 1309 continue; 1310 1311 constrainLocalCopy(SU, DAG); 1312 } 1313 } 1314 1315 //===----------------------------------------------------------------------===// 1316 // GenericScheduler - Implementation of the generic MachineSchedStrategy. 1317 //===----------------------------------------------------------------------===// 1318 1319 namespace { 1320 /// GenericScheduler shrinks the unscheduled zone using heuristics to balance 1321 /// the schedule. 1322 class GenericScheduler : public MachineSchedStrategy { 1323 public: 1324 /// Represent the type of SchedCandidate found within a single queue. 1325 /// pickNodeBidirectional depends on these listed by decreasing priority. 1326 enum CandReason { 1327 NoCand, PhysRegCopy, RegExcess, RegCritical, Cluster, Weak, RegMax, 1328 ResourceReduce, ResourceDemand, BotHeightReduce, BotPathReduce, 1329 TopDepthReduce, TopPathReduce, NextDefUse, NodeOrder}; 1330 1331 #ifndef NDEBUG 1332 static const char *getReasonStr(GenericScheduler::CandReason Reason); 1333 #endif 1334 1335 /// Policy for scheduling the next instruction in the candidate's zone. 1336 struct CandPolicy { 1337 bool ReduceLatency; 1338 unsigned ReduceResIdx; 1339 unsigned DemandResIdx; 1340 1341 CandPolicy(): ReduceLatency(false), ReduceResIdx(0), DemandResIdx(0) {} 1342 }; 1343 1344 /// Status of an instruction's critical resource consumption. 1345 struct SchedResourceDelta { 1346 // Count critical resources in the scheduled region required by SU. 1347 unsigned CritResources; 1348 1349 // Count critical resources from another region consumed by SU. 1350 unsigned DemandedResources; 1351 1352 SchedResourceDelta(): CritResources(0), DemandedResources(0) {} 1353 1354 bool operator==(const SchedResourceDelta &RHS) const { 1355 return CritResources == RHS.CritResources 1356 && DemandedResources == RHS.DemandedResources; 1357 } 1358 bool operator!=(const SchedResourceDelta &RHS) const { 1359 return !operator==(RHS); 1360 } 1361 }; 1362 1363 /// Store the state used by GenericScheduler heuristics, required for the 1364 /// lifetime of one invocation of pickNode(). 1365 struct SchedCandidate { 1366 CandPolicy Policy; 1367 1368 // The best SUnit candidate. 1369 SUnit *SU; 1370 1371 // The reason for this candidate. 1372 CandReason Reason; 1373 1374 // Set of reasons that apply to multiple candidates. 1375 uint32_t RepeatReasonSet; 1376 1377 // Register pressure values for the best candidate. 1378 RegPressureDelta RPDelta; 1379 1380 // Critical resource consumption of the best candidate. 1381 SchedResourceDelta ResDelta; 1382 1383 SchedCandidate(const CandPolicy &policy) 1384 : Policy(policy), SU(NULL), Reason(NoCand), RepeatReasonSet(0) {} 1385 1386 bool isValid() const { return SU; } 1387 1388 // Copy the status of another candidate without changing policy. 1389 void setBest(SchedCandidate &Best) { 1390 assert(Best.Reason != NoCand && "uninitialized Sched candidate"); 1391 SU = Best.SU; 1392 Reason = Best.Reason; 1393 RPDelta = Best.RPDelta; 1394 ResDelta = Best.ResDelta; 1395 } 1396 1397 bool isRepeat(CandReason R) { return RepeatReasonSet & (1 << R); } 1398 void setRepeat(CandReason R) { RepeatReasonSet |= (1 << R); } 1399 1400 void initResourceDelta(const ScheduleDAGMI *DAG, 1401 const TargetSchedModel *SchedModel); 1402 }; 1403 1404 /// Summarize the unscheduled region. 1405 struct SchedRemainder { 1406 // Critical path through the DAG in expected latency. 1407 unsigned CriticalPath; 1408 unsigned CyclicCritPath; 1409 1410 // Scaled count of micro-ops left to schedule. 1411 unsigned RemIssueCount; 1412 1413 bool IsAcyclicLatencyLimited; 1414 1415 // Unscheduled resources 1416 SmallVector<unsigned, 16> RemainingCounts; 1417 1418 void reset() { 1419 CriticalPath = 0; 1420 CyclicCritPath = 0; 1421 RemIssueCount = 0; 1422 IsAcyclicLatencyLimited = false; 1423 RemainingCounts.clear(); 1424 } 1425 1426 SchedRemainder() { reset(); } 1427 1428 void init(ScheduleDAGMI *DAG, const TargetSchedModel *SchedModel); 1429 }; 1430 1431 /// Each Scheduling boundary is associated with ready queues. It tracks the 1432 /// current cycle in the direction of movement, and maintains the state 1433 /// of "hazards" and other interlocks at the current cycle. 1434 struct SchedBoundary { 1435 ScheduleDAGMI *DAG; 1436 const TargetSchedModel *SchedModel; 1437 SchedRemainder *Rem; 1438 1439 ReadyQueue Available; 1440 ReadyQueue Pending; 1441 bool CheckPending; 1442 1443 // For heuristics, keep a list of the nodes that immediately depend on the 1444 // most recently scheduled node. 1445 SmallPtrSet<const SUnit*, 8> NextSUs; 1446 1447 ScheduleHazardRecognizer *HazardRec; 1448 1449 /// Number of cycles it takes to issue the instructions scheduled in this 1450 /// zone. It is defined as: scheduled-micro-ops / issue-width + stalls. 1451 /// See getStalls(). 1452 unsigned CurrCycle; 1453 1454 /// Micro-ops issued in the current cycle 1455 unsigned CurrMOps; 1456 1457 /// MinReadyCycle - Cycle of the soonest available instruction. 1458 unsigned MinReadyCycle; 1459 1460 // The expected latency of the critical path in this scheduled zone. 1461 unsigned ExpectedLatency; 1462 1463 // The latency of dependence chains leading into this zone. 1464 // For each node scheduled bottom-up: DLat = max DLat, N.Depth. 1465 // For each cycle scheduled: DLat -= 1. 1466 unsigned DependentLatency; 1467 1468 /// Count the scheduled (issued) micro-ops that can be retired by 1469 /// time=CurrCycle assuming the first scheduled instr is retired at time=0. 1470 unsigned RetiredMOps; 1471 1472 // Count scheduled resources that have been executed. Resources are 1473 // considered executed if they become ready in the time that it takes to 1474 // saturate any resource including the one in question. Counts are scaled 1475 // for direct comparison with other resources. Counts can be compared with 1476 // MOps * getMicroOpFactor and Latency * getLatencyFactor. 1477 SmallVector<unsigned, 16> ExecutedResCounts; 1478 1479 /// Cache the max count for a single resource. 1480 unsigned MaxExecutedResCount; 1481 1482 // Cache the critical resources ID in this scheduled zone. 1483 unsigned ZoneCritResIdx; 1484 1485 // Is the scheduled region resource limited vs. latency limited. 1486 bool IsResourceLimited; 1487 1488 #ifndef NDEBUG 1489 // Remember the greatest operand latency as an upper bound on the number of 1490 // times we should retry the pending queue because of a hazard. 1491 unsigned MaxObservedLatency; 1492 #endif 1493 1494 void reset() { 1495 // A new HazardRec is created for each DAG and owned by SchedBoundary. 1496 // Destroying and reconstructing it is very expensive though. So keep 1497 // invalid, placeholder HazardRecs. 1498 if (HazardRec && HazardRec->isEnabled()) { 1499 delete HazardRec; 1500 HazardRec = 0; 1501 } 1502 Available.clear(); 1503 Pending.clear(); 1504 CheckPending = false; 1505 NextSUs.clear(); 1506 CurrCycle = 0; 1507 CurrMOps = 0; 1508 MinReadyCycle = UINT_MAX; 1509 ExpectedLatency = 0; 1510 DependentLatency = 0; 1511 RetiredMOps = 0; 1512 MaxExecutedResCount = 0; 1513 ZoneCritResIdx = 0; 1514 IsResourceLimited = false; 1515 #ifndef NDEBUG 1516 MaxObservedLatency = 0; 1517 #endif 1518 // Reserve a zero-count for invalid CritResIdx. 1519 ExecutedResCounts.resize(1); 1520 assert(!ExecutedResCounts[0] && "nonzero count for bad resource"); 1521 } 1522 1523 /// Pending queues extend the ready queues with the same ID and the 1524 /// PendingFlag set. 1525 SchedBoundary(unsigned ID, const Twine &Name): 1526 DAG(0), SchedModel(0), Rem(0), Available(ID, Name+".A"), 1527 Pending(ID << GenericScheduler::LogMaxQID, Name+".P"), 1528 HazardRec(0) { 1529 reset(); 1530 } 1531 1532 ~SchedBoundary() { delete HazardRec; } 1533 1534 void init(ScheduleDAGMI *dag, const TargetSchedModel *smodel, 1535 SchedRemainder *rem); 1536 1537 bool isTop() const { 1538 return Available.getID() == GenericScheduler::TopQID; 1539 } 1540 1541 #ifndef NDEBUG 1542 const char *getResourceName(unsigned PIdx) { 1543 if (!PIdx) 1544 return "MOps"; 1545 return SchedModel->getProcResource(PIdx)->Name; 1546 } 1547 #endif 1548 1549 /// Get the number of latency cycles "covered" by the scheduled 1550 /// instructions. This is the larger of the critical path within the zone 1551 /// and the number of cycles required to issue the instructions. 1552 unsigned getScheduledLatency() const { 1553 return std::max(ExpectedLatency, CurrCycle); 1554 } 1555 1556 unsigned getUnscheduledLatency(SUnit *SU) const { 1557 return isTop() ? SU->getHeight() : SU->getDepth(); 1558 } 1559 1560 unsigned getResourceCount(unsigned ResIdx) const { 1561 return ExecutedResCounts[ResIdx]; 1562 } 1563 1564 /// Get the scaled count of scheduled micro-ops and resources, including 1565 /// executed resources. 1566 unsigned getCriticalCount() const { 1567 if (!ZoneCritResIdx) 1568 return RetiredMOps * SchedModel->getMicroOpFactor(); 1569 return getResourceCount(ZoneCritResIdx); 1570 } 1571 1572 /// Get a scaled count for the minimum execution time of the scheduled 1573 /// micro-ops that are ready to execute by getExecutedCount. Notice the 1574 /// feedback loop. 1575 unsigned getExecutedCount() const { 1576 return std::max(CurrCycle * SchedModel->getLatencyFactor(), 1577 MaxExecutedResCount); 1578 } 1579 1580 bool checkHazard(SUnit *SU); 1581 1582 unsigned findMaxLatency(ArrayRef<SUnit*> ReadySUs); 1583 1584 unsigned getOtherResourceCount(unsigned &OtherCritIdx); 1585 1586 void setPolicy(CandPolicy &Policy, SchedBoundary &OtherZone); 1587 1588 void releaseNode(SUnit *SU, unsigned ReadyCycle); 1589 1590 void bumpCycle(unsigned NextCycle); 1591 1592 void incExecutedResources(unsigned PIdx, unsigned Count); 1593 1594 unsigned countResource(unsigned PIdx, unsigned Cycles, unsigned ReadyCycle); 1595 1596 void bumpNode(SUnit *SU); 1597 1598 void releasePending(); 1599 1600 void removeReady(SUnit *SU); 1601 1602 SUnit *pickOnlyChoice(); 1603 1604 #ifndef NDEBUG 1605 void dumpScheduledState(); 1606 #endif 1607 }; 1608 1609 private: 1610 const MachineSchedContext *Context; 1611 ScheduleDAGMI *DAG; 1612 const TargetSchedModel *SchedModel; 1613 const TargetRegisterInfo *TRI; 1614 1615 // State of the top and bottom scheduled instruction boundaries. 1616 SchedRemainder Rem; 1617 SchedBoundary Top; 1618 SchedBoundary Bot; 1619 1620 MachineSchedPolicy RegionPolicy; 1621 public: 1622 /// SUnit::NodeQueueId: 0 (none), 1 (top), 2 (bot), 3 (both) 1623 enum { 1624 TopQID = 1, 1625 BotQID = 2, 1626 LogMaxQID = 2 1627 }; 1628 1629 GenericScheduler(const MachineSchedContext *C): 1630 Context(C), DAG(0), SchedModel(0), TRI(0), 1631 Top(TopQID, "TopQ"), Bot(BotQID, "BotQ") {} 1632 1633 virtual void initPolicy(MachineBasicBlock::iterator Begin, 1634 MachineBasicBlock::iterator End, 1635 unsigned NumRegionInstrs); 1636 1637 bool shouldTrackPressure() const { return RegionPolicy.ShouldTrackPressure; } 1638 1639 virtual void initialize(ScheduleDAGMI *dag); 1640 1641 virtual SUnit *pickNode(bool &IsTopNode); 1642 1643 virtual void schedNode(SUnit *SU, bool IsTopNode); 1644 1645 virtual void releaseTopNode(SUnit *SU); 1646 1647 virtual void releaseBottomNode(SUnit *SU); 1648 1649 virtual void registerRoots(); 1650 1651 protected: 1652 void checkAcyclicLatency(); 1653 1654 void tryCandidate(SchedCandidate &Cand, 1655 SchedCandidate &TryCand, 1656 SchedBoundary &Zone, 1657 const RegPressureTracker &RPTracker, 1658 RegPressureTracker &TempTracker); 1659 1660 SUnit *pickNodeBidirectional(bool &IsTopNode); 1661 1662 void pickNodeFromQueue(SchedBoundary &Zone, 1663 const RegPressureTracker &RPTracker, 1664 SchedCandidate &Candidate); 1665 1666 void reschedulePhysRegCopies(SUnit *SU, bool isTop); 1667 1668 #ifndef NDEBUG 1669 void traceCandidate(const SchedCandidate &Cand); 1670 #endif 1671 }; 1672 } // namespace 1673 1674 void GenericScheduler::SchedRemainder:: 1675 init(ScheduleDAGMI *DAG, const TargetSchedModel *SchedModel) { 1676 reset(); 1677 if (!SchedModel->hasInstrSchedModel()) 1678 return; 1679 RemainingCounts.resize(SchedModel->getNumProcResourceKinds()); 1680 for (std::vector<SUnit>::iterator 1681 I = DAG->SUnits.begin(), E = DAG->SUnits.end(); I != E; ++I) { 1682 const MCSchedClassDesc *SC = DAG->getSchedClass(&*I); 1683 RemIssueCount += SchedModel->getNumMicroOps(I->getInstr(), SC) 1684 * SchedModel->getMicroOpFactor(); 1685 for (TargetSchedModel::ProcResIter 1686 PI = SchedModel->getWriteProcResBegin(SC), 1687 PE = SchedModel->getWriteProcResEnd(SC); PI != PE; ++PI) { 1688 unsigned PIdx = PI->ProcResourceIdx; 1689 unsigned Factor = SchedModel->getResourceFactor(PIdx); 1690 RemainingCounts[PIdx] += (Factor * PI->Cycles); 1691 } 1692 } 1693 } 1694 1695 void GenericScheduler::SchedBoundary:: 1696 init(ScheduleDAGMI *dag, const TargetSchedModel *smodel, SchedRemainder *rem) { 1697 reset(); 1698 DAG = dag; 1699 SchedModel = smodel; 1700 Rem = rem; 1701 if (SchedModel->hasInstrSchedModel()) 1702 ExecutedResCounts.resize(SchedModel->getNumProcResourceKinds()); 1703 } 1704 1705 /// Initialize the per-region scheduling policy. 1706 void GenericScheduler::initPolicy(MachineBasicBlock::iterator Begin, 1707 MachineBasicBlock::iterator End, 1708 unsigned NumRegionInstrs) { 1709 const TargetMachine &TM = Context->MF->getTarget(); 1710 1711 // Avoid setting up the register pressure tracker for small regions to save 1712 // compile time. As a rough heuristic, only track pressure when the number of 1713 // schedulable instructions exceeds half the integer register file. 1714 unsigned NIntRegs = Context->RegClassInfo->getNumAllocatableRegs( 1715 TM.getTargetLowering()->getRegClassFor(MVT::i32)); 1716 1717 RegionPolicy.ShouldTrackPressure = NumRegionInstrs > (NIntRegs / 2); 1718 1719 // For generic targets, we default to bottom-up, because it's simpler and more 1720 // compile-time optimizations have been implemented in that direction. 1721 RegionPolicy.OnlyBottomUp = true; 1722 1723 // Allow the subtarget to override default policy. 1724 const TargetSubtargetInfo &ST = TM.getSubtarget<TargetSubtargetInfo>(); 1725 ST.overrideSchedPolicy(RegionPolicy, Begin, End, NumRegionInstrs); 1726 1727 // After subtarget overrides, apply command line options. 1728 if (!EnableRegPressure) 1729 RegionPolicy.ShouldTrackPressure = false; 1730 1731 // Check -misched-topdown/bottomup can force or unforce scheduling direction. 1732 // e.g. -misched-bottomup=false allows scheduling in both directions. 1733 assert((!ForceTopDown || !ForceBottomUp) && 1734 "-misched-topdown incompatible with -misched-bottomup"); 1735 if (ForceBottomUp.getNumOccurrences() > 0) { 1736 RegionPolicy.OnlyBottomUp = ForceBottomUp; 1737 if (RegionPolicy.OnlyBottomUp) 1738 RegionPolicy.OnlyTopDown = false; 1739 } 1740 if (ForceTopDown.getNumOccurrences() > 0) { 1741 RegionPolicy.OnlyTopDown = ForceTopDown; 1742 if (RegionPolicy.OnlyTopDown) 1743 RegionPolicy.OnlyBottomUp = false; 1744 } 1745 } 1746 1747 void GenericScheduler::initialize(ScheduleDAGMI *dag) { 1748 DAG = dag; 1749 SchedModel = DAG->getSchedModel(); 1750 TRI = DAG->TRI; 1751 1752 Rem.init(DAG, SchedModel); 1753 Top.init(DAG, SchedModel, &Rem); 1754 Bot.init(DAG, SchedModel, &Rem); 1755 1756 // Initialize resource counts. 1757 1758 // Initialize the HazardRecognizers. If itineraries don't exist, are empty, or 1759 // are disabled, then these HazardRecs will be disabled. 1760 const InstrItineraryData *Itin = SchedModel->getInstrItineraries(); 1761 const TargetMachine &TM = DAG->MF.getTarget(); 1762 if (!Top.HazardRec) { 1763 Top.HazardRec = 1764 TM.getInstrInfo()->CreateTargetMIHazardRecognizer(Itin, DAG); 1765 } 1766 if (!Bot.HazardRec) { 1767 Bot.HazardRec = 1768 TM.getInstrInfo()->CreateTargetMIHazardRecognizer(Itin, DAG); 1769 } 1770 } 1771 1772 void GenericScheduler::releaseTopNode(SUnit *SU) { 1773 if (SU->isScheduled) 1774 return; 1775 1776 for (SUnit::pred_iterator I = SU->Preds.begin(), E = SU->Preds.end(); 1777 I != E; ++I) { 1778 if (I->isWeak()) 1779 continue; 1780 unsigned PredReadyCycle = I->getSUnit()->TopReadyCycle; 1781 unsigned Latency = I->getLatency(); 1782 #ifndef NDEBUG 1783 Top.MaxObservedLatency = std::max(Latency, Top.MaxObservedLatency); 1784 #endif 1785 if (SU->TopReadyCycle < PredReadyCycle + Latency) 1786 SU->TopReadyCycle = PredReadyCycle + Latency; 1787 } 1788 Top.releaseNode(SU, SU->TopReadyCycle); 1789 } 1790 1791 void GenericScheduler::releaseBottomNode(SUnit *SU) { 1792 if (SU->isScheduled) 1793 return; 1794 1795 assert(SU->getInstr() && "Scheduled SUnit must have instr"); 1796 1797 for (SUnit::succ_iterator I = SU->Succs.begin(), E = SU->Succs.end(); 1798 I != E; ++I) { 1799 if (I->isWeak()) 1800 continue; 1801 unsigned SuccReadyCycle = I->getSUnit()->BotReadyCycle; 1802 unsigned Latency = I->getLatency(); 1803 #ifndef NDEBUG 1804 Bot.MaxObservedLatency = std::max(Latency, Bot.MaxObservedLatency); 1805 #endif 1806 if (SU->BotReadyCycle < SuccReadyCycle + Latency) 1807 SU->BotReadyCycle = SuccReadyCycle + Latency; 1808 } 1809 Bot.releaseNode(SU, SU->BotReadyCycle); 1810 } 1811 1812 /// Set IsAcyclicLatencyLimited if the acyclic path is longer than the cyclic 1813 /// critical path by more cycles than it takes to drain the instruction buffer. 1814 /// We estimate an upper bounds on in-flight instructions as: 1815 /// 1816 /// CyclesPerIteration = max( CyclicPath, Loop-Resource-Height ) 1817 /// InFlightIterations = AcyclicPath / CyclesPerIteration 1818 /// InFlightResources = InFlightIterations * LoopResources 1819 /// 1820 /// TODO: Check execution resources in addition to IssueCount. 1821 void GenericScheduler::checkAcyclicLatency() { 1822 if (Rem.CyclicCritPath == 0 || Rem.CyclicCritPath >= Rem.CriticalPath) 1823 return; 1824 1825 // Scaled number of cycles per loop iteration. 1826 unsigned IterCount = 1827 std::max(Rem.CyclicCritPath * SchedModel->getLatencyFactor(), 1828 Rem.RemIssueCount); 1829 // Scaled acyclic critical path. 1830 unsigned AcyclicCount = Rem.CriticalPath * SchedModel->getLatencyFactor(); 1831 // InFlightCount = (AcyclicPath / IterCycles) * InstrPerLoop 1832 unsigned InFlightCount = 1833 (AcyclicCount * Rem.RemIssueCount + IterCount-1) / IterCount; 1834 unsigned BufferLimit = 1835 SchedModel->getMicroOpBufferSize() * SchedModel->getMicroOpFactor(); 1836 1837 Rem.IsAcyclicLatencyLimited = InFlightCount > BufferLimit; 1838 1839 DEBUG(dbgs() << "IssueCycles=" 1840 << Rem.RemIssueCount / SchedModel->getLatencyFactor() << "c " 1841 << "IterCycles=" << IterCount / SchedModel->getLatencyFactor() 1842 << "c NumIters=" << (AcyclicCount + IterCount-1) / IterCount 1843 << " InFlight=" << InFlightCount / SchedModel->getMicroOpFactor() 1844 << "m BufferLim=" << SchedModel->getMicroOpBufferSize() << "m\n"; 1845 if (Rem.IsAcyclicLatencyLimited) 1846 dbgs() << " ACYCLIC LATENCY LIMIT\n"); 1847 } 1848 1849 void GenericScheduler::registerRoots() { 1850 Rem.CriticalPath = DAG->ExitSU.getDepth(); 1851 1852 // Some roots may not feed into ExitSU. Check all of them in case. 1853 for (std::vector<SUnit*>::const_iterator 1854 I = Bot.Available.begin(), E = Bot.Available.end(); I != E; ++I) { 1855 if ((*I)->getDepth() > Rem.CriticalPath) 1856 Rem.CriticalPath = (*I)->getDepth(); 1857 } 1858 DEBUG(dbgs() << "Critical Path: " << Rem.CriticalPath << '\n'); 1859 1860 if (EnableCyclicPath) { 1861 Rem.CyclicCritPath = DAG->computeCyclicCriticalPath(); 1862 checkAcyclicLatency(); 1863 } 1864 } 1865 1866 /// Does this SU have a hazard within the current instruction group. 1867 /// 1868 /// The scheduler supports two modes of hazard recognition. The first is the 1869 /// ScheduleHazardRecognizer API. It is a fully general hazard recognizer that 1870 /// supports highly complicated in-order reservation tables 1871 /// (ScoreboardHazardRecognizer) and arbitraty target-specific logic. 1872 /// 1873 /// The second is a streamlined mechanism that checks for hazards based on 1874 /// simple counters that the scheduler itself maintains. It explicitly checks 1875 /// for instruction dispatch limitations, including the number of micro-ops that 1876 /// can dispatch per cycle. 1877 /// 1878 /// TODO: Also check whether the SU must start a new group. 1879 bool GenericScheduler::SchedBoundary::checkHazard(SUnit *SU) { 1880 if (HazardRec->isEnabled()) 1881 return HazardRec->getHazardType(SU) != ScheduleHazardRecognizer::NoHazard; 1882 1883 unsigned uops = SchedModel->getNumMicroOps(SU->getInstr()); 1884 if ((CurrMOps > 0) && (CurrMOps + uops > SchedModel->getIssueWidth())) { 1885 DEBUG(dbgs() << " SU(" << SU->NodeNum << ") uops=" 1886 << SchedModel->getNumMicroOps(SU->getInstr()) << '\n'); 1887 return true; 1888 } 1889 return false; 1890 } 1891 1892 // Find the unscheduled node in ReadySUs with the highest latency. 1893 unsigned GenericScheduler::SchedBoundary:: 1894 findMaxLatency(ArrayRef<SUnit*> ReadySUs) { 1895 SUnit *LateSU = 0; 1896 unsigned RemLatency = 0; 1897 for (ArrayRef<SUnit*>::iterator I = ReadySUs.begin(), E = ReadySUs.end(); 1898 I != E; ++I) { 1899 unsigned L = getUnscheduledLatency(*I); 1900 if (L > RemLatency) { 1901 RemLatency = L; 1902 LateSU = *I; 1903 } 1904 } 1905 if (LateSU) { 1906 DEBUG(dbgs() << Available.getName() << " RemLatency SU(" 1907 << LateSU->NodeNum << ") " << RemLatency << "c\n"); 1908 } 1909 return RemLatency; 1910 } 1911 1912 // Count resources in this zone and the remaining unscheduled 1913 // instruction. Return the max count, scaled. Set OtherCritIdx to the critical 1914 // resource index, or zero if the zone is issue limited. 1915 unsigned GenericScheduler::SchedBoundary:: 1916 getOtherResourceCount(unsigned &OtherCritIdx) { 1917 OtherCritIdx = 0; 1918 if (!SchedModel->hasInstrSchedModel()) 1919 return 0; 1920 1921 unsigned OtherCritCount = Rem->RemIssueCount 1922 + (RetiredMOps * SchedModel->getMicroOpFactor()); 1923 DEBUG(dbgs() << " " << Available.getName() << " + Remain MOps: " 1924 << OtherCritCount / SchedModel->getMicroOpFactor() << '\n'); 1925 for (unsigned PIdx = 1, PEnd = SchedModel->getNumProcResourceKinds(); 1926 PIdx != PEnd; ++PIdx) { 1927 unsigned OtherCount = getResourceCount(PIdx) + Rem->RemainingCounts[PIdx]; 1928 if (OtherCount > OtherCritCount) { 1929 OtherCritCount = OtherCount; 1930 OtherCritIdx = PIdx; 1931 } 1932 } 1933 if (OtherCritIdx) { 1934 DEBUG(dbgs() << " " << Available.getName() << " + Remain CritRes: " 1935 << OtherCritCount / SchedModel->getResourceFactor(OtherCritIdx) 1936 << " " << getResourceName(OtherCritIdx) << "\n"); 1937 } 1938 return OtherCritCount; 1939 } 1940 1941 /// Set the CandPolicy for this zone given the current resources and latencies 1942 /// inside and outside the zone. 1943 void GenericScheduler::SchedBoundary::setPolicy(CandPolicy &Policy, 1944 SchedBoundary &OtherZone) { 1945 // Now that potential stalls have been considered, apply preemptive heuristics 1946 // based on the the total latency and resources inside and outside this 1947 // zone. 1948 1949 // Compute remaining latency. We need this both to determine whether the 1950 // overall schedule has become latency-limited and whether the instructions 1951 // outside this zone are resource or latency limited. 1952 // 1953 // The "dependent" latency is updated incrementally during scheduling as the 1954 // max height/depth of scheduled nodes minus the cycles since it was 1955 // scheduled: 1956 // DLat = max (N.depth - (CurrCycle - N.ReadyCycle) for N in Zone 1957 // 1958 // The "independent" latency is the max ready queue depth: 1959 // ILat = max N.depth for N in Available|Pending 1960 // 1961 // RemainingLatency is the greater of independent and dependent latency. 1962 unsigned RemLatency = DependentLatency; 1963 RemLatency = std::max(RemLatency, findMaxLatency(Available.elements())); 1964 RemLatency = std::max(RemLatency, findMaxLatency(Pending.elements())); 1965 1966 // Compute the critical resource outside the zone. 1967 unsigned OtherCritIdx; 1968 unsigned OtherCount = OtherZone.getOtherResourceCount(OtherCritIdx); 1969 1970 bool OtherResLimited = false; 1971 if (SchedModel->hasInstrSchedModel()) { 1972 unsigned LFactor = SchedModel->getLatencyFactor(); 1973 OtherResLimited = (int)(OtherCount - (RemLatency * LFactor)) > (int)LFactor; 1974 } 1975 if (!OtherResLimited && (RemLatency + CurrCycle > Rem->CriticalPath)) { 1976 Policy.ReduceLatency |= true; 1977 DEBUG(dbgs() << " " << Available.getName() << " RemainingLatency " 1978 << RemLatency << " + " << CurrCycle << "c > CritPath " 1979 << Rem->CriticalPath << "\n"); 1980 } 1981 // If the same resource is limiting inside and outside the zone, do nothing. 1982 if (ZoneCritResIdx == OtherCritIdx) 1983 return; 1984 1985 DEBUG( 1986 if (IsResourceLimited) { 1987 dbgs() << " " << Available.getName() << " ResourceLimited: " 1988 << getResourceName(ZoneCritResIdx) << "\n"; 1989 } 1990 if (OtherResLimited) 1991 dbgs() << " RemainingLimit: " << getResourceName(OtherCritIdx) << "\n"; 1992 if (!IsResourceLimited && !OtherResLimited) 1993 dbgs() << " Latency limited both directions.\n"); 1994 1995 if (IsResourceLimited && !Policy.ReduceResIdx) 1996 Policy.ReduceResIdx = ZoneCritResIdx; 1997 1998 if (OtherResLimited) 1999 Policy.DemandResIdx = OtherCritIdx; 2000 } 2001 2002 void GenericScheduler::SchedBoundary::releaseNode(SUnit *SU, 2003 unsigned ReadyCycle) { 2004 if (ReadyCycle < MinReadyCycle) 2005 MinReadyCycle = ReadyCycle; 2006 2007 // Check for interlocks first. For the purpose of other heuristics, an 2008 // instruction that cannot issue appears as if it's not in the ReadyQueue. 2009 bool IsBuffered = SchedModel->getMicroOpBufferSize() != 0; 2010 if ((!IsBuffered && ReadyCycle > CurrCycle) || checkHazard(SU)) 2011 Pending.push(SU); 2012 else 2013 Available.push(SU); 2014 2015 // Record this node as an immediate dependent of the scheduled node. 2016 NextSUs.insert(SU); 2017 } 2018 2019 /// Move the boundary of scheduled code by one cycle. 2020 void GenericScheduler::SchedBoundary::bumpCycle(unsigned NextCycle) { 2021 if (SchedModel->getMicroOpBufferSize() == 0) { 2022 assert(MinReadyCycle < UINT_MAX && "MinReadyCycle uninitialized"); 2023 if (MinReadyCycle > NextCycle) 2024 NextCycle = MinReadyCycle; 2025 } 2026 // Update the current micro-ops, which will issue in the next cycle. 2027 unsigned DecMOps = SchedModel->getIssueWidth() * (NextCycle - CurrCycle); 2028 CurrMOps = (CurrMOps <= DecMOps) ? 0 : CurrMOps - DecMOps; 2029 2030 // Decrement DependentLatency based on the next cycle. 2031 if ((NextCycle - CurrCycle) > DependentLatency) 2032 DependentLatency = 0; 2033 else 2034 DependentLatency -= (NextCycle - CurrCycle); 2035 2036 if (!HazardRec->isEnabled()) { 2037 // Bypass HazardRec virtual calls. 2038 CurrCycle = NextCycle; 2039 } 2040 else { 2041 // Bypass getHazardType calls in case of long latency. 2042 for (; CurrCycle != NextCycle; ++CurrCycle) { 2043 if (isTop()) 2044 HazardRec->AdvanceCycle(); 2045 else 2046 HazardRec->RecedeCycle(); 2047 } 2048 } 2049 CheckPending = true; 2050 unsigned LFactor = SchedModel->getLatencyFactor(); 2051 IsResourceLimited = 2052 (int)(getCriticalCount() - (getScheduledLatency() * LFactor)) 2053 > (int)LFactor; 2054 2055 DEBUG(dbgs() << "Cycle: " << CurrCycle << ' ' << Available.getName() << '\n'); 2056 } 2057 2058 void GenericScheduler::SchedBoundary::incExecutedResources(unsigned PIdx, 2059 unsigned Count) { 2060 ExecutedResCounts[PIdx] += Count; 2061 if (ExecutedResCounts[PIdx] > MaxExecutedResCount) 2062 MaxExecutedResCount = ExecutedResCounts[PIdx]; 2063 } 2064 2065 /// Add the given processor resource to this scheduled zone. 2066 /// 2067 /// \param Cycles indicates the number of consecutive (non-pipelined) cycles 2068 /// during which this resource is consumed. 2069 /// 2070 /// \return the next cycle at which the instruction may execute without 2071 /// oversubscribing resources. 2072 unsigned GenericScheduler::SchedBoundary:: 2073 countResource(unsigned PIdx, unsigned Cycles, unsigned ReadyCycle) { 2074 unsigned Factor = SchedModel->getResourceFactor(PIdx); 2075 unsigned Count = Factor * Cycles; 2076 DEBUG(dbgs() << " " << getResourceName(PIdx) 2077 << " +" << Cycles << "x" << Factor << "u\n"); 2078 2079 // Update Executed resources counts. 2080 incExecutedResources(PIdx, Count); 2081 assert(Rem->RemainingCounts[PIdx] >= Count && "resource double counted"); 2082 Rem->RemainingCounts[PIdx] -= Count; 2083 2084 // Check if this resource exceeds the current critical resource. If so, it 2085 // becomes the critical resource. 2086 if (ZoneCritResIdx != PIdx && (getResourceCount(PIdx) > getCriticalCount())) { 2087 ZoneCritResIdx = PIdx; 2088 DEBUG(dbgs() << " *** Critical resource " 2089 << getResourceName(PIdx) << ": " 2090 << getResourceCount(PIdx) / SchedModel->getLatencyFactor() << "c\n"); 2091 } 2092 // TODO: We don't yet model reserved resources. It's not hard though. 2093 return CurrCycle; 2094 } 2095 2096 /// Move the boundary of scheduled code by one SUnit. 2097 void GenericScheduler::SchedBoundary::bumpNode(SUnit *SU) { 2098 // Update the reservation table. 2099 if (HazardRec->isEnabled()) { 2100 if (!isTop() && SU->isCall) { 2101 // Calls are scheduled with their preceding instructions. For bottom-up 2102 // scheduling, clear the pipeline state before emitting. 2103 HazardRec->Reset(); 2104 } 2105 HazardRec->EmitInstruction(SU); 2106 } 2107 const MCSchedClassDesc *SC = DAG->getSchedClass(SU); 2108 unsigned IncMOps = SchedModel->getNumMicroOps(SU->getInstr()); 2109 CurrMOps += IncMOps; 2110 // checkHazard prevents scheduling multiple instructions per cycle that exceed 2111 // issue width. However, we commonly reach the maximum. In this case 2112 // opportunistically bump the cycle to avoid uselessly checking everything in 2113 // the readyQ. Furthermore, a single instruction may produce more than one 2114 // cycle's worth of micro-ops. 2115 // 2116 // TODO: Also check if this SU must end a dispatch group. 2117 unsigned NextCycle = CurrCycle; 2118 if (CurrMOps >= SchedModel->getIssueWidth()) { 2119 ++NextCycle; 2120 DEBUG(dbgs() << " *** Max MOps " << CurrMOps 2121 << " at cycle " << CurrCycle << '\n'); 2122 } 2123 unsigned ReadyCycle = (isTop() ? SU->TopReadyCycle : SU->BotReadyCycle); 2124 DEBUG(dbgs() << " Ready @" << ReadyCycle << "c\n"); 2125 2126 switch (SchedModel->getMicroOpBufferSize()) { 2127 case 0: 2128 assert(ReadyCycle <= CurrCycle && "Broken PendingQueue"); 2129 break; 2130 case 1: 2131 if (ReadyCycle > NextCycle) { 2132 NextCycle = ReadyCycle; 2133 DEBUG(dbgs() << " *** Stall until: " << ReadyCycle << "\n"); 2134 } 2135 break; 2136 default: 2137 // We don't currently model the OOO reorder buffer, so consider all 2138 // scheduled MOps to be "retired". 2139 break; 2140 } 2141 RetiredMOps += IncMOps; 2142 2143 // Update resource counts and critical resource. 2144 if (SchedModel->hasInstrSchedModel()) { 2145 unsigned DecRemIssue = IncMOps * SchedModel->getMicroOpFactor(); 2146 assert(Rem->RemIssueCount >= DecRemIssue && "MOps double counted"); 2147 Rem->RemIssueCount -= DecRemIssue; 2148 if (ZoneCritResIdx) { 2149 // Scale scheduled micro-ops for comparing with the critical resource. 2150 unsigned ScaledMOps = 2151 RetiredMOps * SchedModel->getMicroOpFactor(); 2152 2153 // If scaled micro-ops are now more than the previous critical resource by 2154 // a full cycle, then micro-ops issue becomes critical. 2155 if ((int)(ScaledMOps - getResourceCount(ZoneCritResIdx)) 2156 >= (int)SchedModel->getLatencyFactor()) { 2157 ZoneCritResIdx = 0; 2158 DEBUG(dbgs() << " *** Critical resource NumMicroOps: " 2159 << ScaledMOps / SchedModel->getLatencyFactor() << "c\n"); 2160 } 2161 } 2162 for (TargetSchedModel::ProcResIter 2163 PI = SchedModel->getWriteProcResBegin(SC), 2164 PE = SchedModel->getWriteProcResEnd(SC); PI != PE; ++PI) { 2165 unsigned RCycle = 2166 countResource(PI->ProcResourceIdx, PI->Cycles, ReadyCycle); 2167 if (RCycle > NextCycle) 2168 NextCycle = RCycle; 2169 } 2170 } 2171 // Update ExpectedLatency and DependentLatency. 2172 unsigned &TopLatency = isTop() ? ExpectedLatency : DependentLatency; 2173 unsigned &BotLatency = isTop() ? DependentLatency : ExpectedLatency; 2174 if (SU->getDepth() > TopLatency) { 2175 TopLatency = SU->getDepth(); 2176 DEBUG(dbgs() << " " << Available.getName() 2177 << " TopLatency SU(" << SU->NodeNum << ") " << TopLatency << "c\n"); 2178 } 2179 if (SU->getHeight() > BotLatency) { 2180 BotLatency = SU->getHeight(); 2181 DEBUG(dbgs() << " " << Available.getName() 2182 << " BotLatency SU(" << SU->NodeNum << ") " << BotLatency << "c\n"); 2183 } 2184 // If we stall for any reason, bump the cycle. 2185 if (NextCycle > CurrCycle) { 2186 bumpCycle(NextCycle); 2187 } 2188 else { 2189 // After updating ZoneCritResIdx and ExpectedLatency, check if we're 2190 // resource limited. If a stall occured, bumpCycle does this. 2191 unsigned LFactor = SchedModel->getLatencyFactor(); 2192 IsResourceLimited = 2193 (int)(getCriticalCount() - (getScheduledLatency() * LFactor)) 2194 > (int)LFactor; 2195 } 2196 DEBUG(dumpScheduledState()); 2197 } 2198 2199 /// Release pending ready nodes in to the available queue. This makes them 2200 /// visible to heuristics. 2201 void GenericScheduler::SchedBoundary::releasePending() { 2202 // If the available queue is empty, it is safe to reset MinReadyCycle. 2203 if (Available.empty()) 2204 MinReadyCycle = UINT_MAX; 2205 2206 // Check to see if any of the pending instructions are ready to issue. If 2207 // so, add them to the available queue. 2208 bool IsBuffered = SchedModel->getMicroOpBufferSize() != 0; 2209 for (unsigned i = 0, e = Pending.size(); i != e; ++i) { 2210 SUnit *SU = *(Pending.begin()+i); 2211 unsigned ReadyCycle = isTop() ? SU->TopReadyCycle : SU->BotReadyCycle; 2212 2213 if (ReadyCycle < MinReadyCycle) 2214 MinReadyCycle = ReadyCycle; 2215 2216 if (!IsBuffered && ReadyCycle > CurrCycle) 2217 continue; 2218 2219 if (checkHazard(SU)) 2220 continue; 2221 2222 Available.push(SU); 2223 Pending.remove(Pending.begin()+i); 2224 --i; --e; 2225 } 2226 DEBUG(if (!Pending.empty()) Pending.dump()); 2227 CheckPending = false; 2228 } 2229 2230 /// Remove SU from the ready set for this boundary. 2231 void GenericScheduler::SchedBoundary::removeReady(SUnit *SU) { 2232 if (Available.isInQueue(SU)) 2233 Available.remove(Available.find(SU)); 2234 else { 2235 assert(Pending.isInQueue(SU) && "bad ready count"); 2236 Pending.remove(Pending.find(SU)); 2237 } 2238 } 2239 2240 /// If this queue only has one ready candidate, return it. As a side effect, 2241 /// defer any nodes that now hit a hazard, and advance the cycle until at least 2242 /// one node is ready. If multiple instructions are ready, return NULL. 2243 SUnit *GenericScheduler::SchedBoundary::pickOnlyChoice() { 2244 if (CheckPending) 2245 releasePending(); 2246 2247 if (CurrMOps > 0) { 2248 // Defer any ready instrs that now have a hazard. 2249 for (ReadyQueue::iterator I = Available.begin(); I != Available.end();) { 2250 if (checkHazard(*I)) { 2251 Pending.push(*I); 2252 I = Available.remove(I); 2253 continue; 2254 } 2255 ++I; 2256 } 2257 } 2258 for (unsigned i = 0; Available.empty(); ++i) { 2259 assert(i <= (HazardRec->getMaxLookAhead() + MaxObservedLatency) && 2260 "permanent hazard"); (void)i; 2261 bumpCycle(CurrCycle + 1); 2262 releasePending(); 2263 } 2264 if (Available.size() == 1) 2265 return *Available.begin(); 2266 return NULL; 2267 } 2268 2269 #ifndef NDEBUG 2270 // This is useful information to dump after bumpNode. 2271 // Note that the Queue contents are more useful before pickNodeFromQueue. 2272 void GenericScheduler::SchedBoundary::dumpScheduledState() { 2273 unsigned ResFactor; 2274 unsigned ResCount; 2275 if (ZoneCritResIdx) { 2276 ResFactor = SchedModel->getResourceFactor(ZoneCritResIdx); 2277 ResCount = getResourceCount(ZoneCritResIdx); 2278 } 2279 else { 2280 ResFactor = SchedModel->getMicroOpFactor(); 2281 ResCount = RetiredMOps * SchedModel->getMicroOpFactor(); 2282 } 2283 unsigned LFactor = SchedModel->getLatencyFactor(); 2284 dbgs() << Available.getName() << " @" << CurrCycle << "c\n" 2285 << " Retired: " << RetiredMOps; 2286 dbgs() << "\n Executed: " << getExecutedCount() / LFactor << "c"; 2287 dbgs() << "\n Critical: " << ResCount / LFactor << "c, " 2288 << ResCount / ResFactor << " " << getResourceName(ZoneCritResIdx) 2289 << "\n ExpectedLatency: " << ExpectedLatency << "c\n" 2290 << (IsResourceLimited ? " - Resource" : " - Latency") 2291 << " limited.\n"; 2292 } 2293 #endif 2294 2295 void GenericScheduler::SchedCandidate:: 2296 initResourceDelta(const ScheduleDAGMI *DAG, 2297 const TargetSchedModel *SchedModel) { 2298 if (!Policy.ReduceResIdx && !Policy.DemandResIdx) 2299 return; 2300 2301 const MCSchedClassDesc *SC = DAG->getSchedClass(SU); 2302 for (TargetSchedModel::ProcResIter 2303 PI = SchedModel->getWriteProcResBegin(SC), 2304 PE = SchedModel->getWriteProcResEnd(SC); PI != PE; ++PI) { 2305 if (PI->ProcResourceIdx == Policy.ReduceResIdx) 2306 ResDelta.CritResources += PI->Cycles; 2307 if (PI->ProcResourceIdx == Policy.DemandResIdx) 2308 ResDelta.DemandedResources += PI->Cycles; 2309 } 2310 } 2311 2312 2313 /// Return true if this heuristic determines order. 2314 static bool tryLess(int TryVal, int CandVal, 2315 GenericScheduler::SchedCandidate &TryCand, 2316 GenericScheduler::SchedCandidate &Cand, 2317 GenericScheduler::CandReason Reason) { 2318 if (TryVal < CandVal) { 2319 TryCand.Reason = Reason; 2320 return true; 2321 } 2322 if (TryVal > CandVal) { 2323 if (Cand.Reason > Reason) 2324 Cand.Reason = Reason; 2325 return true; 2326 } 2327 Cand.setRepeat(Reason); 2328 return false; 2329 } 2330 2331 static bool tryGreater(int TryVal, int CandVal, 2332 GenericScheduler::SchedCandidate &TryCand, 2333 GenericScheduler::SchedCandidate &Cand, 2334 GenericScheduler::CandReason Reason) { 2335 if (TryVal > CandVal) { 2336 TryCand.Reason = Reason; 2337 return true; 2338 } 2339 if (TryVal < CandVal) { 2340 if (Cand.Reason > Reason) 2341 Cand.Reason = Reason; 2342 return true; 2343 } 2344 Cand.setRepeat(Reason); 2345 return false; 2346 } 2347 2348 static bool tryPressure(const PressureChange &TryP, 2349 const PressureChange &CandP, 2350 GenericScheduler::SchedCandidate &TryCand, 2351 GenericScheduler::SchedCandidate &Cand, 2352 GenericScheduler::CandReason Reason) { 2353 int TryRank = TryP.getPSetOrMax(); 2354 int CandRank = CandP.getPSetOrMax(); 2355 // If both candidates affect the same set, go with the smallest increase. 2356 if (TryRank == CandRank) { 2357 return tryLess(TryP.getUnitInc(), CandP.getUnitInc(), TryCand, Cand, 2358 Reason); 2359 } 2360 // If one candidate decreases and the other increases, go with it. 2361 // Invalid candidates have UnitInc==0. 2362 if (tryLess(TryP.getUnitInc() < 0, CandP.getUnitInc() < 0, TryCand, Cand, 2363 Reason)) { 2364 return true; 2365 } 2366 // If the candidates are decreasing pressure, reverse priority. 2367 if (TryP.getUnitInc() < 0) 2368 std::swap(TryRank, CandRank); 2369 return tryGreater(TryRank, CandRank, TryCand, Cand, Reason); 2370 } 2371 2372 static unsigned getWeakLeft(const SUnit *SU, bool isTop) { 2373 return (isTop) ? SU->WeakPredsLeft : SU->WeakSuccsLeft; 2374 } 2375 2376 /// Minimize physical register live ranges. Regalloc wants them adjacent to 2377 /// their physreg def/use. 2378 /// 2379 /// FIXME: This is an unnecessary check on the critical path. Most are root/leaf 2380 /// copies which can be prescheduled. The rest (e.g. x86 MUL) could be bundled 2381 /// with the operation that produces or consumes the physreg. We'll do this when 2382 /// regalloc has support for parallel copies. 2383 static int biasPhysRegCopy(const SUnit *SU, bool isTop) { 2384 const MachineInstr *MI = SU->getInstr(); 2385 if (!MI->isCopy()) 2386 return 0; 2387 2388 unsigned ScheduledOper = isTop ? 1 : 0; 2389 unsigned UnscheduledOper = isTop ? 0 : 1; 2390 // If we have already scheduled the physreg produce/consumer, immediately 2391 // schedule the copy. 2392 if (TargetRegisterInfo::isPhysicalRegister( 2393 MI->getOperand(ScheduledOper).getReg())) 2394 return 1; 2395 // If the physreg is at the boundary, defer it. Otherwise schedule it 2396 // immediately to free the dependent. We can hoist the copy later. 2397 bool AtBoundary = isTop ? !SU->NumSuccsLeft : !SU->NumPredsLeft; 2398 if (TargetRegisterInfo::isPhysicalRegister( 2399 MI->getOperand(UnscheduledOper).getReg())) 2400 return AtBoundary ? -1 : 1; 2401 return 0; 2402 } 2403 2404 static bool tryLatency(GenericScheduler::SchedCandidate &TryCand, 2405 GenericScheduler::SchedCandidate &Cand, 2406 GenericScheduler::SchedBoundary &Zone) { 2407 if (Zone.isTop()) { 2408 if (Cand.SU->getDepth() > Zone.getScheduledLatency()) { 2409 if (tryLess(TryCand.SU->getDepth(), Cand.SU->getDepth(), 2410 TryCand, Cand, GenericScheduler::TopDepthReduce)) 2411 return true; 2412 } 2413 if (tryGreater(TryCand.SU->getHeight(), Cand.SU->getHeight(), 2414 TryCand, Cand, GenericScheduler::TopPathReduce)) 2415 return true; 2416 } 2417 else { 2418 if (Cand.SU->getHeight() > Zone.getScheduledLatency()) { 2419 if (tryLess(TryCand.SU->getHeight(), Cand.SU->getHeight(), 2420 TryCand, Cand, GenericScheduler::BotHeightReduce)) 2421 return true; 2422 } 2423 if (tryGreater(TryCand.SU->getDepth(), Cand.SU->getDepth(), 2424 TryCand, Cand, GenericScheduler::BotPathReduce)) 2425 return true; 2426 } 2427 return false; 2428 } 2429 2430 /// Apply a set of heursitics to a new candidate. Heuristics are currently 2431 /// hierarchical. This may be more efficient than a graduated cost model because 2432 /// we don't need to evaluate all aspects of the model for each node in the 2433 /// queue. But it's really done to make the heuristics easier to debug and 2434 /// statistically analyze. 2435 /// 2436 /// \param Cand provides the policy and current best candidate. 2437 /// \param TryCand refers to the next SUnit candidate, otherwise uninitialized. 2438 /// \param Zone describes the scheduled zone that we are extending. 2439 /// \param RPTracker describes reg pressure within the scheduled zone. 2440 /// \param TempTracker is a scratch pressure tracker to reuse in queries. 2441 void GenericScheduler::tryCandidate(SchedCandidate &Cand, 2442 SchedCandidate &TryCand, 2443 SchedBoundary &Zone, 2444 const RegPressureTracker &RPTracker, 2445 RegPressureTracker &TempTracker) { 2446 2447 if (DAG->isTrackingPressure()) { 2448 // Always initialize TryCand's RPDelta. 2449 if (Zone.isTop()) { 2450 TempTracker.getMaxDownwardPressureDelta( 2451 TryCand.SU->getInstr(), 2452 TryCand.RPDelta, 2453 DAG->getRegionCriticalPSets(), 2454 DAG->getRegPressure().MaxSetPressure); 2455 } 2456 else { 2457 if (VerifyScheduling) { 2458 TempTracker.getMaxUpwardPressureDelta( 2459 TryCand.SU->getInstr(), 2460 &DAG->getPressureDiff(TryCand.SU), 2461 TryCand.RPDelta, 2462 DAG->getRegionCriticalPSets(), 2463 DAG->getRegPressure().MaxSetPressure); 2464 } 2465 else { 2466 RPTracker.getUpwardPressureDelta( 2467 TryCand.SU->getInstr(), 2468 DAG->getPressureDiff(TryCand.SU), 2469 TryCand.RPDelta, 2470 DAG->getRegionCriticalPSets(), 2471 DAG->getRegPressure().MaxSetPressure); 2472 } 2473 } 2474 } 2475 DEBUG(if (TryCand.RPDelta.Excess.isValid()) 2476 dbgs() << " SU(" << TryCand.SU->NodeNum << ") " 2477 << TRI->getRegPressureSetName(TryCand.RPDelta.Excess.getPSet()) 2478 << ":" << TryCand.RPDelta.Excess.getUnitInc() << "\n"); 2479 2480 // Initialize the candidate if needed. 2481 if (!Cand.isValid()) { 2482 TryCand.Reason = NodeOrder; 2483 return; 2484 } 2485 2486 if (tryGreater(biasPhysRegCopy(TryCand.SU, Zone.isTop()), 2487 biasPhysRegCopy(Cand.SU, Zone.isTop()), 2488 TryCand, Cand, PhysRegCopy)) 2489 return; 2490 2491 // Avoid exceeding the target's limit. If signed PSetID is negative, it is 2492 // invalid; convert it to INT_MAX to give it lowest priority. 2493 if (DAG->isTrackingPressure() && tryPressure(TryCand.RPDelta.Excess, 2494 Cand.RPDelta.Excess, 2495 TryCand, Cand, RegExcess)) 2496 return; 2497 2498 // Avoid increasing the max critical pressure in the scheduled region. 2499 if (DAG->isTrackingPressure() && tryPressure(TryCand.RPDelta.CriticalMax, 2500 Cand.RPDelta.CriticalMax, 2501 TryCand, Cand, RegCritical)) 2502 return; 2503 2504 // For loops that are acyclic path limited, aggressively schedule for latency. 2505 // This can result in very long dependence chains scheduled in sequence, so 2506 // once every cycle (when CurrMOps == 0), switch to normal heuristics. 2507 if (Rem.IsAcyclicLatencyLimited && !Zone.CurrMOps 2508 && tryLatency(TryCand, Cand, Zone)) 2509 return; 2510 2511 // Keep clustered nodes together to encourage downstream peephole 2512 // optimizations which may reduce resource requirements. 2513 // 2514 // This is a best effort to set things up for a post-RA pass. Optimizations 2515 // like generating loads of multiple registers should ideally be done within 2516 // the scheduler pass by combining the loads during DAG postprocessing. 2517 const SUnit *NextClusterSU = 2518 Zone.isTop() ? DAG->getNextClusterSucc() : DAG->getNextClusterPred(); 2519 if (tryGreater(TryCand.SU == NextClusterSU, Cand.SU == NextClusterSU, 2520 TryCand, Cand, Cluster)) 2521 return; 2522 2523 // Weak edges are for clustering and other constraints. 2524 if (tryLess(getWeakLeft(TryCand.SU, Zone.isTop()), 2525 getWeakLeft(Cand.SU, Zone.isTop()), 2526 TryCand, Cand, Weak)) { 2527 return; 2528 } 2529 // Avoid increasing the max pressure of the entire region. 2530 if (DAG->isTrackingPressure() && tryPressure(TryCand.RPDelta.CurrentMax, 2531 Cand.RPDelta.CurrentMax, 2532 TryCand, Cand, RegMax)) 2533 return; 2534 2535 // Avoid critical resource consumption and balance the schedule. 2536 TryCand.initResourceDelta(DAG, SchedModel); 2537 if (tryLess(TryCand.ResDelta.CritResources, Cand.ResDelta.CritResources, 2538 TryCand, Cand, ResourceReduce)) 2539 return; 2540 if (tryGreater(TryCand.ResDelta.DemandedResources, 2541 Cand.ResDelta.DemandedResources, 2542 TryCand, Cand, ResourceDemand)) 2543 return; 2544 2545 // Avoid serializing long latency dependence chains. 2546 // For acyclic path limited loops, latency was already checked above. 2547 if (Cand.Policy.ReduceLatency && !Rem.IsAcyclicLatencyLimited 2548 && tryLatency(TryCand, Cand, Zone)) { 2549 return; 2550 } 2551 2552 // Prefer immediate defs/users of the last scheduled instruction. This is a 2553 // local pressure avoidance strategy that also makes the machine code 2554 // readable. 2555 if (tryGreater(Zone.NextSUs.count(TryCand.SU), Zone.NextSUs.count(Cand.SU), 2556 TryCand, Cand, NextDefUse)) 2557 return; 2558 2559 // Fall through to original instruction order. 2560 if ((Zone.isTop() && TryCand.SU->NodeNum < Cand.SU->NodeNum) 2561 || (!Zone.isTop() && TryCand.SU->NodeNum > Cand.SU->NodeNum)) { 2562 TryCand.Reason = NodeOrder; 2563 } 2564 } 2565 2566 #ifndef NDEBUG 2567 const char *GenericScheduler::getReasonStr( 2568 GenericScheduler::CandReason Reason) { 2569 switch (Reason) { 2570 case NoCand: return "NOCAND "; 2571 case PhysRegCopy: return "PREG-COPY"; 2572 case RegExcess: return "REG-EXCESS"; 2573 case RegCritical: return "REG-CRIT "; 2574 case Cluster: return "CLUSTER "; 2575 case Weak: return "WEAK "; 2576 case RegMax: return "REG-MAX "; 2577 case ResourceReduce: return "RES-REDUCE"; 2578 case ResourceDemand: return "RES-DEMAND"; 2579 case TopDepthReduce: return "TOP-DEPTH "; 2580 case TopPathReduce: return "TOP-PATH "; 2581 case BotHeightReduce:return "BOT-HEIGHT"; 2582 case BotPathReduce: return "BOT-PATH "; 2583 case NextDefUse: return "DEF-USE "; 2584 case NodeOrder: return "ORDER "; 2585 }; 2586 llvm_unreachable("Unknown reason!"); 2587 } 2588 2589 void GenericScheduler::traceCandidate(const SchedCandidate &Cand) { 2590 PressureChange P; 2591 unsigned ResIdx = 0; 2592 unsigned Latency = 0; 2593 switch (Cand.Reason) { 2594 default: 2595 break; 2596 case RegExcess: 2597 P = Cand.RPDelta.Excess; 2598 break; 2599 case RegCritical: 2600 P = Cand.RPDelta.CriticalMax; 2601 break; 2602 case RegMax: 2603 P = Cand.RPDelta.CurrentMax; 2604 break; 2605 case ResourceReduce: 2606 ResIdx = Cand.Policy.ReduceResIdx; 2607 break; 2608 case ResourceDemand: 2609 ResIdx = Cand.Policy.DemandResIdx; 2610 break; 2611 case TopDepthReduce: 2612 Latency = Cand.SU->getDepth(); 2613 break; 2614 case TopPathReduce: 2615 Latency = Cand.SU->getHeight(); 2616 break; 2617 case BotHeightReduce: 2618 Latency = Cand.SU->getHeight(); 2619 break; 2620 case BotPathReduce: 2621 Latency = Cand.SU->getDepth(); 2622 break; 2623 } 2624 dbgs() << " SU(" << Cand.SU->NodeNum << ") " << getReasonStr(Cand.Reason); 2625 if (P.isValid()) 2626 dbgs() << " " << TRI->getRegPressureSetName(P.getPSet()) 2627 << ":" << P.getUnitInc() << " "; 2628 else 2629 dbgs() << " "; 2630 if (ResIdx) 2631 dbgs() << " " << SchedModel->getProcResource(ResIdx)->Name << " "; 2632 else 2633 dbgs() << " "; 2634 if (Latency) 2635 dbgs() << " " << Latency << " cycles "; 2636 else 2637 dbgs() << " "; 2638 dbgs() << '\n'; 2639 } 2640 #endif 2641 2642 /// Pick the best candidate from the queue. 2643 /// 2644 /// TODO: getMaxPressureDelta results can be mostly cached for each SUnit during 2645 /// DAG building. To adjust for the current scheduling location we need to 2646 /// maintain the number of vreg uses remaining to be top-scheduled. 2647 void GenericScheduler::pickNodeFromQueue(SchedBoundary &Zone, 2648 const RegPressureTracker &RPTracker, 2649 SchedCandidate &Cand) { 2650 ReadyQueue &Q = Zone.Available; 2651 2652 DEBUG(Q.dump()); 2653 2654 // getMaxPressureDelta temporarily modifies the tracker. 2655 RegPressureTracker &TempTracker = const_cast<RegPressureTracker&>(RPTracker); 2656 2657 for (ReadyQueue::iterator I = Q.begin(), E = Q.end(); I != E; ++I) { 2658 2659 SchedCandidate TryCand(Cand.Policy); 2660 TryCand.SU = *I; 2661 tryCandidate(Cand, TryCand, Zone, RPTracker, TempTracker); 2662 if (TryCand.Reason != NoCand) { 2663 // Initialize resource delta if needed in case future heuristics query it. 2664 if (TryCand.ResDelta == SchedResourceDelta()) 2665 TryCand.initResourceDelta(DAG, SchedModel); 2666 Cand.setBest(TryCand); 2667 DEBUG(traceCandidate(Cand)); 2668 } 2669 } 2670 } 2671 2672 static void tracePick(const GenericScheduler::SchedCandidate &Cand, 2673 bool IsTop) { 2674 DEBUG(dbgs() << "Pick " << (IsTop ? "Top " : "Bot ") 2675 << GenericScheduler::getReasonStr(Cand.Reason) << '\n'); 2676 } 2677 2678 /// Pick the best candidate node from either the top or bottom queue. 2679 SUnit *GenericScheduler::pickNodeBidirectional(bool &IsTopNode) { 2680 // Schedule as far as possible in the direction of no choice. This is most 2681 // efficient, but also provides the best heuristics for CriticalPSets. 2682 if (SUnit *SU = Bot.pickOnlyChoice()) { 2683 IsTopNode = false; 2684 DEBUG(dbgs() << "Pick Bot NOCAND\n"); 2685 return SU; 2686 } 2687 if (SUnit *SU = Top.pickOnlyChoice()) { 2688 IsTopNode = true; 2689 DEBUG(dbgs() << "Pick Top NOCAND\n"); 2690 return SU; 2691 } 2692 CandPolicy NoPolicy; 2693 SchedCandidate BotCand(NoPolicy); 2694 SchedCandidate TopCand(NoPolicy); 2695 Bot.setPolicy(BotCand.Policy, Top); 2696 Top.setPolicy(TopCand.Policy, Bot); 2697 2698 // Prefer bottom scheduling when heuristics are silent. 2699 pickNodeFromQueue(Bot, DAG->getBotRPTracker(), BotCand); 2700 assert(BotCand.Reason != NoCand && "failed to find the first candidate"); 2701 2702 // If either Q has a single candidate that provides the least increase in 2703 // Excess pressure, we can immediately schedule from that Q. 2704 // 2705 // RegionCriticalPSets summarizes the pressure within the scheduled region and 2706 // affects picking from either Q. If scheduling in one direction must 2707 // increase pressure for one of the excess PSets, then schedule in that 2708 // direction first to provide more freedom in the other direction. 2709 if ((BotCand.Reason == RegExcess && !BotCand.isRepeat(RegExcess)) 2710 || (BotCand.Reason == RegCritical 2711 && !BotCand.isRepeat(RegCritical))) 2712 { 2713 IsTopNode = false; 2714 tracePick(BotCand, IsTopNode); 2715 return BotCand.SU; 2716 } 2717 // Check if the top Q has a better candidate. 2718 pickNodeFromQueue(Top, DAG->getTopRPTracker(), TopCand); 2719 assert(TopCand.Reason != NoCand && "failed to find the first candidate"); 2720 2721 // Choose the queue with the most important (lowest enum) reason. 2722 if (TopCand.Reason < BotCand.Reason) { 2723 IsTopNode = true; 2724 tracePick(TopCand, IsTopNode); 2725 return TopCand.SU; 2726 } 2727 // Otherwise prefer the bottom candidate, in node order if all else failed. 2728 IsTopNode = false; 2729 tracePick(BotCand, IsTopNode); 2730 return BotCand.SU; 2731 } 2732 2733 /// Pick the best node to balance the schedule. Implements MachineSchedStrategy. 2734 SUnit *GenericScheduler::pickNode(bool &IsTopNode) { 2735 if (DAG->top() == DAG->bottom()) { 2736 assert(Top.Available.empty() && Top.Pending.empty() && 2737 Bot.Available.empty() && Bot.Pending.empty() && "ReadyQ garbage"); 2738 return NULL; 2739 } 2740 SUnit *SU; 2741 do { 2742 if (RegionPolicy.OnlyTopDown) { 2743 SU = Top.pickOnlyChoice(); 2744 if (!SU) { 2745 CandPolicy NoPolicy; 2746 SchedCandidate TopCand(NoPolicy); 2747 pickNodeFromQueue(Top, DAG->getTopRPTracker(), TopCand); 2748 assert(TopCand.Reason != NoCand && "failed to find a candidate"); 2749 tracePick(TopCand, true); 2750 SU = TopCand.SU; 2751 } 2752 IsTopNode = true; 2753 } 2754 else if (RegionPolicy.OnlyBottomUp) { 2755 SU = Bot.pickOnlyChoice(); 2756 if (!SU) { 2757 CandPolicy NoPolicy; 2758 SchedCandidate BotCand(NoPolicy); 2759 pickNodeFromQueue(Bot, DAG->getBotRPTracker(), BotCand); 2760 assert(BotCand.Reason != NoCand && "failed to find a candidate"); 2761 tracePick(BotCand, false); 2762 SU = BotCand.SU; 2763 } 2764 IsTopNode = false; 2765 } 2766 else { 2767 SU = pickNodeBidirectional(IsTopNode); 2768 } 2769 } while (SU->isScheduled); 2770 2771 if (SU->isTopReady()) 2772 Top.removeReady(SU); 2773 if (SU->isBottomReady()) 2774 Bot.removeReady(SU); 2775 2776 DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") " << *SU->getInstr()); 2777 return SU; 2778 } 2779 2780 void GenericScheduler::reschedulePhysRegCopies(SUnit *SU, bool isTop) { 2781 2782 MachineBasicBlock::iterator InsertPos = SU->getInstr(); 2783 if (!isTop) 2784 ++InsertPos; 2785 SmallVectorImpl<SDep> &Deps = isTop ? SU->Preds : SU->Succs; 2786 2787 // Find already scheduled copies with a single physreg dependence and move 2788 // them just above the scheduled instruction. 2789 for (SmallVectorImpl<SDep>::iterator I = Deps.begin(), E = Deps.end(); 2790 I != E; ++I) { 2791 if (I->getKind() != SDep::Data || !TRI->isPhysicalRegister(I->getReg())) 2792 continue; 2793 SUnit *DepSU = I->getSUnit(); 2794 if (isTop ? DepSU->Succs.size() > 1 : DepSU->Preds.size() > 1) 2795 continue; 2796 MachineInstr *Copy = DepSU->getInstr(); 2797 if (!Copy->isCopy()) 2798 continue; 2799 DEBUG(dbgs() << " Rescheduling physreg copy "; 2800 I->getSUnit()->dump(DAG)); 2801 DAG->moveInstruction(Copy, InsertPos); 2802 } 2803 } 2804 2805 /// Update the scheduler's state after scheduling a node. This is the same node 2806 /// that was just returned by pickNode(). However, ScheduleDAGMI needs to update 2807 /// it's state based on the current cycle before MachineSchedStrategy does. 2808 /// 2809 /// FIXME: Eventually, we may bundle physreg copies rather than rescheduling 2810 /// them here. See comments in biasPhysRegCopy. 2811 void GenericScheduler::schedNode(SUnit *SU, bool IsTopNode) { 2812 if (IsTopNode) { 2813 SU->TopReadyCycle = std::max(SU->TopReadyCycle, Top.CurrCycle); 2814 Top.bumpNode(SU); 2815 if (SU->hasPhysRegUses) 2816 reschedulePhysRegCopies(SU, true); 2817 } 2818 else { 2819 SU->BotReadyCycle = std::max(SU->BotReadyCycle, Bot.CurrCycle); 2820 Bot.bumpNode(SU); 2821 if (SU->hasPhysRegDefs) 2822 reschedulePhysRegCopies(SU, false); 2823 } 2824 } 2825 2826 /// Create the standard converging machine scheduler. This will be used as the 2827 /// default scheduler if the target does not set a default. 2828 static ScheduleDAGInstrs *createGenericSched(MachineSchedContext *C) { 2829 ScheduleDAGMI *DAG = new ScheduleDAGMI(C, new GenericScheduler(C)); 2830 // Register DAG post-processors. 2831 // 2832 // FIXME: extend the mutation API to allow earlier mutations to instantiate 2833 // data and pass it to later mutations. Have a single mutation that gathers 2834 // the interesting nodes in one pass. 2835 DAG->addMutation(new CopyConstrain(DAG->TII, DAG->TRI)); 2836 if (EnableLoadCluster && DAG->TII->enableClusterLoads()) 2837 DAG->addMutation(new LoadClusterMutation(DAG->TII, DAG->TRI)); 2838 if (EnableMacroFusion) 2839 DAG->addMutation(new MacroFusion(DAG->TII)); 2840 return DAG; 2841 } 2842 static MachineSchedRegistry 2843 GenericSchedRegistry("converge", "Standard converging scheduler.", 2844 createGenericSched); 2845 2846 //===----------------------------------------------------------------------===// 2847 // ILP Scheduler. Currently for experimental analysis of heuristics. 2848 //===----------------------------------------------------------------------===// 2849 2850 namespace { 2851 /// \brief Order nodes by the ILP metric. 2852 struct ILPOrder { 2853 const SchedDFSResult *DFSResult; 2854 const BitVector *ScheduledTrees; 2855 bool MaximizeILP; 2856 2857 ILPOrder(bool MaxILP): DFSResult(0), ScheduledTrees(0), MaximizeILP(MaxILP) {} 2858 2859 /// \brief Apply a less-than relation on node priority. 2860 /// 2861 /// (Return true if A comes after B in the Q.) 2862 bool operator()(const SUnit *A, const SUnit *B) const { 2863 unsigned SchedTreeA = DFSResult->getSubtreeID(A); 2864 unsigned SchedTreeB = DFSResult->getSubtreeID(B); 2865 if (SchedTreeA != SchedTreeB) { 2866 // Unscheduled trees have lower priority. 2867 if (ScheduledTrees->test(SchedTreeA) != ScheduledTrees->test(SchedTreeB)) 2868 return ScheduledTrees->test(SchedTreeB); 2869 2870 // Trees with shallower connections have have lower priority. 2871 if (DFSResult->getSubtreeLevel(SchedTreeA) 2872 != DFSResult->getSubtreeLevel(SchedTreeB)) { 2873 return DFSResult->getSubtreeLevel(SchedTreeA) 2874 < DFSResult->getSubtreeLevel(SchedTreeB); 2875 } 2876 } 2877 if (MaximizeILP) 2878 return DFSResult->getILP(A) < DFSResult->getILP(B); 2879 else 2880 return DFSResult->getILP(A) > DFSResult->getILP(B); 2881 } 2882 }; 2883 2884 /// \brief Schedule based on the ILP metric. 2885 class ILPScheduler : public MachineSchedStrategy { 2886 ScheduleDAGMI *DAG; 2887 ILPOrder Cmp; 2888 2889 std::vector<SUnit*> ReadyQ; 2890 public: 2891 ILPScheduler(bool MaximizeILP): DAG(0), Cmp(MaximizeILP) {} 2892 2893 virtual void initialize(ScheduleDAGMI *dag) { 2894 DAG = dag; 2895 DAG->computeDFSResult(); 2896 Cmp.DFSResult = DAG->getDFSResult(); 2897 Cmp.ScheduledTrees = &DAG->getScheduledTrees(); 2898 ReadyQ.clear(); 2899 } 2900 2901 virtual void registerRoots() { 2902 // Restore the heap in ReadyQ with the updated DFS results. 2903 std::make_heap(ReadyQ.begin(), ReadyQ.end(), Cmp); 2904 } 2905 2906 /// Implement MachineSchedStrategy interface. 2907 /// ----------------------------------------- 2908 2909 /// Callback to select the highest priority node from the ready Q. 2910 virtual SUnit *pickNode(bool &IsTopNode) { 2911 if (ReadyQ.empty()) return NULL; 2912 std::pop_heap(ReadyQ.begin(), ReadyQ.end(), Cmp); 2913 SUnit *SU = ReadyQ.back(); 2914 ReadyQ.pop_back(); 2915 IsTopNode = false; 2916 DEBUG(dbgs() << "Pick node " << "SU(" << SU->NodeNum << ") " 2917 << " ILP: " << DAG->getDFSResult()->getILP(SU) 2918 << " Tree: " << DAG->getDFSResult()->getSubtreeID(SU) << " @" 2919 << DAG->getDFSResult()->getSubtreeLevel( 2920 DAG->getDFSResult()->getSubtreeID(SU)) << '\n' 2921 << "Scheduling " << *SU->getInstr()); 2922 return SU; 2923 } 2924 2925 /// \brief Scheduler callback to notify that a new subtree is scheduled. 2926 virtual void scheduleTree(unsigned SubtreeID) { 2927 std::make_heap(ReadyQ.begin(), ReadyQ.end(), Cmp); 2928 } 2929 2930 /// Callback after a node is scheduled. Mark a newly scheduled tree, notify 2931 /// DFSResults, and resort the priority Q. 2932 virtual void schedNode(SUnit *SU, bool IsTopNode) { 2933 assert(!IsTopNode && "SchedDFSResult needs bottom-up"); 2934 } 2935 2936 virtual void releaseTopNode(SUnit *) { /*only called for top roots*/ } 2937 2938 virtual void releaseBottomNode(SUnit *SU) { 2939 ReadyQ.push_back(SU); 2940 std::push_heap(ReadyQ.begin(), ReadyQ.end(), Cmp); 2941 } 2942 }; 2943 } // namespace 2944 2945 static ScheduleDAGInstrs *createILPMaxScheduler(MachineSchedContext *C) { 2946 return new ScheduleDAGMI(C, new ILPScheduler(true)); 2947 } 2948 static ScheduleDAGInstrs *createILPMinScheduler(MachineSchedContext *C) { 2949 return new ScheduleDAGMI(C, new ILPScheduler(false)); 2950 } 2951 static MachineSchedRegistry ILPMaxRegistry( 2952 "ilpmax", "Schedule bottom-up for max ILP", createILPMaxScheduler); 2953 static MachineSchedRegistry ILPMinRegistry( 2954 "ilpmin", "Schedule bottom-up for min ILP", createILPMinScheduler); 2955 2956 //===----------------------------------------------------------------------===// 2957 // Machine Instruction Shuffler for Correctness Testing 2958 //===----------------------------------------------------------------------===// 2959 2960 #ifndef NDEBUG 2961 namespace { 2962 /// Apply a less-than relation on the node order, which corresponds to the 2963 /// instruction order prior to scheduling. IsReverse implements greater-than. 2964 template<bool IsReverse> 2965 struct SUnitOrder { 2966 bool operator()(SUnit *A, SUnit *B) const { 2967 if (IsReverse) 2968 return A->NodeNum > B->NodeNum; 2969 else 2970 return A->NodeNum < B->NodeNum; 2971 } 2972 }; 2973 2974 /// Reorder instructions as much as possible. 2975 class InstructionShuffler : public MachineSchedStrategy { 2976 bool IsAlternating; 2977 bool IsTopDown; 2978 2979 // Using a less-than relation (SUnitOrder<false>) for the TopQ priority 2980 // gives nodes with a higher number higher priority causing the latest 2981 // instructions to be scheduled first. 2982 PriorityQueue<SUnit*, std::vector<SUnit*>, SUnitOrder<false> > 2983 TopQ; 2984 // When scheduling bottom-up, use greater-than as the queue priority. 2985 PriorityQueue<SUnit*, std::vector<SUnit*>, SUnitOrder<true> > 2986 BottomQ; 2987 public: 2988 InstructionShuffler(bool alternate, bool topdown) 2989 : IsAlternating(alternate), IsTopDown(topdown) {} 2990 2991 virtual void initialize(ScheduleDAGMI *) { 2992 TopQ.clear(); 2993 BottomQ.clear(); 2994 } 2995 2996 /// Implement MachineSchedStrategy interface. 2997 /// ----------------------------------------- 2998 2999 virtual SUnit *pickNode(bool &IsTopNode) { 3000 SUnit *SU; 3001 if (IsTopDown) { 3002 do { 3003 if (TopQ.empty()) return NULL; 3004 SU = TopQ.top(); 3005 TopQ.pop(); 3006 } while (SU->isScheduled); 3007 IsTopNode = true; 3008 } 3009 else { 3010 do { 3011 if (BottomQ.empty()) return NULL; 3012 SU = BottomQ.top(); 3013 BottomQ.pop(); 3014 } while (SU->isScheduled); 3015 IsTopNode = false; 3016 } 3017 if (IsAlternating) 3018 IsTopDown = !IsTopDown; 3019 return SU; 3020 } 3021 3022 virtual void schedNode(SUnit *SU, bool IsTopNode) {} 3023 3024 virtual void releaseTopNode(SUnit *SU) { 3025 TopQ.push(SU); 3026 } 3027 virtual void releaseBottomNode(SUnit *SU) { 3028 BottomQ.push(SU); 3029 } 3030 }; 3031 } // namespace 3032 3033 static ScheduleDAGInstrs *createInstructionShuffler(MachineSchedContext *C) { 3034 bool Alternate = !ForceTopDown && !ForceBottomUp; 3035 bool TopDown = !ForceBottomUp; 3036 assert((TopDown || !ForceTopDown) && 3037 "-misched-topdown incompatible with -misched-bottomup"); 3038 return new ScheduleDAGMI(C, new InstructionShuffler(Alternate, TopDown)); 3039 } 3040 static MachineSchedRegistry ShufflerRegistry( 3041 "shuffle", "Shuffle machine instructions alternating directions", 3042 createInstructionShuffler); 3043 #endif // !NDEBUG 3044 3045 //===----------------------------------------------------------------------===// 3046 // GraphWriter support for ScheduleDAGMI. 3047 //===----------------------------------------------------------------------===// 3048 3049 #ifndef NDEBUG 3050 namespace llvm { 3051 3052 template<> struct GraphTraits< 3053 ScheduleDAGMI*> : public GraphTraits<ScheduleDAG*> {}; 3054 3055 template<> 3056 struct DOTGraphTraits<ScheduleDAGMI*> : public DefaultDOTGraphTraits { 3057 3058 DOTGraphTraits (bool isSimple=false) : DefaultDOTGraphTraits(isSimple) {} 3059 3060 static std::string getGraphName(const ScheduleDAG *G) { 3061 return G->MF.getName(); 3062 } 3063 3064 static bool renderGraphFromBottomUp() { 3065 return true; 3066 } 3067 3068 static bool isNodeHidden(const SUnit *Node) { 3069 return (Node->Preds.size() > 10 || Node->Succs.size() > 10); 3070 } 3071 3072 static bool hasNodeAddressLabel(const SUnit *Node, 3073 const ScheduleDAG *Graph) { 3074 return false; 3075 } 3076 3077 /// If you want to override the dot attributes printed for a particular 3078 /// edge, override this method. 3079 static std::string getEdgeAttributes(const SUnit *Node, 3080 SUnitIterator EI, 3081 const ScheduleDAG *Graph) { 3082 if (EI.isArtificialDep()) 3083 return "color=cyan,style=dashed"; 3084 if (EI.isCtrlDep()) 3085 return "color=blue,style=dashed"; 3086 return ""; 3087 } 3088 3089 static std::string getNodeLabel(const SUnit *SU, const ScheduleDAG *G) { 3090 std::string Str; 3091 raw_string_ostream SS(Str); 3092 const SchedDFSResult *DFS = 3093 static_cast<const ScheduleDAGMI*>(G)->getDFSResult(); 3094 SS << "SU:" << SU->NodeNum; 3095 if (DFS) 3096 SS << " I:" << DFS->getNumInstrs(SU); 3097 return SS.str(); 3098 } 3099 static std::string getNodeDescription(const SUnit *SU, const ScheduleDAG *G) { 3100 return G->getGraphNodeLabel(SU); 3101 } 3102 3103 static std::string getNodeAttributes(const SUnit *N, 3104 const ScheduleDAG *Graph) { 3105 std::string Str("shape=Mrecord"); 3106 const SchedDFSResult *DFS = 3107 static_cast<const ScheduleDAGMI*>(Graph)->getDFSResult(); 3108 if (DFS) { 3109 Str += ",style=filled,fillcolor=\"#"; 3110 Str += DOT::getColorString(DFS->getSubtreeID(N)); 3111 Str += '"'; 3112 } 3113 return Str; 3114 } 3115 }; 3116 } // namespace llvm 3117 #endif // NDEBUG 3118 3119 /// viewGraph - Pop up a ghostview window with the reachable parts of the DAG 3120 /// rendered using 'dot'. 3121 /// 3122 void ScheduleDAGMI::viewGraph(const Twine &Name, const Twine &Title) { 3123 #ifndef NDEBUG 3124 ViewGraph(this, Name, false, Title); 3125 #else 3126 errs() << "ScheduleDAGMI::viewGraph is only available in debug builds on " 3127 << "systems with Graphviz or gv!\n"; 3128 #endif // NDEBUG 3129 } 3130 3131 /// Out-of-line implementation with no arguments is handy for gdb. 3132 void ScheduleDAGMI::viewGraph() { 3133 viewGraph(getDAGName(), "Scheduling-Units Graph for " + getDAGName()); 3134 } 3135