1 //===- MachineScheduler.cpp - Machine Instruction Scheduler ---------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // MachineScheduler schedules machine instructions after phi elimination. It 11 // preserves LiveIntervals so it can be invoked before register allocation. 12 // 13 //===----------------------------------------------------------------------===// 14 15 #define DEBUG_TYPE "misched" 16 17 #include "llvm/CodeGen/MachineScheduler.h" 18 #include "llvm/ADT/OwningPtr.h" 19 #include "llvm/ADT/PriorityQueue.h" 20 #include "llvm/Analysis/AliasAnalysis.h" 21 #include "llvm/CodeGen/LiveIntervalAnalysis.h" 22 #include "llvm/CodeGen/MachineDominators.h" 23 #include "llvm/CodeGen/MachineLoopInfo.h" 24 #include "llvm/CodeGen/MachineRegisterInfo.h" 25 #include "llvm/CodeGen/Passes.h" 26 #include "llvm/CodeGen/RegisterClassInfo.h" 27 #include "llvm/CodeGen/ScheduleDFS.h" 28 #include "llvm/CodeGen/ScheduleHazardRecognizer.h" 29 #include "llvm/Support/CommandLine.h" 30 #include "llvm/Support/Debug.h" 31 #include "llvm/Support/ErrorHandling.h" 32 #include "llvm/Support/GraphWriter.h" 33 #include "llvm/Support/raw_ostream.h" 34 #include "llvm/Target/TargetInstrInfo.h" 35 #include <queue> 36 37 using namespace llvm; 38 39 namespace llvm { 40 cl::opt<bool> ForceTopDown("misched-topdown", cl::Hidden, 41 cl::desc("Force top-down list scheduling")); 42 cl::opt<bool> ForceBottomUp("misched-bottomup", cl::Hidden, 43 cl::desc("Force bottom-up list scheduling")); 44 } 45 46 #ifndef NDEBUG 47 static cl::opt<bool> ViewMISchedDAGs("view-misched-dags", cl::Hidden, 48 cl::desc("Pop up a window to show MISched dags after they are processed")); 49 50 static cl::opt<unsigned> MISchedCutoff("misched-cutoff", cl::Hidden, 51 cl::desc("Stop scheduling after N instructions"), cl::init(~0U)); 52 #else 53 static bool ViewMISchedDAGs = false; 54 #endif // NDEBUG 55 56 static cl::opt<bool> EnableRegPressure("misched-regpressure", cl::Hidden, 57 cl::desc("Enable register pressure scheduling."), cl::init(true)); 58 59 static cl::opt<bool> EnableCyclicPath("misched-cyclicpath", cl::Hidden, 60 cl::desc("Enable cyclic critical path analysis."), cl::init(true)); 61 62 static cl::opt<bool> EnableLoadCluster("misched-cluster", cl::Hidden, 63 cl::desc("Enable load clustering."), cl::init(true)); 64 65 // Experimental heuristics 66 static cl::opt<bool> EnableMacroFusion("misched-fusion", cl::Hidden, 67 cl::desc("Enable scheduling for macro fusion."), cl::init(true)); 68 69 static cl::opt<bool> VerifyScheduling("verify-misched", cl::Hidden, 70 cl::desc("Verify machine instrs before and after machine scheduling")); 71 72 // DAG subtrees must have at least this many nodes. 73 static const unsigned MinSubtreeSize = 8; 74 75 //===----------------------------------------------------------------------===// 76 // Machine Instruction Scheduling Pass and Registry 77 //===----------------------------------------------------------------------===// 78 79 MachineSchedContext::MachineSchedContext(): 80 MF(0), MLI(0), MDT(0), PassConfig(0), AA(0), LIS(0) { 81 RegClassInfo = new RegisterClassInfo(); 82 } 83 84 MachineSchedContext::~MachineSchedContext() { 85 delete RegClassInfo; 86 } 87 88 namespace { 89 /// MachineScheduler runs after coalescing and before register allocation. 90 class MachineScheduler : public MachineSchedContext, 91 public MachineFunctionPass { 92 public: 93 MachineScheduler(); 94 95 virtual void getAnalysisUsage(AnalysisUsage &AU) const; 96 97 virtual void releaseMemory() {} 98 99 virtual bool runOnMachineFunction(MachineFunction&); 100 101 virtual void print(raw_ostream &O, const Module* = 0) const; 102 103 static char ID; // Class identification, replacement for typeinfo 104 105 protected: 106 ScheduleDAGInstrs *createMachineScheduler(); 107 }; 108 } // namespace 109 110 char MachineScheduler::ID = 0; 111 112 char &llvm::MachineSchedulerID = MachineScheduler::ID; 113 114 INITIALIZE_PASS_BEGIN(MachineScheduler, "misched", 115 "Machine Instruction Scheduler", false, false) 116 INITIALIZE_AG_DEPENDENCY(AliasAnalysis) 117 INITIALIZE_PASS_DEPENDENCY(SlotIndexes) 118 INITIALIZE_PASS_DEPENDENCY(LiveIntervals) 119 INITIALIZE_PASS_END(MachineScheduler, "misched", 120 "Machine Instruction Scheduler", false, false) 121 122 MachineScheduler::MachineScheduler() 123 : MachineFunctionPass(ID) { 124 initializeMachineSchedulerPass(*PassRegistry::getPassRegistry()); 125 } 126 127 void MachineScheduler::getAnalysisUsage(AnalysisUsage &AU) const { 128 AU.setPreservesCFG(); 129 AU.addRequiredID(MachineDominatorsID); 130 AU.addRequired<MachineLoopInfo>(); 131 AU.addRequired<AliasAnalysis>(); 132 AU.addRequired<TargetPassConfig>(); 133 AU.addRequired<SlotIndexes>(); 134 AU.addPreserved<SlotIndexes>(); 135 AU.addRequired<LiveIntervals>(); 136 AU.addPreserved<LiveIntervals>(); 137 MachineFunctionPass::getAnalysisUsage(AU); 138 } 139 140 MachinePassRegistry MachineSchedRegistry::Registry; 141 142 /// A dummy default scheduler factory indicates whether the scheduler 143 /// is overridden on the command line. 144 static ScheduleDAGInstrs *useDefaultMachineSched(MachineSchedContext *C) { 145 return 0; 146 } 147 148 /// MachineSchedOpt allows command line selection of the scheduler. 149 static cl::opt<MachineSchedRegistry::ScheduleDAGCtor, false, 150 RegisterPassParser<MachineSchedRegistry> > 151 MachineSchedOpt("misched", 152 cl::init(&useDefaultMachineSched), cl::Hidden, 153 cl::desc("Machine instruction scheduler to use")); 154 155 static MachineSchedRegistry 156 DefaultSchedRegistry("default", "Use the target's default scheduler choice.", 157 useDefaultMachineSched); 158 159 /// Forward declare the standard machine scheduler. This will be used as the 160 /// default scheduler if the target does not set a default. 161 static ScheduleDAGInstrs *createGenericSched(MachineSchedContext *C); 162 163 164 /// Decrement this iterator until reaching the top or a non-debug instr. 165 static MachineBasicBlock::const_iterator 166 priorNonDebug(MachineBasicBlock::const_iterator I, 167 MachineBasicBlock::const_iterator Beg) { 168 assert(I != Beg && "reached the top of the region, cannot decrement"); 169 while (--I != Beg) { 170 if (!I->isDebugValue()) 171 break; 172 } 173 return I; 174 } 175 176 /// Non-const version. 177 static MachineBasicBlock::iterator 178 priorNonDebug(MachineBasicBlock::iterator I, 179 MachineBasicBlock::const_iterator Beg) { 180 return const_cast<MachineInstr*>( 181 &*priorNonDebug(MachineBasicBlock::const_iterator(I), Beg)); 182 } 183 184 /// If this iterator is a debug value, increment until reaching the End or a 185 /// non-debug instruction. 186 static MachineBasicBlock::const_iterator 187 nextIfDebug(MachineBasicBlock::const_iterator I, 188 MachineBasicBlock::const_iterator End) { 189 for(; I != End; ++I) { 190 if (!I->isDebugValue()) 191 break; 192 } 193 return I; 194 } 195 196 /// Non-const version. 197 static MachineBasicBlock::iterator 198 nextIfDebug(MachineBasicBlock::iterator I, 199 MachineBasicBlock::const_iterator End) { 200 // Cast the return value to nonconst MachineInstr, then cast to an 201 // instr_iterator, which does not check for null, finally return a 202 // bundle_iterator. 203 return MachineBasicBlock::instr_iterator( 204 const_cast<MachineInstr*>( 205 &*nextIfDebug(MachineBasicBlock::const_iterator(I), End))); 206 } 207 208 /// Instantiate a ScheduleDAGInstrs that will be owned by the caller. 209 ScheduleDAGInstrs *MachineScheduler::createMachineScheduler() { 210 // Select the scheduler, or set the default. 211 MachineSchedRegistry::ScheduleDAGCtor Ctor = MachineSchedOpt; 212 if (Ctor != useDefaultMachineSched) 213 return Ctor(this); 214 215 // Get the default scheduler set by the target for this function. 216 ScheduleDAGInstrs *Scheduler = PassConfig->createMachineScheduler(this); 217 if (Scheduler) 218 return Scheduler; 219 220 // Default to GenericScheduler. 221 return createGenericSched(this); 222 } 223 224 /// Top-level MachineScheduler pass driver. 225 /// 226 /// Visit blocks in function order. Divide each block into scheduling regions 227 /// and visit them bottom-up. Visiting regions bottom-up is not required, but is 228 /// consistent with the DAG builder, which traverses the interior of the 229 /// scheduling regions bottom-up. 230 /// 231 /// This design avoids exposing scheduling boundaries to the DAG builder, 232 /// simplifying the DAG builder's support for "special" target instructions. 233 /// At the same time the design allows target schedulers to operate across 234 /// scheduling boundaries, for example to bundle the boudary instructions 235 /// without reordering them. This creates complexity, because the target 236 /// scheduler must update the RegionBegin and RegionEnd positions cached by 237 /// ScheduleDAGInstrs whenever adding or removing instructions. A much simpler 238 /// design would be to split blocks at scheduling boundaries, but LLVM has a 239 /// general bias against block splitting purely for implementation simplicity. 240 bool MachineScheduler::runOnMachineFunction(MachineFunction &mf) { 241 DEBUG(dbgs() << "Before MISsched:\n"; mf.print(dbgs())); 242 243 // Initialize the context of the pass. 244 MF = &mf; 245 MLI = &getAnalysis<MachineLoopInfo>(); 246 MDT = &getAnalysis<MachineDominatorTree>(); 247 PassConfig = &getAnalysis<TargetPassConfig>(); 248 AA = &getAnalysis<AliasAnalysis>(); 249 250 LIS = &getAnalysis<LiveIntervals>(); 251 const TargetInstrInfo *TII = MF->getTarget().getInstrInfo(); 252 253 if (VerifyScheduling) { 254 DEBUG(LIS->dump()); 255 MF->verify(this, "Before machine scheduling."); 256 } 257 RegClassInfo->runOnMachineFunction(*MF); 258 259 // Instantiate the selected scheduler for this target, function, and 260 // optimization level. 261 OwningPtr<ScheduleDAGInstrs> Scheduler(createMachineScheduler()); 262 263 // Visit all machine basic blocks. 264 // 265 // TODO: Visit blocks in global postorder or postorder within the bottom-up 266 // loop tree. Then we can optionally compute global RegPressure. 267 for (MachineFunction::iterator MBB = MF->begin(), MBBEnd = MF->end(); 268 MBB != MBBEnd; ++MBB) { 269 270 Scheduler->startBlock(MBB); 271 272 // Break the block into scheduling regions [I, RegionEnd), and schedule each 273 // region as soon as it is discovered. RegionEnd points the scheduling 274 // boundary at the bottom of the region. The DAG does not include RegionEnd, 275 // but the region does (i.e. the next RegionEnd is above the previous 276 // RegionBegin). If the current block has no terminator then RegionEnd == 277 // MBB->end() for the bottom region. 278 // 279 // The Scheduler may insert instructions during either schedule() or 280 // exitRegion(), even for empty regions. So the local iterators 'I' and 281 // 'RegionEnd' are invalid across these calls. 282 unsigned RemainingInstrs = MBB->size(); 283 for(MachineBasicBlock::iterator RegionEnd = MBB->end(); 284 RegionEnd != MBB->begin(); RegionEnd = Scheduler->begin()) { 285 286 // Avoid decrementing RegionEnd for blocks with no terminator. 287 if (RegionEnd != MBB->end() 288 || TII->isSchedulingBoundary(llvm::prior(RegionEnd), MBB, *MF)) { 289 --RegionEnd; 290 // Count the boundary instruction. 291 --RemainingInstrs; 292 } 293 294 // The next region starts above the previous region. Look backward in the 295 // instruction stream until we find the nearest boundary. 296 unsigned NumRegionInstrs = 0; 297 MachineBasicBlock::iterator I = RegionEnd; 298 for(;I != MBB->begin(); --I, --RemainingInstrs, ++NumRegionInstrs) { 299 if (TII->isSchedulingBoundary(llvm::prior(I), MBB, *MF)) 300 break; 301 } 302 // Notify the scheduler of the region, even if we may skip scheduling 303 // it. Perhaps it still needs to be bundled. 304 Scheduler->enterRegion(MBB, I, RegionEnd, NumRegionInstrs); 305 306 // Skip empty scheduling regions (0 or 1 schedulable instructions). 307 if (I == RegionEnd || I == llvm::prior(RegionEnd)) { 308 // Close the current region. Bundle the terminator if needed. 309 // This invalidates 'RegionEnd' and 'I'. 310 Scheduler->exitRegion(); 311 continue; 312 } 313 DEBUG(dbgs() << "********** MI Scheduling **********\n"); 314 DEBUG(dbgs() << MF->getName() 315 << ":BB#" << MBB->getNumber() << " " << MBB->getName() 316 << "\n From: " << *I << " To: "; 317 if (RegionEnd != MBB->end()) dbgs() << *RegionEnd; 318 else dbgs() << "End"; 319 dbgs() << " RegionInstrs: " << NumRegionInstrs 320 << " Remaining: " << RemainingInstrs << "\n"); 321 322 // Schedule a region: possibly reorder instructions. 323 // This invalidates 'RegionEnd' and 'I'. 324 Scheduler->schedule(); 325 326 // Close the current region. 327 Scheduler->exitRegion(); 328 329 // Scheduling has invalidated the current iterator 'I'. Ask the 330 // scheduler for the top of it's scheduled region. 331 RegionEnd = Scheduler->begin(); 332 } 333 assert(RemainingInstrs == 0 && "Instruction count mismatch!"); 334 Scheduler->finishBlock(); 335 } 336 Scheduler->finalizeSchedule(); 337 DEBUG(LIS->dump()); 338 if (VerifyScheduling) 339 MF->verify(this, "After machine scheduling."); 340 return true; 341 } 342 343 void MachineScheduler::print(raw_ostream &O, const Module* m) const { 344 // unimplemented 345 } 346 347 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 348 void ReadyQueue::dump() { 349 dbgs() << Name << ": "; 350 for (unsigned i = 0, e = Queue.size(); i < e; ++i) 351 dbgs() << Queue[i]->NodeNum << " "; 352 dbgs() << "\n"; 353 } 354 #endif 355 356 //===----------------------------------------------------------------------===// 357 // ScheduleDAGMI - Base class for MachineInstr scheduling with LiveIntervals 358 // preservation. 359 //===----------------------------------------------------------------------===// 360 361 ScheduleDAGMI::~ScheduleDAGMI() { 362 delete DFSResult; 363 DeleteContainerPointers(Mutations); 364 delete SchedImpl; 365 } 366 367 bool ScheduleDAGMI::canAddEdge(SUnit *SuccSU, SUnit *PredSU) { 368 return SuccSU == &ExitSU || !Topo.IsReachable(PredSU, SuccSU); 369 } 370 371 bool ScheduleDAGMI::addEdge(SUnit *SuccSU, const SDep &PredDep) { 372 if (SuccSU != &ExitSU) { 373 // Do not use WillCreateCycle, it assumes SD scheduling. 374 // If Pred is reachable from Succ, then the edge creates a cycle. 375 if (Topo.IsReachable(PredDep.getSUnit(), SuccSU)) 376 return false; 377 Topo.AddPred(SuccSU, PredDep.getSUnit()); 378 } 379 SuccSU->addPred(PredDep, /*Required=*/!PredDep.isArtificial()); 380 // Return true regardless of whether a new edge needed to be inserted. 381 return true; 382 } 383 384 /// ReleaseSucc - Decrement the NumPredsLeft count of a successor. When 385 /// NumPredsLeft reaches zero, release the successor node. 386 /// 387 /// FIXME: Adjust SuccSU height based on MinLatency. 388 void ScheduleDAGMI::releaseSucc(SUnit *SU, SDep *SuccEdge) { 389 SUnit *SuccSU = SuccEdge->getSUnit(); 390 391 if (SuccEdge->isWeak()) { 392 --SuccSU->WeakPredsLeft; 393 if (SuccEdge->isCluster()) 394 NextClusterSucc = SuccSU; 395 return; 396 } 397 #ifndef NDEBUG 398 if (SuccSU->NumPredsLeft == 0) { 399 dbgs() << "*** Scheduling failed! ***\n"; 400 SuccSU->dump(this); 401 dbgs() << " has been released too many times!\n"; 402 llvm_unreachable(0); 403 } 404 #endif 405 --SuccSU->NumPredsLeft; 406 if (SuccSU->NumPredsLeft == 0 && SuccSU != &ExitSU) 407 SchedImpl->releaseTopNode(SuccSU); 408 } 409 410 /// releaseSuccessors - Call releaseSucc on each of SU's successors. 411 void ScheduleDAGMI::releaseSuccessors(SUnit *SU) { 412 for (SUnit::succ_iterator I = SU->Succs.begin(), E = SU->Succs.end(); 413 I != E; ++I) { 414 releaseSucc(SU, &*I); 415 } 416 } 417 418 /// ReleasePred - Decrement the NumSuccsLeft count of a predecessor. When 419 /// NumSuccsLeft reaches zero, release the predecessor node. 420 /// 421 /// FIXME: Adjust PredSU height based on MinLatency. 422 void ScheduleDAGMI::releasePred(SUnit *SU, SDep *PredEdge) { 423 SUnit *PredSU = PredEdge->getSUnit(); 424 425 if (PredEdge->isWeak()) { 426 --PredSU->WeakSuccsLeft; 427 if (PredEdge->isCluster()) 428 NextClusterPred = PredSU; 429 return; 430 } 431 #ifndef NDEBUG 432 if (PredSU->NumSuccsLeft == 0) { 433 dbgs() << "*** Scheduling failed! ***\n"; 434 PredSU->dump(this); 435 dbgs() << " has been released too many times!\n"; 436 llvm_unreachable(0); 437 } 438 #endif 439 --PredSU->NumSuccsLeft; 440 if (PredSU->NumSuccsLeft == 0 && PredSU != &EntrySU) 441 SchedImpl->releaseBottomNode(PredSU); 442 } 443 444 /// releasePredecessors - Call releasePred on each of SU's predecessors. 445 void ScheduleDAGMI::releasePredecessors(SUnit *SU) { 446 for (SUnit::pred_iterator I = SU->Preds.begin(), E = SU->Preds.end(); 447 I != E; ++I) { 448 releasePred(SU, &*I); 449 } 450 } 451 452 /// This is normally called from the main scheduler loop but may also be invoked 453 /// by the scheduling strategy to perform additional code motion. 454 void ScheduleDAGMI::moveInstruction(MachineInstr *MI, 455 MachineBasicBlock::iterator InsertPos) { 456 // Advance RegionBegin if the first instruction moves down. 457 if (&*RegionBegin == MI) 458 ++RegionBegin; 459 460 // Update the instruction stream. 461 BB->splice(InsertPos, BB, MI); 462 463 // Update LiveIntervals 464 LIS->handleMove(MI, /*UpdateFlags=*/true); 465 466 // Recede RegionBegin if an instruction moves above the first. 467 if (RegionBegin == InsertPos) 468 RegionBegin = MI; 469 } 470 471 bool ScheduleDAGMI::checkSchedLimit() { 472 #ifndef NDEBUG 473 if (NumInstrsScheduled == MISchedCutoff && MISchedCutoff != ~0U) { 474 CurrentTop = CurrentBottom; 475 return false; 476 } 477 ++NumInstrsScheduled; 478 #endif 479 return true; 480 } 481 482 /// enterRegion - Called back from MachineScheduler::runOnMachineFunction after 483 /// crossing a scheduling boundary. [begin, end) includes all instructions in 484 /// the region, including the boundary itself and single-instruction regions 485 /// that don't get scheduled. 486 void ScheduleDAGMI::enterRegion(MachineBasicBlock *bb, 487 MachineBasicBlock::iterator begin, 488 MachineBasicBlock::iterator end, 489 unsigned regioninstrs) 490 { 491 ScheduleDAGInstrs::enterRegion(bb, begin, end, regioninstrs); 492 493 // For convenience remember the end of the liveness region. 494 LiveRegionEnd = 495 (RegionEnd == bb->end()) ? RegionEnd : llvm::next(RegionEnd); 496 497 SUPressureDiffs.clear(); 498 499 SchedImpl->initPolicy(begin, end, regioninstrs); 500 501 ShouldTrackPressure = SchedImpl->shouldTrackPressure(); 502 } 503 504 // Setup the register pressure trackers for the top scheduled top and bottom 505 // scheduled regions. 506 void ScheduleDAGMI::initRegPressure() { 507 TopRPTracker.init(&MF, RegClassInfo, LIS, BB, RegionBegin); 508 BotRPTracker.init(&MF, RegClassInfo, LIS, BB, LiveRegionEnd); 509 510 // Close the RPTracker to finalize live ins. 511 RPTracker.closeRegion(); 512 513 DEBUG(RPTracker.dump()); 514 515 // Initialize the live ins and live outs. 516 TopRPTracker.addLiveRegs(RPTracker.getPressure().LiveInRegs); 517 BotRPTracker.addLiveRegs(RPTracker.getPressure().LiveOutRegs); 518 519 // Close one end of the tracker so we can call 520 // getMaxUpward/DownwardPressureDelta before advancing across any 521 // instructions. This converts currently live regs into live ins/outs. 522 TopRPTracker.closeTop(); 523 BotRPTracker.closeBottom(); 524 525 BotRPTracker.initLiveThru(RPTracker); 526 if (!BotRPTracker.getLiveThru().empty()) { 527 TopRPTracker.initLiveThru(BotRPTracker.getLiveThru()); 528 DEBUG(dbgs() << "Live Thru: "; 529 dumpRegSetPressure(BotRPTracker.getLiveThru(), TRI)); 530 }; 531 532 // For each live out vreg reduce the pressure change associated with other 533 // uses of the same vreg below the live-out reaching def. 534 updatePressureDiffs(RPTracker.getPressure().LiveOutRegs); 535 536 // Account for liveness generated by the region boundary. 537 if (LiveRegionEnd != RegionEnd) { 538 SmallVector<unsigned, 8> LiveUses; 539 BotRPTracker.recede(&LiveUses); 540 updatePressureDiffs(LiveUses); 541 } 542 543 assert(BotRPTracker.getPos() == RegionEnd && "Can't find the region bottom"); 544 545 // Cache the list of excess pressure sets in this region. This will also track 546 // the max pressure in the scheduled code for these sets. 547 RegionCriticalPSets.clear(); 548 const std::vector<unsigned> &RegionPressure = 549 RPTracker.getPressure().MaxSetPressure; 550 for (unsigned i = 0, e = RegionPressure.size(); i < e; ++i) { 551 unsigned Limit = RegClassInfo->getRegPressureSetLimit(i); 552 if (RegionPressure[i] > Limit) { 553 DEBUG(dbgs() << TRI->getRegPressureSetName(i) 554 << " Limit " << Limit 555 << " Actual " << RegionPressure[i] << "\n"); 556 RegionCriticalPSets.push_back(PressureChange(i)); 557 } 558 } 559 DEBUG(dbgs() << "Excess PSets: "; 560 for (unsigned i = 0, e = RegionCriticalPSets.size(); i != e; ++i) 561 dbgs() << TRI->getRegPressureSetName( 562 RegionCriticalPSets[i].getPSet()) << " "; 563 dbgs() << "\n"); 564 } 565 566 void ScheduleDAGMI:: 567 updateScheduledPressure(const SUnit *SU, 568 const std::vector<unsigned> &NewMaxPressure) { 569 const PressureDiff &PDiff = getPressureDiff(SU); 570 unsigned CritIdx = 0, CritEnd = RegionCriticalPSets.size(); 571 for (PressureDiff::const_iterator I = PDiff.begin(), E = PDiff.end(); 572 I != E; ++I) { 573 if (!I->isValid()) 574 break; 575 unsigned ID = I->getPSet(); 576 while (CritIdx != CritEnd && RegionCriticalPSets[CritIdx].getPSet() < ID) 577 ++CritIdx; 578 if (CritIdx != CritEnd && RegionCriticalPSets[CritIdx].getPSet() == ID) { 579 if ((int)NewMaxPressure[ID] > RegionCriticalPSets[CritIdx].getUnitInc() 580 && NewMaxPressure[ID] <= INT16_MAX) 581 RegionCriticalPSets[CritIdx].setUnitInc(NewMaxPressure[ID]); 582 } 583 unsigned Limit = RegClassInfo->getRegPressureSetLimit(ID); 584 if (NewMaxPressure[ID] >= Limit - 2) { 585 DEBUG(dbgs() << " " << TRI->getRegPressureSetName(ID) << ": " 586 << NewMaxPressure[ID] << " > " << Limit << "(+ " 587 << BotRPTracker.getLiveThru()[ID] << " livethru)\n"); 588 } 589 } 590 } 591 592 /// Update the PressureDiff array for liveness after scheduling this 593 /// instruction. 594 void ScheduleDAGMI::updatePressureDiffs(ArrayRef<unsigned> LiveUses) { 595 for (unsigned LUIdx = 0, LUEnd = LiveUses.size(); LUIdx != LUEnd; ++LUIdx) { 596 /// FIXME: Currently assuming single-use physregs. 597 unsigned Reg = LiveUses[LUIdx]; 598 DEBUG(dbgs() << " LiveReg: " << PrintVRegOrUnit(Reg, TRI) << "\n"); 599 if (!TRI->isVirtualRegister(Reg)) 600 continue; 601 602 // This may be called before CurrentBottom has been initialized. However, 603 // BotRPTracker must have a valid position. We want the value live into the 604 // instruction or live out of the block, so ask for the previous 605 // instruction's live-out. 606 const LiveInterval &LI = LIS->getInterval(Reg); 607 VNInfo *VNI; 608 MachineBasicBlock::const_iterator I = 609 nextIfDebug(BotRPTracker.getPos(), BB->end()); 610 if (I == BB->end()) 611 VNI = LI.getVNInfoBefore(LIS->getMBBEndIdx(BB)); 612 else { 613 LiveQueryResult LRQ = LI.Query(LIS->getInstructionIndex(I)); 614 VNI = LRQ.valueIn(); 615 } 616 // RegisterPressureTracker guarantees that readsReg is true for LiveUses. 617 assert(VNI && "No live value at use."); 618 for (VReg2UseMap::iterator 619 UI = VRegUses.find(Reg); UI != VRegUses.end(); ++UI) { 620 SUnit *SU = UI->SU; 621 DEBUG(dbgs() << " UpdateRegP: SU(" << SU->NodeNum << ") " 622 << *SU->getInstr()); 623 // If this use comes before the reaching def, it cannot be a last use, so 624 // descrease its pressure change. 625 if (!SU->isScheduled && SU != &ExitSU) { 626 LiveQueryResult LRQ 627 = LI.Query(LIS->getInstructionIndex(SU->getInstr())); 628 if (LRQ.valueIn() == VNI) 629 getPressureDiff(SU).addPressureChange(Reg, true, &MRI); 630 } 631 } 632 } 633 } 634 635 /// schedule - Called back from MachineScheduler::runOnMachineFunction 636 /// after setting up the current scheduling region. [RegionBegin, RegionEnd) 637 /// only includes instructions that have DAG nodes, not scheduling boundaries. 638 /// 639 /// This is a skeletal driver, with all the functionality pushed into helpers, 640 /// so that it can be easilly extended by experimental schedulers. Generally, 641 /// implementing MachineSchedStrategy should be sufficient to implement a new 642 /// scheduling algorithm. However, if a scheduler further subclasses 643 /// ScheduleDAGMI then it will want to override this virtual method in order to 644 /// update any specialized state. 645 void ScheduleDAGMI::schedule() { 646 buildDAGWithRegPressure(); 647 648 Topo.InitDAGTopologicalSorting(); 649 650 postprocessDAG(); 651 652 SmallVector<SUnit*, 8> TopRoots, BotRoots; 653 findRootsAndBiasEdges(TopRoots, BotRoots); 654 655 // Initialize the strategy before modifying the DAG. 656 // This may initialize a DFSResult to be used for queue priority. 657 SchedImpl->initialize(this); 658 659 DEBUG(for (unsigned su = 0, e = SUnits.size(); su != e; ++su) 660 SUnits[su].dumpAll(this)); 661 if (ViewMISchedDAGs) viewGraph(); 662 663 // Initialize ready queues now that the DAG and priority data are finalized. 664 initQueues(TopRoots, BotRoots); 665 666 bool IsTopNode = false; 667 while (SUnit *SU = SchedImpl->pickNode(IsTopNode)) { 668 assert(!SU->isScheduled && "Node already scheduled"); 669 if (!checkSchedLimit()) 670 break; 671 672 scheduleMI(SU, IsTopNode); 673 674 updateQueues(SU, IsTopNode); 675 } 676 assert(CurrentTop == CurrentBottom && "Nonempty unscheduled zone."); 677 678 placeDebugValues(); 679 680 DEBUG({ 681 unsigned BBNum = begin()->getParent()->getNumber(); 682 dbgs() << "*** Final schedule for BB#" << BBNum << " ***\n"; 683 dumpSchedule(); 684 dbgs() << '\n'; 685 }); 686 } 687 688 /// Build the DAG and setup three register pressure trackers. 689 void ScheduleDAGMI::buildDAGWithRegPressure() { 690 if (!ShouldTrackPressure) { 691 RPTracker.reset(); 692 RegionCriticalPSets.clear(); 693 buildSchedGraph(AA); 694 return; 695 } 696 697 // Initialize the register pressure tracker used by buildSchedGraph. 698 RPTracker.init(&MF, RegClassInfo, LIS, BB, LiveRegionEnd, 699 /*TrackUntiedDefs=*/true); 700 701 // Account for liveness generate by the region boundary. 702 if (LiveRegionEnd != RegionEnd) 703 RPTracker.recede(); 704 705 // Build the DAG, and compute current register pressure. 706 buildSchedGraph(AA, &RPTracker, &SUPressureDiffs); 707 708 // Initialize top/bottom trackers after computing region pressure. 709 initRegPressure(); 710 } 711 712 /// Apply each ScheduleDAGMutation step in order. 713 void ScheduleDAGMI::postprocessDAG() { 714 for (unsigned i = 0, e = Mutations.size(); i < e; ++i) { 715 Mutations[i]->apply(this); 716 } 717 } 718 719 void ScheduleDAGMI::computeDFSResult() { 720 if (!DFSResult) 721 DFSResult = new SchedDFSResult(/*BottomU*/true, MinSubtreeSize); 722 DFSResult->clear(); 723 ScheduledTrees.clear(); 724 DFSResult->resize(SUnits.size()); 725 DFSResult->compute(SUnits); 726 ScheduledTrees.resize(DFSResult->getNumSubtrees()); 727 } 728 729 void ScheduleDAGMI::findRootsAndBiasEdges(SmallVectorImpl<SUnit*> &TopRoots, 730 SmallVectorImpl<SUnit*> &BotRoots) { 731 for (std::vector<SUnit>::iterator 732 I = SUnits.begin(), E = SUnits.end(); I != E; ++I) { 733 SUnit *SU = &(*I); 734 assert(!SU->isBoundaryNode() && "Boundary node should not be in SUnits"); 735 736 // Order predecessors so DFSResult follows the critical path. 737 SU->biasCriticalPath(); 738 739 // A SUnit is ready to top schedule if it has no predecessors. 740 if (!I->NumPredsLeft) 741 TopRoots.push_back(SU); 742 // A SUnit is ready to bottom schedule if it has no successors. 743 if (!I->NumSuccsLeft) 744 BotRoots.push_back(SU); 745 } 746 ExitSU.biasCriticalPath(); 747 } 748 749 /// Compute the max cyclic critical path through the DAG. The scheduling DAG 750 /// only provides the critical path for single block loops. To handle loops that 751 /// span blocks, we could use the vreg path latencies provided by 752 /// MachineTraceMetrics instead. However, MachineTraceMetrics is not currently 753 /// available for use in the scheduler. 754 /// 755 /// The cyclic path estimation identifies a def-use pair that crosses the back 756 /// edge and considers the depth and height of the nodes. For example, consider 757 /// the following instruction sequence where each instruction has unit latency 758 /// and defines an epomymous virtual register: 759 /// 760 /// a->b(a,c)->c(b)->d(c)->exit 761 /// 762 /// The cyclic critical path is a two cycles: b->c->b 763 /// The acyclic critical path is four cycles: a->b->c->d->exit 764 /// LiveOutHeight = height(c) = len(c->d->exit) = 2 765 /// LiveOutDepth = depth(c) + 1 = len(a->b->c) + 1 = 3 766 /// LiveInHeight = height(b) + 1 = len(b->c->d->exit) + 1 = 4 767 /// LiveInDepth = depth(b) = len(a->b) = 1 768 /// 769 /// LiveOutDepth - LiveInDepth = 3 - 1 = 2 770 /// LiveInHeight - LiveOutHeight = 4 - 2 = 2 771 /// CyclicCriticalPath = min(2, 2) = 2 772 unsigned ScheduleDAGMI::computeCyclicCriticalPath() { 773 // This only applies to single block loop. 774 if (!BB->isSuccessor(BB)) 775 return 0; 776 777 unsigned MaxCyclicLatency = 0; 778 // Visit each live out vreg def to find def/use pairs that cross iterations. 779 ArrayRef<unsigned> LiveOuts = RPTracker.getPressure().LiveOutRegs; 780 for (ArrayRef<unsigned>::iterator RI = LiveOuts.begin(), RE = LiveOuts.end(); 781 RI != RE; ++RI) { 782 unsigned Reg = *RI; 783 if (!TRI->isVirtualRegister(Reg)) 784 continue; 785 const LiveInterval &LI = LIS->getInterval(Reg); 786 const VNInfo *DefVNI = LI.getVNInfoBefore(LIS->getMBBEndIdx(BB)); 787 if (!DefVNI) 788 continue; 789 790 MachineInstr *DefMI = LIS->getInstructionFromIndex(DefVNI->def); 791 const SUnit *DefSU = getSUnit(DefMI); 792 if (!DefSU) 793 continue; 794 795 unsigned LiveOutHeight = DefSU->getHeight(); 796 unsigned LiveOutDepth = DefSU->getDepth() + DefSU->Latency; 797 // Visit all local users of the vreg def. 798 for (VReg2UseMap::iterator 799 UI = VRegUses.find(Reg); UI != VRegUses.end(); ++UI) { 800 if (UI->SU == &ExitSU) 801 continue; 802 803 // Only consider uses of the phi. 804 LiveQueryResult LRQ = 805 LI.Query(LIS->getInstructionIndex(UI->SU->getInstr())); 806 if (!LRQ.valueIn()->isPHIDef()) 807 continue; 808 809 // Assume that a path spanning two iterations is a cycle, which could 810 // overestimate in strange cases. This allows cyclic latency to be 811 // estimated as the minimum slack of the vreg's depth or height. 812 unsigned CyclicLatency = 0; 813 if (LiveOutDepth > UI->SU->getDepth()) 814 CyclicLatency = LiveOutDepth - UI->SU->getDepth(); 815 816 unsigned LiveInHeight = UI->SU->getHeight() + DefSU->Latency; 817 if (LiveInHeight > LiveOutHeight) { 818 if (LiveInHeight - LiveOutHeight < CyclicLatency) 819 CyclicLatency = LiveInHeight - LiveOutHeight; 820 } 821 else 822 CyclicLatency = 0; 823 824 DEBUG(dbgs() << "Cyclic Path: SU(" << DefSU->NodeNum << ") -> SU(" 825 << UI->SU->NodeNum << ") = " << CyclicLatency << "c\n"); 826 if (CyclicLatency > MaxCyclicLatency) 827 MaxCyclicLatency = CyclicLatency; 828 } 829 } 830 DEBUG(dbgs() << "Cyclic Critical Path: " << MaxCyclicLatency << "c\n"); 831 return MaxCyclicLatency; 832 } 833 834 /// Identify DAG roots and setup scheduler queues. 835 void ScheduleDAGMI::initQueues(ArrayRef<SUnit*> TopRoots, 836 ArrayRef<SUnit*> BotRoots) { 837 NextClusterSucc = NULL; 838 NextClusterPred = NULL; 839 840 // Release all DAG roots for scheduling, not including EntrySU/ExitSU. 841 // 842 // Nodes with unreleased weak edges can still be roots. 843 // Release top roots in forward order. 844 for (SmallVectorImpl<SUnit*>::const_iterator 845 I = TopRoots.begin(), E = TopRoots.end(); I != E; ++I) { 846 SchedImpl->releaseTopNode(*I); 847 } 848 // Release bottom roots in reverse order so the higher priority nodes appear 849 // first. This is more natural and slightly more efficient. 850 for (SmallVectorImpl<SUnit*>::const_reverse_iterator 851 I = BotRoots.rbegin(), E = BotRoots.rend(); I != E; ++I) { 852 SchedImpl->releaseBottomNode(*I); 853 } 854 855 releaseSuccessors(&EntrySU); 856 releasePredecessors(&ExitSU); 857 858 SchedImpl->registerRoots(); 859 860 // Advance past initial DebugValues. 861 CurrentTop = nextIfDebug(RegionBegin, RegionEnd); 862 CurrentBottom = RegionEnd; 863 864 if (ShouldTrackPressure) { 865 assert(TopRPTracker.getPos() == RegionBegin && "bad initial Top tracker"); 866 TopRPTracker.setPos(CurrentTop); 867 } 868 } 869 870 /// Move an instruction and update register pressure. 871 void ScheduleDAGMI::scheduleMI(SUnit *SU, bool IsTopNode) { 872 // Move the instruction to its new location in the instruction stream. 873 MachineInstr *MI = SU->getInstr(); 874 875 if (IsTopNode) { 876 assert(SU->isTopReady() && "node still has unscheduled dependencies"); 877 if (&*CurrentTop == MI) 878 CurrentTop = nextIfDebug(++CurrentTop, CurrentBottom); 879 else { 880 moveInstruction(MI, CurrentTop); 881 TopRPTracker.setPos(MI); 882 } 883 884 if (ShouldTrackPressure) { 885 // Update top scheduled pressure. 886 TopRPTracker.advance(); 887 assert(TopRPTracker.getPos() == CurrentTop && "out of sync"); 888 updateScheduledPressure(SU, TopRPTracker.getPressure().MaxSetPressure); 889 } 890 } 891 else { 892 assert(SU->isBottomReady() && "node still has unscheduled dependencies"); 893 MachineBasicBlock::iterator priorII = 894 priorNonDebug(CurrentBottom, CurrentTop); 895 if (&*priorII == MI) 896 CurrentBottom = priorII; 897 else { 898 if (&*CurrentTop == MI) { 899 CurrentTop = nextIfDebug(++CurrentTop, priorII); 900 TopRPTracker.setPos(CurrentTop); 901 } 902 moveInstruction(MI, CurrentBottom); 903 CurrentBottom = MI; 904 } 905 if (ShouldTrackPressure) { 906 // Update bottom scheduled pressure. 907 SmallVector<unsigned, 8> LiveUses; 908 BotRPTracker.recede(&LiveUses); 909 assert(BotRPTracker.getPos() == CurrentBottom && "out of sync"); 910 updateScheduledPressure(SU, BotRPTracker.getPressure().MaxSetPressure); 911 updatePressureDiffs(LiveUses); 912 } 913 } 914 } 915 916 /// Update scheduler queues after scheduling an instruction. 917 void ScheduleDAGMI::updateQueues(SUnit *SU, bool IsTopNode) { 918 // Release dependent instructions for scheduling. 919 if (IsTopNode) 920 releaseSuccessors(SU); 921 else 922 releasePredecessors(SU); 923 924 SU->isScheduled = true; 925 926 if (DFSResult) { 927 unsigned SubtreeID = DFSResult->getSubtreeID(SU); 928 if (!ScheduledTrees.test(SubtreeID)) { 929 ScheduledTrees.set(SubtreeID); 930 DFSResult->scheduleTree(SubtreeID); 931 SchedImpl->scheduleTree(SubtreeID); 932 } 933 } 934 935 // Notify the scheduling strategy after updating the DAG. 936 SchedImpl->schedNode(SU, IsTopNode); 937 } 938 939 /// Reinsert any remaining debug_values, just like the PostRA scheduler. 940 void ScheduleDAGMI::placeDebugValues() { 941 // If first instruction was a DBG_VALUE then put it back. 942 if (FirstDbgValue) { 943 BB->splice(RegionBegin, BB, FirstDbgValue); 944 RegionBegin = FirstDbgValue; 945 } 946 947 for (std::vector<std::pair<MachineInstr *, MachineInstr *> >::iterator 948 DI = DbgValues.end(), DE = DbgValues.begin(); DI != DE; --DI) { 949 std::pair<MachineInstr *, MachineInstr *> P = *prior(DI); 950 MachineInstr *DbgValue = P.first; 951 MachineBasicBlock::iterator OrigPrevMI = P.second; 952 if (&*RegionBegin == DbgValue) 953 ++RegionBegin; 954 BB->splice(++OrigPrevMI, BB, DbgValue); 955 if (OrigPrevMI == llvm::prior(RegionEnd)) 956 RegionEnd = DbgValue; 957 } 958 DbgValues.clear(); 959 FirstDbgValue = NULL; 960 } 961 962 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 963 void ScheduleDAGMI::dumpSchedule() const { 964 for (MachineBasicBlock::iterator MI = begin(), ME = end(); MI != ME; ++MI) { 965 if (SUnit *SU = getSUnit(&(*MI))) 966 SU->dump(this); 967 else 968 dbgs() << "Missing SUnit\n"; 969 } 970 } 971 #endif 972 973 //===----------------------------------------------------------------------===// 974 // LoadClusterMutation - DAG post-processing to cluster loads. 975 //===----------------------------------------------------------------------===// 976 977 namespace { 978 /// \brief Post-process the DAG to create cluster edges between neighboring 979 /// loads. 980 class LoadClusterMutation : public ScheduleDAGMutation { 981 struct LoadInfo { 982 SUnit *SU; 983 unsigned BaseReg; 984 unsigned Offset; 985 LoadInfo(SUnit *su, unsigned reg, unsigned ofs) 986 : SU(su), BaseReg(reg), Offset(ofs) {} 987 }; 988 static bool LoadInfoLess(const LoadClusterMutation::LoadInfo &LHS, 989 const LoadClusterMutation::LoadInfo &RHS); 990 991 const TargetInstrInfo *TII; 992 const TargetRegisterInfo *TRI; 993 public: 994 LoadClusterMutation(const TargetInstrInfo *tii, 995 const TargetRegisterInfo *tri) 996 : TII(tii), TRI(tri) {} 997 998 virtual void apply(ScheduleDAGMI *DAG); 999 protected: 1000 void clusterNeighboringLoads(ArrayRef<SUnit*> Loads, ScheduleDAGMI *DAG); 1001 }; 1002 } // anonymous 1003 1004 bool LoadClusterMutation::LoadInfoLess( 1005 const LoadClusterMutation::LoadInfo &LHS, 1006 const LoadClusterMutation::LoadInfo &RHS) { 1007 if (LHS.BaseReg != RHS.BaseReg) 1008 return LHS.BaseReg < RHS.BaseReg; 1009 return LHS.Offset < RHS.Offset; 1010 } 1011 1012 void LoadClusterMutation::clusterNeighboringLoads(ArrayRef<SUnit*> Loads, 1013 ScheduleDAGMI *DAG) { 1014 SmallVector<LoadClusterMutation::LoadInfo,32> LoadRecords; 1015 for (unsigned Idx = 0, End = Loads.size(); Idx != End; ++Idx) { 1016 SUnit *SU = Loads[Idx]; 1017 unsigned BaseReg; 1018 unsigned Offset; 1019 if (TII->getLdStBaseRegImmOfs(SU->getInstr(), BaseReg, Offset, TRI)) 1020 LoadRecords.push_back(LoadInfo(SU, BaseReg, Offset)); 1021 } 1022 if (LoadRecords.size() < 2) 1023 return; 1024 std::sort(LoadRecords.begin(), LoadRecords.end(), LoadInfoLess); 1025 unsigned ClusterLength = 1; 1026 for (unsigned Idx = 0, End = LoadRecords.size(); Idx < (End - 1); ++Idx) { 1027 if (LoadRecords[Idx].BaseReg != LoadRecords[Idx+1].BaseReg) { 1028 ClusterLength = 1; 1029 continue; 1030 } 1031 1032 SUnit *SUa = LoadRecords[Idx].SU; 1033 SUnit *SUb = LoadRecords[Idx+1].SU; 1034 if (TII->shouldClusterLoads(SUa->getInstr(), SUb->getInstr(), ClusterLength) 1035 && DAG->addEdge(SUb, SDep(SUa, SDep::Cluster))) { 1036 1037 DEBUG(dbgs() << "Cluster loads SU(" << SUa->NodeNum << ") - SU(" 1038 << SUb->NodeNum << ")\n"); 1039 // Copy successor edges from SUa to SUb. Interleaving computation 1040 // dependent on SUa can prevent load combining due to register reuse. 1041 // Predecessor edges do not need to be copied from SUb to SUa since nearby 1042 // loads should have effectively the same inputs. 1043 for (SUnit::const_succ_iterator 1044 SI = SUa->Succs.begin(), SE = SUa->Succs.end(); SI != SE; ++SI) { 1045 if (SI->getSUnit() == SUb) 1046 continue; 1047 DEBUG(dbgs() << " Copy Succ SU(" << SI->getSUnit()->NodeNum << ")\n"); 1048 DAG->addEdge(SI->getSUnit(), SDep(SUb, SDep::Artificial)); 1049 } 1050 ++ClusterLength; 1051 } 1052 else 1053 ClusterLength = 1; 1054 } 1055 } 1056 1057 /// \brief Callback from DAG postProcessing to create cluster edges for loads. 1058 void LoadClusterMutation::apply(ScheduleDAGMI *DAG) { 1059 // Map DAG NodeNum to store chain ID. 1060 DenseMap<unsigned, unsigned> StoreChainIDs; 1061 // Map each store chain to a set of dependent loads. 1062 SmallVector<SmallVector<SUnit*,4>, 32> StoreChainDependents; 1063 for (unsigned Idx = 0, End = DAG->SUnits.size(); Idx != End; ++Idx) { 1064 SUnit *SU = &DAG->SUnits[Idx]; 1065 if (!SU->getInstr()->mayLoad()) 1066 continue; 1067 unsigned ChainPredID = DAG->SUnits.size(); 1068 for (SUnit::const_pred_iterator 1069 PI = SU->Preds.begin(), PE = SU->Preds.end(); PI != PE; ++PI) { 1070 if (PI->isCtrl()) { 1071 ChainPredID = PI->getSUnit()->NodeNum; 1072 break; 1073 } 1074 } 1075 // Check if this chain-like pred has been seen 1076 // before. ChainPredID==MaxNodeID for loads at the top of the schedule. 1077 unsigned NumChains = StoreChainDependents.size(); 1078 std::pair<DenseMap<unsigned, unsigned>::iterator, bool> Result = 1079 StoreChainIDs.insert(std::make_pair(ChainPredID, NumChains)); 1080 if (Result.second) 1081 StoreChainDependents.resize(NumChains + 1); 1082 StoreChainDependents[Result.first->second].push_back(SU); 1083 } 1084 // Iterate over the store chains. 1085 for (unsigned Idx = 0, End = StoreChainDependents.size(); Idx != End; ++Idx) 1086 clusterNeighboringLoads(StoreChainDependents[Idx], DAG); 1087 } 1088 1089 //===----------------------------------------------------------------------===// 1090 // MacroFusion - DAG post-processing to encourage fusion of macro ops. 1091 //===----------------------------------------------------------------------===// 1092 1093 namespace { 1094 /// \brief Post-process the DAG to create cluster edges between instructions 1095 /// that may be fused by the processor into a single operation. 1096 class MacroFusion : public ScheduleDAGMutation { 1097 const TargetInstrInfo *TII; 1098 public: 1099 MacroFusion(const TargetInstrInfo *tii): TII(tii) {} 1100 1101 virtual void apply(ScheduleDAGMI *DAG); 1102 }; 1103 } // anonymous 1104 1105 /// \brief Callback from DAG postProcessing to create cluster edges to encourage 1106 /// fused operations. 1107 void MacroFusion::apply(ScheduleDAGMI *DAG) { 1108 // For now, assume targets can only fuse with the branch. 1109 MachineInstr *Branch = DAG->ExitSU.getInstr(); 1110 if (!Branch) 1111 return; 1112 1113 for (unsigned Idx = DAG->SUnits.size(); Idx > 0;) { 1114 SUnit *SU = &DAG->SUnits[--Idx]; 1115 if (!TII->shouldScheduleAdjacent(SU->getInstr(), Branch)) 1116 continue; 1117 1118 // Create a single weak edge from SU to ExitSU. The only effect is to cause 1119 // bottom-up scheduling to heavily prioritize the clustered SU. There is no 1120 // need to copy predecessor edges from ExitSU to SU, since top-down 1121 // scheduling cannot prioritize ExitSU anyway. To defer top-down scheduling 1122 // of SU, we could create an artificial edge from the deepest root, but it 1123 // hasn't been needed yet. 1124 bool Success = DAG->addEdge(&DAG->ExitSU, SDep(SU, SDep::Cluster)); 1125 (void)Success; 1126 assert(Success && "No DAG nodes should be reachable from ExitSU"); 1127 1128 DEBUG(dbgs() << "Macro Fuse SU(" << SU->NodeNum << ")\n"); 1129 break; 1130 } 1131 } 1132 1133 //===----------------------------------------------------------------------===// 1134 // CopyConstrain - DAG post-processing to encourage copy elimination. 1135 //===----------------------------------------------------------------------===// 1136 1137 namespace { 1138 /// \brief Post-process the DAG to create weak edges from all uses of a copy to 1139 /// the one use that defines the copy's source vreg, most likely an induction 1140 /// variable increment. 1141 class CopyConstrain : public ScheduleDAGMutation { 1142 // Transient state. 1143 SlotIndex RegionBeginIdx; 1144 // RegionEndIdx is the slot index of the last non-debug instruction in the 1145 // scheduling region. So we may have RegionBeginIdx == RegionEndIdx. 1146 SlotIndex RegionEndIdx; 1147 public: 1148 CopyConstrain(const TargetInstrInfo *, const TargetRegisterInfo *) {} 1149 1150 virtual void apply(ScheduleDAGMI *DAG); 1151 1152 protected: 1153 void constrainLocalCopy(SUnit *CopySU, ScheduleDAGMI *DAG); 1154 }; 1155 } // anonymous 1156 1157 /// constrainLocalCopy handles two possibilities: 1158 /// 1) Local src: 1159 /// I0: = dst 1160 /// I1: src = ... 1161 /// I2: = dst 1162 /// I3: dst = src (copy) 1163 /// (create pred->succ edges I0->I1, I2->I1) 1164 /// 1165 /// 2) Local copy: 1166 /// I0: dst = src (copy) 1167 /// I1: = dst 1168 /// I2: src = ... 1169 /// I3: = dst 1170 /// (create pred->succ edges I1->I2, I3->I2) 1171 /// 1172 /// Although the MachineScheduler is currently constrained to single blocks, 1173 /// this algorithm should handle extended blocks. An EBB is a set of 1174 /// contiguously numbered blocks such that the previous block in the EBB is 1175 /// always the single predecessor. 1176 void CopyConstrain::constrainLocalCopy(SUnit *CopySU, ScheduleDAGMI *DAG) { 1177 LiveIntervals *LIS = DAG->getLIS(); 1178 MachineInstr *Copy = CopySU->getInstr(); 1179 1180 // Check for pure vreg copies. 1181 unsigned SrcReg = Copy->getOperand(1).getReg(); 1182 if (!TargetRegisterInfo::isVirtualRegister(SrcReg)) 1183 return; 1184 1185 unsigned DstReg = Copy->getOperand(0).getReg(); 1186 if (!TargetRegisterInfo::isVirtualRegister(DstReg)) 1187 return; 1188 1189 // Check if either the dest or source is local. If it's live across a back 1190 // edge, it's not local. Note that if both vregs are live across the back 1191 // edge, we cannot successfully contrain the copy without cyclic scheduling. 1192 unsigned LocalReg = DstReg; 1193 unsigned GlobalReg = SrcReg; 1194 LiveInterval *LocalLI = &LIS->getInterval(LocalReg); 1195 if (!LocalLI->isLocal(RegionBeginIdx, RegionEndIdx)) { 1196 LocalReg = SrcReg; 1197 GlobalReg = DstReg; 1198 LocalLI = &LIS->getInterval(LocalReg); 1199 if (!LocalLI->isLocal(RegionBeginIdx, RegionEndIdx)) 1200 return; 1201 } 1202 LiveInterval *GlobalLI = &LIS->getInterval(GlobalReg); 1203 1204 // Find the global segment after the start of the local LI. 1205 LiveInterval::iterator GlobalSegment = GlobalLI->find(LocalLI->beginIndex()); 1206 // If GlobalLI does not overlap LocalLI->start, then a copy directly feeds a 1207 // local live range. We could create edges from other global uses to the local 1208 // start, but the coalescer should have already eliminated these cases, so 1209 // don't bother dealing with it. 1210 if (GlobalSegment == GlobalLI->end()) 1211 return; 1212 1213 // If GlobalSegment is killed at the LocalLI->start, the call to find() 1214 // returned the next global segment. But if GlobalSegment overlaps with 1215 // LocalLI->start, then advance to the next segement. If a hole in GlobalLI 1216 // exists in LocalLI's vicinity, GlobalSegment will be the end of the hole. 1217 if (GlobalSegment->contains(LocalLI->beginIndex())) 1218 ++GlobalSegment; 1219 1220 if (GlobalSegment == GlobalLI->end()) 1221 return; 1222 1223 // Check if GlobalLI contains a hole in the vicinity of LocalLI. 1224 if (GlobalSegment != GlobalLI->begin()) { 1225 // Two address defs have no hole. 1226 if (SlotIndex::isSameInstr(llvm::prior(GlobalSegment)->end, 1227 GlobalSegment->start)) { 1228 return; 1229 } 1230 // If the prior global segment may be defined by the same two-address 1231 // instruction that also defines LocalLI, then can't make a hole here. 1232 if (SlotIndex::isSameInstr(llvm::prior(GlobalSegment)->start, 1233 LocalLI->beginIndex())) { 1234 return; 1235 } 1236 // If GlobalLI has a prior segment, it must be live into the EBB. Otherwise 1237 // it would be a disconnected component in the live range. 1238 assert(llvm::prior(GlobalSegment)->start < LocalLI->beginIndex() && 1239 "Disconnected LRG within the scheduling region."); 1240 } 1241 MachineInstr *GlobalDef = LIS->getInstructionFromIndex(GlobalSegment->start); 1242 if (!GlobalDef) 1243 return; 1244 1245 SUnit *GlobalSU = DAG->getSUnit(GlobalDef); 1246 if (!GlobalSU) 1247 return; 1248 1249 // GlobalDef is the bottom of the GlobalLI hole. Open the hole by 1250 // constraining the uses of the last local def to precede GlobalDef. 1251 SmallVector<SUnit*,8> LocalUses; 1252 const VNInfo *LastLocalVN = LocalLI->getVNInfoBefore(LocalLI->endIndex()); 1253 MachineInstr *LastLocalDef = LIS->getInstructionFromIndex(LastLocalVN->def); 1254 SUnit *LastLocalSU = DAG->getSUnit(LastLocalDef); 1255 for (SUnit::const_succ_iterator 1256 I = LastLocalSU->Succs.begin(), E = LastLocalSU->Succs.end(); 1257 I != E; ++I) { 1258 if (I->getKind() != SDep::Data || I->getReg() != LocalReg) 1259 continue; 1260 if (I->getSUnit() == GlobalSU) 1261 continue; 1262 if (!DAG->canAddEdge(GlobalSU, I->getSUnit())) 1263 return; 1264 LocalUses.push_back(I->getSUnit()); 1265 } 1266 // Open the top of the GlobalLI hole by constraining any earlier global uses 1267 // to precede the start of LocalLI. 1268 SmallVector<SUnit*,8> GlobalUses; 1269 MachineInstr *FirstLocalDef = 1270 LIS->getInstructionFromIndex(LocalLI->beginIndex()); 1271 SUnit *FirstLocalSU = DAG->getSUnit(FirstLocalDef); 1272 for (SUnit::const_pred_iterator 1273 I = GlobalSU->Preds.begin(), E = GlobalSU->Preds.end(); I != E; ++I) { 1274 if (I->getKind() != SDep::Anti || I->getReg() != GlobalReg) 1275 continue; 1276 if (I->getSUnit() == FirstLocalSU) 1277 continue; 1278 if (!DAG->canAddEdge(FirstLocalSU, I->getSUnit())) 1279 return; 1280 GlobalUses.push_back(I->getSUnit()); 1281 } 1282 DEBUG(dbgs() << "Constraining copy SU(" << CopySU->NodeNum << ")\n"); 1283 // Add the weak edges. 1284 for (SmallVectorImpl<SUnit*>::const_iterator 1285 I = LocalUses.begin(), E = LocalUses.end(); I != E; ++I) { 1286 DEBUG(dbgs() << " Local use SU(" << (*I)->NodeNum << ") -> SU(" 1287 << GlobalSU->NodeNum << ")\n"); 1288 DAG->addEdge(GlobalSU, SDep(*I, SDep::Weak)); 1289 } 1290 for (SmallVectorImpl<SUnit*>::const_iterator 1291 I = GlobalUses.begin(), E = GlobalUses.end(); I != E; ++I) { 1292 DEBUG(dbgs() << " Global use SU(" << (*I)->NodeNum << ") -> SU(" 1293 << FirstLocalSU->NodeNum << ")\n"); 1294 DAG->addEdge(FirstLocalSU, SDep(*I, SDep::Weak)); 1295 } 1296 } 1297 1298 /// \brief Callback from DAG postProcessing to create weak edges to encourage 1299 /// copy elimination. 1300 void CopyConstrain::apply(ScheduleDAGMI *DAG) { 1301 MachineBasicBlock::iterator FirstPos = nextIfDebug(DAG->begin(), DAG->end()); 1302 if (FirstPos == DAG->end()) 1303 return; 1304 RegionBeginIdx = DAG->getLIS()->getInstructionIndex(&*FirstPos); 1305 RegionEndIdx = DAG->getLIS()->getInstructionIndex( 1306 &*priorNonDebug(DAG->end(), DAG->begin())); 1307 1308 for (unsigned Idx = 0, End = DAG->SUnits.size(); Idx != End; ++Idx) { 1309 SUnit *SU = &DAG->SUnits[Idx]; 1310 if (!SU->getInstr()->isCopy()) 1311 continue; 1312 1313 constrainLocalCopy(SU, DAG); 1314 } 1315 } 1316 1317 //===----------------------------------------------------------------------===// 1318 // GenericScheduler - Implementation of the generic MachineSchedStrategy. 1319 //===----------------------------------------------------------------------===// 1320 1321 namespace { 1322 /// GenericScheduler shrinks the unscheduled zone using heuristics to balance 1323 /// the schedule. 1324 class GenericScheduler : public MachineSchedStrategy { 1325 public: 1326 /// Represent the type of SchedCandidate found within a single queue. 1327 /// pickNodeBidirectional depends on these listed by decreasing priority. 1328 enum CandReason { 1329 NoCand, PhysRegCopy, RegExcess, RegCritical, Cluster, Weak, RegMax, 1330 ResourceReduce, ResourceDemand, BotHeightReduce, BotPathReduce, 1331 TopDepthReduce, TopPathReduce, NextDefUse, NodeOrder}; 1332 1333 #ifndef NDEBUG 1334 static const char *getReasonStr(GenericScheduler::CandReason Reason); 1335 #endif 1336 1337 /// Policy for scheduling the next instruction in the candidate's zone. 1338 struct CandPolicy { 1339 bool ReduceLatency; 1340 unsigned ReduceResIdx; 1341 unsigned DemandResIdx; 1342 1343 CandPolicy(): ReduceLatency(false), ReduceResIdx(0), DemandResIdx(0) {} 1344 }; 1345 1346 /// Status of an instruction's critical resource consumption. 1347 struct SchedResourceDelta { 1348 // Count critical resources in the scheduled region required by SU. 1349 unsigned CritResources; 1350 1351 // Count critical resources from another region consumed by SU. 1352 unsigned DemandedResources; 1353 1354 SchedResourceDelta(): CritResources(0), DemandedResources(0) {} 1355 1356 bool operator==(const SchedResourceDelta &RHS) const { 1357 return CritResources == RHS.CritResources 1358 && DemandedResources == RHS.DemandedResources; 1359 } 1360 bool operator!=(const SchedResourceDelta &RHS) const { 1361 return !operator==(RHS); 1362 } 1363 }; 1364 1365 /// Store the state used by GenericScheduler heuristics, required for the 1366 /// lifetime of one invocation of pickNode(). 1367 struct SchedCandidate { 1368 CandPolicy Policy; 1369 1370 // The best SUnit candidate. 1371 SUnit *SU; 1372 1373 // The reason for this candidate. 1374 CandReason Reason; 1375 1376 // Set of reasons that apply to multiple candidates. 1377 uint32_t RepeatReasonSet; 1378 1379 // Register pressure values for the best candidate. 1380 RegPressureDelta RPDelta; 1381 1382 // Critical resource consumption of the best candidate. 1383 SchedResourceDelta ResDelta; 1384 1385 SchedCandidate(const CandPolicy &policy) 1386 : Policy(policy), SU(NULL), Reason(NoCand), RepeatReasonSet(0) {} 1387 1388 bool isValid() const { return SU; } 1389 1390 // Copy the status of another candidate without changing policy. 1391 void setBest(SchedCandidate &Best) { 1392 assert(Best.Reason != NoCand && "uninitialized Sched candidate"); 1393 SU = Best.SU; 1394 Reason = Best.Reason; 1395 RPDelta = Best.RPDelta; 1396 ResDelta = Best.ResDelta; 1397 } 1398 1399 bool isRepeat(CandReason R) { return RepeatReasonSet & (1 << R); } 1400 void setRepeat(CandReason R) { RepeatReasonSet |= (1 << R); } 1401 1402 void initResourceDelta(const ScheduleDAGMI *DAG, 1403 const TargetSchedModel *SchedModel); 1404 }; 1405 1406 /// Summarize the unscheduled region. 1407 struct SchedRemainder { 1408 // Critical path through the DAG in expected latency. 1409 unsigned CriticalPath; 1410 unsigned CyclicCritPath; 1411 1412 // Scaled count of micro-ops left to schedule. 1413 unsigned RemIssueCount; 1414 1415 bool IsAcyclicLatencyLimited; 1416 1417 // Unscheduled resources 1418 SmallVector<unsigned, 16> RemainingCounts; 1419 1420 void reset() { 1421 CriticalPath = 0; 1422 CyclicCritPath = 0; 1423 RemIssueCount = 0; 1424 IsAcyclicLatencyLimited = false; 1425 RemainingCounts.clear(); 1426 } 1427 1428 SchedRemainder() { reset(); } 1429 1430 void init(ScheduleDAGMI *DAG, const TargetSchedModel *SchedModel); 1431 }; 1432 1433 /// Each Scheduling boundary is associated with ready queues. It tracks the 1434 /// current cycle in the direction of movement, and maintains the state 1435 /// of "hazards" and other interlocks at the current cycle. 1436 struct SchedBoundary { 1437 ScheduleDAGMI *DAG; 1438 const TargetSchedModel *SchedModel; 1439 SchedRemainder *Rem; 1440 1441 ReadyQueue Available; 1442 ReadyQueue Pending; 1443 bool CheckPending; 1444 1445 // For heuristics, keep a list of the nodes that immediately depend on the 1446 // most recently scheduled node. 1447 SmallPtrSet<const SUnit*, 8> NextSUs; 1448 1449 ScheduleHazardRecognizer *HazardRec; 1450 1451 /// Number of cycles it takes to issue the instructions scheduled in this 1452 /// zone. It is defined as: scheduled-micro-ops / issue-width + stalls. 1453 /// See getStalls(). 1454 unsigned CurrCycle; 1455 1456 /// Micro-ops issued in the current cycle 1457 unsigned CurrMOps; 1458 1459 /// MinReadyCycle - Cycle of the soonest available instruction. 1460 unsigned MinReadyCycle; 1461 1462 // The expected latency of the critical path in this scheduled zone. 1463 unsigned ExpectedLatency; 1464 1465 // The latency of dependence chains leading into this zone. 1466 // For each node scheduled bottom-up: DLat = max DLat, N.Depth. 1467 // For each cycle scheduled: DLat -= 1. 1468 unsigned DependentLatency; 1469 1470 /// Count the scheduled (issued) micro-ops that can be retired by 1471 /// time=CurrCycle assuming the first scheduled instr is retired at time=0. 1472 unsigned RetiredMOps; 1473 1474 // Count scheduled resources that have been executed. Resources are 1475 // considered executed if they become ready in the time that it takes to 1476 // saturate any resource including the one in question. Counts are scaled 1477 // for direct comparison with other resources. Counts can be compared with 1478 // MOps * getMicroOpFactor and Latency * getLatencyFactor. 1479 SmallVector<unsigned, 16> ExecutedResCounts; 1480 1481 /// Cache the max count for a single resource. 1482 unsigned MaxExecutedResCount; 1483 1484 // Cache the critical resources ID in this scheduled zone. 1485 unsigned ZoneCritResIdx; 1486 1487 // Is the scheduled region resource limited vs. latency limited. 1488 bool IsResourceLimited; 1489 1490 #ifndef NDEBUG 1491 // Remember the greatest operand latency as an upper bound on the number of 1492 // times we should retry the pending queue because of a hazard. 1493 unsigned MaxObservedLatency; 1494 #endif 1495 1496 void reset() { 1497 // A new HazardRec is created for each DAG and owned by SchedBoundary. 1498 // Destroying and reconstructing it is very expensive though. So keep 1499 // invalid, placeholder HazardRecs. 1500 if (HazardRec && HazardRec->isEnabled()) { 1501 delete HazardRec; 1502 HazardRec = 0; 1503 } 1504 Available.clear(); 1505 Pending.clear(); 1506 CheckPending = false; 1507 NextSUs.clear(); 1508 CurrCycle = 0; 1509 CurrMOps = 0; 1510 MinReadyCycle = UINT_MAX; 1511 ExpectedLatency = 0; 1512 DependentLatency = 0; 1513 RetiredMOps = 0; 1514 MaxExecutedResCount = 0; 1515 ZoneCritResIdx = 0; 1516 IsResourceLimited = false; 1517 #ifndef NDEBUG 1518 MaxObservedLatency = 0; 1519 #endif 1520 // Reserve a zero-count for invalid CritResIdx. 1521 ExecutedResCounts.resize(1); 1522 assert(!ExecutedResCounts[0] && "nonzero count for bad resource"); 1523 } 1524 1525 /// Pending queues extend the ready queues with the same ID and the 1526 /// PendingFlag set. 1527 SchedBoundary(unsigned ID, const Twine &Name): 1528 DAG(0), SchedModel(0), Rem(0), Available(ID, Name+".A"), 1529 Pending(ID << GenericScheduler::LogMaxQID, Name+".P"), 1530 HazardRec(0) { 1531 reset(); 1532 } 1533 1534 ~SchedBoundary() { delete HazardRec; } 1535 1536 void init(ScheduleDAGMI *dag, const TargetSchedModel *smodel, 1537 SchedRemainder *rem); 1538 1539 bool isTop() const { 1540 return Available.getID() == GenericScheduler::TopQID; 1541 } 1542 1543 #ifndef NDEBUG 1544 const char *getResourceName(unsigned PIdx) { 1545 if (!PIdx) 1546 return "MOps"; 1547 return SchedModel->getProcResource(PIdx)->Name; 1548 } 1549 #endif 1550 1551 /// Get the number of latency cycles "covered" by the scheduled 1552 /// instructions. This is the larger of the critical path within the zone 1553 /// and the number of cycles required to issue the instructions. 1554 unsigned getScheduledLatency() const { 1555 return std::max(ExpectedLatency, CurrCycle); 1556 } 1557 1558 unsigned getUnscheduledLatency(SUnit *SU) const { 1559 return isTop() ? SU->getHeight() : SU->getDepth(); 1560 } 1561 1562 unsigned getResourceCount(unsigned ResIdx) const { 1563 return ExecutedResCounts[ResIdx]; 1564 } 1565 1566 /// Get the scaled count of scheduled micro-ops and resources, including 1567 /// executed resources. 1568 unsigned getCriticalCount() const { 1569 if (!ZoneCritResIdx) 1570 return RetiredMOps * SchedModel->getMicroOpFactor(); 1571 return getResourceCount(ZoneCritResIdx); 1572 } 1573 1574 /// Get a scaled count for the minimum execution time of the scheduled 1575 /// micro-ops that are ready to execute by getExecutedCount. Notice the 1576 /// feedback loop. 1577 unsigned getExecutedCount() const { 1578 return std::max(CurrCycle * SchedModel->getLatencyFactor(), 1579 MaxExecutedResCount); 1580 } 1581 1582 bool checkHazard(SUnit *SU); 1583 1584 unsigned findMaxLatency(ArrayRef<SUnit*> ReadySUs); 1585 1586 unsigned getOtherResourceCount(unsigned &OtherCritIdx); 1587 1588 void setPolicy(CandPolicy &Policy, SchedBoundary &OtherZone); 1589 1590 void releaseNode(SUnit *SU, unsigned ReadyCycle); 1591 1592 void bumpCycle(unsigned NextCycle); 1593 1594 void incExecutedResources(unsigned PIdx, unsigned Count); 1595 1596 unsigned countResource(unsigned PIdx, unsigned Cycles, unsigned ReadyCycle); 1597 1598 void bumpNode(SUnit *SU); 1599 1600 void releasePending(); 1601 1602 void removeReady(SUnit *SU); 1603 1604 SUnit *pickOnlyChoice(); 1605 1606 #ifndef NDEBUG 1607 void dumpScheduledState(); 1608 #endif 1609 }; 1610 1611 private: 1612 const MachineSchedContext *Context; 1613 ScheduleDAGMI *DAG; 1614 const TargetSchedModel *SchedModel; 1615 const TargetRegisterInfo *TRI; 1616 1617 // State of the top and bottom scheduled instruction boundaries. 1618 SchedRemainder Rem; 1619 SchedBoundary Top; 1620 SchedBoundary Bot; 1621 1622 MachineSchedPolicy RegionPolicy; 1623 public: 1624 /// SUnit::NodeQueueId: 0 (none), 1 (top), 2 (bot), 3 (both) 1625 enum { 1626 TopQID = 1, 1627 BotQID = 2, 1628 LogMaxQID = 2 1629 }; 1630 1631 GenericScheduler(const MachineSchedContext *C): 1632 Context(C), DAG(0), SchedModel(0), TRI(0), 1633 Top(TopQID, "TopQ"), Bot(BotQID, "BotQ") {} 1634 1635 virtual void initPolicy(MachineBasicBlock::iterator Begin, 1636 MachineBasicBlock::iterator End, 1637 unsigned NumRegionInstrs); 1638 1639 bool shouldTrackPressure() const { return RegionPolicy.ShouldTrackPressure; } 1640 1641 virtual void initialize(ScheduleDAGMI *dag); 1642 1643 virtual SUnit *pickNode(bool &IsTopNode); 1644 1645 virtual void schedNode(SUnit *SU, bool IsTopNode); 1646 1647 virtual void releaseTopNode(SUnit *SU); 1648 1649 virtual void releaseBottomNode(SUnit *SU); 1650 1651 virtual void registerRoots(); 1652 1653 protected: 1654 void checkAcyclicLatency(); 1655 1656 void tryCandidate(SchedCandidate &Cand, 1657 SchedCandidate &TryCand, 1658 SchedBoundary &Zone, 1659 const RegPressureTracker &RPTracker, 1660 RegPressureTracker &TempTracker); 1661 1662 SUnit *pickNodeBidirectional(bool &IsTopNode); 1663 1664 void pickNodeFromQueue(SchedBoundary &Zone, 1665 const RegPressureTracker &RPTracker, 1666 SchedCandidate &Candidate); 1667 1668 void reschedulePhysRegCopies(SUnit *SU, bool isTop); 1669 1670 #ifndef NDEBUG 1671 void traceCandidate(const SchedCandidate &Cand); 1672 #endif 1673 }; 1674 } // namespace 1675 1676 void GenericScheduler::SchedRemainder:: 1677 init(ScheduleDAGMI *DAG, const TargetSchedModel *SchedModel) { 1678 reset(); 1679 if (!SchedModel->hasInstrSchedModel()) 1680 return; 1681 RemainingCounts.resize(SchedModel->getNumProcResourceKinds()); 1682 for (std::vector<SUnit>::iterator 1683 I = DAG->SUnits.begin(), E = DAG->SUnits.end(); I != E; ++I) { 1684 const MCSchedClassDesc *SC = DAG->getSchedClass(&*I); 1685 RemIssueCount += SchedModel->getNumMicroOps(I->getInstr(), SC) 1686 * SchedModel->getMicroOpFactor(); 1687 for (TargetSchedModel::ProcResIter 1688 PI = SchedModel->getWriteProcResBegin(SC), 1689 PE = SchedModel->getWriteProcResEnd(SC); PI != PE; ++PI) { 1690 unsigned PIdx = PI->ProcResourceIdx; 1691 unsigned Factor = SchedModel->getResourceFactor(PIdx); 1692 RemainingCounts[PIdx] += (Factor * PI->Cycles); 1693 } 1694 } 1695 } 1696 1697 void GenericScheduler::SchedBoundary:: 1698 init(ScheduleDAGMI *dag, const TargetSchedModel *smodel, SchedRemainder *rem) { 1699 reset(); 1700 DAG = dag; 1701 SchedModel = smodel; 1702 Rem = rem; 1703 if (SchedModel->hasInstrSchedModel()) 1704 ExecutedResCounts.resize(SchedModel->getNumProcResourceKinds()); 1705 } 1706 1707 /// Initialize the per-region scheduling policy. 1708 void GenericScheduler::initPolicy(MachineBasicBlock::iterator Begin, 1709 MachineBasicBlock::iterator End, 1710 unsigned NumRegionInstrs) { 1711 const TargetMachine &TM = Context->MF->getTarget(); 1712 1713 // Avoid setting up the register pressure tracker for small regions to save 1714 // compile time. As a rough heuristic, only track pressure when the number of 1715 // schedulable instructions exceeds half the integer register file. 1716 unsigned NIntRegs = Context->RegClassInfo->getNumAllocatableRegs( 1717 TM.getTargetLowering()->getRegClassFor(MVT::i32)); 1718 1719 RegionPolicy.ShouldTrackPressure = NumRegionInstrs > (NIntRegs / 2); 1720 1721 // For generic targets, we default to bottom-up, because it's simpler and more 1722 // compile-time optimizations have been implemented in that direction. 1723 RegionPolicy.OnlyBottomUp = true; 1724 1725 // Allow the subtarget to override default policy. 1726 const TargetSubtargetInfo &ST = TM.getSubtarget<TargetSubtargetInfo>(); 1727 ST.overrideSchedPolicy(RegionPolicy, Begin, End, NumRegionInstrs); 1728 1729 // After subtarget overrides, apply command line options. 1730 if (!EnableRegPressure) 1731 RegionPolicy.ShouldTrackPressure = false; 1732 1733 // Check -misched-topdown/bottomup can force or unforce scheduling direction. 1734 // e.g. -misched-bottomup=false allows scheduling in both directions. 1735 assert((!ForceTopDown || !ForceBottomUp) && 1736 "-misched-topdown incompatible with -misched-bottomup"); 1737 if (ForceBottomUp.getNumOccurrences() > 0) { 1738 RegionPolicy.OnlyBottomUp = ForceBottomUp; 1739 if (RegionPolicy.OnlyBottomUp) 1740 RegionPolicy.OnlyTopDown = false; 1741 } 1742 if (ForceTopDown.getNumOccurrences() > 0) { 1743 RegionPolicy.OnlyTopDown = ForceTopDown; 1744 if (RegionPolicy.OnlyTopDown) 1745 RegionPolicy.OnlyBottomUp = false; 1746 } 1747 } 1748 1749 void GenericScheduler::initialize(ScheduleDAGMI *dag) { 1750 DAG = dag; 1751 SchedModel = DAG->getSchedModel(); 1752 TRI = DAG->TRI; 1753 1754 Rem.init(DAG, SchedModel); 1755 Top.init(DAG, SchedModel, &Rem); 1756 Bot.init(DAG, SchedModel, &Rem); 1757 1758 // Initialize resource counts. 1759 1760 // Initialize the HazardRecognizers. If itineraries don't exist, are empty, or 1761 // are disabled, then these HazardRecs will be disabled. 1762 const InstrItineraryData *Itin = SchedModel->getInstrItineraries(); 1763 const TargetMachine &TM = DAG->MF.getTarget(); 1764 if (!Top.HazardRec) { 1765 Top.HazardRec = 1766 TM.getInstrInfo()->CreateTargetMIHazardRecognizer(Itin, DAG); 1767 } 1768 if (!Bot.HazardRec) { 1769 Bot.HazardRec = 1770 TM.getInstrInfo()->CreateTargetMIHazardRecognizer(Itin, DAG); 1771 } 1772 } 1773 1774 void GenericScheduler::releaseTopNode(SUnit *SU) { 1775 if (SU->isScheduled) 1776 return; 1777 1778 for (SUnit::pred_iterator I = SU->Preds.begin(), E = SU->Preds.end(); 1779 I != E; ++I) { 1780 if (I->isWeak()) 1781 continue; 1782 unsigned PredReadyCycle = I->getSUnit()->TopReadyCycle; 1783 unsigned Latency = I->getLatency(); 1784 #ifndef NDEBUG 1785 Top.MaxObservedLatency = std::max(Latency, Top.MaxObservedLatency); 1786 #endif 1787 if (SU->TopReadyCycle < PredReadyCycle + Latency) 1788 SU->TopReadyCycle = PredReadyCycle + Latency; 1789 } 1790 Top.releaseNode(SU, SU->TopReadyCycle); 1791 } 1792 1793 void GenericScheduler::releaseBottomNode(SUnit *SU) { 1794 if (SU->isScheduled) 1795 return; 1796 1797 assert(SU->getInstr() && "Scheduled SUnit must have instr"); 1798 1799 for (SUnit::succ_iterator I = SU->Succs.begin(), E = SU->Succs.end(); 1800 I != E; ++I) { 1801 if (I->isWeak()) 1802 continue; 1803 unsigned SuccReadyCycle = I->getSUnit()->BotReadyCycle; 1804 unsigned Latency = I->getLatency(); 1805 #ifndef NDEBUG 1806 Bot.MaxObservedLatency = std::max(Latency, Bot.MaxObservedLatency); 1807 #endif 1808 if (SU->BotReadyCycle < SuccReadyCycle + Latency) 1809 SU->BotReadyCycle = SuccReadyCycle + Latency; 1810 } 1811 Bot.releaseNode(SU, SU->BotReadyCycle); 1812 } 1813 1814 /// Set IsAcyclicLatencyLimited if the acyclic path is longer than the cyclic 1815 /// critical path by more cycles than it takes to drain the instruction buffer. 1816 /// We estimate an upper bounds on in-flight instructions as: 1817 /// 1818 /// CyclesPerIteration = max( CyclicPath, Loop-Resource-Height ) 1819 /// InFlightIterations = AcyclicPath / CyclesPerIteration 1820 /// InFlightResources = InFlightIterations * LoopResources 1821 /// 1822 /// TODO: Check execution resources in addition to IssueCount. 1823 void GenericScheduler::checkAcyclicLatency() { 1824 if (Rem.CyclicCritPath == 0 || Rem.CyclicCritPath >= Rem.CriticalPath) 1825 return; 1826 1827 // Scaled number of cycles per loop iteration. 1828 unsigned IterCount = 1829 std::max(Rem.CyclicCritPath * SchedModel->getLatencyFactor(), 1830 Rem.RemIssueCount); 1831 // Scaled acyclic critical path. 1832 unsigned AcyclicCount = Rem.CriticalPath * SchedModel->getLatencyFactor(); 1833 // InFlightCount = (AcyclicPath / IterCycles) * InstrPerLoop 1834 unsigned InFlightCount = 1835 (AcyclicCount * Rem.RemIssueCount + IterCount-1) / IterCount; 1836 unsigned BufferLimit = 1837 SchedModel->getMicroOpBufferSize() * SchedModel->getMicroOpFactor(); 1838 1839 Rem.IsAcyclicLatencyLimited = InFlightCount > BufferLimit; 1840 1841 DEBUG(dbgs() << "IssueCycles=" 1842 << Rem.RemIssueCount / SchedModel->getLatencyFactor() << "c " 1843 << "IterCycles=" << IterCount / SchedModel->getLatencyFactor() 1844 << "c NumIters=" << (AcyclicCount + IterCount-1) / IterCount 1845 << " InFlight=" << InFlightCount / SchedModel->getMicroOpFactor() 1846 << "m BufferLim=" << SchedModel->getMicroOpBufferSize() << "m\n"; 1847 if (Rem.IsAcyclicLatencyLimited) 1848 dbgs() << " ACYCLIC LATENCY LIMIT\n"); 1849 } 1850 1851 void GenericScheduler::registerRoots() { 1852 Rem.CriticalPath = DAG->ExitSU.getDepth(); 1853 1854 // Some roots may not feed into ExitSU. Check all of them in case. 1855 for (std::vector<SUnit*>::const_iterator 1856 I = Bot.Available.begin(), E = Bot.Available.end(); I != E; ++I) { 1857 if ((*I)->getDepth() > Rem.CriticalPath) 1858 Rem.CriticalPath = (*I)->getDepth(); 1859 } 1860 DEBUG(dbgs() << "Critical Path: " << Rem.CriticalPath << '\n'); 1861 1862 if (EnableCyclicPath) { 1863 Rem.CyclicCritPath = DAG->computeCyclicCriticalPath(); 1864 checkAcyclicLatency(); 1865 } 1866 } 1867 1868 /// Does this SU have a hazard within the current instruction group. 1869 /// 1870 /// The scheduler supports two modes of hazard recognition. The first is the 1871 /// ScheduleHazardRecognizer API. It is a fully general hazard recognizer that 1872 /// supports highly complicated in-order reservation tables 1873 /// (ScoreboardHazardRecognizer) and arbitraty target-specific logic. 1874 /// 1875 /// The second is a streamlined mechanism that checks for hazards based on 1876 /// simple counters that the scheduler itself maintains. It explicitly checks 1877 /// for instruction dispatch limitations, including the number of micro-ops that 1878 /// can dispatch per cycle. 1879 /// 1880 /// TODO: Also check whether the SU must start a new group. 1881 bool GenericScheduler::SchedBoundary::checkHazard(SUnit *SU) { 1882 if (HazardRec->isEnabled()) 1883 return HazardRec->getHazardType(SU) != ScheduleHazardRecognizer::NoHazard; 1884 1885 unsigned uops = SchedModel->getNumMicroOps(SU->getInstr()); 1886 if ((CurrMOps > 0) && (CurrMOps + uops > SchedModel->getIssueWidth())) { 1887 DEBUG(dbgs() << " SU(" << SU->NodeNum << ") uops=" 1888 << SchedModel->getNumMicroOps(SU->getInstr()) << '\n'); 1889 return true; 1890 } 1891 return false; 1892 } 1893 1894 // Find the unscheduled node in ReadySUs with the highest latency. 1895 unsigned GenericScheduler::SchedBoundary:: 1896 findMaxLatency(ArrayRef<SUnit*> ReadySUs) { 1897 SUnit *LateSU = 0; 1898 unsigned RemLatency = 0; 1899 for (ArrayRef<SUnit*>::iterator I = ReadySUs.begin(), E = ReadySUs.end(); 1900 I != E; ++I) { 1901 unsigned L = getUnscheduledLatency(*I); 1902 if (L > RemLatency) { 1903 RemLatency = L; 1904 LateSU = *I; 1905 } 1906 } 1907 if (LateSU) { 1908 DEBUG(dbgs() << Available.getName() << " RemLatency SU(" 1909 << LateSU->NodeNum << ") " << RemLatency << "c\n"); 1910 } 1911 return RemLatency; 1912 } 1913 1914 // Count resources in this zone and the remaining unscheduled 1915 // instruction. Return the max count, scaled. Set OtherCritIdx to the critical 1916 // resource index, or zero if the zone is issue limited. 1917 unsigned GenericScheduler::SchedBoundary:: 1918 getOtherResourceCount(unsigned &OtherCritIdx) { 1919 OtherCritIdx = 0; 1920 if (!SchedModel->hasInstrSchedModel()) 1921 return 0; 1922 1923 unsigned OtherCritCount = Rem->RemIssueCount 1924 + (RetiredMOps * SchedModel->getMicroOpFactor()); 1925 DEBUG(dbgs() << " " << Available.getName() << " + Remain MOps: " 1926 << OtherCritCount / SchedModel->getMicroOpFactor() << '\n'); 1927 for (unsigned PIdx = 1, PEnd = SchedModel->getNumProcResourceKinds(); 1928 PIdx != PEnd; ++PIdx) { 1929 unsigned OtherCount = getResourceCount(PIdx) + Rem->RemainingCounts[PIdx]; 1930 if (OtherCount > OtherCritCount) { 1931 OtherCritCount = OtherCount; 1932 OtherCritIdx = PIdx; 1933 } 1934 } 1935 if (OtherCritIdx) { 1936 DEBUG(dbgs() << " " << Available.getName() << " + Remain CritRes: " 1937 << OtherCritCount / SchedModel->getResourceFactor(OtherCritIdx) 1938 << " " << getResourceName(OtherCritIdx) << "\n"); 1939 } 1940 return OtherCritCount; 1941 } 1942 1943 /// Set the CandPolicy for this zone given the current resources and latencies 1944 /// inside and outside the zone. 1945 void GenericScheduler::SchedBoundary::setPolicy(CandPolicy &Policy, 1946 SchedBoundary &OtherZone) { 1947 // Now that potential stalls have been considered, apply preemptive heuristics 1948 // based on the the total latency and resources inside and outside this 1949 // zone. 1950 1951 // Compute remaining latency. We need this both to determine whether the 1952 // overall schedule has become latency-limited and whether the instructions 1953 // outside this zone are resource or latency limited. 1954 // 1955 // The "dependent" latency is updated incrementally during scheduling as the 1956 // max height/depth of scheduled nodes minus the cycles since it was 1957 // scheduled: 1958 // DLat = max (N.depth - (CurrCycle - N.ReadyCycle) for N in Zone 1959 // 1960 // The "independent" latency is the max ready queue depth: 1961 // ILat = max N.depth for N in Available|Pending 1962 // 1963 // RemainingLatency is the greater of independent and dependent latency. 1964 unsigned RemLatency = DependentLatency; 1965 RemLatency = std::max(RemLatency, findMaxLatency(Available.elements())); 1966 RemLatency = std::max(RemLatency, findMaxLatency(Pending.elements())); 1967 1968 // Compute the critical resource outside the zone. 1969 unsigned OtherCritIdx; 1970 unsigned OtherCount = OtherZone.getOtherResourceCount(OtherCritIdx); 1971 1972 bool OtherResLimited = false; 1973 if (SchedModel->hasInstrSchedModel()) { 1974 unsigned LFactor = SchedModel->getLatencyFactor(); 1975 OtherResLimited = (int)(OtherCount - (RemLatency * LFactor)) > (int)LFactor; 1976 } 1977 if (!OtherResLimited && (RemLatency + CurrCycle > Rem->CriticalPath)) { 1978 Policy.ReduceLatency |= true; 1979 DEBUG(dbgs() << " " << Available.getName() << " RemainingLatency " 1980 << RemLatency << " + " << CurrCycle << "c > CritPath " 1981 << Rem->CriticalPath << "\n"); 1982 } 1983 // If the same resource is limiting inside and outside the zone, do nothing. 1984 if (ZoneCritResIdx == OtherCritIdx) 1985 return; 1986 1987 DEBUG( 1988 if (IsResourceLimited) { 1989 dbgs() << " " << Available.getName() << " ResourceLimited: " 1990 << getResourceName(ZoneCritResIdx) << "\n"; 1991 } 1992 if (OtherResLimited) 1993 dbgs() << " RemainingLimit: " << getResourceName(OtherCritIdx) << "\n"; 1994 if (!IsResourceLimited && !OtherResLimited) 1995 dbgs() << " Latency limited both directions.\n"); 1996 1997 if (IsResourceLimited && !Policy.ReduceResIdx) 1998 Policy.ReduceResIdx = ZoneCritResIdx; 1999 2000 if (OtherResLimited) 2001 Policy.DemandResIdx = OtherCritIdx; 2002 } 2003 2004 void GenericScheduler::SchedBoundary::releaseNode(SUnit *SU, 2005 unsigned ReadyCycle) { 2006 if (ReadyCycle < MinReadyCycle) 2007 MinReadyCycle = ReadyCycle; 2008 2009 // Check for interlocks first. For the purpose of other heuristics, an 2010 // instruction that cannot issue appears as if it's not in the ReadyQueue. 2011 bool IsBuffered = SchedModel->getMicroOpBufferSize() != 0; 2012 if ((!IsBuffered && ReadyCycle > CurrCycle) || checkHazard(SU)) 2013 Pending.push(SU); 2014 else 2015 Available.push(SU); 2016 2017 // Record this node as an immediate dependent of the scheduled node. 2018 NextSUs.insert(SU); 2019 } 2020 2021 /// Move the boundary of scheduled code by one cycle. 2022 void GenericScheduler::SchedBoundary::bumpCycle(unsigned NextCycle) { 2023 if (SchedModel->getMicroOpBufferSize() == 0) { 2024 assert(MinReadyCycle < UINT_MAX && "MinReadyCycle uninitialized"); 2025 if (MinReadyCycle > NextCycle) 2026 NextCycle = MinReadyCycle; 2027 } 2028 // Update the current micro-ops, which will issue in the next cycle. 2029 unsigned DecMOps = SchedModel->getIssueWidth() * (NextCycle - CurrCycle); 2030 CurrMOps = (CurrMOps <= DecMOps) ? 0 : CurrMOps - DecMOps; 2031 2032 // Decrement DependentLatency based on the next cycle. 2033 if ((NextCycle - CurrCycle) > DependentLatency) 2034 DependentLatency = 0; 2035 else 2036 DependentLatency -= (NextCycle - CurrCycle); 2037 2038 if (!HazardRec->isEnabled()) { 2039 // Bypass HazardRec virtual calls. 2040 CurrCycle = NextCycle; 2041 } 2042 else { 2043 // Bypass getHazardType calls in case of long latency. 2044 for (; CurrCycle != NextCycle; ++CurrCycle) { 2045 if (isTop()) 2046 HazardRec->AdvanceCycle(); 2047 else 2048 HazardRec->RecedeCycle(); 2049 } 2050 } 2051 CheckPending = true; 2052 unsigned LFactor = SchedModel->getLatencyFactor(); 2053 IsResourceLimited = 2054 (int)(getCriticalCount() - (getScheduledLatency() * LFactor)) 2055 > (int)LFactor; 2056 2057 DEBUG(dbgs() << "Cycle: " << CurrCycle << ' ' << Available.getName() << '\n'); 2058 } 2059 2060 void GenericScheduler::SchedBoundary::incExecutedResources(unsigned PIdx, 2061 unsigned Count) { 2062 ExecutedResCounts[PIdx] += Count; 2063 if (ExecutedResCounts[PIdx] > MaxExecutedResCount) 2064 MaxExecutedResCount = ExecutedResCounts[PIdx]; 2065 } 2066 2067 /// Add the given processor resource to this scheduled zone. 2068 /// 2069 /// \param Cycles indicates the number of consecutive (non-pipelined) cycles 2070 /// during which this resource is consumed. 2071 /// 2072 /// \return the next cycle at which the instruction may execute without 2073 /// oversubscribing resources. 2074 unsigned GenericScheduler::SchedBoundary:: 2075 countResource(unsigned PIdx, unsigned Cycles, unsigned ReadyCycle) { 2076 unsigned Factor = SchedModel->getResourceFactor(PIdx); 2077 unsigned Count = Factor * Cycles; 2078 DEBUG(dbgs() << " " << getResourceName(PIdx) 2079 << " +" << Cycles << "x" << Factor << "u\n"); 2080 2081 // Update Executed resources counts. 2082 incExecutedResources(PIdx, Count); 2083 assert(Rem->RemainingCounts[PIdx] >= Count && "resource double counted"); 2084 Rem->RemainingCounts[PIdx] -= Count; 2085 2086 // Check if this resource exceeds the current critical resource. If so, it 2087 // becomes the critical resource. 2088 if (ZoneCritResIdx != PIdx && (getResourceCount(PIdx) > getCriticalCount())) { 2089 ZoneCritResIdx = PIdx; 2090 DEBUG(dbgs() << " *** Critical resource " 2091 << getResourceName(PIdx) << ": " 2092 << getResourceCount(PIdx) / SchedModel->getLatencyFactor() << "c\n"); 2093 } 2094 // TODO: We don't yet model reserved resources. It's not hard though. 2095 return CurrCycle; 2096 } 2097 2098 /// Move the boundary of scheduled code by one SUnit. 2099 void GenericScheduler::SchedBoundary::bumpNode(SUnit *SU) { 2100 // Update the reservation table. 2101 if (HazardRec->isEnabled()) { 2102 if (!isTop() && SU->isCall) { 2103 // Calls are scheduled with their preceding instructions. For bottom-up 2104 // scheduling, clear the pipeline state before emitting. 2105 HazardRec->Reset(); 2106 } 2107 HazardRec->EmitInstruction(SU); 2108 } 2109 const MCSchedClassDesc *SC = DAG->getSchedClass(SU); 2110 unsigned IncMOps = SchedModel->getNumMicroOps(SU->getInstr()); 2111 CurrMOps += IncMOps; 2112 // checkHazard prevents scheduling multiple instructions per cycle that exceed 2113 // issue width. However, we commonly reach the maximum. In this case 2114 // opportunistically bump the cycle to avoid uselessly checking everything in 2115 // the readyQ. Furthermore, a single instruction may produce more than one 2116 // cycle's worth of micro-ops. 2117 // 2118 // TODO: Also check if this SU must end a dispatch group. 2119 unsigned NextCycle = CurrCycle; 2120 if (CurrMOps >= SchedModel->getIssueWidth()) { 2121 ++NextCycle; 2122 DEBUG(dbgs() << " *** Max MOps " << CurrMOps 2123 << " at cycle " << CurrCycle << '\n'); 2124 } 2125 unsigned ReadyCycle = (isTop() ? SU->TopReadyCycle : SU->BotReadyCycle); 2126 DEBUG(dbgs() << " Ready @" << ReadyCycle << "c\n"); 2127 2128 switch (SchedModel->getMicroOpBufferSize()) { 2129 case 0: 2130 assert(ReadyCycle <= CurrCycle && "Broken PendingQueue"); 2131 break; 2132 case 1: 2133 if (ReadyCycle > NextCycle) { 2134 NextCycle = ReadyCycle; 2135 DEBUG(dbgs() << " *** Stall until: " << ReadyCycle << "\n"); 2136 } 2137 break; 2138 default: 2139 // We don't currently model the OOO reorder buffer, so consider all 2140 // scheduled MOps to be "retired". 2141 break; 2142 } 2143 RetiredMOps += IncMOps; 2144 2145 // Update resource counts and critical resource. 2146 if (SchedModel->hasInstrSchedModel()) { 2147 unsigned DecRemIssue = IncMOps * SchedModel->getMicroOpFactor(); 2148 assert(Rem->RemIssueCount >= DecRemIssue && "MOps double counted"); 2149 Rem->RemIssueCount -= DecRemIssue; 2150 if (ZoneCritResIdx) { 2151 // Scale scheduled micro-ops for comparing with the critical resource. 2152 unsigned ScaledMOps = 2153 RetiredMOps * SchedModel->getMicroOpFactor(); 2154 2155 // If scaled micro-ops are now more than the previous critical resource by 2156 // a full cycle, then micro-ops issue becomes critical. 2157 if ((int)(ScaledMOps - getResourceCount(ZoneCritResIdx)) 2158 >= (int)SchedModel->getLatencyFactor()) { 2159 ZoneCritResIdx = 0; 2160 DEBUG(dbgs() << " *** Critical resource NumMicroOps: " 2161 << ScaledMOps / SchedModel->getLatencyFactor() << "c\n"); 2162 } 2163 } 2164 for (TargetSchedModel::ProcResIter 2165 PI = SchedModel->getWriteProcResBegin(SC), 2166 PE = SchedModel->getWriteProcResEnd(SC); PI != PE; ++PI) { 2167 unsigned RCycle = 2168 countResource(PI->ProcResourceIdx, PI->Cycles, ReadyCycle); 2169 if (RCycle > NextCycle) 2170 NextCycle = RCycle; 2171 } 2172 } 2173 // Update ExpectedLatency and DependentLatency. 2174 unsigned &TopLatency = isTop() ? ExpectedLatency : DependentLatency; 2175 unsigned &BotLatency = isTop() ? DependentLatency : ExpectedLatency; 2176 if (SU->getDepth() > TopLatency) { 2177 TopLatency = SU->getDepth(); 2178 DEBUG(dbgs() << " " << Available.getName() 2179 << " TopLatency SU(" << SU->NodeNum << ") " << TopLatency << "c\n"); 2180 } 2181 if (SU->getHeight() > BotLatency) { 2182 BotLatency = SU->getHeight(); 2183 DEBUG(dbgs() << " " << Available.getName() 2184 << " BotLatency SU(" << SU->NodeNum << ") " << BotLatency << "c\n"); 2185 } 2186 // If we stall for any reason, bump the cycle. 2187 if (NextCycle > CurrCycle) { 2188 bumpCycle(NextCycle); 2189 } 2190 else { 2191 // After updating ZoneCritResIdx and ExpectedLatency, check if we're 2192 // resource limited. If a stall occured, bumpCycle does this. 2193 unsigned LFactor = SchedModel->getLatencyFactor(); 2194 IsResourceLimited = 2195 (int)(getCriticalCount() - (getScheduledLatency() * LFactor)) 2196 > (int)LFactor; 2197 } 2198 DEBUG(dumpScheduledState()); 2199 } 2200 2201 /// Release pending ready nodes in to the available queue. This makes them 2202 /// visible to heuristics. 2203 void GenericScheduler::SchedBoundary::releasePending() { 2204 // If the available queue is empty, it is safe to reset MinReadyCycle. 2205 if (Available.empty()) 2206 MinReadyCycle = UINT_MAX; 2207 2208 // Check to see if any of the pending instructions are ready to issue. If 2209 // so, add them to the available queue. 2210 bool IsBuffered = SchedModel->getMicroOpBufferSize() != 0; 2211 for (unsigned i = 0, e = Pending.size(); i != e; ++i) { 2212 SUnit *SU = *(Pending.begin()+i); 2213 unsigned ReadyCycle = isTop() ? SU->TopReadyCycle : SU->BotReadyCycle; 2214 2215 if (ReadyCycle < MinReadyCycle) 2216 MinReadyCycle = ReadyCycle; 2217 2218 if (!IsBuffered && ReadyCycle > CurrCycle) 2219 continue; 2220 2221 if (checkHazard(SU)) 2222 continue; 2223 2224 Available.push(SU); 2225 Pending.remove(Pending.begin()+i); 2226 --i; --e; 2227 } 2228 DEBUG(if (!Pending.empty()) Pending.dump()); 2229 CheckPending = false; 2230 } 2231 2232 /// Remove SU from the ready set for this boundary. 2233 void GenericScheduler::SchedBoundary::removeReady(SUnit *SU) { 2234 if (Available.isInQueue(SU)) 2235 Available.remove(Available.find(SU)); 2236 else { 2237 assert(Pending.isInQueue(SU) && "bad ready count"); 2238 Pending.remove(Pending.find(SU)); 2239 } 2240 } 2241 2242 /// If this queue only has one ready candidate, return it. As a side effect, 2243 /// defer any nodes that now hit a hazard, and advance the cycle until at least 2244 /// one node is ready. If multiple instructions are ready, return NULL. 2245 SUnit *GenericScheduler::SchedBoundary::pickOnlyChoice() { 2246 if (CheckPending) 2247 releasePending(); 2248 2249 if (CurrMOps > 0) { 2250 // Defer any ready instrs that now have a hazard. 2251 for (ReadyQueue::iterator I = Available.begin(); I != Available.end();) { 2252 if (checkHazard(*I)) { 2253 Pending.push(*I); 2254 I = Available.remove(I); 2255 continue; 2256 } 2257 ++I; 2258 } 2259 } 2260 for (unsigned i = 0; Available.empty(); ++i) { 2261 assert(i <= (HazardRec->getMaxLookAhead() + MaxObservedLatency) && 2262 "permanent hazard"); (void)i; 2263 bumpCycle(CurrCycle + 1); 2264 releasePending(); 2265 } 2266 if (Available.size() == 1) 2267 return *Available.begin(); 2268 return NULL; 2269 } 2270 2271 #ifndef NDEBUG 2272 // This is useful information to dump after bumpNode. 2273 // Note that the Queue contents are more useful before pickNodeFromQueue. 2274 void GenericScheduler::SchedBoundary::dumpScheduledState() { 2275 unsigned ResFactor; 2276 unsigned ResCount; 2277 if (ZoneCritResIdx) { 2278 ResFactor = SchedModel->getResourceFactor(ZoneCritResIdx); 2279 ResCount = getResourceCount(ZoneCritResIdx); 2280 } 2281 else { 2282 ResFactor = SchedModel->getMicroOpFactor(); 2283 ResCount = RetiredMOps * SchedModel->getMicroOpFactor(); 2284 } 2285 unsigned LFactor = SchedModel->getLatencyFactor(); 2286 dbgs() << Available.getName() << " @" << CurrCycle << "c\n" 2287 << " Retired: " << RetiredMOps; 2288 dbgs() << "\n Executed: " << getExecutedCount() / LFactor << "c"; 2289 dbgs() << "\n Critical: " << ResCount / LFactor << "c, " 2290 << ResCount / ResFactor << " " << getResourceName(ZoneCritResIdx) 2291 << "\n ExpectedLatency: " << ExpectedLatency << "c\n" 2292 << (IsResourceLimited ? " - Resource" : " - Latency") 2293 << " limited.\n"; 2294 } 2295 #endif 2296 2297 void GenericScheduler::SchedCandidate:: 2298 initResourceDelta(const ScheduleDAGMI *DAG, 2299 const TargetSchedModel *SchedModel) { 2300 if (!Policy.ReduceResIdx && !Policy.DemandResIdx) 2301 return; 2302 2303 const MCSchedClassDesc *SC = DAG->getSchedClass(SU); 2304 for (TargetSchedModel::ProcResIter 2305 PI = SchedModel->getWriteProcResBegin(SC), 2306 PE = SchedModel->getWriteProcResEnd(SC); PI != PE; ++PI) { 2307 if (PI->ProcResourceIdx == Policy.ReduceResIdx) 2308 ResDelta.CritResources += PI->Cycles; 2309 if (PI->ProcResourceIdx == Policy.DemandResIdx) 2310 ResDelta.DemandedResources += PI->Cycles; 2311 } 2312 } 2313 2314 2315 /// Return true if this heuristic determines order. 2316 static bool tryLess(int TryVal, int CandVal, 2317 GenericScheduler::SchedCandidate &TryCand, 2318 GenericScheduler::SchedCandidate &Cand, 2319 GenericScheduler::CandReason Reason) { 2320 if (TryVal < CandVal) { 2321 TryCand.Reason = Reason; 2322 return true; 2323 } 2324 if (TryVal > CandVal) { 2325 if (Cand.Reason > Reason) 2326 Cand.Reason = Reason; 2327 return true; 2328 } 2329 Cand.setRepeat(Reason); 2330 return false; 2331 } 2332 2333 static bool tryGreater(int TryVal, int CandVal, 2334 GenericScheduler::SchedCandidate &TryCand, 2335 GenericScheduler::SchedCandidate &Cand, 2336 GenericScheduler::CandReason Reason) { 2337 if (TryVal > CandVal) { 2338 TryCand.Reason = Reason; 2339 return true; 2340 } 2341 if (TryVal < CandVal) { 2342 if (Cand.Reason > Reason) 2343 Cand.Reason = Reason; 2344 return true; 2345 } 2346 Cand.setRepeat(Reason); 2347 return false; 2348 } 2349 2350 static bool tryPressure(const PressureChange &TryP, 2351 const PressureChange &CandP, 2352 GenericScheduler::SchedCandidate &TryCand, 2353 GenericScheduler::SchedCandidate &Cand, 2354 GenericScheduler::CandReason Reason) { 2355 int TryRank = TryP.getPSetOrMax(); 2356 int CandRank = CandP.getPSetOrMax(); 2357 // If both candidates affect the same set, go with the smallest increase. 2358 if (TryRank == CandRank) { 2359 return tryLess(TryP.getUnitInc(), CandP.getUnitInc(), TryCand, Cand, 2360 Reason); 2361 } 2362 // If one candidate decreases and the other increases, go with it. 2363 // Invalid candidates have UnitInc==0. 2364 if (tryLess(TryP.getUnitInc() < 0, CandP.getUnitInc() < 0, TryCand, Cand, 2365 Reason)) { 2366 return true; 2367 } 2368 // If the candidates are decreasing pressure, reverse priority. 2369 if (TryP.getUnitInc() < 0) 2370 std::swap(TryRank, CandRank); 2371 return tryGreater(TryRank, CandRank, TryCand, Cand, Reason); 2372 } 2373 2374 static unsigned getWeakLeft(const SUnit *SU, bool isTop) { 2375 return (isTop) ? SU->WeakPredsLeft : SU->WeakSuccsLeft; 2376 } 2377 2378 /// Minimize physical register live ranges. Regalloc wants them adjacent to 2379 /// their physreg def/use. 2380 /// 2381 /// FIXME: This is an unnecessary check on the critical path. Most are root/leaf 2382 /// copies which can be prescheduled. The rest (e.g. x86 MUL) could be bundled 2383 /// with the operation that produces or consumes the physreg. We'll do this when 2384 /// regalloc has support for parallel copies. 2385 static int biasPhysRegCopy(const SUnit *SU, bool isTop) { 2386 const MachineInstr *MI = SU->getInstr(); 2387 if (!MI->isCopy()) 2388 return 0; 2389 2390 unsigned ScheduledOper = isTop ? 1 : 0; 2391 unsigned UnscheduledOper = isTop ? 0 : 1; 2392 // If we have already scheduled the physreg produce/consumer, immediately 2393 // schedule the copy. 2394 if (TargetRegisterInfo::isPhysicalRegister( 2395 MI->getOperand(ScheduledOper).getReg())) 2396 return 1; 2397 // If the physreg is at the boundary, defer it. Otherwise schedule it 2398 // immediately to free the dependent. We can hoist the copy later. 2399 bool AtBoundary = isTop ? !SU->NumSuccsLeft : !SU->NumPredsLeft; 2400 if (TargetRegisterInfo::isPhysicalRegister( 2401 MI->getOperand(UnscheduledOper).getReg())) 2402 return AtBoundary ? -1 : 1; 2403 return 0; 2404 } 2405 2406 static bool tryLatency(GenericScheduler::SchedCandidate &TryCand, 2407 GenericScheduler::SchedCandidate &Cand, 2408 GenericScheduler::SchedBoundary &Zone) { 2409 if (Zone.isTop()) { 2410 if (Cand.SU->getDepth() > Zone.getScheduledLatency()) { 2411 if (tryLess(TryCand.SU->getDepth(), Cand.SU->getDepth(), 2412 TryCand, Cand, GenericScheduler::TopDepthReduce)) 2413 return true; 2414 } 2415 if (tryGreater(TryCand.SU->getHeight(), Cand.SU->getHeight(), 2416 TryCand, Cand, GenericScheduler::TopPathReduce)) 2417 return true; 2418 } 2419 else { 2420 if (Cand.SU->getHeight() > Zone.getScheduledLatency()) { 2421 if (tryLess(TryCand.SU->getHeight(), Cand.SU->getHeight(), 2422 TryCand, Cand, GenericScheduler::BotHeightReduce)) 2423 return true; 2424 } 2425 if (tryGreater(TryCand.SU->getDepth(), Cand.SU->getDepth(), 2426 TryCand, Cand, GenericScheduler::BotPathReduce)) 2427 return true; 2428 } 2429 return false; 2430 } 2431 2432 /// Apply a set of heursitics to a new candidate. Heuristics are currently 2433 /// hierarchical. This may be more efficient than a graduated cost model because 2434 /// we don't need to evaluate all aspects of the model for each node in the 2435 /// queue. But it's really done to make the heuristics easier to debug and 2436 /// statistically analyze. 2437 /// 2438 /// \param Cand provides the policy and current best candidate. 2439 /// \param TryCand refers to the next SUnit candidate, otherwise uninitialized. 2440 /// \param Zone describes the scheduled zone that we are extending. 2441 /// \param RPTracker describes reg pressure within the scheduled zone. 2442 /// \param TempTracker is a scratch pressure tracker to reuse in queries. 2443 void GenericScheduler::tryCandidate(SchedCandidate &Cand, 2444 SchedCandidate &TryCand, 2445 SchedBoundary &Zone, 2446 const RegPressureTracker &RPTracker, 2447 RegPressureTracker &TempTracker) { 2448 2449 if (DAG->isTrackingPressure()) { 2450 // Always initialize TryCand's RPDelta. 2451 if (Zone.isTop()) { 2452 TempTracker.getMaxDownwardPressureDelta( 2453 TryCand.SU->getInstr(), 2454 TryCand.RPDelta, 2455 DAG->getRegionCriticalPSets(), 2456 DAG->getRegPressure().MaxSetPressure); 2457 } 2458 else { 2459 if (VerifyScheduling) { 2460 TempTracker.getMaxUpwardPressureDelta( 2461 TryCand.SU->getInstr(), 2462 &DAG->getPressureDiff(TryCand.SU), 2463 TryCand.RPDelta, 2464 DAG->getRegionCriticalPSets(), 2465 DAG->getRegPressure().MaxSetPressure); 2466 } 2467 else { 2468 RPTracker.getUpwardPressureDelta( 2469 TryCand.SU->getInstr(), 2470 DAG->getPressureDiff(TryCand.SU), 2471 TryCand.RPDelta, 2472 DAG->getRegionCriticalPSets(), 2473 DAG->getRegPressure().MaxSetPressure); 2474 } 2475 } 2476 } 2477 DEBUG(if (TryCand.RPDelta.Excess.isValid()) 2478 dbgs() << " SU(" << TryCand.SU->NodeNum << ") " 2479 << TRI->getRegPressureSetName(TryCand.RPDelta.Excess.getPSet()) 2480 << ":" << TryCand.RPDelta.Excess.getUnitInc() << "\n"); 2481 2482 // Initialize the candidate if needed. 2483 if (!Cand.isValid()) { 2484 TryCand.Reason = NodeOrder; 2485 return; 2486 } 2487 2488 if (tryGreater(biasPhysRegCopy(TryCand.SU, Zone.isTop()), 2489 biasPhysRegCopy(Cand.SU, Zone.isTop()), 2490 TryCand, Cand, PhysRegCopy)) 2491 return; 2492 2493 // Avoid exceeding the target's limit. If signed PSetID is negative, it is 2494 // invalid; convert it to INT_MAX to give it lowest priority. 2495 if (DAG->isTrackingPressure() && tryPressure(TryCand.RPDelta.Excess, 2496 Cand.RPDelta.Excess, 2497 TryCand, Cand, RegExcess)) 2498 return; 2499 2500 // Avoid increasing the max critical pressure in the scheduled region. 2501 if (DAG->isTrackingPressure() && tryPressure(TryCand.RPDelta.CriticalMax, 2502 Cand.RPDelta.CriticalMax, 2503 TryCand, Cand, RegCritical)) 2504 return; 2505 2506 // For loops that are acyclic path limited, aggressively schedule for latency. 2507 // This can result in very long dependence chains scheduled in sequence, so 2508 // once every cycle (when CurrMOps == 0), switch to normal heuristics. 2509 if (Rem.IsAcyclicLatencyLimited && !Zone.CurrMOps 2510 && tryLatency(TryCand, Cand, Zone)) 2511 return; 2512 2513 // Keep clustered nodes together to encourage downstream peephole 2514 // optimizations which may reduce resource requirements. 2515 // 2516 // This is a best effort to set things up for a post-RA pass. Optimizations 2517 // like generating loads of multiple registers should ideally be done within 2518 // the scheduler pass by combining the loads during DAG postprocessing. 2519 const SUnit *NextClusterSU = 2520 Zone.isTop() ? DAG->getNextClusterSucc() : DAG->getNextClusterPred(); 2521 if (tryGreater(TryCand.SU == NextClusterSU, Cand.SU == NextClusterSU, 2522 TryCand, Cand, Cluster)) 2523 return; 2524 2525 // Weak edges are for clustering and other constraints. 2526 if (tryLess(getWeakLeft(TryCand.SU, Zone.isTop()), 2527 getWeakLeft(Cand.SU, Zone.isTop()), 2528 TryCand, Cand, Weak)) { 2529 return; 2530 } 2531 // Avoid increasing the max pressure of the entire region. 2532 if (DAG->isTrackingPressure() && tryPressure(TryCand.RPDelta.CurrentMax, 2533 Cand.RPDelta.CurrentMax, 2534 TryCand, Cand, RegMax)) 2535 return; 2536 2537 // Avoid critical resource consumption and balance the schedule. 2538 TryCand.initResourceDelta(DAG, SchedModel); 2539 if (tryLess(TryCand.ResDelta.CritResources, Cand.ResDelta.CritResources, 2540 TryCand, Cand, ResourceReduce)) 2541 return; 2542 if (tryGreater(TryCand.ResDelta.DemandedResources, 2543 Cand.ResDelta.DemandedResources, 2544 TryCand, Cand, ResourceDemand)) 2545 return; 2546 2547 // Avoid serializing long latency dependence chains. 2548 // For acyclic path limited loops, latency was already checked above. 2549 if (Cand.Policy.ReduceLatency && !Rem.IsAcyclicLatencyLimited 2550 && tryLatency(TryCand, Cand, Zone)) { 2551 return; 2552 } 2553 2554 // Prefer immediate defs/users of the last scheduled instruction. This is a 2555 // local pressure avoidance strategy that also makes the machine code 2556 // readable. 2557 if (tryGreater(Zone.NextSUs.count(TryCand.SU), Zone.NextSUs.count(Cand.SU), 2558 TryCand, Cand, NextDefUse)) 2559 return; 2560 2561 // Fall through to original instruction order. 2562 if ((Zone.isTop() && TryCand.SU->NodeNum < Cand.SU->NodeNum) 2563 || (!Zone.isTop() && TryCand.SU->NodeNum > Cand.SU->NodeNum)) { 2564 TryCand.Reason = NodeOrder; 2565 } 2566 } 2567 2568 #ifndef NDEBUG 2569 const char *GenericScheduler::getReasonStr( 2570 GenericScheduler::CandReason Reason) { 2571 switch (Reason) { 2572 case NoCand: return "NOCAND "; 2573 case PhysRegCopy: return "PREG-COPY"; 2574 case RegExcess: return "REG-EXCESS"; 2575 case RegCritical: return "REG-CRIT "; 2576 case Cluster: return "CLUSTER "; 2577 case Weak: return "WEAK "; 2578 case RegMax: return "REG-MAX "; 2579 case ResourceReduce: return "RES-REDUCE"; 2580 case ResourceDemand: return "RES-DEMAND"; 2581 case TopDepthReduce: return "TOP-DEPTH "; 2582 case TopPathReduce: return "TOP-PATH "; 2583 case BotHeightReduce:return "BOT-HEIGHT"; 2584 case BotPathReduce: return "BOT-PATH "; 2585 case NextDefUse: return "DEF-USE "; 2586 case NodeOrder: return "ORDER "; 2587 }; 2588 llvm_unreachable("Unknown reason!"); 2589 } 2590 2591 void GenericScheduler::traceCandidate(const SchedCandidate &Cand) { 2592 PressureChange P; 2593 unsigned ResIdx = 0; 2594 unsigned Latency = 0; 2595 switch (Cand.Reason) { 2596 default: 2597 break; 2598 case RegExcess: 2599 P = Cand.RPDelta.Excess; 2600 break; 2601 case RegCritical: 2602 P = Cand.RPDelta.CriticalMax; 2603 break; 2604 case RegMax: 2605 P = Cand.RPDelta.CurrentMax; 2606 break; 2607 case ResourceReduce: 2608 ResIdx = Cand.Policy.ReduceResIdx; 2609 break; 2610 case ResourceDemand: 2611 ResIdx = Cand.Policy.DemandResIdx; 2612 break; 2613 case TopDepthReduce: 2614 Latency = Cand.SU->getDepth(); 2615 break; 2616 case TopPathReduce: 2617 Latency = Cand.SU->getHeight(); 2618 break; 2619 case BotHeightReduce: 2620 Latency = Cand.SU->getHeight(); 2621 break; 2622 case BotPathReduce: 2623 Latency = Cand.SU->getDepth(); 2624 break; 2625 } 2626 dbgs() << " SU(" << Cand.SU->NodeNum << ") " << getReasonStr(Cand.Reason); 2627 if (P.isValid()) 2628 dbgs() << " " << TRI->getRegPressureSetName(P.getPSet()) 2629 << ":" << P.getUnitInc() << " "; 2630 else 2631 dbgs() << " "; 2632 if (ResIdx) 2633 dbgs() << " " << SchedModel->getProcResource(ResIdx)->Name << " "; 2634 else 2635 dbgs() << " "; 2636 if (Latency) 2637 dbgs() << " " << Latency << " cycles "; 2638 else 2639 dbgs() << " "; 2640 dbgs() << '\n'; 2641 } 2642 #endif 2643 2644 /// Pick the best candidate from the queue. 2645 /// 2646 /// TODO: getMaxPressureDelta results can be mostly cached for each SUnit during 2647 /// DAG building. To adjust for the current scheduling location we need to 2648 /// maintain the number of vreg uses remaining to be top-scheduled. 2649 void GenericScheduler::pickNodeFromQueue(SchedBoundary &Zone, 2650 const RegPressureTracker &RPTracker, 2651 SchedCandidate &Cand) { 2652 ReadyQueue &Q = Zone.Available; 2653 2654 DEBUG(Q.dump()); 2655 2656 // getMaxPressureDelta temporarily modifies the tracker. 2657 RegPressureTracker &TempTracker = const_cast<RegPressureTracker&>(RPTracker); 2658 2659 for (ReadyQueue::iterator I = Q.begin(), E = Q.end(); I != E; ++I) { 2660 2661 SchedCandidate TryCand(Cand.Policy); 2662 TryCand.SU = *I; 2663 tryCandidate(Cand, TryCand, Zone, RPTracker, TempTracker); 2664 if (TryCand.Reason != NoCand) { 2665 // Initialize resource delta if needed in case future heuristics query it. 2666 if (TryCand.ResDelta == SchedResourceDelta()) 2667 TryCand.initResourceDelta(DAG, SchedModel); 2668 Cand.setBest(TryCand); 2669 DEBUG(traceCandidate(Cand)); 2670 } 2671 } 2672 } 2673 2674 static void tracePick(const GenericScheduler::SchedCandidate &Cand, 2675 bool IsTop) { 2676 DEBUG(dbgs() << "Pick " << (IsTop ? "Top " : "Bot ") 2677 << GenericScheduler::getReasonStr(Cand.Reason) << '\n'); 2678 } 2679 2680 /// Pick the best candidate node from either the top or bottom queue. 2681 SUnit *GenericScheduler::pickNodeBidirectional(bool &IsTopNode) { 2682 // Schedule as far as possible in the direction of no choice. This is most 2683 // efficient, but also provides the best heuristics for CriticalPSets. 2684 if (SUnit *SU = Bot.pickOnlyChoice()) { 2685 IsTopNode = false; 2686 DEBUG(dbgs() << "Pick Bot NOCAND\n"); 2687 return SU; 2688 } 2689 if (SUnit *SU = Top.pickOnlyChoice()) { 2690 IsTopNode = true; 2691 DEBUG(dbgs() << "Pick Top NOCAND\n"); 2692 return SU; 2693 } 2694 CandPolicy NoPolicy; 2695 SchedCandidate BotCand(NoPolicy); 2696 SchedCandidate TopCand(NoPolicy); 2697 Bot.setPolicy(BotCand.Policy, Top); 2698 Top.setPolicy(TopCand.Policy, Bot); 2699 2700 // Prefer bottom scheduling when heuristics are silent. 2701 pickNodeFromQueue(Bot, DAG->getBotRPTracker(), BotCand); 2702 assert(BotCand.Reason != NoCand && "failed to find the first candidate"); 2703 2704 // If either Q has a single candidate that provides the least increase in 2705 // Excess pressure, we can immediately schedule from that Q. 2706 // 2707 // RegionCriticalPSets summarizes the pressure within the scheduled region and 2708 // affects picking from either Q. If scheduling in one direction must 2709 // increase pressure for one of the excess PSets, then schedule in that 2710 // direction first to provide more freedom in the other direction. 2711 if ((BotCand.Reason == RegExcess && !BotCand.isRepeat(RegExcess)) 2712 || (BotCand.Reason == RegCritical 2713 && !BotCand.isRepeat(RegCritical))) 2714 { 2715 IsTopNode = false; 2716 tracePick(BotCand, IsTopNode); 2717 return BotCand.SU; 2718 } 2719 // Check if the top Q has a better candidate. 2720 pickNodeFromQueue(Top, DAG->getTopRPTracker(), TopCand); 2721 assert(TopCand.Reason != NoCand && "failed to find the first candidate"); 2722 2723 // Choose the queue with the most important (lowest enum) reason. 2724 if (TopCand.Reason < BotCand.Reason) { 2725 IsTopNode = true; 2726 tracePick(TopCand, IsTopNode); 2727 return TopCand.SU; 2728 } 2729 // Otherwise prefer the bottom candidate, in node order if all else failed. 2730 IsTopNode = false; 2731 tracePick(BotCand, IsTopNode); 2732 return BotCand.SU; 2733 } 2734 2735 /// Pick the best node to balance the schedule. Implements MachineSchedStrategy. 2736 SUnit *GenericScheduler::pickNode(bool &IsTopNode) { 2737 if (DAG->top() == DAG->bottom()) { 2738 assert(Top.Available.empty() && Top.Pending.empty() && 2739 Bot.Available.empty() && Bot.Pending.empty() && "ReadyQ garbage"); 2740 return NULL; 2741 } 2742 SUnit *SU; 2743 do { 2744 if (RegionPolicy.OnlyTopDown) { 2745 SU = Top.pickOnlyChoice(); 2746 if (!SU) { 2747 CandPolicy NoPolicy; 2748 SchedCandidate TopCand(NoPolicy); 2749 pickNodeFromQueue(Top, DAG->getTopRPTracker(), TopCand); 2750 assert(TopCand.Reason != NoCand && "failed to find a candidate"); 2751 tracePick(TopCand, true); 2752 SU = TopCand.SU; 2753 } 2754 IsTopNode = true; 2755 } 2756 else if (RegionPolicy.OnlyBottomUp) { 2757 SU = Bot.pickOnlyChoice(); 2758 if (!SU) { 2759 CandPolicy NoPolicy; 2760 SchedCandidate BotCand(NoPolicy); 2761 pickNodeFromQueue(Bot, DAG->getBotRPTracker(), BotCand); 2762 assert(BotCand.Reason != NoCand && "failed to find a candidate"); 2763 tracePick(BotCand, false); 2764 SU = BotCand.SU; 2765 } 2766 IsTopNode = false; 2767 } 2768 else { 2769 SU = pickNodeBidirectional(IsTopNode); 2770 } 2771 } while (SU->isScheduled); 2772 2773 if (SU->isTopReady()) 2774 Top.removeReady(SU); 2775 if (SU->isBottomReady()) 2776 Bot.removeReady(SU); 2777 2778 DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") " << *SU->getInstr()); 2779 return SU; 2780 } 2781 2782 void GenericScheduler::reschedulePhysRegCopies(SUnit *SU, bool isTop) { 2783 2784 MachineBasicBlock::iterator InsertPos = SU->getInstr(); 2785 if (!isTop) 2786 ++InsertPos; 2787 SmallVectorImpl<SDep> &Deps = isTop ? SU->Preds : SU->Succs; 2788 2789 // Find already scheduled copies with a single physreg dependence and move 2790 // them just above the scheduled instruction. 2791 for (SmallVectorImpl<SDep>::iterator I = Deps.begin(), E = Deps.end(); 2792 I != E; ++I) { 2793 if (I->getKind() != SDep::Data || !TRI->isPhysicalRegister(I->getReg())) 2794 continue; 2795 SUnit *DepSU = I->getSUnit(); 2796 if (isTop ? DepSU->Succs.size() > 1 : DepSU->Preds.size() > 1) 2797 continue; 2798 MachineInstr *Copy = DepSU->getInstr(); 2799 if (!Copy->isCopy()) 2800 continue; 2801 DEBUG(dbgs() << " Rescheduling physreg copy "; 2802 I->getSUnit()->dump(DAG)); 2803 DAG->moveInstruction(Copy, InsertPos); 2804 } 2805 } 2806 2807 /// Update the scheduler's state after scheduling a node. This is the same node 2808 /// that was just returned by pickNode(). However, ScheduleDAGMI needs to update 2809 /// it's state based on the current cycle before MachineSchedStrategy does. 2810 /// 2811 /// FIXME: Eventually, we may bundle physreg copies rather than rescheduling 2812 /// them here. See comments in biasPhysRegCopy. 2813 void GenericScheduler::schedNode(SUnit *SU, bool IsTopNode) { 2814 if (IsTopNode) { 2815 SU->TopReadyCycle = std::max(SU->TopReadyCycle, Top.CurrCycle); 2816 Top.bumpNode(SU); 2817 if (SU->hasPhysRegUses) 2818 reschedulePhysRegCopies(SU, true); 2819 } 2820 else { 2821 SU->BotReadyCycle = std::max(SU->BotReadyCycle, Bot.CurrCycle); 2822 Bot.bumpNode(SU); 2823 if (SU->hasPhysRegDefs) 2824 reschedulePhysRegCopies(SU, false); 2825 } 2826 } 2827 2828 /// Create the standard converging machine scheduler. This will be used as the 2829 /// default scheduler if the target does not set a default. 2830 static ScheduleDAGInstrs *createGenericSched(MachineSchedContext *C) { 2831 ScheduleDAGMI *DAG = new ScheduleDAGMI(C, new GenericScheduler(C)); 2832 // Register DAG post-processors. 2833 // 2834 // FIXME: extend the mutation API to allow earlier mutations to instantiate 2835 // data and pass it to later mutations. Have a single mutation that gathers 2836 // the interesting nodes in one pass. 2837 DAG->addMutation(new CopyConstrain(DAG->TII, DAG->TRI)); 2838 if (EnableLoadCluster && DAG->TII->enableClusterLoads()) 2839 DAG->addMutation(new LoadClusterMutation(DAG->TII, DAG->TRI)); 2840 if (EnableMacroFusion) 2841 DAG->addMutation(new MacroFusion(DAG->TII)); 2842 return DAG; 2843 } 2844 static MachineSchedRegistry 2845 GenericSchedRegistry("converge", "Standard converging scheduler.", 2846 createGenericSched); 2847 2848 //===----------------------------------------------------------------------===// 2849 // ILP Scheduler. Currently for experimental analysis of heuristics. 2850 //===----------------------------------------------------------------------===// 2851 2852 namespace { 2853 /// \brief Order nodes by the ILP metric. 2854 struct ILPOrder { 2855 const SchedDFSResult *DFSResult; 2856 const BitVector *ScheduledTrees; 2857 bool MaximizeILP; 2858 2859 ILPOrder(bool MaxILP): DFSResult(0), ScheduledTrees(0), MaximizeILP(MaxILP) {} 2860 2861 /// \brief Apply a less-than relation on node priority. 2862 /// 2863 /// (Return true if A comes after B in the Q.) 2864 bool operator()(const SUnit *A, const SUnit *B) const { 2865 unsigned SchedTreeA = DFSResult->getSubtreeID(A); 2866 unsigned SchedTreeB = DFSResult->getSubtreeID(B); 2867 if (SchedTreeA != SchedTreeB) { 2868 // Unscheduled trees have lower priority. 2869 if (ScheduledTrees->test(SchedTreeA) != ScheduledTrees->test(SchedTreeB)) 2870 return ScheduledTrees->test(SchedTreeB); 2871 2872 // Trees with shallower connections have have lower priority. 2873 if (DFSResult->getSubtreeLevel(SchedTreeA) 2874 != DFSResult->getSubtreeLevel(SchedTreeB)) { 2875 return DFSResult->getSubtreeLevel(SchedTreeA) 2876 < DFSResult->getSubtreeLevel(SchedTreeB); 2877 } 2878 } 2879 if (MaximizeILP) 2880 return DFSResult->getILP(A) < DFSResult->getILP(B); 2881 else 2882 return DFSResult->getILP(A) > DFSResult->getILP(B); 2883 } 2884 }; 2885 2886 /// \brief Schedule based on the ILP metric. 2887 class ILPScheduler : public MachineSchedStrategy { 2888 ScheduleDAGMI *DAG; 2889 ILPOrder Cmp; 2890 2891 std::vector<SUnit*> ReadyQ; 2892 public: 2893 ILPScheduler(bool MaximizeILP): DAG(0), Cmp(MaximizeILP) {} 2894 2895 virtual void initialize(ScheduleDAGMI *dag) { 2896 DAG = dag; 2897 DAG->computeDFSResult(); 2898 Cmp.DFSResult = DAG->getDFSResult(); 2899 Cmp.ScheduledTrees = &DAG->getScheduledTrees(); 2900 ReadyQ.clear(); 2901 } 2902 2903 virtual void registerRoots() { 2904 // Restore the heap in ReadyQ with the updated DFS results. 2905 std::make_heap(ReadyQ.begin(), ReadyQ.end(), Cmp); 2906 } 2907 2908 /// Implement MachineSchedStrategy interface. 2909 /// ----------------------------------------- 2910 2911 /// Callback to select the highest priority node from the ready Q. 2912 virtual SUnit *pickNode(bool &IsTopNode) { 2913 if (ReadyQ.empty()) return NULL; 2914 std::pop_heap(ReadyQ.begin(), ReadyQ.end(), Cmp); 2915 SUnit *SU = ReadyQ.back(); 2916 ReadyQ.pop_back(); 2917 IsTopNode = false; 2918 DEBUG(dbgs() << "Pick node " << "SU(" << SU->NodeNum << ") " 2919 << " ILP: " << DAG->getDFSResult()->getILP(SU) 2920 << " Tree: " << DAG->getDFSResult()->getSubtreeID(SU) << " @" 2921 << DAG->getDFSResult()->getSubtreeLevel( 2922 DAG->getDFSResult()->getSubtreeID(SU)) << '\n' 2923 << "Scheduling " << *SU->getInstr()); 2924 return SU; 2925 } 2926 2927 /// \brief Scheduler callback to notify that a new subtree is scheduled. 2928 virtual void scheduleTree(unsigned SubtreeID) { 2929 std::make_heap(ReadyQ.begin(), ReadyQ.end(), Cmp); 2930 } 2931 2932 /// Callback after a node is scheduled. Mark a newly scheduled tree, notify 2933 /// DFSResults, and resort the priority Q. 2934 virtual void schedNode(SUnit *SU, bool IsTopNode) { 2935 assert(!IsTopNode && "SchedDFSResult needs bottom-up"); 2936 } 2937 2938 virtual void releaseTopNode(SUnit *) { /*only called for top roots*/ } 2939 2940 virtual void releaseBottomNode(SUnit *SU) { 2941 ReadyQ.push_back(SU); 2942 std::push_heap(ReadyQ.begin(), ReadyQ.end(), Cmp); 2943 } 2944 }; 2945 } // namespace 2946 2947 static ScheduleDAGInstrs *createILPMaxScheduler(MachineSchedContext *C) { 2948 return new ScheduleDAGMI(C, new ILPScheduler(true)); 2949 } 2950 static ScheduleDAGInstrs *createILPMinScheduler(MachineSchedContext *C) { 2951 return new ScheduleDAGMI(C, new ILPScheduler(false)); 2952 } 2953 static MachineSchedRegistry ILPMaxRegistry( 2954 "ilpmax", "Schedule bottom-up for max ILP", createILPMaxScheduler); 2955 static MachineSchedRegistry ILPMinRegistry( 2956 "ilpmin", "Schedule bottom-up for min ILP", createILPMinScheduler); 2957 2958 //===----------------------------------------------------------------------===// 2959 // Machine Instruction Shuffler for Correctness Testing 2960 //===----------------------------------------------------------------------===// 2961 2962 #ifndef NDEBUG 2963 namespace { 2964 /// Apply a less-than relation on the node order, which corresponds to the 2965 /// instruction order prior to scheduling. IsReverse implements greater-than. 2966 template<bool IsReverse> 2967 struct SUnitOrder { 2968 bool operator()(SUnit *A, SUnit *B) const { 2969 if (IsReverse) 2970 return A->NodeNum > B->NodeNum; 2971 else 2972 return A->NodeNum < B->NodeNum; 2973 } 2974 }; 2975 2976 /// Reorder instructions as much as possible. 2977 class InstructionShuffler : public MachineSchedStrategy { 2978 bool IsAlternating; 2979 bool IsTopDown; 2980 2981 // Using a less-than relation (SUnitOrder<false>) for the TopQ priority 2982 // gives nodes with a higher number higher priority causing the latest 2983 // instructions to be scheduled first. 2984 PriorityQueue<SUnit*, std::vector<SUnit*>, SUnitOrder<false> > 2985 TopQ; 2986 // When scheduling bottom-up, use greater-than as the queue priority. 2987 PriorityQueue<SUnit*, std::vector<SUnit*>, SUnitOrder<true> > 2988 BottomQ; 2989 public: 2990 InstructionShuffler(bool alternate, bool topdown) 2991 : IsAlternating(alternate), IsTopDown(topdown) {} 2992 2993 virtual void initialize(ScheduleDAGMI *) { 2994 TopQ.clear(); 2995 BottomQ.clear(); 2996 } 2997 2998 /// Implement MachineSchedStrategy interface. 2999 /// ----------------------------------------- 3000 3001 virtual SUnit *pickNode(bool &IsTopNode) { 3002 SUnit *SU; 3003 if (IsTopDown) { 3004 do { 3005 if (TopQ.empty()) return NULL; 3006 SU = TopQ.top(); 3007 TopQ.pop(); 3008 } while (SU->isScheduled); 3009 IsTopNode = true; 3010 } 3011 else { 3012 do { 3013 if (BottomQ.empty()) return NULL; 3014 SU = BottomQ.top(); 3015 BottomQ.pop(); 3016 } while (SU->isScheduled); 3017 IsTopNode = false; 3018 } 3019 if (IsAlternating) 3020 IsTopDown = !IsTopDown; 3021 return SU; 3022 } 3023 3024 virtual void schedNode(SUnit *SU, bool IsTopNode) {} 3025 3026 virtual void releaseTopNode(SUnit *SU) { 3027 TopQ.push(SU); 3028 } 3029 virtual void releaseBottomNode(SUnit *SU) { 3030 BottomQ.push(SU); 3031 } 3032 }; 3033 } // namespace 3034 3035 static ScheduleDAGInstrs *createInstructionShuffler(MachineSchedContext *C) { 3036 bool Alternate = !ForceTopDown && !ForceBottomUp; 3037 bool TopDown = !ForceBottomUp; 3038 assert((TopDown || !ForceTopDown) && 3039 "-misched-topdown incompatible with -misched-bottomup"); 3040 return new ScheduleDAGMI(C, new InstructionShuffler(Alternate, TopDown)); 3041 } 3042 static MachineSchedRegistry ShufflerRegistry( 3043 "shuffle", "Shuffle machine instructions alternating directions", 3044 createInstructionShuffler); 3045 #endif // !NDEBUG 3046 3047 //===----------------------------------------------------------------------===// 3048 // GraphWriter support for ScheduleDAGMI. 3049 //===----------------------------------------------------------------------===// 3050 3051 #ifndef NDEBUG 3052 namespace llvm { 3053 3054 template<> struct GraphTraits< 3055 ScheduleDAGMI*> : public GraphTraits<ScheduleDAG*> {}; 3056 3057 template<> 3058 struct DOTGraphTraits<ScheduleDAGMI*> : public DefaultDOTGraphTraits { 3059 3060 DOTGraphTraits (bool isSimple=false) : DefaultDOTGraphTraits(isSimple) {} 3061 3062 static std::string getGraphName(const ScheduleDAG *G) { 3063 return G->MF.getName(); 3064 } 3065 3066 static bool renderGraphFromBottomUp() { 3067 return true; 3068 } 3069 3070 static bool isNodeHidden(const SUnit *Node) { 3071 return (Node->Preds.size() > 10 || Node->Succs.size() > 10); 3072 } 3073 3074 static bool hasNodeAddressLabel(const SUnit *Node, 3075 const ScheduleDAG *Graph) { 3076 return false; 3077 } 3078 3079 /// If you want to override the dot attributes printed for a particular 3080 /// edge, override this method. 3081 static std::string getEdgeAttributes(const SUnit *Node, 3082 SUnitIterator EI, 3083 const ScheduleDAG *Graph) { 3084 if (EI.isArtificialDep()) 3085 return "color=cyan,style=dashed"; 3086 if (EI.isCtrlDep()) 3087 return "color=blue,style=dashed"; 3088 return ""; 3089 } 3090 3091 static std::string getNodeLabel(const SUnit *SU, const ScheduleDAG *G) { 3092 std::string Str; 3093 raw_string_ostream SS(Str); 3094 const SchedDFSResult *DFS = 3095 static_cast<const ScheduleDAGMI*>(G)->getDFSResult(); 3096 SS << "SU:" << SU->NodeNum; 3097 if (DFS) 3098 SS << " I:" << DFS->getNumInstrs(SU); 3099 return SS.str(); 3100 } 3101 static std::string getNodeDescription(const SUnit *SU, const ScheduleDAG *G) { 3102 return G->getGraphNodeLabel(SU); 3103 } 3104 3105 static std::string getNodeAttributes(const SUnit *N, 3106 const ScheduleDAG *Graph) { 3107 std::string Str("shape=Mrecord"); 3108 const SchedDFSResult *DFS = 3109 static_cast<const ScheduleDAGMI*>(Graph)->getDFSResult(); 3110 if (DFS) { 3111 Str += ",style=filled,fillcolor=\"#"; 3112 Str += DOT::getColorString(DFS->getSubtreeID(N)); 3113 Str += '"'; 3114 } 3115 return Str; 3116 } 3117 }; 3118 } // namespace llvm 3119 #endif // NDEBUG 3120 3121 /// viewGraph - Pop up a ghostview window with the reachable parts of the DAG 3122 /// rendered using 'dot'. 3123 /// 3124 void ScheduleDAGMI::viewGraph(const Twine &Name, const Twine &Title) { 3125 #ifndef NDEBUG 3126 ViewGraph(this, Name, false, Title); 3127 #else 3128 errs() << "ScheduleDAGMI::viewGraph is only available in debug builds on " 3129 << "systems with Graphviz or gv!\n"; 3130 #endif // NDEBUG 3131 } 3132 3133 /// Out-of-line implementation with no arguments is handy for gdb. 3134 void ScheduleDAGMI::viewGraph() { 3135 viewGraph(getDAGName(), "Scheduling-Units Graph for " + getDAGName()); 3136 } 3137