1 //===- MachineScheduler.cpp - Machine Instruction Scheduler ---------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // MachineScheduler schedules machine instructions after phi elimination. It 10 // preserves LiveIntervals so it can be invoked before register allocation. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "llvm/CodeGen/MachineScheduler.h" 15 #include "llvm/ADT/ArrayRef.h" 16 #include "llvm/ADT/BitVector.h" 17 #include "llvm/ADT/DenseMap.h" 18 #include "llvm/ADT/PriorityQueue.h" 19 #include "llvm/ADT/STLExtras.h" 20 #include "llvm/ADT/SmallVector.h" 21 #include "llvm/ADT/Statistic.h" 22 #include "llvm/ADT/iterator_range.h" 23 #include "llvm/Analysis/AliasAnalysis.h" 24 #include "llvm/CodeGen/LiveInterval.h" 25 #include "llvm/CodeGen/LiveIntervals.h" 26 #include "llvm/CodeGen/MachineBasicBlock.h" 27 #include "llvm/CodeGen/MachineDominators.h" 28 #include "llvm/CodeGen/MachineFunction.h" 29 #include "llvm/CodeGen/MachineFunctionPass.h" 30 #include "llvm/CodeGen/MachineInstr.h" 31 #include "llvm/CodeGen/MachineLoopInfo.h" 32 #include "llvm/CodeGen/MachineOperand.h" 33 #include "llvm/CodeGen/MachinePassRegistry.h" 34 #include "llvm/CodeGen/MachineRegisterInfo.h" 35 #include "llvm/CodeGen/RegisterClassInfo.h" 36 #include "llvm/CodeGen/RegisterPressure.h" 37 #include "llvm/CodeGen/ScheduleDAG.h" 38 #include "llvm/CodeGen/ScheduleDAGInstrs.h" 39 #include "llvm/CodeGen/ScheduleDAGMutation.h" 40 #include "llvm/CodeGen/ScheduleDFS.h" 41 #include "llvm/CodeGen/ScheduleHazardRecognizer.h" 42 #include "llvm/CodeGen/SlotIndexes.h" 43 #include "llvm/CodeGen/TargetFrameLowering.h" 44 #include "llvm/CodeGen/TargetInstrInfo.h" 45 #include "llvm/CodeGen/TargetLowering.h" 46 #include "llvm/CodeGen/TargetPassConfig.h" 47 #include "llvm/CodeGen/TargetRegisterInfo.h" 48 #include "llvm/CodeGen/TargetSchedule.h" 49 #include "llvm/CodeGen/TargetSubtargetInfo.h" 50 #include "llvm/CodeGenTypes/MachineValueType.h" 51 #include "llvm/Config/llvm-config.h" 52 #include "llvm/InitializePasses.h" 53 #include "llvm/MC/LaneBitmask.h" 54 #include "llvm/Pass.h" 55 #include "llvm/Support/CommandLine.h" 56 #include "llvm/Support/Compiler.h" 57 #include "llvm/Support/Debug.h" 58 #include "llvm/Support/ErrorHandling.h" 59 #include "llvm/Support/GraphWriter.h" 60 #include "llvm/Support/raw_ostream.h" 61 #include <algorithm> 62 #include <cassert> 63 #include <cstdint> 64 #include <iterator> 65 #include <limits> 66 #include <memory> 67 #include <string> 68 #include <tuple> 69 #include <utility> 70 #include <vector> 71 72 using namespace llvm; 73 74 #define DEBUG_TYPE "machine-scheduler" 75 76 STATISTIC(NumClustered, "Number of load/store pairs clustered"); 77 78 namespace llvm { 79 80 cl::opt<bool> ForceTopDown("misched-topdown", cl::Hidden, 81 cl::desc("Force top-down list scheduling")); 82 cl::opt<bool> ForceBottomUp("misched-bottomup", cl::Hidden, 83 cl::desc("Force bottom-up list scheduling")); 84 namespace MISchedPostRASched { 85 enum Direction { 86 TopDown, 87 BottomUp, 88 }; 89 } // end namespace MISchedPostRASched 90 cl::opt<MISchedPostRASched::Direction> PostRADirection( 91 "misched-postra-direction", cl::Hidden, 92 cl::desc("Post reg-alloc list scheduling direction"), 93 // Default to top-down because it was implemented first and existing targets 94 // expect that behavior by default. 95 cl::init(MISchedPostRASched::TopDown), 96 cl::values(clEnumValN(MISchedPostRASched::TopDown, "topdown", 97 "Force top-down post reg-alloc list scheduling"), 98 clEnumValN(MISchedPostRASched::BottomUp, "bottomup", 99 "Force bottom-up post reg-alloc list scheduling"))); 100 cl::opt<bool> 101 DumpCriticalPathLength("misched-dcpl", cl::Hidden, 102 cl::desc("Print critical path length to stdout")); 103 104 cl::opt<bool> VerifyScheduling( 105 "verify-misched", cl::Hidden, 106 cl::desc("Verify machine instrs before and after machine scheduling")); 107 108 #ifndef NDEBUG 109 cl::opt<bool> ViewMISchedDAGs( 110 "view-misched-dags", cl::Hidden, 111 cl::desc("Pop up a window to show MISched dags after they are processed")); 112 cl::opt<bool> PrintDAGs("misched-print-dags", cl::Hidden, 113 cl::desc("Print schedule DAGs")); 114 cl::opt<bool> MISchedDumpReservedCycles( 115 "misched-dump-reserved-cycles", cl::Hidden, cl::init(false), 116 cl::desc("Dump resource usage at schedule boundary.")); 117 cl::opt<bool> MischedDetailResourceBooking( 118 "misched-detail-resource-booking", cl::Hidden, cl::init(false), 119 cl::desc("Show details of invoking getNextResoufceCycle.")); 120 #else 121 const bool ViewMISchedDAGs = false; 122 const bool PrintDAGs = false; 123 const bool MischedDetailResourceBooking = false; 124 #ifdef LLVM_ENABLE_DUMP 125 const bool MISchedDumpReservedCycles = false; 126 #endif // LLVM_ENABLE_DUMP 127 #endif // NDEBUG 128 129 } // end namespace llvm 130 131 #ifndef NDEBUG 132 /// In some situations a few uninteresting nodes depend on nearly all other 133 /// nodes in the graph, provide a cutoff to hide them. 134 static cl::opt<unsigned> ViewMISchedCutoff("view-misched-cutoff", cl::Hidden, 135 cl::desc("Hide nodes with more predecessor/successor than cutoff")); 136 137 static cl::opt<unsigned> MISchedCutoff("misched-cutoff", cl::Hidden, 138 cl::desc("Stop scheduling after N instructions"), cl::init(~0U)); 139 140 static cl::opt<std::string> SchedOnlyFunc("misched-only-func", cl::Hidden, 141 cl::desc("Only schedule this function")); 142 static cl::opt<unsigned> SchedOnlyBlock("misched-only-block", cl::Hidden, 143 cl::desc("Only schedule this MBB#")); 144 #endif // NDEBUG 145 146 /// Avoid quadratic complexity in unusually large basic blocks by limiting the 147 /// size of the ready lists. 148 static cl::opt<unsigned> ReadyListLimit("misched-limit", cl::Hidden, 149 cl::desc("Limit ready list to N instructions"), cl::init(256)); 150 151 static cl::opt<bool> EnableRegPressure("misched-regpressure", cl::Hidden, 152 cl::desc("Enable register pressure scheduling."), cl::init(true)); 153 154 static cl::opt<bool> EnableCyclicPath("misched-cyclicpath", cl::Hidden, 155 cl::desc("Enable cyclic critical path analysis."), cl::init(true)); 156 157 static cl::opt<bool> EnableMemOpCluster("misched-cluster", cl::Hidden, 158 cl::desc("Enable memop clustering."), 159 cl::init(true)); 160 static cl::opt<bool> 161 ForceFastCluster("force-fast-cluster", cl::Hidden, 162 cl::desc("Switch to fast cluster algorithm with the lost " 163 "of some fusion opportunities"), 164 cl::init(false)); 165 static cl::opt<unsigned> 166 FastClusterThreshold("fast-cluster-threshold", cl::Hidden, 167 cl::desc("The threshold for fast cluster"), 168 cl::init(1000)); 169 170 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 171 static cl::opt<bool> MISchedDumpScheduleTrace( 172 "misched-dump-schedule-trace", cl::Hidden, cl::init(false), 173 cl::desc("Dump resource usage at schedule boundary.")); 174 static cl::opt<unsigned> 175 HeaderColWidth("misched-dump-schedule-trace-col-header-width", cl::Hidden, 176 cl::desc("Set width of the columns with " 177 "the resources and schedule units"), 178 cl::init(19)); 179 static cl::opt<unsigned> 180 ColWidth("misched-dump-schedule-trace-col-width", cl::Hidden, 181 cl::desc("Set width of the columns showing resource booking."), 182 cl::init(5)); 183 static cl::opt<bool> MISchedSortResourcesInTrace( 184 "misched-sort-resources-in-trace", cl::Hidden, cl::init(true), 185 cl::desc("Sort the resources printed in the dump trace")); 186 #endif 187 188 static cl::opt<unsigned> 189 MIResourceCutOff("misched-resource-cutoff", cl::Hidden, 190 cl::desc("Number of intervals to track"), cl::init(10)); 191 192 // DAG subtrees must have at least this many nodes. 193 static const unsigned MinSubtreeSize = 8; 194 195 // Pin the vtables to this file. 196 void MachineSchedStrategy::anchor() {} 197 198 void ScheduleDAGMutation::anchor() {} 199 200 //===----------------------------------------------------------------------===// 201 // Machine Instruction Scheduling Pass and Registry 202 //===----------------------------------------------------------------------===// 203 204 MachineSchedContext::MachineSchedContext() { 205 RegClassInfo = new RegisterClassInfo(); 206 } 207 208 MachineSchedContext::~MachineSchedContext() { 209 delete RegClassInfo; 210 } 211 212 namespace { 213 214 /// Base class for a machine scheduler class that can run at any point. 215 class MachineSchedulerBase : public MachineSchedContext, 216 public MachineFunctionPass { 217 public: 218 MachineSchedulerBase(char &ID): MachineFunctionPass(ID) {} 219 220 void print(raw_ostream &O, const Module* = nullptr) const override; 221 222 protected: 223 void scheduleRegions(ScheduleDAGInstrs &Scheduler, bool FixKillFlags); 224 }; 225 226 /// MachineScheduler runs after coalescing and before register allocation. 227 class MachineScheduler : public MachineSchedulerBase { 228 public: 229 MachineScheduler(); 230 231 void getAnalysisUsage(AnalysisUsage &AU) const override; 232 233 bool runOnMachineFunction(MachineFunction&) override; 234 235 static char ID; // Class identification, replacement for typeinfo 236 237 protected: 238 ScheduleDAGInstrs *createMachineScheduler(); 239 }; 240 241 /// PostMachineScheduler runs after shortly before code emission. 242 class PostMachineScheduler : public MachineSchedulerBase { 243 public: 244 PostMachineScheduler(); 245 246 void getAnalysisUsage(AnalysisUsage &AU) const override; 247 248 bool runOnMachineFunction(MachineFunction&) override; 249 250 static char ID; // Class identification, replacement for typeinfo 251 252 protected: 253 ScheduleDAGInstrs *createPostMachineScheduler(); 254 }; 255 256 } // end anonymous namespace 257 258 char MachineScheduler::ID = 0; 259 260 char &llvm::MachineSchedulerID = MachineScheduler::ID; 261 262 INITIALIZE_PASS_BEGIN(MachineScheduler, DEBUG_TYPE, 263 "Machine Instruction Scheduler", false, false) 264 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) 265 INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) 266 INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo) 267 INITIALIZE_PASS_DEPENDENCY(SlotIndexes) 268 INITIALIZE_PASS_DEPENDENCY(LiveIntervals) 269 INITIALIZE_PASS_END(MachineScheduler, DEBUG_TYPE, 270 "Machine Instruction Scheduler", false, false) 271 272 MachineScheduler::MachineScheduler() : MachineSchedulerBase(ID) { 273 initializeMachineSchedulerPass(*PassRegistry::getPassRegistry()); 274 } 275 276 void MachineScheduler::getAnalysisUsage(AnalysisUsage &AU) const { 277 AU.setPreservesCFG(); 278 AU.addRequired<MachineDominatorTree>(); 279 AU.addRequired<MachineLoopInfo>(); 280 AU.addRequired<AAResultsWrapperPass>(); 281 AU.addRequired<TargetPassConfig>(); 282 AU.addRequired<SlotIndexes>(); 283 AU.addPreserved<SlotIndexes>(); 284 AU.addRequired<LiveIntervals>(); 285 AU.addPreserved<LiveIntervals>(); 286 MachineFunctionPass::getAnalysisUsage(AU); 287 } 288 289 char PostMachineScheduler::ID = 0; 290 291 char &llvm::PostMachineSchedulerID = PostMachineScheduler::ID; 292 293 INITIALIZE_PASS_BEGIN(PostMachineScheduler, "postmisched", 294 "PostRA Machine Instruction Scheduler", false, false) 295 INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) 296 INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo) 297 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) 298 INITIALIZE_PASS_END(PostMachineScheduler, "postmisched", 299 "PostRA Machine Instruction Scheduler", false, false) 300 301 PostMachineScheduler::PostMachineScheduler() : MachineSchedulerBase(ID) { 302 initializePostMachineSchedulerPass(*PassRegistry::getPassRegistry()); 303 } 304 305 void PostMachineScheduler::getAnalysisUsage(AnalysisUsage &AU) const { 306 AU.setPreservesCFG(); 307 AU.addRequired<MachineDominatorTree>(); 308 AU.addRequired<MachineLoopInfo>(); 309 AU.addRequired<AAResultsWrapperPass>(); 310 AU.addRequired<TargetPassConfig>(); 311 MachineFunctionPass::getAnalysisUsage(AU); 312 } 313 314 MachinePassRegistry<MachineSchedRegistry::ScheduleDAGCtor> 315 MachineSchedRegistry::Registry; 316 317 /// A dummy default scheduler factory indicates whether the scheduler 318 /// is overridden on the command line. 319 static ScheduleDAGInstrs *useDefaultMachineSched(MachineSchedContext *C) { 320 return nullptr; 321 } 322 323 /// MachineSchedOpt allows command line selection of the scheduler. 324 static cl::opt<MachineSchedRegistry::ScheduleDAGCtor, false, 325 RegisterPassParser<MachineSchedRegistry>> 326 MachineSchedOpt("misched", 327 cl::init(&useDefaultMachineSched), cl::Hidden, 328 cl::desc("Machine instruction scheduler to use")); 329 330 static MachineSchedRegistry 331 DefaultSchedRegistry("default", "Use the target's default scheduler choice.", 332 useDefaultMachineSched); 333 334 static cl::opt<bool> EnableMachineSched( 335 "enable-misched", 336 cl::desc("Enable the machine instruction scheduling pass."), cl::init(true), 337 cl::Hidden); 338 339 static cl::opt<bool> EnablePostRAMachineSched( 340 "enable-post-misched", 341 cl::desc("Enable the post-ra machine instruction scheduling pass."), 342 cl::init(true), cl::Hidden); 343 344 /// Decrement this iterator until reaching the top or a non-debug instr. 345 static MachineBasicBlock::const_iterator 346 priorNonDebug(MachineBasicBlock::const_iterator I, 347 MachineBasicBlock::const_iterator Beg) { 348 assert(I != Beg && "reached the top of the region, cannot decrement"); 349 while (--I != Beg) { 350 if (!I->isDebugOrPseudoInstr()) 351 break; 352 } 353 return I; 354 } 355 356 /// Non-const version. 357 static MachineBasicBlock::iterator 358 priorNonDebug(MachineBasicBlock::iterator I, 359 MachineBasicBlock::const_iterator Beg) { 360 return priorNonDebug(MachineBasicBlock::const_iterator(I), Beg) 361 .getNonConstIterator(); 362 } 363 364 /// If this iterator is a debug value, increment until reaching the End or a 365 /// non-debug instruction. 366 static MachineBasicBlock::const_iterator 367 nextIfDebug(MachineBasicBlock::const_iterator I, 368 MachineBasicBlock::const_iterator End) { 369 for(; I != End; ++I) { 370 if (!I->isDebugOrPseudoInstr()) 371 break; 372 } 373 return I; 374 } 375 376 /// Non-const version. 377 static MachineBasicBlock::iterator 378 nextIfDebug(MachineBasicBlock::iterator I, 379 MachineBasicBlock::const_iterator End) { 380 return nextIfDebug(MachineBasicBlock::const_iterator(I), End) 381 .getNonConstIterator(); 382 } 383 384 /// Instantiate a ScheduleDAGInstrs that will be owned by the caller. 385 ScheduleDAGInstrs *MachineScheduler::createMachineScheduler() { 386 // Select the scheduler, or set the default. 387 MachineSchedRegistry::ScheduleDAGCtor Ctor = MachineSchedOpt; 388 if (Ctor != useDefaultMachineSched) 389 return Ctor(this); 390 391 // Get the default scheduler set by the target for this function. 392 ScheduleDAGInstrs *Scheduler = PassConfig->createMachineScheduler(this); 393 if (Scheduler) 394 return Scheduler; 395 396 // Default to GenericScheduler. 397 return createGenericSchedLive(this); 398 } 399 400 /// Instantiate a ScheduleDAGInstrs for PostRA scheduling that will be owned by 401 /// the caller. We don't have a command line option to override the postRA 402 /// scheduler. The Target must configure it. 403 ScheduleDAGInstrs *PostMachineScheduler::createPostMachineScheduler() { 404 // Get the postRA scheduler set by the target for this function. 405 ScheduleDAGInstrs *Scheduler = PassConfig->createPostMachineScheduler(this); 406 if (Scheduler) 407 return Scheduler; 408 409 // Default to GenericScheduler. 410 return createGenericSchedPostRA(this); 411 } 412 413 /// Top-level MachineScheduler pass driver. 414 /// 415 /// Visit blocks in function order. Divide each block into scheduling regions 416 /// and visit them bottom-up. Visiting regions bottom-up is not required, but is 417 /// consistent with the DAG builder, which traverses the interior of the 418 /// scheduling regions bottom-up. 419 /// 420 /// This design avoids exposing scheduling boundaries to the DAG builder, 421 /// simplifying the DAG builder's support for "special" target instructions. 422 /// At the same time the design allows target schedulers to operate across 423 /// scheduling boundaries, for example to bundle the boundary instructions 424 /// without reordering them. This creates complexity, because the target 425 /// scheduler must update the RegionBegin and RegionEnd positions cached by 426 /// ScheduleDAGInstrs whenever adding or removing instructions. A much simpler 427 /// design would be to split blocks at scheduling boundaries, but LLVM has a 428 /// general bias against block splitting purely for implementation simplicity. 429 bool MachineScheduler::runOnMachineFunction(MachineFunction &mf) { 430 if (skipFunction(mf.getFunction())) 431 return false; 432 433 if (EnableMachineSched.getNumOccurrences()) { 434 if (!EnableMachineSched) 435 return false; 436 } else if (!mf.getSubtarget().enableMachineScheduler()) 437 return false; 438 439 LLVM_DEBUG(dbgs() << "Before MISched:\n"; mf.print(dbgs())); 440 441 // Initialize the context of the pass. 442 MF = &mf; 443 MLI = &getAnalysis<MachineLoopInfo>(); 444 MDT = &getAnalysis<MachineDominatorTree>(); 445 PassConfig = &getAnalysis<TargetPassConfig>(); 446 AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); 447 448 LIS = &getAnalysis<LiveIntervals>(); 449 450 if (VerifyScheduling) { 451 LLVM_DEBUG(LIS->dump()); 452 MF->verify(this, "Before machine scheduling."); 453 } 454 RegClassInfo->runOnMachineFunction(*MF); 455 456 // Instantiate the selected scheduler for this target, function, and 457 // optimization level. 458 std::unique_ptr<ScheduleDAGInstrs> Scheduler(createMachineScheduler()); 459 ScheduleDAGMI::DumpDirection D; 460 if (ForceTopDown) 461 D = ScheduleDAGMI::DumpDirection::TopDown; 462 else if (ForceBottomUp) 463 D = ScheduleDAGMI::DumpDirection::BottomUp; 464 else 465 D = ScheduleDAGMI::DumpDirection::Bidirectional; 466 Scheduler->setDumpDirection(D); 467 scheduleRegions(*Scheduler, false); 468 469 LLVM_DEBUG(LIS->dump()); 470 if (VerifyScheduling) 471 MF->verify(this, "After machine scheduling."); 472 return true; 473 } 474 475 bool PostMachineScheduler::runOnMachineFunction(MachineFunction &mf) { 476 if (skipFunction(mf.getFunction())) 477 return false; 478 479 if (EnablePostRAMachineSched.getNumOccurrences()) { 480 if (!EnablePostRAMachineSched) 481 return false; 482 } else if (!mf.getSubtarget().enablePostRAMachineScheduler()) { 483 LLVM_DEBUG(dbgs() << "Subtarget disables post-MI-sched.\n"); 484 return false; 485 } 486 LLVM_DEBUG(dbgs() << "Before post-MI-sched:\n"; mf.print(dbgs())); 487 488 // Initialize the context of the pass. 489 MF = &mf; 490 MLI = &getAnalysis<MachineLoopInfo>(); 491 PassConfig = &getAnalysis<TargetPassConfig>(); 492 AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); 493 494 if (VerifyScheduling) 495 MF->verify(this, "Before post machine scheduling."); 496 497 // Instantiate the selected scheduler for this target, function, and 498 // optimization level. 499 std::unique_ptr<ScheduleDAGInstrs> Scheduler(createPostMachineScheduler()); 500 ScheduleDAGMI::DumpDirection D; 501 if (PostRADirection == MISchedPostRASched::TopDown) 502 D = ScheduleDAGMI::DumpDirection::TopDown; 503 else 504 D = ScheduleDAGMI::DumpDirection::BottomUp; 505 Scheduler->setDumpDirection(D); 506 scheduleRegions(*Scheduler, true); 507 508 if (VerifyScheduling) 509 MF->verify(this, "After post machine scheduling."); 510 return true; 511 } 512 513 /// Return true of the given instruction should not be included in a scheduling 514 /// region. 515 /// 516 /// MachineScheduler does not currently support scheduling across calls. To 517 /// handle calls, the DAG builder needs to be modified to create register 518 /// anti/output dependencies on the registers clobbered by the call's regmask 519 /// operand. In PreRA scheduling, the stack pointer adjustment already prevents 520 /// scheduling across calls. In PostRA scheduling, we need the isCall to enforce 521 /// the boundary, but there would be no benefit to postRA scheduling across 522 /// calls this late anyway. 523 static bool isSchedBoundary(MachineBasicBlock::iterator MI, 524 MachineBasicBlock *MBB, 525 MachineFunction *MF, 526 const TargetInstrInfo *TII) { 527 return MI->isCall() || TII->isSchedulingBoundary(*MI, MBB, *MF); 528 } 529 530 /// A region of an MBB for scheduling. 531 namespace { 532 struct SchedRegion { 533 /// RegionBegin is the first instruction in the scheduling region, and 534 /// RegionEnd is either MBB->end() or the scheduling boundary after the 535 /// last instruction in the scheduling region. These iterators cannot refer 536 /// to instructions outside of the identified scheduling region because 537 /// those may be reordered before scheduling this region. 538 MachineBasicBlock::iterator RegionBegin; 539 MachineBasicBlock::iterator RegionEnd; 540 unsigned NumRegionInstrs; 541 542 SchedRegion(MachineBasicBlock::iterator B, MachineBasicBlock::iterator E, 543 unsigned N) : 544 RegionBegin(B), RegionEnd(E), NumRegionInstrs(N) {} 545 }; 546 } // end anonymous namespace 547 548 using MBBRegionsVector = SmallVector<SchedRegion, 16>; 549 550 static void 551 getSchedRegions(MachineBasicBlock *MBB, 552 MBBRegionsVector &Regions, 553 bool RegionsTopDown) { 554 MachineFunction *MF = MBB->getParent(); 555 const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo(); 556 557 MachineBasicBlock::iterator I = nullptr; 558 for(MachineBasicBlock::iterator RegionEnd = MBB->end(); 559 RegionEnd != MBB->begin(); RegionEnd = I) { 560 561 // Avoid decrementing RegionEnd for blocks with no terminator. 562 if (RegionEnd != MBB->end() || 563 isSchedBoundary(&*std::prev(RegionEnd), &*MBB, MF, TII)) { 564 --RegionEnd; 565 } 566 567 // The next region starts above the previous region. Look backward in the 568 // instruction stream until we find the nearest boundary. 569 unsigned NumRegionInstrs = 0; 570 I = RegionEnd; 571 for (;I != MBB->begin(); --I) { 572 MachineInstr &MI = *std::prev(I); 573 if (isSchedBoundary(&MI, &*MBB, MF, TII)) 574 break; 575 if (!MI.isDebugOrPseudoInstr()) { 576 // MBB::size() uses instr_iterator to count. Here we need a bundle to 577 // count as a single instruction. 578 ++NumRegionInstrs; 579 } 580 } 581 582 // It's possible we found a scheduling region that only has debug 583 // instructions. Don't bother scheduling these. 584 if (NumRegionInstrs != 0) 585 Regions.push_back(SchedRegion(I, RegionEnd, NumRegionInstrs)); 586 } 587 588 if (RegionsTopDown) 589 std::reverse(Regions.begin(), Regions.end()); 590 } 591 592 /// Main driver for both MachineScheduler and PostMachineScheduler. 593 void MachineSchedulerBase::scheduleRegions(ScheduleDAGInstrs &Scheduler, 594 bool FixKillFlags) { 595 // Visit all machine basic blocks. 596 // 597 // TODO: Visit blocks in global postorder or postorder within the bottom-up 598 // loop tree. Then we can optionally compute global RegPressure. 599 for (MachineFunction::iterator MBB = MF->begin(), MBBEnd = MF->end(); 600 MBB != MBBEnd; ++MBB) { 601 602 Scheduler.startBlock(&*MBB); 603 604 #ifndef NDEBUG 605 if (SchedOnlyFunc.getNumOccurrences() && SchedOnlyFunc != MF->getName()) 606 continue; 607 if (SchedOnlyBlock.getNumOccurrences() 608 && (int)SchedOnlyBlock != MBB->getNumber()) 609 continue; 610 #endif 611 612 // Break the block into scheduling regions [I, RegionEnd). RegionEnd 613 // points to the scheduling boundary at the bottom of the region. The DAG 614 // does not include RegionEnd, but the region does (i.e. the next 615 // RegionEnd is above the previous RegionBegin). If the current block has 616 // no terminator then RegionEnd == MBB->end() for the bottom region. 617 // 618 // All the regions of MBB are first found and stored in MBBRegions, which 619 // will be processed (MBB) top-down if initialized with true. 620 // 621 // The Scheduler may insert instructions during either schedule() or 622 // exitRegion(), even for empty regions. So the local iterators 'I' and 623 // 'RegionEnd' are invalid across these calls. Instructions must not be 624 // added to other regions than the current one without updating MBBRegions. 625 626 MBBRegionsVector MBBRegions; 627 getSchedRegions(&*MBB, MBBRegions, Scheduler.doMBBSchedRegionsTopDown()); 628 for (const SchedRegion &R : MBBRegions) { 629 MachineBasicBlock::iterator I = R.RegionBegin; 630 MachineBasicBlock::iterator RegionEnd = R.RegionEnd; 631 unsigned NumRegionInstrs = R.NumRegionInstrs; 632 633 // Notify the scheduler of the region, even if we may skip scheduling 634 // it. Perhaps it still needs to be bundled. 635 Scheduler.enterRegion(&*MBB, I, RegionEnd, NumRegionInstrs); 636 637 // Skip empty scheduling regions (0 or 1 schedulable instructions). 638 if (I == RegionEnd || I == std::prev(RegionEnd)) { 639 // Close the current region. Bundle the terminator if needed. 640 // This invalidates 'RegionEnd' and 'I'. 641 Scheduler.exitRegion(); 642 continue; 643 } 644 LLVM_DEBUG(dbgs() << "********** MI Scheduling **********\n"); 645 LLVM_DEBUG(dbgs() << MF->getName() << ":" << printMBBReference(*MBB) 646 << " " << MBB->getName() << "\n From: " << *I 647 << " To: "; 648 if (RegionEnd != MBB->end()) dbgs() << *RegionEnd; 649 else dbgs() << "End\n"; 650 dbgs() << " RegionInstrs: " << NumRegionInstrs << '\n'); 651 if (DumpCriticalPathLength) { 652 errs() << MF->getName(); 653 errs() << ":%bb. " << MBB->getNumber(); 654 errs() << " " << MBB->getName() << " \n"; 655 } 656 657 // Schedule a region: possibly reorder instructions. 658 // This invalidates the original region iterators. 659 Scheduler.schedule(); 660 661 // Close the current region. 662 Scheduler.exitRegion(); 663 } 664 Scheduler.finishBlock(); 665 // FIXME: Ideally, no further passes should rely on kill flags. However, 666 // thumb2 size reduction is currently an exception, so the PostMIScheduler 667 // needs to do this. 668 if (FixKillFlags) 669 Scheduler.fixupKills(*MBB); 670 } 671 Scheduler.finalizeSchedule(); 672 } 673 674 void MachineSchedulerBase::print(raw_ostream &O, const Module* m) const { 675 // unimplemented 676 } 677 678 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 679 LLVM_DUMP_METHOD void ReadyQueue::dump() const { 680 dbgs() << "Queue " << Name << ": "; 681 for (const SUnit *SU : Queue) 682 dbgs() << SU->NodeNum << " "; 683 dbgs() << "\n"; 684 } 685 #endif 686 687 //===----------------------------------------------------------------------===// 688 // ScheduleDAGMI - Basic machine instruction scheduling. This is 689 // independent of PreRA/PostRA scheduling and involves no extra book-keeping for 690 // virtual registers. 691 // ===----------------------------------------------------------------------===/ 692 693 // Provide a vtable anchor. 694 ScheduleDAGMI::~ScheduleDAGMI() = default; 695 696 /// ReleaseSucc - Decrement the NumPredsLeft count of a successor. When 697 /// NumPredsLeft reaches zero, release the successor node. 698 /// 699 /// FIXME: Adjust SuccSU height based on MinLatency. 700 void ScheduleDAGMI::releaseSucc(SUnit *SU, SDep *SuccEdge) { 701 SUnit *SuccSU = SuccEdge->getSUnit(); 702 703 if (SuccEdge->isWeak()) { 704 --SuccSU->WeakPredsLeft; 705 if (SuccEdge->isCluster()) 706 NextClusterSucc = SuccSU; 707 return; 708 } 709 #ifndef NDEBUG 710 if (SuccSU->NumPredsLeft == 0) { 711 dbgs() << "*** Scheduling failed! ***\n"; 712 dumpNode(*SuccSU); 713 dbgs() << " has been released too many times!\n"; 714 llvm_unreachable(nullptr); 715 } 716 #endif 717 // SU->TopReadyCycle was set to CurrCycle when it was scheduled. However, 718 // CurrCycle may have advanced since then. 719 if (SuccSU->TopReadyCycle < SU->TopReadyCycle + SuccEdge->getLatency()) 720 SuccSU->TopReadyCycle = SU->TopReadyCycle + SuccEdge->getLatency(); 721 722 --SuccSU->NumPredsLeft; 723 if (SuccSU->NumPredsLeft == 0 && SuccSU != &ExitSU) 724 SchedImpl->releaseTopNode(SuccSU); 725 } 726 727 /// releaseSuccessors - Call releaseSucc on each of SU's successors. 728 void ScheduleDAGMI::releaseSuccessors(SUnit *SU) { 729 for (SDep &Succ : SU->Succs) 730 releaseSucc(SU, &Succ); 731 } 732 733 /// ReleasePred - Decrement the NumSuccsLeft count of a predecessor. When 734 /// NumSuccsLeft reaches zero, release the predecessor node. 735 /// 736 /// FIXME: Adjust PredSU height based on MinLatency. 737 void ScheduleDAGMI::releasePred(SUnit *SU, SDep *PredEdge) { 738 SUnit *PredSU = PredEdge->getSUnit(); 739 740 if (PredEdge->isWeak()) { 741 --PredSU->WeakSuccsLeft; 742 if (PredEdge->isCluster()) 743 NextClusterPred = PredSU; 744 return; 745 } 746 #ifndef NDEBUG 747 if (PredSU->NumSuccsLeft == 0) { 748 dbgs() << "*** Scheduling failed! ***\n"; 749 dumpNode(*PredSU); 750 dbgs() << " has been released too many times!\n"; 751 llvm_unreachable(nullptr); 752 } 753 #endif 754 // SU->BotReadyCycle was set to CurrCycle when it was scheduled. However, 755 // CurrCycle may have advanced since then. 756 if (PredSU->BotReadyCycle < SU->BotReadyCycle + PredEdge->getLatency()) 757 PredSU->BotReadyCycle = SU->BotReadyCycle + PredEdge->getLatency(); 758 759 --PredSU->NumSuccsLeft; 760 if (PredSU->NumSuccsLeft == 0 && PredSU != &EntrySU) 761 SchedImpl->releaseBottomNode(PredSU); 762 } 763 764 /// releasePredecessors - Call releasePred on each of SU's predecessors. 765 void ScheduleDAGMI::releasePredecessors(SUnit *SU) { 766 for (SDep &Pred : SU->Preds) 767 releasePred(SU, &Pred); 768 } 769 770 void ScheduleDAGMI::startBlock(MachineBasicBlock *bb) { 771 ScheduleDAGInstrs::startBlock(bb); 772 SchedImpl->enterMBB(bb); 773 } 774 775 void ScheduleDAGMI::finishBlock() { 776 SchedImpl->leaveMBB(); 777 ScheduleDAGInstrs::finishBlock(); 778 } 779 780 /// enterRegion - Called back from PostMachineScheduler::runOnMachineFunction 781 /// after crossing a scheduling boundary. [begin, end) includes all instructions 782 /// in the region, including the boundary itself and single-instruction regions 783 /// that don't get scheduled. 784 void ScheduleDAGMI::enterRegion(MachineBasicBlock *bb, 785 MachineBasicBlock::iterator begin, 786 MachineBasicBlock::iterator end, 787 unsigned regioninstrs) 788 { 789 ScheduleDAGInstrs::enterRegion(bb, begin, end, regioninstrs); 790 791 SchedImpl->initPolicy(begin, end, regioninstrs); 792 } 793 794 /// This is normally called from the main scheduler loop but may also be invoked 795 /// by the scheduling strategy to perform additional code motion. 796 void ScheduleDAGMI::moveInstruction( 797 MachineInstr *MI, MachineBasicBlock::iterator InsertPos) { 798 // Advance RegionBegin if the first instruction moves down. 799 if (&*RegionBegin == MI) 800 ++RegionBegin; 801 802 // Update the instruction stream. 803 BB->splice(InsertPos, BB, MI); 804 805 // Update LiveIntervals 806 if (LIS) 807 LIS->handleMove(*MI, /*UpdateFlags=*/true); 808 809 // Recede RegionBegin if an instruction moves above the first. 810 if (RegionBegin == InsertPos) 811 RegionBegin = MI; 812 } 813 814 bool ScheduleDAGMI::checkSchedLimit() { 815 #if LLVM_ENABLE_ABI_BREAKING_CHECKS && !defined(NDEBUG) 816 if (NumInstrsScheduled == MISchedCutoff && MISchedCutoff != ~0U) { 817 CurrentTop = CurrentBottom; 818 return false; 819 } 820 ++NumInstrsScheduled; 821 #endif 822 return true; 823 } 824 825 /// Per-region scheduling driver, called back from 826 /// PostMachineScheduler::runOnMachineFunction. This is a simplified driver 827 /// that does not consider liveness or register pressure. It is useful for 828 /// PostRA scheduling and potentially other custom schedulers. 829 void ScheduleDAGMI::schedule() { 830 LLVM_DEBUG(dbgs() << "ScheduleDAGMI::schedule starting\n"); 831 LLVM_DEBUG(SchedImpl->dumpPolicy()); 832 833 // Build the DAG. 834 buildSchedGraph(AA); 835 836 postProcessDAG(); 837 838 SmallVector<SUnit*, 8> TopRoots, BotRoots; 839 findRootsAndBiasEdges(TopRoots, BotRoots); 840 841 LLVM_DEBUG(dump()); 842 if (PrintDAGs) dump(); 843 if (ViewMISchedDAGs) viewGraph(); 844 845 // Initialize the strategy before modifying the DAG. 846 // This may initialize a DFSResult to be used for queue priority. 847 SchedImpl->initialize(this); 848 849 // Initialize ready queues now that the DAG and priority data are finalized. 850 initQueues(TopRoots, BotRoots); 851 852 bool IsTopNode = false; 853 while (true) { 854 LLVM_DEBUG(dbgs() << "** ScheduleDAGMI::schedule picking next node\n"); 855 SUnit *SU = SchedImpl->pickNode(IsTopNode); 856 if (!SU) break; 857 858 assert(!SU->isScheduled && "Node already scheduled"); 859 if (!checkSchedLimit()) 860 break; 861 862 MachineInstr *MI = SU->getInstr(); 863 if (IsTopNode) { 864 assert(SU->isTopReady() && "node still has unscheduled dependencies"); 865 if (&*CurrentTop == MI) 866 CurrentTop = nextIfDebug(++CurrentTop, CurrentBottom); 867 else 868 moveInstruction(MI, CurrentTop); 869 } else { 870 assert(SU->isBottomReady() && "node still has unscheduled dependencies"); 871 MachineBasicBlock::iterator priorII = 872 priorNonDebug(CurrentBottom, CurrentTop); 873 if (&*priorII == MI) 874 CurrentBottom = priorII; 875 else { 876 if (&*CurrentTop == MI) 877 CurrentTop = nextIfDebug(++CurrentTop, priorII); 878 moveInstruction(MI, CurrentBottom); 879 CurrentBottom = MI; 880 } 881 } 882 // Notify the scheduling strategy before updating the DAG. 883 // This sets the scheduled node's ReadyCycle to CurrCycle. When updateQueues 884 // runs, it can then use the accurate ReadyCycle time to determine whether 885 // newly released nodes can move to the readyQ. 886 SchedImpl->schedNode(SU, IsTopNode); 887 888 updateQueues(SU, IsTopNode); 889 } 890 assert(CurrentTop == CurrentBottom && "Nonempty unscheduled zone."); 891 892 placeDebugValues(); 893 894 LLVM_DEBUG({ 895 dbgs() << "*** Final schedule for " 896 << printMBBReference(*begin()->getParent()) << " ***\n"; 897 dumpSchedule(); 898 dbgs() << '\n'; 899 }); 900 } 901 902 /// Apply each ScheduleDAGMutation step in order. 903 void ScheduleDAGMI::postProcessDAG() { 904 for (auto &m : Mutations) 905 m->apply(this); 906 } 907 908 void ScheduleDAGMI:: 909 findRootsAndBiasEdges(SmallVectorImpl<SUnit*> &TopRoots, 910 SmallVectorImpl<SUnit*> &BotRoots) { 911 for (SUnit &SU : SUnits) { 912 assert(!SU.isBoundaryNode() && "Boundary node should not be in SUnits"); 913 914 // Order predecessors so DFSResult follows the critical path. 915 SU.biasCriticalPath(); 916 917 // A SUnit is ready to top schedule if it has no predecessors. 918 if (!SU.NumPredsLeft) 919 TopRoots.push_back(&SU); 920 // A SUnit is ready to bottom schedule if it has no successors. 921 if (!SU.NumSuccsLeft) 922 BotRoots.push_back(&SU); 923 } 924 ExitSU.biasCriticalPath(); 925 } 926 927 /// Identify DAG roots and setup scheduler queues. 928 void ScheduleDAGMI::initQueues(ArrayRef<SUnit*> TopRoots, 929 ArrayRef<SUnit*> BotRoots) { 930 NextClusterSucc = nullptr; 931 NextClusterPred = nullptr; 932 933 // Release all DAG roots for scheduling, not including EntrySU/ExitSU. 934 // 935 // Nodes with unreleased weak edges can still be roots. 936 // Release top roots in forward order. 937 for (SUnit *SU : TopRoots) 938 SchedImpl->releaseTopNode(SU); 939 940 // Release bottom roots in reverse order so the higher priority nodes appear 941 // first. This is more natural and slightly more efficient. 942 for (SmallVectorImpl<SUnit*>::const_reverse_iterator 943 I = BotRoots.rbegin(), E = BotRoots.rend(); I != E; ++I) { 944 SchedImpl->releaseBottomNode(*I); 945 } 946 947 releaseSuccessors(&EntrySU); 948 releasePredecessors(&ExitSU); 949 950 SchedImpl->registerRoots(); 951 952 // Advance past initial DebugValues. 953 CurrentTop = nextIfDebug(RegionBegin, RegionEnd); 954 CurrentBottom = RegionEnd; 955 } 956 957 /// Update scheduler queues after scheduling an instruction. 958 void ScheduleDAGMI::updateQueues(SUnit *SU, bool IsTopNode) { 959 // Release dependent instructions for scheduling. 960 if (IsTopNode) 961 releaseSuccessors(SU); 962 else 963 releasePredecessors(SU); 964 965 SU->isScheduled = true; 966 } 967 968 /// Reinsert any remaining debug_values, just like the PostRA scheduler. 969 void ScheduleDAGMI::placeDebugValues() { 970 // If first instruction was a DBG_VALUE then put it back. 971 if (FirstDbgValue) { 972 BB->splice(RegionBegin, BB, FirstDbgValue); 973 RegionBegin = FirstDbgValue; 974 } 975 976 for (std::vector<std::pair<MachineInstr *, MachineInstr *>>::iterator 977 DI = DbgValues.end(), DE = DbgValues.begin(); DI != DE; --DI) { 978 std::pair<MachineInstr *, MachineInstr *> P = *std::prev(DI); 979 MachineInstr *DbgValue = P.first; 980 MachineBasicBlock::iterator OrigPrevMI = P.second; 981 if (&*RegionBegin == DbgValue) 982 ++RegionBegin; 983 BB->splice(std::next(OrigPrevMI), BB, DbgValue); 984 if (RegionEnd != BB->end() && OrigPrevMI == &*RegionEnd) 985 RegionEnd = DbgValue; 986 } 987 } 988 989 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 990 static const char *scheduleTableLegend = " i: issue\n x: resource booked"; 991 992 LLVM_DUMP_METHOD void ScheduleDAGMI::dumpScheduleTraceTopDown() const { 993 // Bail off when there is no schedule model to query. 994 if (!SchedModel.hasInstrSchedModel()) 995 return; 996 997 // Nothing to show if there is no or just one instruction. 998 if (BB->size() < 2) 999 return; 1000 1001 dbgs() << " * Schedule table (TopDown):\n"; 1002 dbgs() << scheduleTableLegend << "\n"; 1003 const unsigned FirstCycle = getSUnit(&*(std::begin(*this)))->TopReadyCycle; 1004 unsigned LastCycle = getSUnit(&*(std::prev(std::end(*this))))->TopReadyCycle; 1005 for (MachineInstr &MI : *this) { 1006 SUnit *SU = getSUnit(&MI); 1007 if (!SU) 1008 continue; 1009 const MCSchedClassDesc *SC = getSchedClass(SU); 1010 for (TargetSchedModel::ProcResIter PI = SchedModel.getWriteProcResBegin(SC), 1011 PE = SchedModel.getWriteProcResEnd(SC); 1012 PI != PE; ++PI) { 1013 if (SU->TopReadyCycle + PI->ReleaseAtCycle - 1 > LastCycle) 1014 LastCycle = SU->TopReadyCycle + PI->ReleaseAtCycle - 1; 1015 } 1016 } 1017 // Print the header with the cycles 1018 dbgs() << llvm::left_justify("Cycle", HeaderColWidth); 1019 for (unsigned C = FirstCycle; C <= LastCycle; ++C) 1020 dbgs() << llvm::left_justify("| " + std::to_string(C), ColWidth); 1021 dbgs() << "|\n"; 1022 1023 for (MachineInstr &MI : *this) { 1024 SUnit *SU = getSUnit(&MI); 1025 if (!SU) { 1026 dbgs() << "Missing SUnit\n"; 1027 continue; 1028 } 1029 std::string NodeName("SU("); 1030 NodeName += std::to_string(SU->NodeNum) + ")"; 1031 dbgs() << llvm::left_justify(NodeName, HeaderColWidth); 1032 unsigned C = FirstCycle; 1033 for (; C <= LastCycle; ++C) { 1034 if (C == SU->TopReadyCycle) 1035 dbgs() << llvm::left_justify("| i", ColWidth); 1036 else 1037 dbgs() << llvm::left_justify("|", ColWidth); 1038 } 1039 dbgs() << "|\n"; 1040 const MCSchedClassDesc *SC = getSchedClass(SU); 1041 1042 SmallVector<MCWriteProcResEntry, 4> ResourcesIt( 1043 make_range(SchedModel.getWriteProcResBegin(SC), 1044 SchedModel.getWriteProcResEnd(SC))); 1045 1046 if (MISchedSortResourcesInTrace) 1047 llvm::stable_sort(ResourcesIt, 1048 [](const MCWriteProcResEntry &LHS, 1049 const MCWriteProcResEntry &RHS) -> bool { 1050 return LHS.AcquireAtCycle < RHS.AcquireAtCycle || 1051 (LHS.AcquireAtCycle == RHS.AcquireAtCycle && 1052 LHS.ReleaseAtCycle < RHS.ReleaseAtCycle); 1053 }); 1054 for (const MCWriteProcResEntry &PI : ResourcesIt) { 1055 C = FirstCycle; 1056 const std::string ResName = 1057 SchedModel.getResourceName(PI.ProcResourceIdx); 1058 dbgs() << llvm::right_justify(ResName + " ", HeaderColWidth); 1059 for (; C < SU->TopReadyCycle + PI.AcquireAtCycle; ++C) { 1060 dbgs() << llvm::left_justify("|", ColWidth); 1061 } 1062 for (unsigned I = 0, E = PI.ReleaseAtCycle - PI.AcquireAtCycle; I != E; 1063 ++I, ++C) 1064 dbgs() << llvm::left_justify("| x", ColWidth); 1065 while (C++ <= LastCycle) 1066 dbgs() << llvm::left_justify("|", ColWidth); 1067 // Place end char 1068 dbgs() << "| \n"; 1069 } 1070 } 1071 } 1072 1073 LLVM_DUMP_METHOD void ScheduleDAGMI::dumpScheduleTraceBottomUp() const { 1074 // Bail off when there is no schedule model to query. 1075 if (!SchedModel.hasInstrSchedModel()) 1076 return; 1077 1078 // Nothing to show if there is no or just one instruction. 1079 if (BB->size() < 2) 1080 return; 1081 1082 dbgs() << " * Schedule table (BottomUp):\n"; 1083 dbgs() << scheduleTableLegend << "\n"; 1084 1085 const int FirstCycle = getSUnit(&*(std::begin(*this)))->BotReadyCycle; 1086 int LastCycle = getSUnit(&*(std::prev(std::end(*this))))->BotReadyCycle; 1087 for (MachineInstr &MI : *this) { 1088 SUnit *SU = getSUnit(&MI); 1089 if (!SU) 1090 continue; 1091 const MCSchedClassDesc *SC = getSchedClass(SU); 1092 for (TargetSchedModel::ProcResIter PI = SchedModel.getWriteProcResBegin(SC), 1093 PE = SchedModel.getWriteProcResEnd(SC); 1094 PI != PE; ++PI) { 1095 if ((int)SU->BotReadyCycle - PI->ReleaseAtCycle + 1 < LastCycle) 1096 LastCycle = (int)SU->BotReadyCycle - PI->ReleaseAtCycle + 1; 1097 } 1098 } 1099 // Print the header with the cycles 1100 dbgs() << llvm::left_justify("Cycle", HeaderColWidth); 1101 for (int C = FirstCycle; C >= LastCycle; --C) 1102 dbgs() << llvm::left_justify("| " + std::to_string(C), ColWidth); 1103 dbgs() << "|\n"; 1104 1105 for (MachineInstr &MI : *this) { 1106 SUnit *SU = getSUnit(&MI); 1107 if (!SU) { 1108 dbgs() << "Missing SUnit\n"; 1109 continue; 1110 } 1111 std::string NodeName("SU("); 1112 NodeName += std::to_string(SU->NodeNum) + ")"; 1113 dbgs() << llvm::left_justify(NodeName, HeaderColWidth); 1114 int C = FirstCycle; 1115 for (; C >= LastCycle; --C) { 1116 if (C == (int)SU->BotReadyCycle) 1117 dbgs() << llvm::left_justify("| i", ColWidth); 1118 else 1119 dbgs() << llvm::left_justify("|", ColWidth); 1120 } 1121 dbgs() << "|\n"; 1122 const MCSchedClassDesc *SC = getSchedClass(SU); 1123 SmallVector<MCWriteProcResEntry, 4> ResourcesIt( 1124 make_range(SchedModel.getWriteProcResBegin(SC), 1125 SchedModel.getWriteProcResEnd(SC))); 1126 1127 if (MISchedSortResourcesInTrace) 1128 llvm::stable_sort(ResourcesIt, 1129 [](const MCWriteProcResEntry &LHS, 1130 const MCWriteProcResEntry &RHS) -> bool { 1131 return LHS.AcquireAtCycle < RHS.AcquireAtCycle || 1132 (LHS.AcquireAtCycle == RHS.AcquireAtCycle && 1133 LHS.ReleaseAtCycle < RHS.ReleaseAtCycle); 1134 }); 1135 for (const MCWriteProcResEntry &PI : ResourcesIt) { 1136 C = FirstCycle; 1137 const std::string ResName = 1138 SchedModel.getResourceName(PI.ProcResourceIdx); 1139 dbgs() << llvm::right_justify(ResName + " ", HeaderColWidth); 1140 for (; C > ((int)SU->BotReadyCycle - (int)PI.AcquireAtCycle); --C) { 1141 dbgs() << llvm::left_justify("|", ColWidth); 1142 } 1143 for (unsigned I = 0, E = PI.ReleaseAtCycle - PI.AcquireAtCycle; I != E; 1144 ++I, --C) 1145 dbgs() << llvm::left_justify("| x", ColWidth); 1146 while (C-- >= LastCycle) 1147 dbgs() << llvm::left_justify("|", ColWidth); 1148 // Place end char 1149 dbgs() << "| \n"; 1150 } 1151 } 1152 } 1153 #endif 1154 1155 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1156 LLVM_DUMP_METHOD void ScheduleDAGMI::dumpSchedule() const { 1157 if (MISchedDumpScheduleTrace) { 1158 if (DumpDir == DumpDirection::TopDown) 1159 dumpScheduleTraceTopDown(); 1160 else if (DumpDir == DumpDirection::BottomUp) 1161 dumpScheduleTraceBottomUp(); 1162 else if (DumpDir == DumpDirection::Bidirectional) { 1163 dbgs() << "* Schedule table (Bidirectional): not implemented\n"; 1164 } else { 1165 dbgs() << "* Schedule table: DumpDirection not set.\n"; 1166 } 1167 } 1168 1169 for (MachineInstr &MI : *this) { 1170 if (SUnit *SU = getSUnit(&MI)) 1171 dumpNode(*SU); 1172 else 1173 dbgs() << "Missing SUnit\n"; 1174 } 1175 } 1176 #endif 1177 1178 //===----------------------------------------------------------------------===// 1179 // ScheduleDAGMILive - Base class for MachineInstr scheduling with LiveIntervals 1180 // preservation. 1181 //===----------------------------------------------------------------------===// 1182 1183 ScheduleDAGMILive::~ScheduleDAGMILive() { 1184 delete DFSResult; 1185 } 1186 1187 void ScheduleDAGMILive::collectVRegUses(SUnit &SU) { 1188 const MachineInstr &MI = *SU.getInstr(); 1189 for (const MachineOperand &MO : MI.operands()) { 1190 if (!MO.isReg()) 1191 continue; 1192 if (!MO.readsReg()) 1193 continue; 1194 if (TrackLaneMasks && !MO.isUse()) 1195 continue; 1196 1197 Register Reg = MO.getReg(); 1198 if (!Reg.isVirtual()) 1199 continue; 1200 1201 // Ignore re-defs. 1202 if (TrackLaneMasks) { 1203 bool FoundDef = false; 1204 for (const MachineOperand &MO2 : MI.all_defs()) { 1205 if (MO2.getReg() == Reg && !MO2.isDead()) { 1206 FoundDef = true; 1207 break; 1208 } 1209 } 1210 if (FoundDef) 1211 continue; 1212 } 1213 1214 // Record this local VReg use. 1215 VReg2SUnitMultiMap::iterator UI = VRegUses.find(Reg); 1216 for (; UI != VRegUses.end(); ++UI) { 1217 if (UI->SU == &SU) 1218 break; 1219 } 1220 if (UI == VRegUses.end()) 1221 VRegUses.insert(VReg2SUnit(Reg, LaneBitmask::getNone(), &SU)); 1222 } 1223 } 1224 1225 /// enterRegion - Called back from MachineScheduler::runOnMachineFunction after 1226 /// crossing a scheduling boundary. [begin, end) includes all instructions in 1227 /// the region, including the boundary itself and single-instruction regions 1228 /// that don't get scheduled. 1229 void ScheduleDAGMILive::enterRegion(MachineBasicBlock *bb, 1230 MachineBasicBlock::iterator begin, 1231 MachineBasicBlock::iterator end, 1232 unsigned regioninstrs) 1233 { 1234 // ScheduleDAGMI initializes SchedImpl's per-region policy. 1235 ScheduleDAGMI::enterRegion(bb, begin, end, regioninstrs); 1236 1237 // For convenience remember the end of the liveness region. 1238 LiveRegionEnd = (RegionEnd == bb->end()) ? RegionEnd : std::next(RegionEnd); 1239 1240 SUPressureDiffs.clear(); 1241 1242 ShouldTrackPressure = SchedImpl->shouldTrackPressure(); 1243 ShouldTrackLaneMasks = SchedImpl->shouldTrackLaneMasks(); 1244 1245 assert((!ShouldTrackLaneMasks || ShouldTrackPressure) && 1246 "ShouldTrackLaneMasks requires ShouldTrackPressure"); 1247 } 1248 1249 // Setup the register pressure trackers for the top scheduled and bottom 1250 // scheduled regions. 1251 void ScheduleDAGMILive::initRegPressure() { 1252 VRegUses.clear(); 1253 VRegUses.setUniverse(MRI.getNumVirtRegs()); 1254 for (SUnit &SU : SUnits) 1255 collectVRegUses(SU); 1256 1257 TopRPTracker.init(&MF, RegClassInfo, LIS, BB, RegionBegin, 1258 ShouldTrackLaneMasks, false); 1259 BotRPTracker.init(&MF, RegClassInfo, LIS, BB, LiveRegionEnd, 1260 ShouldTrackLaneMasks, false); 1261 1262 // Close the RPTracker to finalize live ins. 1263 RPTracker.closeRegion(); 1264 1265 LLVM_DEBUG(RPTracker.dump()); 1266 1267 // Initialize the live ins and live outs. 1268 TopRPTracker.addLiveRegs(RPTracker.getPressure().LiveInRegs); 1269 BotRPTracker.addLiveRegs(RPTracker.getPressure().LiveOutRegs); 1270 1271 // Close one end of the tracker so we can call 1272 // getMaxUpward/DownwardPressureDelta before advancing across any 1273 // instructions. This converts currently live regs into live ins/outs. 1274 TopRPTracker.closeTop(); 1275 BotRPTracker.closeBottom(); 1276 1277 BotRPTracker.initLiveThru(RPTracker); 1278 if (!BotRPTracker.getLiveThru().empty()) { 1279 TopRPTracker.initLiveThru(BotRPTracker.getLiveThru()); 1280 LLVM_DEBUG(dbgs() << "Live Thru: "; 1281 dumpRegSetPressure(BotRPTracker.getLiveThru(), TRI)); 1282 }; 1283 1284 // For each live out vreg reduce the pressure change associated with other 1285 // uses of the same vreg below the live-out reaching def. 1286 updatePressureDiffs(RPTracker.getPressure().LiveOutRegs); 1287 1288 // Account for liveness generated by the region boundary. 1289 if (LiveRegionEnd != RegionEnd) { 1290 SmallVector<RegisterMaskPair, 8> LiveUses; 1291 BotRPTracker.recede(&LiveUses); 1292 updatePressureDiffs(LiveUses); 1293 } 1294 1295 LLVM_DEBUG(dbgs() << "Top Pressure:\n"; 1296 dumpRegSetPressure(TopRPTracker.getRegSetPressureAtPos(), TRI); 1297 dbgs() << "Bottom Pressure:\n"; 1298 dumpRegSetPressure(BotRPTracker.getRegSetPressureAtPos(), TRI);); 1299 1300 assert((BotRPTracker.getPos() == RegionEnd || 1301 (RegionEnd->isDebugInstr() && 1302 BotRPTracker.getPos() == priorNonDebug(RegionEnd, RegionBegin))) && 1303 "Can't find the region bottom"); 1304 1305 // Cache the list of excess pressure sets in this region. This will also track 1306 // the max pressure in the scheduled code for these sets. 1307 RegionCriticalPSets.clear(); 1308 const std::vector<unsigned> &RegionPressure = 1309 RPTracker.getPressure().MaxSetPressure; 1310 for (unsigned i = 0, e = RegionPressure.size(); i < e; ++i) { 1311 unsigned Limit = RegClassInfo->getRegPressureSetLimit(i); 1312 if (RegionPressure[i] > Limit) { 1313 LLVM_DEBUG(dbgs() << TRI->getRegPressureSetName(i) << " Limit " << Limit 1314 << " Actual " << RegionPressure[i] << "\n"); 1315 RegionCriticalPSets.push_back(PressureChange(i)); 1316 } 1317 } 1318 LLVM_DEBUG(dbgs() << "Excess PSets: "; 1319 for (const PressureChange &RCPS 1320 : RegionCriticalPSets) dbgs() 1321 << TRI->getRegPressureSetName(RCPS.getPSet()) << " "; 1322 dbgs() << "\n"); 1323 } 1324 1325 void ScheduleDAGMILive:: 1326 updateScheduledPressure(const SUnit *SU, 1327 const std::vector<unsigned> &NewMaxPressure) { 1328 const PressureDiff &PDiff = getPressureDiff(SU); 1329 unsigned CritIdx = 0, CritEnd = RegionCriticalPSets.size(); 1330 for (const PressureChange &PC : PDiff) { 1331 if (!PC.isValid()) 1332 break; 1333 unsigned ID = PC.getPSet(); 1334 while (CritIdx != CritEnd && RegionCriticalPSets[CritIdx].getPSet() < ID) 1335 ++CritIdx; 1336 if (CritIdx != CritEnd && RegionCriticalPSets[CritIdx].getPSet() == ID) { 1337 if ((int)NewMaxPressure[ID] > RegionCriticalPSets[CritIdx].getUnitInc() 1338 && NewMaxPressure[ID] <= (unsigned)std::numeric_limits<int16_t>::max()) 1339 RegionCriticalPSets[CritIdx].setUnitInc(NewMaxPressure[ID]); 1340 } 1341 unsigned Limit = RegClassInfo->getRegPressureSetLimit(ID); 1342 if (NewMaxPressure[ID] >= Limit - 2) { 1343 LLVM_DEBUG(dbgs() << " " << TRI->getRegPressureSetName(ID) << ": " 1344 << NewMaxPressure[ID] 1345 << ((NewMaxPressure[ID] > Limit) ? " > " : " <= ") 1346 << Limit << "(+ " << BotRPTracker.getLiveThru()[ID] 1347 << " livethru)\n"); 1348 } 1349 } 1350 } 1351 1352 /// Update the PressureDiff array for liveness after scheduling this 1353 /// instruction. 1354 void ScheduleDAGMILive::updatePressureDiffs( 1355 ArrayRef<RegisterMaskPair> LiveUses) { 1356 for (const RegisterMaskPair &P : LiveUses) { 1357 Register Reg = P.RegUnit; 1358 /// FIXME: Currently assuming single-use physregs. 1359 if (!Reg.isVirtual()) 1360 continue; 1361 1362 if (ShouldTrackLaneMasks) { 1363 // If the register has just become live then other uses won't change 1364 // this fact anymore => decrement pressure. 1365 // If the register has just become dead then other uses make it come 1366 // back to life => increment pressure. 1367 bool Decrement = P.LaneMask.any(); 1368 1369 for (const VReg2SUnit &V2SU 1370 : make_range(VRegUses.find(Reg), VRegUses.end())) { 1371 SUnit &SU = *V2SU.SU; 1372 if (SU.isScheduled || &SU == &ExitSU) 1373 continue; 1374 1375 PressureDiff &PDiff = getPressureDiff(&SU); 1376 PDiff.addPressureChange(Reg, Decrement, &MRI); 1377 LLVM_DEBUG(dbgs() << " UpdateRegP: SU(" << SU.NodeNum << ") " 1378 << printReg(Reg, TRI) << ':' 1379 << PrintLaneMask(P.LaneMask) << ' ' << *SU.getInstr(); 1380 dbgs() << " to "; PDiff.dump(*TRI);); 1381 } 1382 } else { 1383 assert(P.LaneMask.any()); 1384 LLVM_DEBUG(dbgs() << " LiveReg: " << printVRegOrUnit(Reg, TRI) << "\n"); 1385 // This may be called before CurrentBottom has been initialized. However, 1386 // BotRPTracker must have a valid position. We want the value live into the 1387 // instruction or live out of the block, so ask for the previous 1388 // instruction's live-out. 1389 const LiveInterval &LI = LIS->getInterval(Reg); 1390 VNInfo *VNI; 1391 MachineBasicBlock::const_iterator I = 1392 nextIfDebug(BotRPTracker.getPos(), BB->end()); 1393 if (I == BB->end()) 1394 VNI = LI.getVNInfoBefore(LIS->getMBBEndIdx(BB)); 1395 else { 1396 LiveQueryResult LRQ = LI.Query(LIS->getInstructionIndex(*I)); 1397 VNI = LRQ.valueIn(); 1398 } 1399 // RegisterPressureTracker guarantees that readsReg is true for LiveUses. 1400 assert(VNI && "No live value at use."); 1401 for (const VReg2SUnit &V2SU 1402 : make_range(VRegUses.find(Reg), VRegUses.end())) { 1403 SUnit *SU = V2SU.SU; 1404 // If this use comes before the reaching def, it cannot be a last use, 1405 // so decrease its pressure change. 1406 if (!SU->isScheduled && SU != &ExitSU) { 1407 LiveQueryResult LRQ = 1408 LI.Query(LIS->getInstructionIndex(*SU->getInstr())); 1409 if (LRQ.valueIn() == VNI) { 1410 PressureDiff &PDiff = getPressureDiff(SU); 1411 PDiff.addPressureChange(Reg, true, &MRI); 1412 LLVM_DEBUG(dbgs() << " UpdateRegP: SU(" << SU->NodeNum << ") " 1413 << *SU->getInstr(); 1414 dbgs() << " to "; PDiff.dump(*TRI);); 1415 } 1416 } 1417 } 1418 } 1419 } 1420 } 1421 1422 void ScheduleDAGMILive::dump() const { 1423 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1424 if (EntrySU.getInstr() != nullptr) 1425 dumpNodeAll(EntrySU); 1426 for (const SUnit &SU : SUnits) { 1427 dumpNodeAll(SU); 1428 if (ShouldTrackPressure) { 1429 dbgs() << " Pressure Diff : "; 1430 getPressureDiff(&SU).dump(*TRI); 1431 } 1432 dbgs() << " Single Issue : "; 1433 if (SchedModel.mustBeginGroup(SU.getInstr()) && 1434 SchedModel.mustEndGroup(SU.getInstr())) 1435 dbgs() << "true;"; 1436 else 1437 dbgs() << "false;"; 1438 dbgs() << '\n'; 1439 } 1440 if (ExitSU.getInstr() != nullptr) 1441 dumpNodeAll(ExitSU); 1442 #endif 1443 } 1444 1445 /// schedule - Called back from MachineScheduler::runOnMachineFunction 1446 /// after setting up the current scheduling region. [RegionBegin, RegionEnd) 1447 /// only includes instructions that have DAG nodes, not scheduling boundaries. 1448 /// 1449 /// This is a skeletal driver, with all the functionality pushed into helpers, 1450 /// so that it can be easily extended by experimental schedulers. Generally, 1451 /// implementing MachineSchedStrategy should be sufficient to implement a new 1452 /// scheduling algorithm. However, if a scheduler further subclasses 1453 /// ScheduleDAGMILive then it will want to override this virtual method in order 1454 /// to update any specialized state. 1455 void ScheduleDAGMILive::schedule() { 1456 LLVM_DEBUG(dbgs() << "ScheduleDAGMILive::schedule starting\n"); 1457 LLVM_DEBUG(SchedImpl->dumpPolicy()); 1458 buildDAGWithRegPressure(); 1459 1460 postProcessDAG(); 1461 1462 SmallVector<SUnit*, 8> TopRoots, BotRoots; 1463 findRootsAndBiasEdges(TopRoots, BotRoots); 1464 1465 // Initialize the strategy before modifying the DAG. 1466 // This may initialize a DFSResult to be used for queue priority. 1467 SchedImpl->initialize(this); 1468 1469 LLVM_DEBUG(dump()); 1470 if (PrintDAGs) dump(); 1471 if (ViewMISchedDAGs) viewGraph(); 1472 1473 // Initialize ready queues now that the DAG and priority data are finalized. 1474 initQueues(TopRoots, BotRoots); 1475 1476 bool IsTopNode = false; 1477 while (true) { 1478 LLVM_DEBUG(dbgs() << "** ScheduleDAGMILive::schedule picking next node\n"); 1479 SUnit *SU = SchedImpl->pickNode(IsTopNode); 1480 if (!SU) break; 1481 1482 assert(!SU->isScheduled && "Node already scheduled"); 1483 if (!checkSchedLimit()) 1484 break; 1485 1486 scheduleMI(SU, IsTopNode); 1487 1488 if (DFSResult) { 1489 unsigned SubtreeID = DFSResult->getSubtreeID(SU); 1490 if (!ScheduledTrees.test(SubtreeID)) { 1491 ScheduledTrees.set(SubtreeID); 1492 DFSResult->scheduleTree(SubtreeID); 1493 SchedImpl->scheduleTree(SubtreeID); 1494 } 1495 } 1496 1497 // Notify the scheduling strategy after updating the DAG. 1498 SchedImpl->schedNode(SU, IsTopNode); 1499 1500 updateQueues(SU, IsTopNode); 1501 } 1502 assert(CurrentTop == CurrentBottom && "Nonempty unscheduled zone."); 1503 1504 placeDebugValues(); 1505 1506 LLVM_DEBUG({ 1507 dbgs() << "*** Final schedule for " 1508 << printMBBReference(*begin()->getParent()) << " ***\n"; 1509 dumpSchedule(); 1510 dbgs() << '\n'; 1511 }); 1512 } 1513 1514 /// Build the DAG and setup three register pressure trackers. 1515 void ScheduleDAGMILive::buildDAGWithRegPressure() { 1516 if (!ShouldTrackPressure) { 1517 RPTracker.reset(); 1518 RegionCriticalPSets.clear(); 1519 buildSchedGraph(AA); 1520 return; 1521 } 1522 1523 // Initialize the register pressure tracker used by buildSchedGraph. 1524 RPTracker.init(&MF, RegClassInfo, LIS, BB, LiveRegionEnd, 1525 ShouldTrackLaneMasks, /*TrackUntiedDefs=*/true); 1526 1527 // Account for liveness generate by the region boundary. 1528 if (LiveRegionEnd != RegionEnd) 1529 RPTracker.recede(); 1530 1531 // Build the DAG, and compute current register pressure. 1532 buildSchedGraph(AA, &RPTracker, &SUPressureDiffs, LIS, ShouldTrackLaneMasks); 1533 1534 // Initialize top/bottom trackers after computing region pressure. 1535 initRegPressure(); 1536 } 1537 1538 void ScheduleDAGMILive::computeDFSResult() { 1539 if (!DFSResult) 1540 DFSResult = new SchedDFSResult(/*BottomU*/true, MinSubtreeSize); 1541 DFSResult->clear(); 1542 ScheduledTrees.clear(); 1543 DFSResult->resize(SUnits.size()); 1544 DFSResult->compute(SUnits); 1545 ScheduledTrees.resize(DFSResult->getNumSubtrees()); 1546 } 1547 1548 /// Compute the max cyclic critical path through the DAG. The scheduling DAG 1549 /// only provides the critical path for single block loops. To handle loops that 1550 /// span blocks, we could use the vreg path latencies provided by 1551 /// MachineTraceMetrics instead. However, MachineTraceMetrics is not currently 1552 /// available for use in the scheduler. 1553 /// 1554 /// The cyclic path estimation identifies a def-use pair that crosses the back 1555 /// edge and considers the depth and height of the nodes. For example, consider 1556 /// the following instruction sequence where each instruction has unit latency 1557 /// and defines an eponymous virtual register: 1558 /// 1559 /// a->b(a,c)->c(b)->d(c)->exit 1560 /// 1561 /// The cyclic critical path is a two cycles: b->c->b 1562 /// The acyclic critical path is four cycles: a->b->c->d->exit 1563 /// LiveOutHeight = height(c) = len(c->d->exit) = 2 1564 /// LiveOutDepth = depth(c) + 1 = len(a->b->c) + 1 = 3 1565 /// LiveInHeight = height(b) + 1 = len(b->c->d->exit) + 1 = 4 1566 /// LiveInDepth = depth(b) = len(a->b) = 1 1567 /// 1568 /// LiveOutDepth - LiveInDepth = 3 - 1 = 2 1569 /// LiveInHeight - LiveOutHeight = 4 - 2 = 2 1570 /// CyclicCriticalPath = min(2, 2) = 2 1571 /// 1572 /// This could be relevant to PostRA scheduling, but is currently implemented 1573 /// assuming LiveIntervals. 1574 unsigned ScheduleDAGMILive::computeCyclicCriticalPath() { 1575 // This only applies to single block loop. 1576 if (!BB->isSuccessor(BB)) 1577 return 0; 1578 1579 unsigned MaxCyclicLatency = 0; 1580 // Visit each live out vreg def to find def/use pairs that cross iterations. 1581 for (const RegisterMaskPair &P : RPTracker.getPressure().LiveOutRegs) { 1582 Register Reg = P.RegUnit; 1583 if (!Reg.isVirtual()) 1584 continue; 1585 const LiveInterval &LI = LIS->getInterval(Reg); 1586 const VNInfo *DefVNI = LI.getVNInfoBefore(LIS->getMBBEndIdx(BB)); 1587 if (!DefVNI) 1588 continue; 1589 1590 MachineInstr *DefMI = LIS->getInstructionFromIndex(DefVNI->def); 1591 const SUnit *DefSU = getSUnit(DefMI); 1592 if (!DefSU) 1593 continue; 1594 1595 unsigned LiveOutHeight = DefSU->getHeight(); 1596 unsigned LiveOutDepth = DefSU->getDepth() + DefSU->Latency; 1597 // Visit all local users of the vreg def. 1598 for (const VReg2SUnit &V2SU 1599 : make_range(VRegUses.find(Reg), VRegUses.end())) { 1600 SUnit *SU = V2SU.SU; 1601 if (SU == &ExitSU) 1602 continue; 1603 1604 // Only consider uses of the phi. 1605 LiveQueryResult LRQ = LI.Query(LIS->getInstructionIndex(*SU->getInstr())); 1606 if (!LRQ.valueIn()->isPHIDef()) 1607 continue; 1608 1609 // Assume that a path spanning two iterations is a cycle, which could 1610 // overestimate in strange cases. This allows cyclic latency to be 1611 // estimated as the minimum slack of the vreg's depth or height. 1612 unsigned CyclicLatency = 0; 1613 if (LiveOutDepth > SU->getDepth()) 1614 CyclicLatency = LiveOutDepth - SU->getDepth(); 1615 1616 unsigned LiveInHeight = SU->getHeight() + DefSU->Latency; 1617 if (LiveInHeight > LiveOutHeight) { 1618 if (LiveInHeight - LiveOutHeight < CyclicLatency) 1619 CyclicLatency = LiveInHeight - LiveOutHeight; 1620 } else 1621 CyclicLatency = 0; 1622 1623 LLVM_DEBUG(dbgs() << "Cyclic Path: SU(" << DefSU->NodeNum << ") -> SU(" 1624 << SU->NodeNum << ") = " << CyclicLatency << "c\n"); 1625 if (CyclicLatency > MaxCyclicLatency) 1626 MaxCyclicLatency = CyclicLatency; 1627 } 1628 } 1629 LLVM_DEBUG(dbgs() << "Cyclic Critical Path: " << MaxCyclicLatency << "c\n"); 1630 return MaxCyclicLatency; 1631 } 1632 1633 /// Release ExitSU predecessors and setup scheduler queues. Re-position 1634 /// the Top RP tracker in case the region beginning has changed. 1635 void ScheduleDAGMILive::initQueues(ArrayRef<SUnit*> TopRoots, 1636 ArrayRef<SUnit*> BotRoots) { 1637 ScheduleDAGMI::initQueues(TopRoots, BotRoots); 1638 if (ShouldTrackPressure) { 1639 assert(TopRPTracker.getPos() == RegionBegin && "bad initial Top tracker"); 1640 TopRPTracker.setPos(CurrentTop); 1641 } 1642 } 1643 1644 /// Move an instruction and update register pressure. 1645 void ScheduleDAGMILive::scheduleMI(SUnit *SU, bool IsTopNode) { 1646 // Move the instruction to its new location in the instruction stream. 1647 MachineInstr *MI = SU->getInstr(); 1648 1649 if (IsTopNode) { 1650 assert(SU->isTopReady() && "node still has unscheduled dependencies"); 1651 if (&*CurrentTop == MI) 1652 CurrentTop = nextIfDebug(++CurrentTop, CurrentBottom); 1653 else { 1654 moveInstruction(MI, CurrentTop); 1655 TopRPTracker.setPos(MI); 1656 } 1657 1658 if (ShouldTrackPressure) { 1659 // Update top scheduled pressure. 1660 RegisterOperands RegOpers; 1661 RegOpers.collect(*MI, *TRI, MRI, ShouldTrackLaneMasks, false); 1662 if (ShouldTrackLaneMasks) { 1663 // Adjust liveness and add missing dead+read-undef flags. 1664 SlotIndex SlotIdx = LIS->getInstructionIndex(*MI).getRegSlot(); 1665 RegOpers.adjustLaneLiveness(*LIS, MRI, SlotIdx, MI); 1666 } else { 1667 // Adjust for missing dead-def flags. 1668 RegOpers.detectDeadDefs(*MI, *LIS); 1669 } 1670 1671 TopRPTracker.advance(RegOpers); 1672 assert(TopRPTracker.getPos() == CurrentTop && "out of sync"); 1673 LLVM_DEBUG(dbgs() << "Top Pressure:\n"; dumpRegSetPressure( 1674 TopRPTracker.getRegSetPressureAtPos(), TRI);); 1675 1676 updateScheduledPressure(SU, TopRPTracker.getPressure().MaxSetPressure); 1677 } 1678 } else { 1679 assert(SU->isBottomReady() && "node still has unscheduled dependencies"); 1680 MachineBasicBlock::iterator priorII = 1681 priorNonDebug(CurrentBottom, CurrentTop); 1682 if (&*priorII == MI) 1683 CurrentBottom = priorII; 1684 else { 1685 if (&*CurrentTop == MI) { 1686 CurrentTop = nextIfDebug(++CurrentTop, priorII); 1687 TopRPTracker.setPos(CurrentTop); 1688 } 1689 moveInstruction(MI, CurrentBottom); 1690 CurrentBottom = MI; 1691 BotRPTracker.setPos(CurrentBottom); 1692 } 1693 if (ShouldTrackPressure) { 1694 RegisterOperands RegOpers; 1695 RegOpers.collect(*MI, *TRI, MRI, ShouldTrackLaneMasks, false); 1696 if (ShouldTrackLaneMasks) { 1697 // Adjust liveness and add missing dead+read-undef flags. 1698 SlotIndex SlotIdx = LIS->getInstructionIndex(*MI).getRegSlot(); 1699 RegOpers.adjustLaneLiveness(*LIS, MRI, SlotIdx, MI); 1700 } else { 1701 // Adjust for missing dead-def flags. 1702 RegOpers.detectDeadDefs(*MI, *LIS); 1703 } 1704 1705 if (BotRPTracker.getPos() != CurrentBottom) 1706 BotRPTracker.recedeSkipDebugValues(); 1707 SmallVector<RegisterMaskPair, 8> LiveUses; 1708 BotRPTracker.recede(RegOpers, &LiveUses); 1709 assert(BotRPTracker.getPos() == CurrentBottom && "out of sync"); 1710 LLVM_DEBUG(dbgs() << "Bottom Pressure:\n"; dumpRegSetPressure( 1711 BotRPTracker.getRegSetPressureAtPos(), TRI);); 1712 1713 updateScheduledPressure(SU, BotRPTracker.getPressure().MaxSetPressure); 1714 updatePressureDiffs(LiveUses); 1715 } 1716 } 1717 } 1718 1719 //===----------------------------------------------------------------------===// 1720 // BaseMemOpClusterMutation - DAG post-processing to cluster loads or stores. 1721 //===----------------------------------------------------------------------===// 1722 1723 namespace { 1724 1725 /// Post-process the DAG to create cluster edges between neighboring 1726 /// loads or between neighboring stores. 1727 class BaseMemOpClusterMutation : public ScheduleDAGMutation { 1728 struct MemOpInfo { 1729 SUnit *SU; 1730 SmallVector<const MachineOperand *, 4> BaseOps; 1731 int64_t Offset; 1732 LocationSize Width; 1733 bool OffsetIsScalable; 1734 1735 MemOpInfo(SUnit *SU, ArrayRef<const MachineOperand *> BaseOps, 1736 int64_t Offset, bool OffsetIsScalable, LocationSize Width) 1737 : SU(SU), BaseOps(BaseOps.begin(), BaseOps.end()), Offset(Offset), 1738 Width(Width), OffsetIsScalable(OffsetIsScalable) {} 1739 1740 static bool Compare(const MachineOperand *const &A, 1741 const MachineOperand *const &B) { 1742 if (A->getType() != B->getType()) 1743 return A->getType() < B->getType(); 1744 if (A->isReg()) 1745 return A->getReg() < B->getReg(); 1746 if (A->isFI()) { 1747 const MachineFunction &MF = *A->getParent()->getParent()->getParent(); 1748 const TargetFrameLowering &TFI = *MF.getSubtarget().getFrameLowering(); 1749 bool StackGrowsDown = TFI.getStackGrowthDirection() == 1750 TargetFrameLowering::StackGrowsDown; 1751 return StackGrowsDown ? A->getIndex() > B->getIndex() 1752 : A->getIndex() < B->getIndex(); 1753 } 1754 1755 llvm_unreachable("MemOpClusterMutation only supports register or frame " 1756 "index bases."); 1757 } 1758 1759 bool operator<(const MemOpInfo &RHS) const { 1760 // FIXME: Don't compare everything twice. Maybe use C++20 three way 1761 // comparison instead when it's available. 1762 if (std::lexicographical_compare(BaseOps.begin(), BaseOps.end(), 1763 RHS.BaseOps.begin(), RHS.BaseOps.end(), 1764 Compare)) 1765 return true; 1766 if (std::lexicographical_compare(RHS.BaseOps.begin(), RHS.BaseOps.end(), 1767 BaseOps.begin(), BaseOps.end(), Compare)) 1768 return false; 1769 if (Offset != RHS.Offset) 1770 return Offset < RHS.Offset; 1771 return SU->NodeNum < RHS.SU->NodeNum; 1772 } 1773 }; 1774 1775 const TargetInstrInfo *TII; 1776 const TargetRegisterInfo *TRI; 1777 bool IsLoad; 1778 bool ReorderWhileClustering; 1779 1780 public: 1781 BaseMemOpClusterMutation(const TargetInstrInfo *tii, 1782 const TargetRegisterInfo *tri, bool IsLoad, 1783 bool ReorderWhileClustering) 1784 : TII(tii), TRI(tri), IsLoad(IsLoad), 1785 ReorderWhileClustering(ReorderWhileClustering) {} 1786 1787 void apply(ScheduleDAGInstrs *DAGInstrs) override; 1788 1789 protected: 1790 void clusterNeighboringMemOps(ArrayRef<MemOpInfo> MemOps, bool FastCluster, 1791 ScheduleDAGInstrs *DAG); 1792 void collectMemOpRecords(std::vector<SUnit> &SUnits, 1793 SmallVectorImpl<MemOpInfo> &MemOpRecords); 1794 bool groupMemOps(ArrayRef<MemOpInfo> MemOps, ScheduleDAGInstrs *DAG, 1795 DenseMap<unsigned, SmallVector<MemOpInfo, 32>> &Groups); 1796 }; 1797 1798 class StoreClusterMutation : public BaseMemOpClusterMutation { 1799 public: 1800 StoreClusterMutation(const TargetInstrInfo *tii, 1801 const TargetRegisterInfo *tri, 1802 bool ReorderWhileClustering) 1803 : BaseMemOpClusterMutation(tii, tri, false, ReorderWhileClustering) {} 1804 }; 1805 1806 class LoadClusterMutation : public BaseMemOpClusterMutation { 1807 public: 1808 LoadClusterMutation(const TargetInstrInfo *tii, const TargetRegisterInfo *tri, 1809 bool ReorderWhileClustering) 1810 : BaseMemOpClusterMutation(tii, tri, true, ReorderWhileClustering) {} 1811 }; 1812 1813 } // end anonymous namespace 1814 1815 namespace llvm { 1816 1817 std::unique_ptr<ScheduleDAGMutation> 1818 createLoadClusterDAGMutation(const TargetInstrInfo *TII, 1819 const TargetRegisterInfo *TRI, 1820 bool ReorderWhileClustering) { 1821 return EnableMemOpCluster ? std::make_unique<LoadClusterMutation>( 1822 TII, TRI, ReorderWhileClustering) 1823 : nullptr; 1824 } 1825 1826 std::unique_ptr<ScheduleDAGMutation> 1827 createStoreClusterDAGMutation(const TargetInstrInfo *TII, 1828 const TargetRegisterInfo *TRI, 1829 bool ReorderWhileClustering) { 1830 return EnableMemOpCluster ? std::make_unique<StoreClusterMutation>( 1831 TII, TRI, ReorderWhileClustering) 1832 : nullptr; 1833 } 1834 1835 } // end namespace llvm 1836 1837 // Sorting all the loads/stores first, then for each load/store, checking the 1838 // following load/store one by one, until reach the first non-dependent one and 1839 // call target hook to see if they can cluster. 1840 // If FastCluster is enabled, we assume that, all the loads/stores have been 1841 // preprocessed and now, they didn't have dependencies on each other. 1842 void BaseMemOpClusterMutation::clusterNeighboringMemOps( 1843 ArrayRef<MemOpInfo> MemOpRecords, bool FastCluster, 1844 ScheduleDAGInstrs *DAG) { 1845 // Keep track of the current cluster length and bytes for each SUnit. 1846 DenseMap<unsigned, std::pair<unsigned, unsigned>> SUnit2ClusterInfo; 1847 1848 // At this point, `MemOpRecords` array must hold atleast two mem ops. Try to 1849 // cluster mem ops collected within `MemOpRecords` array. 1850 for (unsigned Idx = 0, End = MemOpRecords.size(); Idx < (End - 1); ++Idx) { 1851 // Decision to cluster mem ops is taken based on target dependent logic 1852 auto MemOpa = MemOpRecords[Idx]; 1853 1854 // Seek for the next load/store to do the cluster. 1855 unsigned NextIdx = Idx + 1; 1856 for (; NextIdx < End; ++NextIdx) 1857 // Skip if MemOpb has been clustered already or has dependency with 1858 // MemOpa. 1859 if (!SUnit2ClusterInfo.count(MemOpRecords[NextIdx].SU->NodeNum) && 1860 (FastCluster || 1861 (!DAG->IsReachable(MemOpRecords[NextIdx].SU, MemOpa.SU) && 1862 !DAG->IsReachable(MemOpa.SU, MemOpRecords[NextIdx].SU)))) 1863 break; 1864 if (NextIdx == End) 1865 continue; 1866 1867 auto MemOpb = MemOpRecords[NextIdx]; 1868 unsigned ClusterLength = 2; 1869 unsigned CurrentClusterBytes = MemOpa.Width.getValue().getKnownMinValue() + 1870 MemOpb.Width.getValue().getKnownMinValue(); 1871 if (SUnit2ClusterInfo.count(MemOpa.SU->NodeNum)) { 1872 ClusterLength = SUnit2ClusterInfo[MemOpa.SU->NodeNum].first + 1; 1873 CurrentClusterBytes = SUnit2ClusterInfo[MemOpa.SU->NodeNum].second + 1874 MemOpb.Width.getValue().getKnownMinValue(); 1875 } 1876 1877 if (!TII->shouldClusterMemOps(MemOpa.BaseOps, MemOpa.Offset, 1878 MemOpa.OffsetIsScalable, MemOpb.BaseOps, 1879 MemOpb.Offset, MemOpb.OffsetIsScalable, 1880 ClusterLength, CurrentClusterBytes)) 1881 continue; 1882 1883 SUnit *SUa = MemOpa.SU; 1884 SUnit *SUb = MemOpb.SU; 1885 if (!ReorderWhileClustering && SUa->NodeNum > SUb->NodeNum) 1886 std::swap(SUa, SUb); 1887 1888 // FIXME: Is this check really required? 1889 if (!DAG->addEdge(SUb, SDep(SUa, SDep::Cluster))) 1890 continue; 1891 1892 LLVM_DEBUG(dbgs() << "Cluster ld/st SU(" << SUa->NodeNum << ") - SU(" 1893 << SUb->NodeNum << ")\n"); 1894 ++NumClustered; 1895 1896 if (IsLoad) { 1897 // Copy successor edges from SUa to SUb. Interleaving computation 1898 // dependent on SUa can prevent load combining due to register reuse. 1899 // Predecessor edges do not need to be copied from SUb to SUa since 1900 // nearby loads should have effectively the same inputs. 1901 for (const SDep &Succ : SUa->Succs) { 1902 if (Succ.getSUnit() == SUb) 1903 continue; 1904 LLVM_DEBUG(dbgs() << " Copy Succ SU(" << Succ.getSUnit()->NodeNum 1905 << ")\n"); 1906 DAG->addEdge(Succ.getSUnit(), SDep(SUb, SDep::Artificial)); 1907 } 1908 } else { 1909 // Copy predecessor edges from SUb to SUa to avoid the SUnits that 1910 // SUb dependent on scheduled in-between SUb and SUa. Successor edges 1911 // do not need to be copied from SUa to SUb since no one will depend 1912 // on stores. 1913 // Notice that, we don't need to care about the memory dependency as 1914 // we won't try to cluster them if they have any memory dependency. 1915 for (const SDep &Pred : SUb->Preds) { 1916 if (Pred.getSUnit() == SUa) 1917 continue; 1918 LLVM_DEBUG(dbgs() << " Copy Pred SU(" << Pred.getSUnit()->NodeNum 1919 << ")\n"); 1920 DAG->addEdge(SUa, SDep(Pred.getSUnit(), SDep::Artificial)); 1921 } 1922 } 1923 1924 SUnit2ClusterInfo[MemOpb.SU->NodeNum] = {ClusterLength, 1925 CurrentClusterBytes}; 1926 1927 LLVM_DEBUG(dbgs() << " Curr cluster length: " << ClusterLength 1928 << ", Curr cluster bytes: " << CurrentClusterBytes 1929 << "\n"); 1930 } 1931 } 1932 1933 void BaseMemOpClusterMutation::collectMemOpRecords( 1934 std::vector<SUnit> &SUnits, SmallVectorImpl<MemOpInfo> &MemOpRecords) { 1935 for (auto &SU : SUnits) { 1936 if ((IsLoad && !SU.getInstr()->mayLoad()) || 1937 (!IsLoad && !SU.getInstr()->mayStore())) 1938 continue; 1939 1940 const MachineInstr &MI = *SU.getInstr(); 1941 SmallVector<const MachineOperand *, 4> BaseOps; 1942 int64_t Offset; 1943 bool OffsetIsScalable; 1944 LocationSize Width = 0; 1945 if (TII->getMemOperandsWithOffsetWidth(MI, BaseOps, Offset, 1946 OffsetIsScalable, Width, TRI)) { 1947 MemOpRecords.push_back( 1948 MemOpInfo(&SU, BaseOps, Offset, OffsetIsScalable, Width)); 1949 1950 LLVM_DEBUG(dbgs() << "Num BaseOps: " << BaseOps.size() << ", Offset: " 1951 << Offset << ", OffsetIsScalable: " << OffsetIsScalable 1952 << ", Width: " << Width << "\n"); 1953 } 1954 #ifndef NDEBUG 1955 for (const auto *Op : BaseOps) 1956 assert(Op); 1957 #endif 1958 } 1959 } 1960 1961 bool BaseMemOpClusterMutation::groupMemOps( 1962 ArrayRef<MemOpInfo> MemOps, ScheduleDAGInstrs *DAG, 1963 DenseMap<unsigned, SmallVector<MemOpInfo, 32>> &Groups) { 1964 bool FastCluster = 1965 ForceFastCluster || 1966 MemOps.size() * DAG->SUnits.size() / 1000 > FastClusterThreshold; 1967 1968 for (const auto &MemOp : MemOps) { 1969 unsigned ChainPredID = DAG->SUnits.size(); 1970 if (FastCluster) { 1971 for (const SDep &Pred : MemOp.SU->Preds) { 1972 // We only want to cluster the mem ops that have the same ctrl(non-data) 1973 // pred so that they didn't have ctrl dependency for each other. But for 1974 // store instrs, we can still cluster them if the pred is load instr. 1975 if ((Pred.isCtrl() && 1976 (IsLoad || 1977 (Pred.getSUnit() && Pred.getSUnit()->getInstr()->mayStore()))) && 1978 !Pred.isArtificial()) { 1979 ChainPredID = Pred.getSUnit()->NodeNum; 1980 break; 1981 } 1982 } 1983 } else 1984 ChainPredID = 0; 1985 1986 Groups[ChainPredID].push_back(MemOp); 1987 } 1988 return FastCluster; 1989 } 1990 1991 /// Callback from DAG postProcessing to create cluster edges for loads/stores. 1992 void BaseMemOpClusterMutation::apply(ScheduleDAGInstrs *DAG) { 1993 // Collect all the clusterable loads/stores 1994 SmallVector<MemOpInfo, 32> MemOpRecords; 1995 collectMemOpRecords(DAG->SUnits, MemOpRecords); 1996 1997 if (MemOpRecords.size() < 2) 1998 return; 1999 2000 // Put the loads/stores without dependency into the same group with some 2001 // heuristic if the DAG is too complex to avoid compiling time blow up. 2002 // Notice that, some fusion pair could be lost with this. 2003 DenseMap<unsigned, SmallVector<MemOpInfo, 32>> Groups; 2004 bool FastCluster = groupMemOps(MemOpRecords, DAG, Groups); 2005 2006 for (auto &Group : Groups) { 2007 // Sorting the loads/stores, so that, we can stop the cluster as early as 2008 // possible. 2009 llvm::sort(Group.second); 2010 2011 // Trying to cluster all the neighboring loads/stores. 2012 clusterNeighboringMemOps(Group.second, FastCluster, DAG); 2013 } 2014 } 2015 2016 //===----------------------------------------------------------------------===// 2017 // CopyConstrain - DAG post-processing to encourage copy elimination. 2018 //===----------------------------------------------------------------------===// 2019 2020 namespace { 2021 2022 /// Post-process the DAG to create weak edges from all uses of a copy to 2023 /// the one use that defines the copy's source vreg, most likely an induction 2024 /// variable increment. 2025 class CopyConstrain : public ScheduleDAGMutation { 2026 // Transient state. 2027 SlotIndex RegionBeginIdx; 2028 2029 // RegionEndIdx is the slot index of the last non-debug instruction in the 2030 // scheduling region. So we may have RegionBeginIdx == RegionEndIdx. 2031 SlotIndex RegionEndIdx; 2032 2033 public: 2034 CopyConstrain(const TargetInstrInfo *, const TargetRegisterInfo *) {} 2035 2036 void apply(ScheduleDAGInstrs *DAGInstrs) override; 2037 2038 protected: 2039 void constrainLocalCopy(SUnit *CopySU, ScheduleDAGMILive *DAG); 2040 }; 2041 2042 } // end anonymous namespace 2043 2044 namespace llvm { 2045 2046 std::unique_ptr<ScheduleDAGMutation> 2047 createCopyConstrainDAGMutation(const TargetInstrInfo *TII, 2048 const TargetRegisterInfo *TRI) { 2049 return std::make_unique<CopyConstrain>(TII, TRI); 2050 } 2051 2052 } // end namespace llvm 2053 2054 /// constrainLocalCopy handles two possibilities: 2055 /// 1) Local src: 2056 /// I0: = dst 2057 /// I1: src = ... 2058 /// I2: = dst 2059 /// I3: dst = src (copy) 2060 /// (create pred->succ edges I0->I1, I2->I1) 2061 /// 2062 /// 2) Local copy: 2063 /// I0: dst = src (copy) 2064 /// I1: = dst 2065 /// I2: src = ... 2066 /// I3: = dst 2067 /// (create pred->succ edges I1->I2, I3->I2) 2068 /// 2069 /// Although the MachineScheduler is currently constrained to single blocks, 2070 /// this algorithm should handle extended blocks. An EBB is a set of 2071 /// contiguously numbered blocks such that the previous block in the EBB is 2072 /// always the single predecessor. 2073 void CopyConstrain::constrainLocalCopy(SUnit *CopySU, ScheduleDAGMILive *DAG) { 2074 LiveIntervals *LIS = DAG->getLIS(); 2075 MachineInstr *Copy = CopySU->getInstr(); 2076 2077 // Check for pure vreg copies. 2078 const MachineOperand &SrcOp = Copy->getOperand(1); 2079 Register SrcReg = SrcOp.getReg(); 2080 if (!SrcReg.isVirtual() || !SrcOp.readsReg()) 2081 return; 2082 2083 const MachineOperand &DstOp = Copy->getOperand(0); 2084 Register DstReg = DstOp.getReg(); 2085 if (!DstReg.isVirtual() || DstOp.isDead()) 2086 return; 2087 2088 // Check if either the dest or source is local. If it's live across a back 2089 // edge, it's not local. Note that if both vregs are live across the back 2090 // edge, we cannot successfully contrain the copy without cyclic scheduling. 2091 // If both the copy's source and dest are local live intervals, then we 2092 // should treat the dest as the global for the purpose of adding 2093 // constraints. This adds edges from source's other uses to the copy. 2094 unsigned LocalReg = SrcReg; 2095 unsigned GlobalReg = DstReg; 2096 LiveInterval *LocalLI = &LIS->getInterval(LocalReg); 2097 if (!LocalLI->isLocal(RegionBeginIdx, RegionEndIdx)) { 2098 LocalReg = DstReg; 2099 GlobalReg = SrcReg; 2100 LocalLI = &LIS->getInterval(LocalReg); 2101 if (!LocalLI->isLocal(RegionBeginIdx, RegionEndIdx)) 2102 return; 2103 } 2104 LiveInterval *GlobalLI = &LIS->getInterval(GlobalReg); 2105 2106 // Find the global segment after the start of the local LI. 2107 LiveInterval::iterator GlobalSegment = GlobalLI->find(LocalLI->beginIndex()); 2108 // If GlobalLI does not overlap LocalLI->start, then a copy directly feeds a 2109 // local live range. We could create edges from other global uses to the local 2110 // start, but the coalescer should have already eliminated these cases, so 2111 // don't bother dealing with it. 2112 if (GlobalSegment == GlobalLI->end()) 2113 return; 2114 2115 // If GlobalSegment is killed at the LocalLI->start, the call to find() 2116 // returned the next global segment. But if GlobalSegment overlaps with 2117 // LocalLI->start, then advance to the next segment. If a hole in GlobalLI 2118 // exists in LocalLI's vicinity, GlobalSegment will be the end of the hole. 2119 if (GlobalSegment->contains(LocalLI->beginIndex())) 2120 ++GlobalSegment; 2121 2122 if (GlobalSegment == GlobalLI->end()) 2123 return; 2124 2125 // Check if GlobalLI contains a hole in the vicinity of LocalLI. 2126 if (GlobalSegment != GlobalLI->begin()) { 2127 // Two address defs have no hole. 2128 if (SlotIndex::isSameInstr(std::prev(GlobalSegment)->end, 2129 GlobalSegment->start)) { 2130 return; 2131 } 2132 // If the prior global segment may be defined by the same two-address 2133 // instruction that also defines LocalLI, then can't make a hole here. 2134 if (SlotIndex::isSameInstr(std::prev(GlobalSegment)->start, 2135 LocalLI->beginIndex())) { 2136 return; 2137 } 2138 // If GlobalLI has a prior segment, it must be live into the EBB. Otherwise 2139 // it would be a disconnected component in the live range. 2140 assert(std::prev(GlobalSegment)->start < LocalLI->beginIndex() && 2141 "Disconnected LRG within the scheduling region."); 2142 } 2143 MachineInstr *GlobalDef = LIS->getInstructionFromIndex(GlobalSegment->start); 2144 if (!GlobalDef) 2145 return; 2146 2147 SUnit *GlobalSU = DAG->getSUnit(GlobalDef); 2148 if (!GlobalSU) 2149 return; 2150 2151 // GlobalDef is the bottom of the GlobalLI hole. Open the hole by 2152 // constraining the uses of the last local def to precede GlobalDef. 2153 SmallVector<SUnit*,8> LocalUses; 2154 const VNInfo *LastLocalVN = LocalLI->getVNInfoBefore(LocalLI->endIndex()); 2155 MachineInstr *LastLocalDef = LIS->getInstructionFromIndex(LastLocalVN->def); 2156 SUnit *LastLocalSU = DAG->getSUnit(LastLocalDef); 2157 for (const SDep &Succ : LastLocalSU->Succs) { 2158 if (Succ.getKind() != SDep::Data || Succ.getReg() != LocalReg) 2159 continue; 2160 if (Succ.getSUnit() == GlobalSU) 2161 continue; 2162 if (!DAG->canAddEdge(GlobalSU, Succ.getSUnit())) 2163 return; 2164 LocalUses.push_back(Succ.getSUnit()); 2165 } 2166 // Open the top of the GlobalLI hole by constraining any earlier global uses 2167 // to precede the start of LocalLI. 2168 SmallVector<SUnit*,8> GlobalUses; 2169 MachineInstr *FirstLocalDef = 2170 LIS->getInstructionFromIndex(LocalLI->beginIndex()); 2171 SUnit *FirstLocalSU = DAG->getSUnit(FirstLocalDef); 2172 for (const SDep &Pred : GlobalSU->Preds) { 2173 if (Pred.getKind() != SDep::Anti || Pred.getReg() != GlobalReg) 2174 continue; 2175 if (Pred.getSUnit() == FirstLocalSU) 2176 continue; 2177 if (!DAG->canAddEdge(FirstLocalSU, Pred.getSUnit())) 2178 return; 2179 GlobalUses.push_back(Pred.getSUnit()); 2180 } 2181 LLVM_DEBUG(dbgs() << "Constraining copy SU(" << CopySU->NodeNum << ")\n"); 2182 // Add the weak edges. 2183 for (SUnit *LU : LocalUses) { 2184 LLVM_DEBUG(dbgs() << " Local use SU(" << LU->NodeNum << ") -> SU(" 2185 << GlobalSU->NodeNum << ")\n"); 2186 DAG->addEdge(GlobalSU, SDep(LU, SDep::Weak)); 2187 } 2188 for (SUnit *GU : GlobalUses) { 2189 LLVM_DEBUG(dbgs() << " Global use SU(" << GU->NodeNum << ") -> SU(" 2190 << FirstLocalSU->NodeNum << ")\n"); 2191 DAG->addEdge(FirstLocalSU, SDep(GU, SDep::Weak)); 2192 } 2193 } 2194 2195 /// Callback from DAG postProcessing to create weak edges to encourage 2196 /// copy elimination. 2197 void CopyConstrain::apply(ScheduleDAGInstrs *DAGInstrs) { 2198 ScheduleDAGMI *DAG = static_cast<ScheduleDAGMI*>(DAGInstrs); 2199 assert(DAG->hasVRegLiveness() && "Expect VRegs with LiveIntervals"); 2200 2201 MachineBasicBlock::iterator FirstPos = nextIfDebug(DAG->begin(), DAG->end()); 2202 if (FirstPos == DAG->end()) 2203 return; 2204 RegionBeginIdx = DAG->getLIS()->getInstructionIndex(*FirstPos); 2205 RegionEndIdx = DAG->getLIS()->getInstructionIndex( 2206 *priorNonDebug(DAG->end(), DAG->begin())); 2207 2208 for (SUnit &SU : DAG->SUnits) { 2209 if (!SU.getInstr()->isCopy()) 2210 continue; 2211 2212 constrainLocalCopy(&SU, static_cast<ScheduleDAGMILive*>(DAG)); 2213 } 2214 } 2215 2216 //===----------------------------------------------------------------------===// 2217 // MachineSchedStrategy helpers used by GenericScheduler, GenericPostScheduler 2218 // and possibly other custom schedulers. 2219 //===----------------------------------------------------------------------===// 2220 2221 static const unsigned InvalidCycle = ~0U; 2222 2223 SchedBoundary::~SchedBoundary() { delete HazardRec; } 2224 2225 /// Given a Count of resource usage and a Latency value, return true if a 2226 /// SchedBoundary becomes resource limited. 2227 /// If we are checking after scheduling a node, we should return true when 2228 /// we just reach the resource limit. 2229 static bool checkResourceLimit(unsigned LFactor, unsigned Count, 2230 unsigned Latency, bool AfterSchedNode) { 2231 int ResCntFactor = (int)(Count - (Latency * LFactor)); 2232 if (AfterSchedNode) 2233 return ResCntFactor >= (int)LFactor; 2234 else 2235 return ResCntFactor > (int)LFactor; 2236 } 2237 2238 void SchedBoundary::reset() { 2239 // A new HazardRec is created for each DAG and owned by SchedBoundary. 2240 // Destroying and reconstructing it is very expensive though. So keep 2241 // invalid, placeholder HazardRecs. 2242 if (HazardRec && HazardRec->isEnabled()) { 2243 delete HazardRec; 2244 HazardRec = nullptr; 2245 } 2246 Available.clear(); 2247 Pending.clear(); 2248 CheckPending = false; 2249 CurrCycle = 0; 2250 CurrMOps = 0; 2251 MinReadyCycle = std::numeric_limits<unsigned>::max(); 2252 ExpectedLatency = 0; 2253 DependentLatency = 0; 2254 RetiredMOps = 0; 2255 MaxExecutedResCount = 0; 2256 ZoneCritResIdx = 0; 2257 IsResourceLimited = false; 2258 ReservedCycles.clear(); 2259 ReservedResourceSegments.clear(); 2260 ReservedCyclesIndex.clear(); 2261 ResourceGroupSubUnitMasks.clear(); 2262 #if LLVM_ENABLE_ABI_BREAKING_CHECKS 2263 // Track the maximum number of stall cycles that could arise either from the 2264 // latency of a DAG edge or the number of cycles that a processor resource is 2265 // reserved (SchedBoundary::ReservedCycles). 2266 MaxObservedStall = 0; 2267 #endif 2268 // Reserve a zero-count for invalid CritResIdx. 2269 ExecutedResCounts.resize(1); 2270 assert(!ExecutedResCounts[0] && "nonzero count for bad resource"); 2271 } 2272 2273 void SchedRemainder:: 2274 init(ScheduleDAGMI *DAG, const TargetSchedModel *SchedModel) { 2275 reset(); 2276 if (!SchedModel->hasInstrSchedModel()) 2277 return; 2278 RemainingCounts.resize(SchedModel->getNumProcResourceKinds()); 2279 for (SUnit &SU : DAG->SUnits) { 2280 const MCSchedClassDesc *SC = DAG->getSchedClass(&SU); 2281 RemIssueCount += SchedModel->getNumMicroOps(SU.getInstr(), SC) 2282 * SchedModel->getMicroOpFactor(); 2283 for (TargetSchedModel::ProcResIter 2284 PI = SchedModel->getWriteProcResBegin(SC), 2285 PE = SchedModel->getWriteProcResEnd(SC); PI != PE; ++PI) { 2286 unsigned PIdx = PI->ProcResourceIdx; 2287 unsigned Factor = SchedModel->getResourceFactor(PIdx); 2288 assert(PI->ReleaseAtCycle >= PI->AcquireAtCycle); 2289 RemainingCounts[PIdx] += 2290 (Factor * (PI->ReleaseAtCycle - PI->AcquireAtCycle)); 2291 } 2292 } 2293 } 2294 2295 void SchedBoundary:: 2296 init(ScheduleDAGMI *dag, const TargetSchedModel *smodel, SchedRemainder *rem) { 2297 reset(); 2298 DAG = dag; 2299 SchedModel = smodel; 2300 Rem = rem; 2301 if (SchedModel->hasInstrSchedModel()) { 2302 unsigned ResourceCount = SchedModel->getNumProcResourceKinds(); 2303 ReservedCyclesIndex.resize(ResourceCount); 2304 ExecutedResCounts.resize(ResourceCount); 2305 ResourceGroupSubUnitMasks.resize(ResourceCount, APInt(ResourceCount, 0)); 2306 unsigned NumUnits = 0; 2307 2308 for (unsigned i = 0; i < ResourceCount; ++i) { 2309 ReservedCyclesIndex[i] = NumUnits; 2310 NumUnits += SchedModel->getProcResource(i)->NumUnits; 2311 if (isUnbufferedGroup(i)) { 2312 auto SubUnits = SchedModel->getProcResource(i)->SubUnitsIdxBegin; 2313 for (unsigned U = 0, UE = SchedModel->getProcResource(i)->NumUnits; 2314 U != UE; ++U) 2315 ResourceGroupSubUnitMasks[i].setBit(SubUnits[U]); 2316 } 2317 } 2318 2319 ReservedCycles.resize(NumUnits, InvalidCycle); 2320 } 2321 } 2322 2323 /// Compute the stall cycles based on this SUnit's ready time. Heuristics treat 2324 /// these "soft stalls" differently than the hard stall cycles based on CPU 2325 /// resources and computed by checkHazard(). A fully in-order model 2326 /// (MicroOpBufferSize==0) will not make use of this since instructions are not 2327 /// available for scheduling until they are ready. However, a weaker in-order 2328 /// model may use this for heuristics. For example, if a processor has in-order 2329 /// behavior when reading certain resources, this may come into play. 2330 unsigned SchedBoundary::getLatencyStallCycles(SUnit *SU) { 2331 if (!SU->isUnbuffered) 2332 return 0; 2333 2334 unsigned ReadyCycle = (isTop() ? SU->TopReadyCycle : SU->BotReadyCycle); 2335 if (ReadyCycle > CurrCycle) 2336 return ReadyCycle - CurrCycle; 2337 return 0; 2338 } 2339 2340 /// Compute the next cycle at which the given processor resource unit 2341 /// can be scheduled. 2342 unsigned SchedBoundary::getNextResourceCycleByInstance(unsigned InstanceIdx, 2343 unsigned ReleaseAtCycle, 2344 unsigned AcquireAtCycle) { 2345 if (SchedModel && SchedModel->enableIntervals()) { 2346 if (isTop()) 2347 return ReservedResourceSegments[InstanceIdx].getFirstAvailableAtFromTop( 2348 CurrCycle, AcquireAtCycle, ReleaseAtCycle); 2349 2350 return ReservedResourceSegments[InstanceIdx].getFirstAvailableAtFromBottom( 2351 CurrCycle, AcquireAtCycle, ReleaseAtCycle); 2352 } 2353 2354 unsigned NextUnreserved = ReservedCycles[InstanceIdx]; 2355 // If this resource has never been used, always return cycle zero. 2356 if (NextUnreserved == InvalidCycle) 2357 return CurrCycle; 2358 // For bottom-up scheduling add the cycles needed for the current operation. 2359 if (!isTop()) 2360 NextUnreserved = std::max(CurrCycle, NextUnreserved + ReleaseAtCycle); 2361 return NextUnreserved; 2362 } 2363 2364 /// Compute the next cycle at which the given processor resource can be 2365 /// scheduled. Returns the next cycle and the index of the processor resource 2366 /// instance in the reserved cycles vector. 2367 std::pair<unsigned, unsigned> 2368 SchedBoundary::getNextResourceCycle(const MCSchedClassDesc *SC, unsigned PIdx, 2369 unsigned ReleaseAtCycle, 2370 unsigned AcquireAtCycle) { 2371 if (MischedDetailResourceBooking) { 2372 LLVM_DEBUG(dbgs() << " Resource booking (@" << CurrCycle << "c): \n"); 2373 LLVM_DEBUG(dumpReservedCycles()); 2374 LLVM_DEBUG(dbgs() << " getNextResourceCycle (@" << CurrCycle << "c): \n"); 2375 } 2376 unsigned MinNextUnreserved = InvalidCycle; 2377 unsigned InstanceIdx = 0; 2378 unsigned StartIndex = ReservedCyclesIndex[PIdx]; 2379 unsigned NumberOfInstances = SchedModel->getProcResource(PIdx)->NumUnits; 2380 assert(NumberOfInstances > 0 && 2381 "Cannot have zero instances of a ProcResource"); 2382 2383 if (isUnbufferedGroup(PIdx)) { 2384 // If any subunits are used by the instruction, report that the 2385 // subunits of the resource group are available at the first cycle 2386 // in which the unit is available, effectively removing the group 2387 // record from hazarding and basing the hazarding decisions on the 2388 // subunit records. Otherwise, choose the first available instance 2389 // from among the subunits. Specifications which assign cycles to 2390 // both the subunits and the group or which use an unbuffered 2391 // group with buffered subunits will appear to schedule 2392 // strangely. In the first case, the additional cycles for the 2393 // group will be ignored. In the second, the group will be 2394 // ignored entirely. 2395 for (const MCWriteProcResEntry &PE : 2396 make_range(SchedModel->getWriteProcResBegin(SC), 2397 SchedModel->getWriteProcResEnd(SC))) 2398 if (ResourceGroupSubUnitMasks[PIdx][PE.ProcResourceIdx]) 2399 return std::make_pair(getNextResourceCycleByInstance( 2400 StartIndex, ReleaseAtCycle, AcquireAtCycle), 2401 StartIndex); 2402 2403 auto SubUnits = SchedModel->getProcResource(PIdx)->SubUnitsIdxBegin; 2404 for (unsigned I = 0, End = NumberOfInstances; I < End; ++I) { 2405 unsigned NextUnreserved, NextInstanceIdx; 2406 std::tie(NextUnreserved, NextInstanceIdx) = 2407 getNextResourceCycle(SC, SubUnits[I], ReleaseAtCycle, AcquireAtCycle); 2408 if (MinNextUnreserved > NextUnreserved) { 2409 InstanceIdx = NextInstanceIdx; 2410 MinNextUnreserved = NextUnreserved; 2411 } 2412 } 2413 return std::make_pair(MinNextUnreserved, InstanceIdx); 2414 } 2415 2416 for (unsigned I = StartIndex, End = StartIndex + NumberOfInstances; I < End; 2417 ++I) { 2418 unsigned NextUnreserved = 2419 getNextResourceCycleByInstance(I, ReleaseAtCycle, AcquireAtCycle); 2420 if (MischedDetailResourceBooking) 2421 LLVM_DEBUG(dbgs() << " Instance " << I - StartIndex << " available @" 2422 << NextUnreserved << "c\n"); 2423 if (MinNextUnreserved > NextUnreserved) { 2424 InstanceIdx = I; 2425 MinNextUnreserved = NextUnreserved; 2426 } 2427 } 2428 if (MischedDetailResourceBooking) 2429 LLVM_DEBUG(dbgs() << " selecting " << SchedModel->getResourceName(PIdx) 2430 << "[" << InstanceIdx - StartIndex << "]" 2431 << " available @" << MinNextUnreserved << "c" 2432 << "\n"); 2433 return std::make_pair(MinNextUnreserved, InstanceIdx); 2434 } 2435 2436 /// Does this SU have a hazard within the current instruction group. 2437 /// 2438 /// The scheduler supports two modes of hazard recognition. The first is the 2439 /// ScheduleHazardRecognizer API. It is a fully general hazard recognizer that 2440 /// supports highly complicated in-order reservation tables 2441 /// (ScoreboardHazardRecognizer) and arbitrary target-specific logic. 2442 /// 2443 /// The second is a streamlined mechanism that checks for hazards based on 2444 /// simple counters that the scheduler itself maintains. It explicitly checks 2445 /// for instruction dispatch limitations, including the number of micro-ops that 2446 /// can dispatch per cycle. 2447 /// 2448 /// TODO: Also check whether the SU must start a new group. 2449 bool SchedBoundary::checkHazard(SUnit *SU) { 2450 if (HazardRec->isEnabled() 2451 && HazardRec->getHazardType(SU) != ScheduleHazardRecognizer::NoHazard) { 2452 return true; 2453 } 2454 2455 unsigned uops = SchedModel->getNumMicroOps(SU->getInstr()); 2456 if ((CurrMOps > 0) && (CurrMOps + uops > SchedModel->getIssueWidth())) { 2457 LLVM_DEBUG(dbgs() << " SU(" << SU->NodeNum << ") uops=" 2458 << SchedModel->getNumMicroOps(SU->getInstr()) << '\n'); 2459 return true; 2460 } 2461 2462 if (CurrMOps > 0 && 2463 ((isTop() && SchedModel->mustBeginGroup(SU->getInstr())) || 2464 (!isTop() && SchedModel->mustEndGroup(SU->getInstr())))) { 2465 LLVM_DEBUG(dbgs() << " hazard: SU(" << SU->NodeNum << ") must " 2466 << (isTop() ? "begin" : "end") << " group\n"); 2467 return true; 2468 } 2469 2470 if (SchedModel->hasInstrSchedModel() && SU->hasReservedResource) { 2471 const MCSchedClassDesc *SC = DAG->getSchedClass(SU); 2472 for (const MCWriteProcResEntry &PE : 2473 make_range(SchedModel->getWriteProcResBegin(SC), 2474 SchedModel->getWriteProcResEnd(SC))) { 2475 unsigned ResIdx = PE.ProcResourceIdx; 2476 unsigned ReleaseAtCycle = PE.ReleaseAtCycle; 2477 unsigned AcquireAtCycle = PE.AcquireAtCycle; 2478 unsigned NRCycle, InstanceIdx; 2479 std::tie(NRCycle, InstanceIdx) = 2480 getNextResourceCycle(SC, ResIdx, ReleaseAtCycle, AcquireAtCycle); 2481 if (NRCycle > CurrCycle) { 2482 #if LLVM_ENABLE_ABI_BREAKING_CHECKS 2483 MaxObservedStall = std::max(ReleaseAtCycle, MaxObservedStall); 2484 #endif 2485 LLVM_DEBUG(dbgs() << " SU(" << SU->NodeNum << ") " 2486 << SchedModel->getResourceName(ResIdx) 2487 << '[' << InstanceIdx - ReservedCyclesIndex[ResIdx] << ']' 2488 << "=" << NRCycle << "c\n"); 2489 return true; 2490 } 2491 } 2492 } 2493 return false; 2494 } 2495 2496 // Find the unscheduled node in ReadySUs with the highest latency. 2497 unsigned SchedBoundary:: 2498 findMaxLatency(ArrayRef<SUnit*> ReadySUs) { 2499 SUnit *LateSU = nullptr; 2500 unsigned RemLatency = 0; 2501 for (SUnit *SU : ReadySUs) { 2502 unsigned L = getUnscheduledLatency(SU); 2503 if (L > RemLatency) { 2504 RemLatency = L; 2505 LateSU = SU; 2506 } 2507 } 2508 if (LateSU) { 2509 LLVM_DEBUG(dbgs() << Available.getName() << " RemLatency SU(" 2510 << LateSU->NodeNum << ") " << RemLatency << "c\n"); 2511 } 2512 return RemLatency; 2513 } 2514 2515 // Count resources in this zone and the remaining unscheduled 2516 // instruction. Return the max count, scaled. Set OtherCritIdx to the critical 2517 // resource index, or zero if the zone is issue limited. 2518 unsigned SchedBoundary:: 2519 getOtherResourceCount(unsigned &OtherCritIdx) { 2520 OtherCritIdx = 0; 2521 if (!SchedModel->hasInstrSchedModel()) 2522 return 0; 2523 2524 unsigned OtherCritCount = Rem->RemIssueCount 2525 + (RetiredMOps * SchedModel->getMicroOpFactor()); 2526 LLVM_DEBUG(dbgs() << " " << Available.getName() << " + Remain MOps: " 2527 << OtherCritCount / SchedModel->getMicroOpFactor() << '\n'); 2528 for (unsigned PIdx = 1, PEnd = SchedModel->getNumProcResourceKinds(); 2529 PIdx != PEnd; ++PIdx) { 2530 unsigned OtherCount = getResourceCount(PIdx) + Rem->RemainingCounts[PIdx]; 2531 if (OtherCount > OtherCritCount) { 2532 OtherCritCount = OtherCount; 2533 OtherCritIdx = PIdx; 2534 } 2535 } 2536 if (OtherCritIdx) { 2537 LLVM_DEBUG( 2538 dbgs() << " " << Available.getName() << " + Remain CritRes: " 2539 << OtherCritCount / SchedModel->getResourceFactor(OtherCritIdx) 2540 << " " << SchedModel->getResourceName(OtherCritIdx) << "\n"); 2541 } 2542 return OtherCritCount; 2543 } 2544 2545 void SchedBoundary::releaseNode(SUnit *SU, unsigned ReadyCycle, bool InPQueue, 2546 unsigned Idx) { 2547 assert(SU->getInstr() && "Scheduled SUnit must have instr"); 2548 2549 #if LLVM_ENABLE_ABI_BREAKING_CHECKS 2550 // ReadyCycle was been bumped up to the CurrCycle when this node was 2551 // scheduled, but CurrCycle may have been eagerly advanced immediately after 2552 // scheduling, so may now be greater than ReadyCycle. 2553 if (ReadyCycle > CurrCycle) 2554 MaxObservedStall = std::max(ReadyCycle - CurrCycle, MaxObservedStall); 2555 #endif 2556 2557 if (ReadyCycle < MinReadyCycle) 2558 MinReadyCycle = ReadyCycle; 2559 2560 // Check for interlocks first. For the purpose of other heuristics, an 2561 // instruction that cannot issue appears as if it's not in the ReadyQueue. 2562 bool IsBuffered = SchedModel->getMicroOpBufferSize() != 0; 2563 bool HazardDetected = (!IsBuffered && ReadyCycle > CurrCycle) || 2564 checkHazard(SU) || (Available.size() >= ReadyListLimit); 2565 2566 if (!HazardDetected) { 2567 Available.push(SU); 2568 2569 if (InPQueue) 2570 Pending.remove(Pending.begin() + Idx); 2571 return; 2572 } 2573 2574 if (!InPQueue) 2575 Pending.push(SU); 2576 } 2577 2578 /// Move the boundary of scheduled code by one cycle. 2579 void SchedBoundary::bumpCycle(unsigned NextCycle) { 2580 if (SchedModel->getMicroOpBufferSize() == 0) { 2581 assert(MinReadyCycle < std::numeric_limits<unsigned>::max() && 2582 "MinReadyCycle uninitialized"); 2583 if (MinReadyCycle > NextCycle) 2584 NextCycle = MinReadyCycle; 2585 } 2586 // Update the current micro-ops, which will issue in the next cycle. 2587 unsigned DecMOps = SchedModel->getIssueWidth() * (NextCycle - CurrCycle); 2588 CurrMOps = (CurrMOps <= DecMOps) ? 0 : CurrMOps - DecMOps; 2589 2590 // Decrement DependentLatency based on the next cycle. 2591 if ((NextCycle - CurrCycle) > DependentLatency) 2592 DependentLatency = 0; 2593 else 2594 DependentLatency -= (NextCycle - CurrCycle); 2595 2596 if (!HazardRec->isEnabled()) { 2597 // Bypass HazardRec virtual calls. 2598 CurrCycle = NextCycle; 2599 } else { 2600 // Bypass getHazardType calls in case of long latency. 2601 for (; CurrCycle != NextCycle; ++CurrCycle) { 2602 if (isTop()) 2603 HazardRec->AdvanceCycle(); 2604 else 2605 HazardRec->RecedeCycle(); 2606 } 2607 } 2608 CheckPending = true; 2609 IsResourceLimited = 2610 checkResourceLimit(SchedModel->getLatencyFactor(), getCriticalCount(), 2611 getScheduledLatency(), true); 2612 2613 LLVM_DEBUG(dbgs() << "Cycle: " << CurrCycle << ' ' << Available.getName() 2614 << '\n'); 2615 } 2616 2617 void SchedBoundary::incExecutedResources(unsigned PIdx, unsigned Count) { 2618 ExecutedResCounts[PIdx] += Count; 2619 if (ExecutedResCounts[PIdx] > MaxExecutedResCount) 2620 MaxExecutedResCount = ExecutedResCounts[PIdx]; 2621 } 2622 2623 /// Add the given processor resource to this scheduled zone. 2624 /// 2625 /// \param ReleaseAtCycle indicates the number of consecutive (non-pipelined) 2626 /// cycles during which this resource is released. 2627 /// 2628 /// \param AcquireAtCycle indicates the number of consecutive (non-pipelined) 2629 /// cycles at which the resource is aquired after issue (assuming no stalls). 2630 /// 2631 /// \return the next cycle at which the instruction may execute without 2632 /// oversubscribing resources. 2633 unsigned SchedBoundary::countResource(const MCSchedClassDesc *SC, unsigned PIdx, 2634 unsigned ReleaseAtCycle, 2635 unsigned NextCycle, 2636 unsigned AcquireAtCycle) { 2637 unsigned Factor = SchedModel->getResourceFactor(PIdx); 2638 unsigned Count = Factor * (ReleaseAtCycle- AcquireAtCycle); 2639 LLVM_DEBUG(dbgs() << " " << SchedModel->getResourceName(PIdx) << " +" 2640 << ReleaseAtCycle << "x" << Factor << "u\n"); 2641 2642 // Update Executed resources counts. 2643 incExecutedResources(PIdx, Count); 2644 assert(Rem->RemainingCounts[PIdx] >= Count && "resource double counted"); 2645 Rem->RemainingCounts[PIdx] -= Count; 2646 2647 // Check if this resource exceeds the current critical resource. If so, it 2648 // becomes the critical resource. 2649 if (ZoneCritResIdx != PIdx && (getResourceCount(PIdx) > getCriticalCount())) { 2650 ZoneCritResIdx = PIdx; 2651 LLVM_DEBUG(dbgs() << " *** Critical resource " 2652 << SchedModel->getResourceName(PIdx) << ": " 2653 << getResourceCount(PIdx) / SchedModel->getLatencyFactor() 2654 << "c\n"); 2655 } 2656 // For reserved resources, record the highest cycle using the resource. 2657 unsigned NextAvailable, InstanceIdx; 2658 std::tie(NextAvailable, InstanceIdx) = 2659 getNextResourceCycle(SC, PIdx, ReleaseAtCycle, AcquireAtCycle); 2660 if (NextAvailable > CurrCycle) { 2661 LLVM_DEBUG(dbgs() << " Resource conflict: " 2662 << SchedModel->getResourceName(PIdx) 2663 << '[' << InstanceIdx - ReservedCyclesIndex[PIdx] << ']' 2664 << " reserved until @" << NextAvailable << "\n"); 2665 } 2666 return NextAvailable; 2667 } 2668 2669 /// Move the boundary of scheduled code by one SUnit. 2670 void SchedBoundary::bumpNode(SUnit *SU) { 2671 // Update the reservation table. 2672 if (HazardRec->isEnabled()) { 2673 if (!isTop() && SU->isCall) { 2674 // Calls are scheduled with their preceding instructions. For bottom-up 2675 // scheduling, clear the pipeline state before emitting. 2676 HazardRec->Reset(); 2677 } 2678 HazardRec->EmitInstruction(SU); 2679 // Scheduling an instruction may have made pending instructions available. 2680 CheckPending = true; 2681 } 2682 // checkHazard should prevent scheduling multiple instructions per cycle that 2683 // exceed the issue width. 2684 const MCSchedClassDesc *SC = DAG->getSchedClass(SU); 2685 unsigned IncMOps = SchedModel->getNumMicroOps(SU->getInstr()); 2686 assert( 2687 (CurrMOps == 0 || (CurrMOps + IncMOps) <= SchedModel->getIssueWidth()) && 2688 "Cannot schedule this instruction's MicroOps in the current cycle."); 2689 2690 unsigned ReadyCycle = (isTop() ? SU->TopReadyCycle : SU->BotReadyCycle); 2691 LLVM_DEBUG(dbgs() << " Ready @" << ReadyCycle << "c\n"); 2692 2693 unsigned NextCycle = CurrCycle; 2694 switch (SchedModel->getMicroOpBufferSize()) { 2695 case 0: 2696 assert(ReadyCycle <= CurrCycle && "Broken PendingQueue"); 2697 break; 2698 case 1: 2699 if (ReadyCycle > NextCycle) { 2700 NextCycle = ReadyCycle; 2701 LLVM_DEBUG(dbgs() << " *** Stall until: " << ReadyCycle << "\n"); 2702 } 2703 break; 2704 default: 2705 // We don't currently model the OOO reorder buffer, so consider all 2706 // scheduled MOps to be "retired". We do loosely model in-order resource 2707 // latency. If this instruction uses an in-order resource, account for any 2708 // likely stall cycles. 2709 if (SU->isUnbuffered && ReadyCycle > NextCycle) 2710 NextCycle = ReadyCycle; 2711 break; 2712 } 2713 RetiredMOps += IncMOps; 2714 2715 // Update resource counts and critical resource. 2716 if (SchedModel->hasInstrSchedModel()) { 2717 unsigned DecRemIssue = IncMOps * SchedModel->getMicroOpFactor(); 2718 assert(Rem->RemIssueCount >= DecRemIssue && "MOps double counted"); 2719 Rem->RemIssueCount -= DecRemIssue; 2720 if (ZoneCritResIdx) { 2721 // Scale scheduled micro-ops for comparing with the critical resource. 2722 unsigned ScaledMOps = 2723 RetiredMOps * SchedModel->getMicroOpFactor(); 2724 2725 // If scaled micro-ops are now more than the previous critical resource by 2726 // a full cycle, then micro-ops issue becomes critical. 2727 if ((int)(ScaledMOps - getResourceCount(ZoneCritResIdx)) 2728 >= (int)SchedModel->getLatencyFactor()) { 2729 ZoneCritResIdx = 0; 2730 LLVM_DEBUG(dbgs() << " *** Critical resource NumMicroOps: " 2731 << ScaledMOps / SchedModel->getLatencyFactor() 2732 << "c\n"); 2733 } 2734 } 2735 for (TargetSchedModel::ProcResIter 2736 PI = SchedModel->getWriteProcResBegin(SC), 2737 PE = SchedModel->getWriteProcResEnd(SC); PI != PE; ++PI) { 2738 unsigned RCycle = 2739 countResource(SC, PI->ProcResourceIdx, PI->ReleaseAtCycle, NextCycle, 2740 PI->AcquireAtCycle); 2741 if (RCycle > NextCycle) 2742 NextCycle = RCycle; 2743 } 2744 if (SU->hasReservedResource) { 2745 // For reserved resources, record the highest cycle using the resource. 2746 // For top-down scheduling, this is the cycle in which we schedule this 2747 // instruction plus the number of cycles the operations reserves the 2748 // resource. For bottom-up is it simply the instruction's cycle. 2749 for (TargetSchedModel::ProcResIter 2750 PI = SchedModel->getWriteProcResBegin(SC), 2751 PE = SchedModel->getWriteProcResEnd(SC); PI != PE; ++PI) { 2752 unsigned PIdx = PI->ProcResourceIdx; 2753 if (SchedModel->getProcResource(PIdx)->BufferSize == 0) { 2754 2755 if (SchedModel && SchedModel->enableIntervals()) { 2756 unsigned ReservedUntil, InstanceIdx; 2757 std::tie(ReservedUntil, InstanceIdx) = getNextResourceCycle( 2758 SC, PIdx, PI->ReleaseAtCycle, PI->AcquireAtCycle); 2759 if (isTop()) { 2760 ReservedResourceSegments[InstanceIdx].add( 2761 ResourceSegments::getResourceIntervalTop( 2762 NextCycle, PI->AcquireAtCycle, PI->ReleaseAtCycle), 2763 MIResourceCutOff); 2764 } else { 2765 ReservedResourceSegments[InstanceIdx].add( 2766 ResourceSegments::getResourceIntervalBottom( 2767 NextCycle, PI->AcquireAtCycle, PI->ReleaseAtCycle), 2768 MIResourceCutOff); 2769 } 2770 } else { 2771 2772 unsigned ReservedUntil, InstanceIdx; 2773 std::tie(ReservedUntil, InstanceIdx) = getNextResourceCycle( 2774 SC, PIdx, PI->ReleaseAtCycle, PI->AcquireAtCycle); 2775 if (isTop()) { 2776 ReservedCycles[InstanceIdx] = 2777 std::max(ReservedUntil, NextCycle + PI->ReleaseAtCycle); 2778 } else 2779 ReservedCycles[InstanceIdx] = NextCycle; 2780 } 2781 } 2782 } 2783 } 2784 } 2785 // Update ExpectedLatency and DependentLatency. 2786 unsigned &TopLatency = isTop() ? ExpectedLatency : DependentLatency; 2787 unsigned &BotLatency = isTop() ? DependentLatency : ExpectedLatency; 2788 if (SU->getDepth() > TopLatency) { 2789 TopLatency = SU->getDepth(); 2790 LLVM_DEBUG(dbgs() << " " << Available.getName() << " TopLatency SU(" 2791 << SU->NodeNum << ") " << TopLatency << "c\n"); 2792 } 2793 if (SU->getHeight() > BotLatency) { 2794 BotLatency = SU->getHeight(); 2795 LLVM_DEBUG(dbgs() << " " << Available.getName() << " BotLatency SU(" 2796 << SU->NodeNum << ") " << BotLatency << "c\n"); 2797 } 2798 // If we stall for any reason, bump the cycle. 2799 if (NextCycle > CurrCycle) 2800 bumpCycle(NextCycle); 2801 else 2802 // After updating ZoneCritResIdx and ExpectedLatency, check if we're 2803 // resource limited. If a stall occurred, bumpCycle does this. 2804 IsResourceLimited = 2805 checkResourceLimit(SchedModel->getLatencyFactor(), getCriticalCount(), 2806 getScheduledLatency(), true); 2807 2808 // Update CurrMOps after calling bumpCycle to handle stalls, since bumpCycle 2809 // resets CurrMOps. Loop to handle instructions with more MOps than issue in 2810 // one cycle. Since we commonly reach the max MOps here, opportunistically 2811 // bump the cycle to avoid uselessly checking everything in the readyQ. 2812 CurrMOps += IncMOps; 2813 2814 // Bump the cycle count for issue group constraints. 2815 // This must be done after NextCycle has been adjust for all other stalls. 2816 // Calling bumpCycle(X) will reduce CurrMOps by one issue group and set 2817 // currCycle to X. 2818 if ((isTop() && SchedModel->mustEndGroup(SU->getInstr())) || 2819 (!isTop() && SchedModel->mustBeginGroup(SU->getInstr()))) { 2820 LLVM_DEBUG(dbgs() << " Bump cycle to " << (isTop() ? "end" : "begin") 2821 << " group\n"); 2822 bumpCycle(++NextCycle); 2823 } 2824 2825 while (CurrMOps >= SchedModel->getIssueWidth()) { 2826 LLVM_DEBUG(dbgs() << " *** Max MOps " << CurrMOps << " at cycle " 2827 << CurrCycle << '\n'); 2828 bumpCycle(++NextCycle); 2829 } 2830 LLVM_DEBUG(dumpScheduledState()); 2831 } 2832 2833 /// Release pending ready nodes in to the available queue. This makes them 2834 /// visible to heuristics. 2835 void SchedBoundary::releasePending() { 2836 // If the available queue is empty, it is safe to reset MinReadyCycle. 2837 if (Available.empty()) 2838 MinReadyCycle = std::numeric_limits<unsigned>::max(); 2839 2840 // Check to see if any of the pending instructions are ready to issue. If 2841 // so, add them to the available queue. 2842 for (unsigned I = 0, E = Pending.size(); I < E; ++I) { 2843 SUnit *SU = *(Pending.begin() + I); 2844 unsigned ReadyCycle = isTop() ? SU->TopReadyCycle : SU->BotReadyCycle; 2845 2846 if (ReadyCycle < MinReadyCycle) 2847 MinReadyCycle = ReadyCycle; 2848 2849 if (Available.size() >= ReadyListLimit) 2850 break; 2851 2852 releaseNode(SU, ReadyCycle, true, I); 2853 if (E != Pending.size()) { 2854 --I; 2855 --E; 2856 } 2857 } 2858 CheckPending = false; 2859 } 2860 2861 /// Remove SU from the ready set for this boundary. 2862 void SchedBoundary::removeReady(SUnit *SU) { 2863 if (Available.isInQueue(SU)) 2864 Available.remove(Available.find(SU)); 2865 else { 2866 assert(Pending.isInQueue(SU) && "bad ready count"); 2867 Pending.remove(Pending.find(SU)); 2868 } 2869 } 2870 2871 /// If this queue only has one ready candidate, return it. As a side effect, 2872 /// defer any nodes that now hit a hazard, and advance the cycle until at least 2873 /// one node is ready. If multiple instructions are ready, return NULL. 2874 SUnit *SchedBoundary::pickOnlyChoice() { 2875 if (CheckPending) 2876 releasePending(); 2877 2878 // Defer any ready instrs that now have a hazard. 2879 for (ReadyQueue::iterator I = Available.begin(); I != Available.end();) { 2880 if (checkHazard(*I)) { 2881 Pending.push(*I); 2882 I = Available.remove(I); 2883 continue; 2884 } 2885 ++I; 2886 } 2887 for (unsigned i = 0; Available.empty(); ++i) { 2888 // FIXME: Re-enable assert once PR20057 is resolved. 2889 // assert(i <= (HazardRec->getMaxLookAhead() + MaxObservedStall) && 2890 // "permanent hazard"); 2891 (void)i; 2892 bumpCycle(CurrCycle + 1); 2893 releasePending(); 2894 } 2895 2896 LLVM_DEBUG(Pending.dump()); 2897 LLVM_DEBUG(Available.dump()); 2898 2899 if (Available.size() == 1) 2900 return *Available.begin(); 2901 return nullptr; 2902 } 2903 2904 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2905 2906 /// Dump the content of the \ref ReservedCycles vector for the 2907 /// resources that are used in the basic block. 2908 /// 2909 LLVM_DUMP_METHOD void SchedBoundary::dumpReservedCycles() const { 2910 if (!SchedModel->hasInstrSchedModel()) 2911 return; 2912 2913 unsigned ResourceCount = SchedModel->getNumProcResourceKinds(); 2914 unsigned StartIdx = 0; 2915 2916 for (unsigned ResIdx = 0; ResIdx < ResourceCount; ++ResIdx) { 2917 const unsigned NumUnits = SchedModel->getProcResource(ResIdx)->NumUnits; 2918 std::string ResName = SchedModel->getResourceName(ResIdx); 2919 for (unsigned UnitIdx = 0; UnitIdx < NumUnits; ++UnitIdx) { 2920 dbgs() << ResName << "(" << UnitIdx << ") = "; 2921 if (SchedModel && SchedModel->enableIntervals()) { 2922 if (ReservedResourceSegments.count(StartIdx + UnitIdx)) 2923 dbgs() << ReservedResourceSegments.at(StartIdx + UnitIdx); 2924 else 2925 dbgs() << "{ }\n"; 2926 } else 2927 dbgs() << ReservedCycles[StartIdx + UnitIdx] << "\n"; 2928 } 2929 StartIdx += NumUnits; 2930 } 2931 } 2932 2933 // This is useful information to dump after bumpNode. 2934 // Note that the Queue contents are more useful before pickNodeFromQueue. 2935 LLVM_DUMP_METHOD void SchedBoundary::dumpScheduledState() const { 2936 unsigned ResFactor; 2937 unsigned ResCount; 2938 if (ZoneCritResIdx) { 2939 ResFactor = SchedModel->getResourceFactor(ZoneCritResIdx); 2940 ResCount = getResourceCount(ZoneCritResIdx); 2941 } else { 2942 ResFactor = SchedModel->getMicroOpFactor(); 2943 ResCount = RetiredMOps * ResFactor; 2944 } 2945 unsigned LFactor = SchedModel->getLatencyFactor(); 2946 dbgs() << Available.getName() << " @" << CurrCycle << "c\n" 2947 << " Retired: " << RetiredMOps; 2948 dbgs() << "\n Executed: " << getExecutedCount() / LFactor << "c"; 2949 dbgs() << "\n Critical: " << ResCount / LFactor << "c, " 2950 << ResCount / ResFactor << " " 2951 << SchedModel->getResourceName(ZoneCritResIdx) 2952 << "\n ExpectedLatency: " << ExpectedLatency << "c\n" 2953 << (IsResourceLimited ? " - Resource" : " - Latency") 2954 << " limited.\n"; 2955 if (MISchedDumpReservedCycles) 2956 dumpReservedCycles(); 2957 } 2958 #endif 2959 2960 //===----------------------------------------------------------------------===// 2961 // GenericScheduler - Generic implementation of MachineSchedStrategy. 2962 //===----------------------------------------------------------------------===// 2963 2964 void GenericSchedulerBase::SchedCandidate:: 2965 initResourceDelta(const ScheduleDAGMI *DAG, 2966 const TargetSchedModel *SchedModel) { 2967 if (!Policy.ReduceResIdx && !Policy.DemandResIdx) 2968 return; 2969 2970 const MCSchedClassDesc *SC = DAG->getSchedClass(SU); 2971 for (TargetSchedModel::ProcResIter 2972 PI = SchedModel->getWriteProcResBegin(SC), 2973 PE = SchedModel->getWriteProcResEnd(SC); PI != PE; ++PI) { 2974 if (PI->ProcResourceIdx == Policy.ReduceResIdx) 2975 ResDelta.CritResources += PI->ReleaseAtCycle; 2976 if (PI->ProcResourceIdx == Policy.DemandResIdx) 2977 ResDelta.DemandedResources += PI->ReleaseAtCycle; 2978 } 2979 } 2980 2981 /// Compute remaining latency. We need this both to determine whether the 2982 /// overall schedule has become latency-limited and whether the instructions 2983 /// outside this zone are resource or latency limited. 2984 /// 2985 /// The "dependent" latency is updated incrementally during scheduling as the 2986 /// max height/depth of scheduled nodes minus the cycles since it was 2987 /// scheduled: 2988 /// DLat = max (N.depth - (CurrCycle - N.ReadyCycle) for N in Zone 2989 /// 2990 /// The "independent" latency is the max ready queue depth: 2991 /// ILat = max N.depth for N in Available|Pending 2992 /// 2993 /// RemainingLatency is the greater of independent and dependent latency. 2994 /// 2995 /// These computations are expensive, especially in DAGs with many edges, so 2996 /// only do them if necessary. 2997 static unsigned computeRemLatency(SchedBoundary &CurrZone) { 2998 unsigned RemLatency = CurrZone.getDependentLatency(); 2999 RemLatency = std::max(RemLatency, 3000 CurrZone.findMaxLatency(CurrZone.Available.elements())); 3001 RemLatency = std::max(RemLatency, 3002 CurrZone.findMaxLatency(CurrZone.Pending.elements())); 3003 return RemLatency; 3004 } 3005 3006 /// Returns true if the current cycle plus remaning latency is greater than 3007 /// the critical path in the scheduling region. 3008 bool GenericSchedulerBase::shouldReduceLatency(const CandPolicy &Policy, 3009 SchedBoundary &CurrZone, 3010 bool ComputeRemLatency, 3011 unsigned &RemLatency) const { 3012 // The current cycle is already greater than the critical path, so we are 3013 // already latency limited and don't need to compute the remaining latency. 3014 if (CurrZone.getCurrCycle() > Rem.CriticalPath) 3015 return true; 3016 3017 // If we haven't scheduled anything yet, then we aren't latency limited. 3018 if (CurrZone.getCurrCycle() == 0) 3019 return false; 3020 3021 if (ComputeRemLatency) 3022 RemLatency = computeRemLatency(CurrZone); 3023 3024 return RemLatency + CurrZone.getCurrCycle() > Rem.CriticalPath; 3025 } 3026 3027 /// Set the CandPolicy given a scheduling zone given the current resources and 3028 /// latencies inside and outside the zone. 3029 void GenericSchedulerBase::setPolicy(CandPolicy &Policy, bool IsPostRA, 3030 SchedBoundary &CurrZone, 3031 SchedBoundary *OtherZone) { 3032 // Apply preemptive heuristics based on the total latency and resources 3033 // inside and outside this zone. Potential stalls should be considered before 3034 // following this policy. 3035 3036 // Compute the critical resource outside the zone. 3037 unsigned OtherCritIdx = 0; 3038 unsigned OtherCount = 3039 OtherZone ? OtherZone->getOtherResourceCount(OtherCritIdx) : 0; 3040 3041 bool OtherResLimited = false; 3042 unsigned RemLatency = 0; 3043 bool RemLatencyComputed = false; 3044 if (SchedModel->hasInstrSchedModel() && OtherCount != 0) { 3045 RemLatency = computeRemLatency(CurrZone); 3046 RemLatencyComputed = true; 3047 OtherResLimited = checkResourceLimit(SchedModel->getLatencyFactor(), 3048 OtherCount, RemLatency, false); 3049 } 3050 3051 // Schedule aggressively for latency in PostRA mode. We don't check for 3052 // acyclic latency during PostRA, and highly out-of-order processors will 3053 // skip PostRA scheduling. 3054 if (!OtherResLimited && 3055 (IsPostRA || shouldReduceLatency(Policy, CurrZone, !RemLatencyComputed, 3056 RemLatency))) { 3057 Policy.ReduceLatency |= true; 3058 LLVM_DEBUG(dbgs() << " " << CurrZone.Available.getName() 3059 << " RemainingLatency " << RemLatency << " + " 3060 << CurrZone.getCurrCycle() << "c > CritPath " 3061 << Rem.CriticalPath << "\n"); 3062 } 3063 // If the same resource is limiting inside and outside the zone, do nothing. 3064 if (CurrZone.getZoneCritResIdx() == OtherCritIdx) 3065 return; 3066 3067 LLVM_DEBUG(if (CurrZone.isResourceLimited()) { 3068 dbgs() << " " << CurrZone.Available.getName() << " ResourceLimited: " 3069 << SchedModel->getResourceName(CurrZone.getZoneCritResIdx()) << "\n"; 3070 } if (OtherResLimited) dbgs() 3071 << " RemainingLimit: " 3072 << SchedModel->getResourceName(OtherCritIdx) << "\n"; 3073 if (!CurrZone.isResourceLimited() && !OtherResLimited) dbgs() 3074 << " Latency limited both directions.\n"); 3075 3076 if (CurrZone.isResourceLimited() && !Policy.ReduceResIdx) 3077 Policy.ReduceResIdx = CurrZone.getZoneCritResIdx(); 3078 3079 if (OtherResLimited) 3080 Policy.DemandResIdx = OtherCritIdx; 3081 } 3082 3083 #ifndef NDEBUG 3084 const char *GenericSchedulerBase::getReasonStr( 3085 GenericSchedulerBase::CandReason Reason) { 3086 switch (Reason) { 3087 case NoCand: return "NOCAND "; 3088 case Only1: return "ONLY1 "; 3089 case PhysReg: return "PHYS-REG "; 3090 case RegExcess: return "REG-EXCESS"; 3091 case RegCritical: return "REG-CRIT "; 3092 case Stall: return "STALL "; 3093 case Cluster: return "CLUSTER "; 3094 case Weak: return "WEAK "; 3095 case RegMax: return "REG-MAX "; 3096 case ResourceReduce: return "RES-REDUCE"; 3097 case ResourceDemand: return "RES-DEMAND"; 3098 case TopDepthReduce: return "TOP-DEPTH "; 3099 case TopPathReduce: return "TOP-PATH "; 3100 case BotHeightReduce:return "BOT-HEIGHT"; 3101 case BotPathReduce: return "BOT-PATH "; 3102 case NextDefUse: return "DEF-USE "; 3103 case NodeOrder: return "ORDER "; 3104 }; 3105 llvm_unreachable("Unknown reason!"); 3106 } 3107 3108 void GenericSchedulerBase::traceCandidate(const SchedCandidate &Cand) { 3109 PressureChange P; 3110 unsigned ResIdx = 0; 3111 unsigned Latency = 0; 3112 switch (Cand.Reason) { 3113 default: 3114 break; 3115 case RegExcess: 3116 P = Cand.RPDelta.Excess; 3117 break; 3118 case RegCritical: 3119 P = Cand.RPDelta.CriticalMax; 3120 break; 3121 case RegMax: 3122 P = Cand.RPDelta.CurrentMax; 3123 break; 3124 case ResourceReduce: 3125 ResIdx = Cand.Policy.ReduceResIdx; 3126 break; 3127 case ResourceDemand: 3128 ResIdx = Cand.Policy.DemandResIdx; 3129 break; 3130 case TopDepthReduce: 3131 Latency = Cand.SU->getDepth(); 3132 break; 3133 case TopPathReduce: 3134 Latency = Cand.SU->getHeight(); 3135 break; 3136 case BotHeightReduce: 3137 Latency = Cand.SU->getHeight(); 3138 break; 3139 case BotPathReduce: 3140 Latency = Cand.SU->getDepth(); 3141 break; 3142 } 3143 dbgs() << " Cand SU(" << Cand.SU->NodeNum << ") " << getReasonStr(Cand.Reason); 3144 if (P.isValid()) 3145 dbgs() << " " << TRI->getRegPressureSetName(P.getPSet()) 3146 << ":" << P.getUnitInc() << " "; 3147 else 3148 dbgs() << " "; 3149 if (ResIdx) 3150 dbgs() << " " << SchedModel->getProcResource(ResIdx)->Name << " "; 3151 else 3152 dbgs() << " "; 3153 if (Latency) 3154 dbgs() << " " << Latency << " cycles "; 3155 else 3156 dbgs() << " "; 3157 dbgs() << '\n'; 3158 } 3159 #endif 3160 3161 namespace llvm { 3162 /// Return true if this heuristic determines order. 3163 /// TODO: Consider refactor return type of these functions as integer or enum, 3164 /// as we may need to differentiate whether TryCand is better than Cand. 3165 bool tryLess(int TryVal, int CandVal, 3166 GenericSchedulerBase::SchedCandidate &TryCand, 3167 GenericSchedulerBase::SchedCandidate &Cand, 3168 GenericSchedulerBase::CandReason Reason) { 3169 if (TryVal < CandVal) { 3170 TryCand.Reason = Reason; 3171 return true; 3172 } 3173 if (TryVal > CandVal) { 3174 if (Cand.Reason > Reason) 3175 Cand.Reason = Reason; 3176 return true; 3177 } 3178 return false; 3179 } 3180 3181 bool tryGreater(int TryVal, int CandVal, 3182 GenericSchedulerBase::SchedCandidate &TryCand, 3183 GenericSchedulerBase::SchedCandidate &Cand, 3184 GenericSchedulerBase::CandReason Reason) { 3185 if (TryVal > CandVal) { 3186 TryCand.Reason = Reason; 3187 return true; 3188 } 3189 if (TryVal < CandVal) { 3190 if (Cand.Reason > Reason) 3191 Cand.Reason = Reason; 3192 return true; 3193 } 3194 return false; 3195 } 3196 3197 bool tryLatency(GenericSchedulerBase::SchedCandidate &TryCand, 3198 GenericSchedulerBase::SchedCandidate &Cand, 3199 SchedBoundary &Zone) { 3200 if (Zone.isTop()) { 3201 // Prefer the candidate with the lesser depth, but only if one of them has 3202 // depth greater than the total latency scheduled so far, otherwise either 3203 // of them could be scheduled now with no stall. 3204 if (std::max(TryCand.SU->getDepth(), Cand.SU->getDepth()) > 3205 Zone.getScheduledLatency()) { 3206 if (tryLess(TryCand.SU->getDepth(), Cand.SU->getDepth(), 3207 TryCand, Cand, GenericSchedulerBase::TopDepthReduce)) 3208 return true; 3209 } 3210 if (tryGreater(TryCand.SU->getHeight(), Cand.SU->getHeight(), 3211 TryCand, Cand, GenericSchedulerBase::TopPathReduce)) 3212 return true; 3213 } else { 3214 // Prefer the candidate with the lesser height, but only if one of them has 3215 // height greater than the total latency scheduled so far, otherwise either 3216 // of them could be scheduled now with no stall. 3217 if (std::max(TryCand.SU->getHeight(), Cand.SU->getHeight()) > 3218 Zone.getScheduledLatency()) { 3219 if (tryLess(TryCand.SU->getHeight(), Cand.SU->getHeight(), 3220 TryCand, Cand, GenericSchedulerBase::BotHeightReduce)) 3221 return true; 3222 } 3223 if (tryGreater(TryCand.SU->getDepth(), Cand.SU->getDepth(), 3224 TryCand, Cand, GenericSchedulerBase::BotPathReduce)) 3225 return true; 3226 } 3227 return false; 3228 } 3229 } // end namespace llvm 3230 3231 static void tracePick(GenericSchedulerBase::CandReason Reason, bool IsTop) { 3232 LLVM_DEBUG(dbgs() << "Pick " << (IsTop ? "Top " : "Bot ") 3233 << GenericSchedulerBase::getReasonStr(Reason) << '\n'); 3234 } 3235 3236 static void tracePick(const GenericSchedulerBase::SchedCandidate &Cand) { 3237 tracePick(Cand.Reason, Cand.AtTop); 3238 } 3239 3240 void GenericScheduler::initialize(ScheduleDAGMI *dag) { 3241 assert(dag->hasVRegLiveness() && 3242 "(PreRA)GenericScheduler needs vreg liveness"); 3243 DAG = static_cast<ScheduleDAGMILive*>(dag); 3244 SchedModel = DAG->getSchedModel(); 3245 TRI = DAG->TRI; 3246 3247 if (RegionPolicy.ComputeDFSResult) 3248 DAG->computeDFSResult(); 3249 3250 Rem.init(DAG, SchedModel); 3251 Top.init(DAG, SchedModel, &Rem); 3252 Bot.init(DAG, SchedModel, &Rem); 3253 3254 // Initialize resource counts. 3255 3256 // Initialize the HazardRecognizers. If itineraries don't exist, are empty, or 3257 // are disabled, then these HazardRecs will be disabled. 3258 const InstrItineraryData *Itin = SchedModel->getInstrItineraries(); 3259 if (!Top.HazardRec) { 3260 Top.HazardRec = 3261 DAG->MF.getSubtarget().getInstrInfo()->CreateTargetMIHazardRecognizer( 3262 Itin, DAG); 3263 } 3264 if (!Bot.HazardRec) { 3265 Bot.HazardRec = 3266 DAG->MF.getSubtarget().getInstrInfo()->CreateTargetMIHazardRecognizer( 3267 Itin, DAG); 3268 } 3269 TopCand.SU = nullptr; 3270 BotCand.SU = nullptr; 3271 } 3272 3273 /// Initialize the per-region scheduling policy. 3274 void GenericScheduler::initPolicy(MachineBasicBlock::iterator Begin, 3275 MachineBasicBlock::iterator End, 3276 unsigned NumRegionInstrs) { 3277 const MachineFunction &MF = *Begin->getMF(); 3278 const TargetLowering *TLI = MF.getSubtarget().getTargetLowering(); 3279 3280 // Avoid setting up the register pressure tracker for small regions to save 3281 // compile time. As a rough heuristic, only track pressure when the number of 3282 // schedulable instructions exceeds half the integer register file. 3283 RegionPolicy.ShouldTrackPressure = true; 3284 for (unsigned VT = MVT::i32; VT > (unsigned)MVT::i1; --VT) { 3285 MVT::SimpleValueType LegalIntVT = (MVT::SimpleValueType)VT; 3286 if (TLI->isTypeLegal(LegalIntVT)) { 3287 unsigned NIntRegs = Context->RegClassInfo->getNumAllocatableRegs( 3288 TLI->getRegClassFor(LegalIntVT)); 3289 RegionPolicy.ShouldTrackPressure = NumRegionInstrs > (NIntRegs / 2); 3290 } 3291 } 3292 3293 // For generic targets, we default to bottom-up, because it's simpler and more 3294 // compile-time optimizations have been implemented in that direction. 3295 RegionPolicy.OnlyBottomUp = true; 3296 3297 // Allow the subtarget to override default policy. 3298 MF.getSubtarget().overrideSchedPolicy(RegionPolicy, NumRegionInstrs); 3299 3300 // After subtarget overrides, apply command line options. 3301 if (!EnableRegPressure) { 3302 RegionPolicy.ShouldTrackPressure = false; 3303 RegionPolicy.ShouldTrackLaneMasks = false; 3304 } 3305 3306 // Check -misched-topdown/bottomup can force or unforce scheduling direction. 3307 // e.g. -misched-bottomup=false allows scheduling in both directions. 3308 assert((!ForceTopDown || !ForceBottomUp) && 3309 "-misched-topdown incompatible with -misched-bottomup"); 3310 if (ForceBottomUp.getNumOccurrences() > 0) { 3311 RegionPolicy.OnlyBottomUp = ForceBottomUp; 3312 if (RegionPolicy.OnlyBottomUp) 3313 RegionPolicy.OnlyTopDown = false; 3314 } 3315 if (ForceTopDown.getNumOccurrences() > 0) { 3316 RegionPolicy.OnlyTopDown = ForceTopDown; 3317 if (RegionPolicy.OnlyTopDown) 3318 RegionPolicy.OnlyBottomUp = false; 3319 } 3320 } 3321 3322 void GenericScheduler::dumpPolicy() const { 3323 // Cannot completely remove virtual function even in release mode. 3324 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 3325 dbgs() << "GenericScheduler RegionPolicy: " 3326 << " ShouldTrackPressure=" << RegionPolicy.ShouldTrackPressure 3327 << " OnlyTopDown=" << RegionPolicy.OnlyTopDown 3328 << " OnlyBottomUp=" << RegionPolicy.OnlyBottomUp 3329 << "\n"; 3330 #endif 3331 } 3332 3333 /// Set IsAcyclicLatencyLimited if the acyclic path is longer than the cyclic 3334 /// critical path by more cycles than it takes to drain the instruction buffer. 3335 /// We estimate an upper bounds on in-flight instructions as: 3336 /// 3337 /// CyclesPerIteration = max( CyclicPath, Loop-Resource-Height ) 3338 /// InFlightIterations = AcyclicPath / CyclesPerIteration 3339 /// InFlightResources = InFlightIterations * LoopResources 3340 /// 3341 /// TODO: Check execution resources in addition to IssueCount. 3342 void GenericScheduler::checkAcyclicLatency() { 3343 if (Rem.CyclicCritPath == 0 || Rem.CyclicCritPath >= Rem.CriticalPath) 3344 return; 3345 3346 // Scaled number of cycles per loop iteration. 3347 unsigned IterCount = 3348 std::max(Rem.CyclicCritPath * SchedModel->getLatencyFactor(), 3349 Rem.RemIssueCount); 3350 // Scaled acyclic critical path. 3351 unsigned AcyclicCount = Rem.CriticalPath * SchedModel->getLatencyFactor(); 3352 // InFlightCount = (AcyclicPath / IterCycles) * InstrPerLoop 3353 unsigned InFlightCount = 3354 (AcyclicCount * Rem.RemIssueCount + IterCount-1) / IterCount; 3355 unsigned BufferLimit = 3356 SchedModel->getMicroOpBufferSize() * SchedModel->getMicroOpFactor(); 3357 3358 Rem.IsAcyclicLatencyLimited = InFlightCount > BufferLimit; 3359 3360 LLVM_DEBUG( 3361 dbgs() << "IssueCycles=" 3362 << Rem.RemIssueCount / SchedModel->getLatencyFactor() << "c " 3363 << "IterCycles=" << IterCount / SchedModel->getLatencyFactor() 3364 << "c NumIters=" << (AcyclicCount + IterCount - 1) / IterCount 3365 << " InFlight=" << InFlightCount / SchedModel->getMicroOpFactor() 3366 << "m BufferLim=" << SchedModel->getMicroOpBufferSize() << "m\n"; 3367 if (Rem.IsAcyclicLatencyLimited) dbgs() << " ACYCLIC LATENCY LIMIT\n"); 3368 } 3369 3370 void GenericScheduler::registerRoots() { 3371 Rem.CriticalPath = DAG->ExitSU.getDepth(); 3372 3373 // Some roots may not feed into ExitSU. Check all of them in case. 3374 for (const SUnit *SU : Bot.Available) { 3375 if (SU->getDepth() > Rem.CriticalPath) 3376 Rem.CriticalPath = SU->getDepth(); 3377 } 3378 LLVM_DEBUG(dbgs() << "Critical Path(GS-RR ): " << Rem.CriticalPath << '\n'); 3379 if (DumpCriticalPathLength) { 3380 errs() << "Critical Path(GS-RR ): " << Rem.CriticalPath << " \n"; 3381 } 3382 3383 if (EnableCyclicPath && SchedModel->getMicroOpBufferSize() > 0) { 3384 Rem.CyclicCritPath = DAG->computeCyclicCriticalPath(); 3385 checkAcyclicLatency(); 3386 } 3387 } 3388 3389 namespace llvm { 3390 bool tryPressure(const PressureChange &TryP, 3391 const PressureChange &CandP, 3392 GenericSchedulerBase::SchedCandidate &TryCand, 3393 GenericSchedulerBase::SchedCandidate &Cand, 3394 GenericSchedulerBase::CandReason Reason, 3395 const TargetRegisterInfo *TRI, 3396 const MachineFunction &MF) { 3397 // If one candidate decreases and the other increases, go with it. 3398 // Invalid candidates have UnitInc==0. 3399 if (tryGreater(TryP.getUnitInc() < 0, CandP.getUnitInc() < 0, TryCand, Cand, 3400 Reason)) { 3401 return true; 3402 } 3403 // Do not compare the magnitude of pressure changes between top and bottom 3404 // boundary. 3405 if (Cand.AtTop != TryCand.AtTop) 3406 return false; 3407 3408 // If both candidates affect the same set in the same boundary, go with the 3409 // smallest increase. 3410 unsigned TryPSet = TryP.getPSetOrMax(); 3411 unsigned CandPSet = CandP.getPSetOrMax(); 3412 if (TryPSet == CandPSet) { 3413 return tryLess(TryP.getUnitInc(), CandP.getUnitInc(), TryCand, Cand, 3414 Reason); 3415 } 3416 3417 int TryRank = TryP.isValid() ? TRI->getRegPressureSetScore(MF, TryPSet) : 3418 std::numeric_limits<int>::max(); 3419 3420 int CandRank = CandP.isValid() ? TRI->getRegPressureSetScore(MF, CandPSet) : 3421 std::numeric_limits<int>::max(); 3422 3423 // If the candidates are decreasing pressure, reverse priority. 3424 if (TryP.getUnitInc() < 0) 3425 std::swap(TryRank, CandRank); 3426 return tryGreater(TryRank, CandRank, TryCand, Cand, Reason); 3427 } 3428 3429 unsigned getWeakLeft(const SUnit *SU, bool isTop) { 3430 return (isTop) ? SU->WeakPredsLeft : SU->WeakSuccsLeft; 3431 } 3432 3433 /// Minimize physical register live ranges. Regalloc wants them adjacent to 3434 /// their physreg def/use. 3435 /// 3436 /// FIXME: This is an unnecessary check on the critical path. Most are root/leaf 3437 /// copies which can be prescheduled. The rest (e.g. x86 MUL) could be bundled 3438 /// with the operation that produces or consumes the physreg. We'll do this when 3439 /// regalloc has support for parallel copies. 3440 int biasPhysReg(const SUnit *SU, bool isTop) { 3441 const MachineInstr *MI = SU->getInstr(); 3442 3443 if (MI->isCopy()) { 3444 unsigned ScheduledOper = isTop ? 1 : 0; 3445 unsigned UnscheduledOper = isTop ? 0 : 1; 3446 // If we have already scheduled the physreg produce/consumer, immediately 3447 // schedule the copy. 3448 if (MI->getOperand(ScheduledOper).getReg().isPhysical()) 3449 return 1; 3450 // If the physreg is at the boundary, defer it. Otherwise schedule it 3451 // immediately to free the dependent. We can hoist the copy later. 3452 bool AtBoundary = isTop ? !SU->NumSuccsLeft : !SU->NumPredsLeft; 3453 if (MI->getOperand(UnscheduledOper).getReg().isPhysical()) 3454 return AtBoundary ? -1 : 1; 3455 } 3456 3457 if (MI->isMoveImmediate()) { 3458 // If we have a move immediate and all successors have been assigned, bias 3459 // towards scheduling this later. Make sure all register defs are to 3460 // physical registers. 3461 bool DoBias = true; 3462 for (const MachineOperand &Op : MI->defs()) { 3463 if (Op.isReg() && !Op.getReg().isPhysical()) { 3464 DoBias = false; 3465 break; 3466 } 3467 } 3468 3469 if (DoBias) 3470 return isTop ? -1 : 1; 3471 } 3472 3473 return 0; 3474 } 3475 } // end namespace llvm 3476 3477 void GenericScheduler::initCandidate(SchedCandidate &Cand, SUnit *SU, 3478 bool AtTop, 3479 const RegPressureTracker &RPTracker, 3480 RegPressureTracker &TempTracker) { 3481 Cand.SU = SU; 3482 Cand.AtTop = AtTop; 3483 if (DAG->isTrackingPressure()) { 3484 if (AtTop) { 3485 TempTracker.getMaxDownwardPressureDelta( 3486 Cand.SU->getInstr(), 3487 Cand.RPDelta, 3488 DAG->getRegionCriticalPSets(), 3489 DAG->getRegPressure().MaxSetPressure); 3490 } else { 3491 if (VerifyScheduling) { 3492 TempTracker.getMaxUpwardPressureDelta( 3493 Cand.SU->getInstr(), 3494 &DAG->getPressureDiff(Cand.SU), 3495 Cand.RPDelta, 3496 DAG->getRegionCriticalPSets(), 3497 DAG->getRegPressure().MaxSetPressure); 3498 } else { 3499 RPTracker.getUpwardPressureDelta( 3500 Cand.SU->getInstr(), 3501 DAG->getPressureDiff(Cand.SU), 3502 Cand.RPDelta, 3503 DAG->getRegionCriticalPSets(), 3504 DAG->getRegPressure().MaxSetPressure); 3505 } 3506 } 3507 } 3508 LLVM_DEBUG(if (Cand.RPDelta.Excess.isValid()) dbgs() 3509 << " Try SU(" << Cand.SU->NodeNum << ") " 3510 << TRI->getRegPressureSetName(Cand.RPDelta.Excess.getPSet()) << ":" 3511 << Cand.RPDelta.Excess.getUnitInc() << "\n"); 3512 } 3513 3514 /// Apply a set of heuristics to a new candidate. Heuristics are currently 3515 /// hierarchical. This may be more efficient than a graduated cost model because 3516 /// we don't need to evaluate all aspects of the model for each node in the 3517 /// queue. But it's really done to make the heuristics easier to debug and 3518 /// statistically analyze. 3519 /// 3520 /// \param Cand provides the policy and current best candidate. 3521 /// \param TryCand refers to the next SUnit candidate, otherwise uninitialized. 3522 /// \param Zone describes the scheduled zone that we are extending, or nullptr 3523 /// if Cand is from a different zone than TryCand. 3524 /// \return \c true if TryCand is better than Cand (Reason is NOT NoCand) 3525 bool GenericScheduler::tryCandidate(SchedCandidate &Cand, 3526 SchedCandidate &TryCand, 3527 SchedBoundary *Zone) const { 3528 // Initialize the candidate if needed. 3529 if (!Cand.isValid()) { 3530 TryCand.Reason = NodeOrder; 3531 return true; 3532 } 3533 3534 // Bias PhysReg Defs and copies to their uses and defined respectively. 3535 if (tryGreater(biasPhysReg(TryCand.SU, TryCand.AtTop), 3536 biasPhysReg(Cand.SU, Cand.AtTop), TryCand, Cand, PhysReg)) 3537 return TryCand.Reason != NoCand; 3538 3539 // Avoid exceeding the target's limit. 3540 if (DAG->isTrackingPressure() && tryPressure(TryCand.RPDelta.Excess, 3541 Cand.RPDelta.Excess, 3542 TryCand, Cand, RegExcess, TRI, 3543 DAG->MF)) 3544 return TryCand.Reason != NoCand; 3545 3546 // Avoid increasing the max critical pressure in the scheduled region. 3547 if (DAG->isTrackingPressure() && tryPressure(TryCand.RPDelta.CriticalMax, 3548 Cand.RPDelta.CriticalMax, 3549 TryCand, Cand, RegCritical, TRI, 3550 DAG->MF)) 3551 return TryCand.Reason != NoCand; 3552 3553 // We only compare a subset of features when comparing nodes between 3554 // Top and Bottom boundary. Some properties are simply incomparable, in many 3555 // other instances we should only override the other boundary if something 3556 // is a clear good pick on one boundary. Skip heuristics that are more 3557 // "tie-breaking" in nature. 3558 bool SameBoundary = Zone != nullptr; 3559 if (SameBoundary) { 3560 // For loops that are acyclic path limited, aggressively schedule for 3561 // latency. Within an single cycle, whenever CurrMOps > 0, allow normal 3562 // heuristics to take precedence. 3563 if (Rem.IsAcyclicLatencyLimited && !Zone->getCurrMOps() && 3564 tryLatency(TryCand, Cand, *Zone)) 3565 return TryCand.Reason != NoCand; 3566 3567 // Prioritize instructions that read unbuffered resources by stall cycles. 3568 if (tryLess(Zone->getLatencyStallCycles(TryCand.SU), 3569 Zone->getLatencyStallCycles(Cand.SU), TryCand, Cand, Stall)) 3570 return TryCand.Reason != NoCand; 3571 } 3572 3573 // Keep clustered nodes together to encourage downstream peephole 3574 // optimizations which may reduce resource requirements. 3575 // 3576 // This is a best effort to set things up for a post-RA pass. Optimizations 3577 // like generating loads of multiple registers should ideally be done within 3578 // the scheduler pass by combining the loads during DAG postprocessing. 3579 const SUnit *CandNextClusterSU = 3580 Cand.AtTop ? DAG->getNextClusterSucc() : DAG->getNextClusterPred(); 3581 const SUnit *TryCandNextClusterSU = 3582 TryCand.AtTop ? DAG->getNextClusterSucc() : DAG->getNextClusterPred(); 3583 if (tryGreater(TryCand.SU == TryCandNextClusterSU, 3584 Cand.SU == CandNextClusterSU, 3585 TryCand, Cand, Cluster)) 3586 return TryCand.Reason != NoCand; 3587 3588 if (SameBoundary) { 3589 // Weak edges are for clustering and other constraints. 3590 if (tryLess(getWeakLeft(TryCand.SU, TryCand.AtTop), 3591 getWeakLeft(Cand.SU, Cand.AtTop), 3592 TryCand, Cand, Weak)) 3593 return TryCand.Reason != NoCand; 3594 } 3595 3596 // Avoid increasing the max pressure of the entire region. 3597 if (DAG->isTrackingPressure() && tryPressure(TryCand.RPDelta.CurrentMax, 3598 Cand.RPDelta.CurrentMax, 3599 TryCand, Cand, RegMax, TRI, 3600 DAG->MF)) 3601 return TryCand.Reason != NoCand; 3602 3603 if (SameBoundary) { 3604 // Avoid critical resource consumption and balance the schedule. 3605 TryCand.initResourceDelta(DAG, SchedModel); 3606 if (tryLess(TryCand.ResDelta.CritResources, Cand.ResDelta.CritResources, 3607 TryCand, Cand, ResourceReduce)) 3608 return TryCand.Reason != NoCand; 3609 if (tryGreater(TryCand.ResDelta.DemandedResources, 3610 Cand.ResDelta.DemandedResources, 3611 TryCand, Cand, ResourceDemand)) 3612 return TryCand.Reason != NoCand; 3613 3614 // Avoid serializing long latency dependence chains. 3615 // For acyclic path limited loops, latency was already checked above. 3616 if (!RegionPolicy.DisableLatencyHeuristic && TryCand.Policy.ReduceLatency && 3617 !Rem.IsAcyclicLatencyLimited && tryLatency(TryCand, Cand, *Zone)) 3618 return TryCand.Reason != NoCand; 3619 3620 // Fall through to original instruction order. 3621 if ((Zone->isTop() && TryCand.SU->NodeNum < Cand.SU->NodeNum) 3622 || (!Zone->isTop() && TryCand.SU->NodeNum > Cand.SU->NodeNum)) { 3623 TryCand.Reason = NodeOrder; 3624 return true; 3625 } 3626 } 3627 3628 return false; 3629 } 3630 3631 /// Pick the best candidate from the queue. 3632 /// 3633 /// TODO: getMaxPressureDelta results can be mostly cached for each SUnit during 3634 /// DAG building. To adjust for the current scheduling location we need to 3635 /// maintain the number of vreg uses remaining to be top-scheduled. 3636 void GenericScheduler::pickNodeFromQueue(SchedBoundary &Zone, 3637 const CandPolicy &ZonePolicy, 3638 const RegPressureTracker &RPTracker, 3639 SchedCandidate &Cand) { 3640 // getMaxPressureDelta temporarily modifies the tracker. 3641 RegPressureTracker &TempTracker = const_cast<RegPressureTracker&>(RPTracker); 3642 3643 ReadyQueue &Q = Zone.Available; 3644 for (SUnit *SU : Q) { 3645 3646 SchedCandidate TryCand(ZonePolicy); 3647 initCandidate(TryCand, SU, Zone.isTop(), RPTracker, TempTracker); 3648 // Pass SchedBoundary only when comparing nodes from the same boundary. 3649 SchedBoundary *ZoneArg = Cand.AtTop == TryCand.AtTop ? &Zone : nullptr; 3650 if (tryCandidate(Cand, TryCand, ZoneArg)) { 3651 // Initialize resource delta if needed in case future heuristics query it. 3652 if (TryCand.ResDelta == SchedResourceDelta()) 3653 TryCand.initResourceDelta(DAG, SchedModel); 3654 Cand.setBest(TryCand); 3655 LLVM_DEBUG(traceCandidate(Cand)); 3656 } 3657 } 3658 } 3659 3660 /// Pick the best candidate node from either the top or bottom queue. 3661 SUnit *GenericScheduler::pickNodeBidirectional(bool &IsTopNode) { 3662 // Schedule as far as possible in the direction of no choice. This is most 3663 // efficient, but also provides the best heuristics for CriticalPSets. 3664 if (SUnit *SU = Bot.pickOnlyChoice()) { 3665 IsTopNode = false; 3666 tracePick(Only1, false); 3667 return SU; 3668 } 3669 if (SUnit *SU = Top.pickOnlyChoice()) { 3670 IsTopNode = true; 3671 tracePick(Only1, true); 3672 return SU; 3673 } 3674 // Set the bottom-up policy based on the state of the current bottom zone and 3675 // the instructions outside the zone, including the top zone. 3676 CandPolicy BotPolicy; 3677 setPolicy(BotPolicy, /*IsPostRA=*/false, Bot, &Top); 3678 // Set the top-down policy based on the state of the current top zone and 3679 // the instructions outside the zone, including the bottom zone. 3680 CandPolicy TopPolicy; 3681 setPolicy(TopPolicy, /*IsPostRA=*/false, Top, &Bot); 3682 3683 // See if BotCand is still valid (because we previously scheduled from Top). 3684 LLVM_DEBUG(dbgs() << "Picking from Bot:\n"); 3685 if (!BotCand.isValid() || BotCand.SU->isScheduled || 3686 BotCand.Policy != BotPolicy) { 3687 BotCand.reset(CandPolicy()); 3688 pickNodeFromQueue(Bot, BotPolicy, DAG->getBotRPTracker(), BotCand); 3689 assert(BotCand.Reason != NoCand && "failed to find the first candidate"); 3690 } else { 3691 LLVM_DEBUG(traceCandidate(BotCand)); 3692 #ifndef NDEBUG 3693 if (VerifyScheduling) { 3694 SchedCandidate TCand; 3695 TCand.reset(CandPolicy()); 3696 pickNodeFromQueue(Bot, BotPolicy, DAG->getBotRPTracker(), TCand); 3697 assert(TCand.SU == BotCand.SU && 3698 "Last pick result should correspond to re-picking right now"); 3699 } 3700 #endif 3701 } 3702 3703 // Check if the top Q has a better candidate. 3704 LLVM_DEBUG(dbgs() << "Picking from Top:\n"); 3705 if (!TopCand.isValid() || TopCand.SU->isScheduled || 3706 TopCand.Policy != TopPolicy) { 3707 TopCand.reset(CandPolicy()); 3708 pickNodeFromQueue(Top, TopPolicy, DAG->getTopRPTracker(), TopCand); 3709 assert(TopCand.Reason != NoCand && "failed to find the first candidate"); 3710 } else { 3711 LLVM_DEBUG(traceCandidate(TopCand)); 3712 #ifndef NDEBUG 3713 if (VerifyScheduling) { 3714 SchedCandidate TCand; 3715 TCand.reset(CandPolicy()); 3716 pickNodeFromQueue(Top, TopPolicy, DAG->getTopRPTracker(), TCand); 3717 assert(TCand.SU == TopCand.SU && 3718 "Last pick result should correspond to re-picking right now"); 3719 } 3720 #endif 3721 } 3722 3723 // Pick best from BotCand and TopCand. 3724 assert(BotCand.isValid()); 3725 assert(TopCand.isValid()); 3726 SchedCandidate Cand = BotCand; 3727 TopCand.Reason = NoCand; 3728 if (tryCandidate(Cand, TopCand, nullptr)) { 3729 Cand.setBest(TopCand); 3730 LLVM_DEBUG(traceCandidate(Cand)); 3731 } 3732 3733 IsTopNode = Cand.AtTop; 3734 tracePick(Cand); 3735 return Cand.SU; 3736 } 3737 3738 /// Pick the best node to balance the schedule. Implements MachineSchedStrategy. 3739 SUnit *GenericScheduler::pickNode(bool &IsTopNode) { 3740 if (DAG->top() == DAG->bottom()) { 3741 assert(Top.Available.empty() && Top.Pending.empty() && 3742 Bot.Available.empty() && Bot.Pending.empty() && "ReadyQ garbage"); 3743 return nullptr; 3744 } 3745 SUnit *SU; 3746 do { 3747 if (RegionPolicy.OnlyTopDown) { 3748 SU = Top.pickOnlyChoice(); 3749 if (!SU) { 3750 CandPolicy NoPolicy; 3751 TopCand.reset(NoPolicy); 3752 pickNodeFromQueue(Top, NoPolicy, DAG->getTopRPTracker(), TopCand); 3753 assert(TopCand.Reason != NoCand && "failed to find a candidate"); 3754 tracePick(TopCand); 3755 SU = TopCand.SU; 3756 } 3757 IsTopNode = true; 3758 } else if (RegionPolicy.OnlyBottomUp) { 3759 SU = Bot.pickOnlyChoice(); 3760 if (!SU) { 3761 CandPolicy NoPolicy; 3762 BotCand.reset(NoPolicy); 3763 pickNodeFromQueue(Bot, NoPolicy, DAG->getBotRPTracker(), BotCand); 3764 assert(BotCand.Reason != NoCand && "failed to find a candidate"); 3765 tracePick(BotCand); 3766 SU = BotCand.SU; 3767 } 3768 IsTopNode = false; 3769 } else { 3770 SU = pickNodeBidirectional(IsTopNode); 3771 } 3772 } while (SU->isScheduled); 3773 3774 if (SU->isTopReady()) 3775 Top.removeReady(SU); 3776 if (SU->isBottomReady()) 3777 Bot.removeReady(SU); 3778 3779 LLVM_DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") " 3780 << *SU->getInstr()); 3781 return SU; 3782 } 3783 3784 void GenericScheduler::reschedulePhysReg(SUnit *SU, bool isTop) { 3785 MachineBasicBlock::iterator InsertPos = SU->getInstr(); 3786 if (!isTop) 3787 ++InsertPos; 3788 SmallVectorImpl<SDep> &Deps = isTop ? SU->Preds : SU->Succs; 3789 3790 // Find already scheduled copies with a single physreg dependence and move 3791 // them just above the scheduled instruction. 3792 for (SDep &Dep : Deps) { 3793 if (Dep.getKind() != SDep::Data || 3794 !Register::isPhysicalRegister(Dep.getReg())) 3795 continue; 3796 SUnit *DepSU = Dep.getSUnit(); 3797 if (isTop ? DepSU->Succs.size() > 1 : DepSU->Preds.size() > 1) 3798 continue; 3799 MachineInstr *Copy = DepSU->getInstr(); 3800 if (!Copy->isCopy() && !Copy->isMoveImmediate()) 3801 continue; 3802 LLVM_DEBUG(dbgs() << " Rescheduling physreg copy "; 3803 DAG->dumpNode(*Dep.getSUnit())); 3804 DAG->moveInstruction(Copy, InsertPos); 3805 } 3806 } 3807 3808 /// Update the scheduler's state after scheduling a node. This is the same node 3809 /// that was just returned by pickNode(). However, ScheduleDAGMILive needs to 3810 /// update it's state based on the current cycle before MachineSchedStrategy 3811 /// does. 3812 /// 3813 /// FIXME: Eventually, we may bundle physreg copies rather than rescheduling 3814 /// them here. See comments in biasPhysReg. 3815 void GenericScheduler::schedNode(SUnit *SU, bool IsTopNode) { 3816 if (IsTopNode) { 3817 SU->TopReadyCycle = std::max(SU->TopReadyCycle, Top.getCurrCycle()); 3818 Top.bumpNode(SU); 3819 if (SU->hasPhysRegUses) 3820 reschedulePhysReg(SU, true); 3821 } else { 3822 SU->BotReadyCycle = std::max(SU->BotReadyCycle, Bot.getCurrCycle()); 3823 Bot.bumpNode(SU); 3824 if (SU->hasPhysRegDefs) 3825 reschedulePhysReg(SU, false); 3826 } 3827 } 3828 3829 /// Create the standard converging machine scheduler. This will be used as the 3830 /// default scheduler if the target does not set a default. 3831 ScheduleDAGMILive *llvm::createGenericSchedLive(MachineSchedContext *C) { 3832 ScheduleDAGMILive *DAG = 3833 new ScheduleDAGMILive(C, std::make_unique<GenericScheduler>(C)); 3834 // Register DAG post-processors. 3835 // 3836 // FIXME: extend the mutation API to allow earlier mutations to instantiate 3837 // data and pass it to later mutations. Have a single mutation that gathers 3838 // the interesting nodes in one pass. 3839 DAG->addMutation(createCopyConstrainDAGMutation(DAG->TII, DAG->TRI)); 3840 3841 const TargetSubtargetInfo &STI = C->MF->getSubtarget(); 3842 // Add MacroFusion mutation if fusions are not empty. 3843 const auto &MacroFusions = STI.getMacroFusions(); 3844 if (!MacroFusions.empty()) 3845 DAG->addMutation(createMacroFusionDAGMutation(MacroFusions)); 3846 return DAG; 3847 } 3848 3849 static ScheduleDAGInstrs *createConvergingSched(MachineSchedContext *C) { 3850 return createGenericSchedLive(C); 3851 } 3852 3853 static MachineSchedRegistry 3854 GenericSchedRegistry("converge", "Standard converging scheduler.", 3855 createConvergingSched); 3856 3857 //===----------------------------------------------------------------------===// 3858 // PostGenericScheduler - Generic PostRA implementation of MachineSchedStrategy. 3859 //===----------------------------------------------------------------------===// 3860 3861 void PostGenericScheduler::initialize(ScheduleDAGMI *Dag) { 3862 DAG = Dag; 3863 SchedModel = DAG->getSchedModel(); 3864 TRI = DAG->TRI; 3865 3866 Rem.init(DAG, SchedModel); 3867 Top.init(DAG, SchedModel, &Rem); 3868 Bot.init(DAG, SchedModel, &Rem); 3869 3870 // Initialize the HazardRecognizers. If itineraries don't exist, are empty, 3871 // or are disabled, then these HazardRecs will be disabled. 3872 const InstrItineraryData *Itin = SchedModel->getInstrItineraries(); 3873 if (!Top.HazardRec) { 3874 Top.HazardRec = 3875 DAG->MF.getSubtarget().getInstrInfo()->CreateTargetMIHazardRecognizer( 3876 Itin, DAG); 3877 } 3878 if (!Bot.HazardRec) { 3879 Bot.HazardRec = 3880 DAG->MF.getSubtarget().getInstrInfo()->CreateTargetMIHazardRecognizer( 3881 Itin, DAG); 3882 } 3883 } 3884 3885 void PostGenericScheduler::initPolicy(MachineBasicBlock::iterator Begin, 3886 MachineBasicBlock::iterator End, 3887 unsigned NumRegionInstrs) { 3888 if (PostRADirection == MISchedPostRASched::TopDown) { 3889 RegionPolicy.OnlyTopDown = true; 3890 RegionPolicy.OnlyBottomUp = false; 3891 } else if (PostRADirection == MISchedPostRASched::BottomUp) { 3892 RegionPolicy.OnlyTopDown = false; 3893 RegionPolicy.OnlyBottomUp = true; 3894 } 3895 } 3896 3897 void PostGenericScheduler::registerRoots() { 3898 Rem.CriticalPath = DAG->ExitSU.getDepth(); 3899 3900 // Some roots may not feed into ExitSU. Check all of them in case. 3901 for (const SUnit *SU : Bot.Available) { 3902 if (SU->getDepth() > Rem.CriticalPath) 3903 Rem.CriticalPath = SU->getDepth(); 3904 } 3905 LLVM_DEBUG(dbgs() << "Critical Path: (PGS-RR) " << Rem.CriticalPath << '\n'); 3906 if (DumpCriticalPathLength) { 3907 errs() << "Critical Path(PGS-RR ): " << Rem.CriticalPath << " \n"; 3908 } 3909 } 3910 3911 /// Apply a set of heuristics to a new candidate for PostRA scheduling. 3912 /// 3913 /// \param Cand provides the policy and current best candidate. 3914 /// \param TryCand refers to the next SUnit candidate, otherwise uninitialized. 3915 /// \return \c true if TryCand is better than Cand (Reason is NOT NoCand) 3916 bool PostGenericScheduler::tryCandidate(SchedCandidate &Cand, 3917 SchedCandidate &TryCand) { 3918 // Initialize the candidate if needed. 3919 if (!Cand.isValid()) { 3920 TryCand.Reason = NodeOrder; 3921 return true; 3922 } 3923 3924 // Prioritize instructions that read unbuffered resources by stall cycles. 3925 if (tryLess(Top.getLatencyStallCycles(TryCand.SU), 3926 Top.getLatencyStallCycles(Cand.SU), TryCand, Cand, Stall)) 3927 return TryCand.Reason != NoCand; 3928 3929 // Keep clustered nodes together. 3930 if (tryGreater(TryCand.SU == DAG->getNextClusterSucc(), 3931 Cand.SU == DAG->getNextClusterSucc(), 3932 TryCand, Cand, Cluster)) 3933 return TryCand.Reason != NoCand; 3934 3935 // Avoid critical resource consumption and balance the schedule. 3936 if (tryLess(TryCand.ResDelta.CritResources, Cand.ResDelta.CritResources, 3937 TryCand, Cand, ResourceReduce)) 3938 return TryCand.Reason != NoCand; 3939 if (tryGreater(TryCand.ResDelta.DemandedResources, 3940 Cand.ResDelta.DemandedResources, 3941 TryCand, Cand, ResourceDemand)) 3942 return TryCand.Reason != NoCand; 3943 3944 // Avoid serializing long latency dependence chains. 3945 if (Cand.Policy.ReduceLatency && tryLatency(TryCand, Cand, Top)) { 3946 return TryCand.Reason != NoCand; 3947 } 3948 3949 // Fall through to original instruction order. 3950 if (TryCand.SU->NodeNum < Cand.SU->NodeNum) { 3951 TryCand.Reason = NodeOrder; 3952 return true; 3953 } 3954 3955 return false; 3956 } 3957 3958 void PostGenericScheduler::pickNodeFromQueue(SchedBoundary &Zone, 3959 SchedCandidate &Cand) { 3960 ReadyQueue &Q = Zone.Available; 3961 for (SUnit *SU : Q) { 3962 SchedCandidate TryCand(Cand.Policy); 3963 TryCand.SU = SU; 3964 TryCand.AtTop = Zone.isTop(); 3965 TryCand.initResourceDelta(DAG, SchedModel); 3966 if (tryCandidate(Cand, TryCand)) { 3967 Cand.setBest(TryCand); 3968 LLVM_DEBUG(traceCandidate(Cand)); 3969 } 3970 } 3971 } 3972 3973 /// Pick the next node to schedule. 3974 SUnit *PostGenericScheduler::pickNode(bool &IsTopNode) { 3975 if (DAG->top() == DAG->bottom()) { 3976 assert(Top.Available.empty() && Top.Pending.empty() && 3977 Bot.Available.empty() && Bot.Pending.empty() && "ReadyQ garbage"); 3978 return nullptr; 3979 } 3980 SUnit *SU; 3981 do { 3982 if (RegionPolicy.OnlyBottomUp) { 3983 assert(!RegionPolicy.OnlyTopDown); 3984 SU = Bot.pickOnlyChoice(); 3985 if (SU) { 3986 tracePick(Only1, true); 3987 } else { 3988 CandPolicy NoPolicy; 3989 SchedCandidate BotCand(NoPolicy); 3990 // Set the bottom-up policy based on the state of the current bottom 3991 // zone and the instructions outside the zone, including the top zone. 3992 setPolicy(BotCand.Policy, /*IsPostRA=*/true, Bot, nullptr); 3993 pickNodeFromQueue(Bot, BotCand); 3994 assert(BotCand.Reason != NoCand && "failed to find a candidate"); 3995 tracePick(BotCand); 3996 SU = BotCand.SU; 3997 } 3998 IsTopNode = false; 3999 } else { 4000 4001 assert(RegionPolicy.OnlyTopDown); 4002 SU = Top.pickOnlyChoice(); 4003 if (SU) { 4004 tracePick(Only1, true); 4005 } else { 4006 CandPolicy NoPolicy; 4007 SchedCandidate TopCand(NoPolicy); 4008 // Set the top-down policy based on the state of the current top zone 4009 // and the instructions outside the zone, including the bottom zone. 4010 setPolicy(TopCand.Policy, /*IsPostRA=*/true, Top, nullptr); 4011 pickNodeFromQueue(Top, TopCand); 4012 assert(TopCand.Reason != NoCand && "failed to find a candidate"); 4013 tracePick(TopCand); 4014 SU = TopCand.SU; 4015 } 4016 IsTopNode = true; 4017 } 4018 } while (SU->isScheduled); 4019 4020 if (SU->isTopReady()) 4021 Top.removeReady(SU); 4022 if (SU->isBottomReady()) 4023 Bot.removeReady(SU); 4024 4025 LLVM_DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") " 4026 << *SU->getInstr()); 4027 return SU; 4028 } 4029 4030 /// Called after ScheduleDAGMI has scheduled an instruction and updated 4031 /// scheduled/remaining flags in the DAG nodes. 4032 void PostGenericScheduler::schedNode(SUnit *SU, bool IsTopNode) { 4033 if (IsTopNode) { 4034 SU->TopReadyCycle = std::max(SU->TopReadyCycle, Top.getCurrCycle()); 4035 Top.bumpNode(SU); 4036 } else { 4037 SU->BotReadyCycle = std::max(SU->BotReadyCycle, Bot.getCurrCycle()); 4038 Bot.bumpNode(SU); 4039 } 4040 } 4041 4042 ScheduleDAGMI *llvm::createGenericSchedPostRA(MachineSchedContext *C) { 4043 ScheduleDAGMI *DAG = 4044 new ScheduleDAGMI(C, std::make_unique<PostGenericScheduler>(C), 4045 /*RemoveKillFlags=*/true); 4046 const TargetSubtargetInfo &STI = C->MF->getSubtarget(); 4047 // Add MacroFusion mutation if fusions are not empty. 4048 const auto &MacroFusions = STI.getMacroFusions(); 4049 if (!MacroFusions.empty()) 4050 DAG->addMutation(createMacroFusionDAGMutation(MacroFusions)); 4051 return DAG; 4052 } 4053 4054 //===----------------------------------------------------------------------===// 4055 // ILP Scheduler. Currently for experimental analysis of heuristics. 4056 //===----------------------------------------------------------------------===// 4057 4058 namespace { 4059 4060 /// Order nodes by the ILP metric. 4061 struct ILPOrder { 4062 const SchedDFSResult *DFSResult = nullptr; 4063 const BitVector *ScheduledTrees = nullptr; 4064 bool MaximizeILP; 4065 4066 ILPOrder(bool MaxILP) : MaximizeILP(MaxILP) {} 4067 4068 /// Apply a less-than relation on node priority. 4069 /// 4070 /// (Return true if A comes after B in the Q.) 4071 bool operator()(const SUnit *A, const SUnit *B) const { 4072 unsigned SchedTreeA = DFSResult->getSubtreeID(A); 4073 unsigned SchedTreeB = DFSResult->getSubtreeID(B); 4074 if (SchedTreeA != SchedTreeB) { 4075 // Unscheduled trees have lower priority. 4076 if (ScheduledTrees->test(SchedTreeA) != ScheduledTrees->test(SchedTreeB)) 4077 return ScheduledTrees->test(SchedTreeB); 4078 4079 // Trees with shallower connections have lower priority. 4080 if (DFSResult->getSubtreeLevel(SchedTreeA) 4081 != DFSResult->getSubtreeLevel(SchedTreeB)) { 4082 return DFSResult->getSubtreeLevel(SchedTreeA) 4083 < DFSResult->getSubtreeLevel(SchedTreeB); 4084 } 4085 } 4086 if (MaximizeILP) 4087 return DFSResult->getILP(A) < DFSResult->getILP(B); 4088 else 4089 return DFSResult->getILP(A) > DFSResult->getILP(B); 4090 } 4091 }; 4092 4093 /// Schedule based on the ILP metric. 4094 class ILPScheduler : public MachineSchedStrategy { 4095 ScheduleDAGMILive *DAG = nullptr; 4096 ILPOrder Cmp; 4097 4098 std::vector<SUnit*> ReadyQ; 4099 4100 public: 4101 ILPScheduler(bool MaximizeILP) : Cmp(MaximizeILP) {} 4102 4103 void initialize(ScheduleDAGMI *dag) override { 4104 assert(dag->hasVRegLiveness() && "ILPScheduler needs vreg liveness"); 4105 DAG = static_cast<ScheduleDAGMILive*>(dag); 4106 DAG->computeDFSResult(); 4107 Cmp.DFSResult = DAG->getDFSResult(); 4108 Cmp.ScheduledTrees = &DAG->getScheduledTrees(); 4109 ReadyQ.clear(); 4110 } 4111 4112 void registerRoots() override { 4113 // Restore the heap in ReadyQ with the updated DFS results. 4114 std::make_heap(ReadyQ.begin(), ReadyQ.end(), Cmp); 4115 } 4116 4117 /// Implement MachineSchedStrategy interface. 4118 /// ----------------------------------------- 4119 4120 /// Callback to select the highest priority node from the ready Q. 4121 SUnit *pickNode(bool &IsTopNode) override { 4122 if (ReadyQ.empty()) return nullptr; 4123 std::pop_heap(ReadyQ.begin(), ReadyQ.end(), Cmp); 4124 SUnit *SU = ReadyQ.back(); 4125 ReadyQ.pop_back(); 4126 IsTopNode = false; 4127 LLVM_DEBUG(dbgs() << "Pick node " 4128 << "SU(" << SU->NodeNum << ") " 4129 << " ILP: " << DAG->getDFSResult()->getILP(SU) 4130 << " Tree: " << DAG->getDFSResult()->getSubtreeID(SU) 4131 << " @" 4132 << DAG->getDFSResult()->getSubtreeLevel( 4133 DAG->getDFSResult()->getSubtreeID(SU)) 4134 << '\n' 4135 << "Scheduling " << *SU->getInstr()); 4136 return SU; 4137 } 4138 4139 /// Scheduler callback to notify that a new subtree is scheduled. 4140 void scheduleTree(unsigned SubtreeID) override { 4141 std::make_heap(ReadyQ.begin(), ReadyQ.end(), Cmp); 4142 } 4143 4144 /// Callback after a node is scheduled. Mark a newly scheduled tree, notify 4145 /// DFSResults, and resort the priority Q. 4146 void schedNode(SUnit *SU, bool IsTopNode) override { 4147 assert(!IsTopNode && "SchedDFSResult needs bottom-up"); 4148 } 4149 4150 void releaseTopNode(SUnit *) override { /*only called for top roots*/ } 4151 4152 void releaseBottomNode(SUnit *SU) override { 4153 ReadyQ.push_back(SU); 4154 std::push_heap(ReadyQ.begin(), ReadyQ.end(), Cmp); 4155 } 4156 }; 4157 4158 } // end anonymous namespace 4159 4160 static ScheduleDAGInstrs *createILPMaxScheduler(MachineSchedContext *C) { 4161 return new ScheduleDAGMILive(C, std::make_unique<ILPScheduler>(true)); 4162 } 4163 static ScheduleDAGInstrs *createILPMinScheduler(MachineSchedContext *C) { 4164 return new ScheduleDAGMILive(C, std::make_unique<ILPScheduler>(false)); 4165 } 4166 4167 static MachineSchedRegistry ILPMaxRegistry( 4168 "ilpmax", "Schedule bottom-up for max ILP", createILPMaxScheduler); 4169 static MachineSchedRegistry ILPMinRegistry( 4170 "ilpmin", "Schedule bottom-up for min ILP", createILPMinScheduler); 4171 4172 //===----------------------------------------------------------------------===// 4173 // Machine Instruction Shuffler for Correctness Testing 4174 //===----------------------------------------------------------------------===// 4175 4176 #ifndef NDEBUG 4177 namespace { 4178 4179 /// Apply a less-than relation on the node order, which corresponds to the 4180 /// instruction order prior to scheduling. IsReverse implements greater-than. 4181 template<bool IsReverse> 4182 struct SUnitOrder { 4183 bool operator()(SUnit *A, SUnit *B) const { 4184 if (IsReverse) 4185 return A->NodeNum > B->NodeNum; 4186 else 4187 return A->NodeNum < B->NodeNum; 4188 } 4189 }; 4190 4191 /// Reorder instructions as much as possible. 4192 class InstructionShuffler : public MachineSchedStrategy { 4193 bool IsAlternating; 4194 bool IsTopDown; 4195 4196 // Using a less-than relation (SUnitOrder<false>) for the TopQ priority 4197 // gives nodes with a higher number higher priority causing the latest 4198 // instructions to be scheduled first. 4199 PriorityQueue<SUnit*, std::vector<SUnit*>, SUnitOrder<false>> 4200 TopQ; 4201 4202 // When scheduling bottom-up, use greater-than as the queue priority. 4203 PriorityQueue<SUnit*, std::vector<SUnit*>, SUnitOrder<true>> 4204 BottomQ; 4205 4206 public: 4207 InstructionShuffler(bool alternate, bool topdown) 4208 : IsAlternating(alternate), IsTopDown(topdown) {} 4209 4210 void initialize(ScheduleDAGMI*) override { 4211 TopQ.clear(); 4212 BottomQ.clear(); 4213 } 4214 4215 /// Implement MachineSchedStrategy interface. 4216 /// ----------------------------------------- 4217 4218 SUnit *pickNode(bool &IsTopNode) override { 4219 SUnit *SU; 4220 if (IsTopDown) { 4221 do { 4222 if (TopQ.empty()) return nullptr; 4223 SU = TopQ.top(); 4224 TopQ.pop(); 4225 } while (SU->isScheduled); 4226 IsTopNode = true; 4227 } else { 4228 do { 4229 if (BottomQ.empty()) return nullptr; 4230 SU = BottomQ.top(); 4231 BottomQ.pop(); 4232 } while (SU->isScheduled); 4233 IsTopNode = false; 4234 } 4235 if (IsAlternating) 4236 IsTopDown = !IsTopDown; 4237 return SU; 4238 } 4239 4240 void schedNode(SUnit *SU, bool IsTopNode) override {} 4241 4242 void releaseTopNode(SUnit *SU) override { 4243 TopQ.push(SU); 4244 } 4245 void releaseBottomNode(SUnit *SU) override { 4246 BottomQ.push(SU); 4247 } 4248 }; 4249 4250 } // end anonymous namespace 4251 4252 static ScheduleDAGInstrs *createInstructionShuffler(MachineSchedContext *C) { 4253 bool Alternate = !ForceTopDown && !ForceBottomUp; 4254 bool TopDown = !ForceBottomUp; 4255 assert((TopDown || !ForceTopDown) && 4256 "-misched-topdown incompatible with -misched-bottomup"); 4257 return new ScheduleDAGMILive( 4258 C, std::make_unique<InstructionShuffler>(Alternate, TopDown)); 4259 } 4260 4261 static MachineSchedRegistry ShufflerRegistry( 4262 "shuffle", "Shuffle machine instructions alternating directions", 4263 createInstructionShuffler); 4264 #endif // !NDEBUG 4265 4266 //===----------------------------------------------------------------------===// 4267 // GraphWriter support for ScheduleDAGMILive. 4268 //===----------------------------------------------------------------------===// 4269 4270 #ifndef NDEBUG 4271 namespace llvm { 4272 4273 template<> struct GraphTraits< 4274 ScheduleDAGMI*> : public GraphTraits<ScheduleDAG*> {}; 4275 4276 template<> 4277 struct DOTGraphTraits<ScheduleDAGMI*> : public DefaultDOTGraphTraits { 4278 DOTGraphTraits(bool isSimple = false) : DefaultDOTGraphTraits(isSimple) {} 4279 4280 static std::string getGraphName(const ScheduleDAG *G) { 4281 return std::string(G->MF.getName()); 4282 } 4283 4284 static bool renderGraphFromBottomUp() { 4285 return true; 4286 } 4287 4288 static bool isNodeHidden(const SUnit *Node, const ScheduleDAG *G) { 4289 if (ViewMISchedCutoff == 0) 4290 return false; 4291 return (Node->Preds.size() > ViewMISchedCutoff 4292 || Node->Succs.size() > ViewMISchedCutoff); 4293 } 4294 4295 /// If you want to override the dot attributes printed for a particular 4296 /// edge, override this method. 4297 static std::string getEdgeAttributes(const SUnit *Node, 4298 SUnitIterator EI, 4299 const ScheduleDAG *Graph) { 4300 if (EI.isArtificialDep()) 4301 return "color=cyan,style=dashed"; 4302 if (EI.isCtrlDep()) 4303 return "color=blue,style=dashed"; 4304 return ""; 4305 } 4306 4307 static std::string getNodeLabel(const SUnit *SU, const ScheduleDAG *G) { 4308 std::string Str; 4309 raw_string_ostream SS(Str); 4310 const ScheduleDAGMI *DAG = static_cast<const ScheduleDAGMI*>(G); 4311 const SchedDFSResult *DFS = DAG->hasVRegLiveness() ? 4312 static_cast<const ScheduleDAGMILive*>(G)->getDFSResult() : nullptr; 4313 SS << "SU:" << SU->NodeNum; 4314 if (DFS) 4315 SS << " I:" << DFS->getNumInstrs(SU); 4316 return SS.str(); 4317 } 4318 4319 static std::string getNodeDescription(const SUnit *SU, const ScheduleDAG *G) { 4320 return G->getGraphNodeLabel(SU); 4321 } 4322 4323 static std::string getNodeAttributes(const SUnit *N, const ScheduleDAG *G) { 4324 std::string Str("shape=Mrecord"); 4325 const ScheduleDAGMI *DAG = static_cast<const ScheduleDAGMI*>(G); 4326 const SchedDFSResult *DFS = DAG->hasVRegLiveness() ? 4327 static_cast<const ScheduleDAGMILive*>(G)->getDFSResult() : nullptr; 4328 if (DFS) { 4329 Str += ",style=filled,fillcolor=\"#"; 4330 Str += DOT::getColorString(DFS->getSubtreeID(N)); 4331 Str += '"'; 4332 } 4333 return Str; 4334 } 4335 }; 4336 4337 } // end namespace llvm 4338 #endif // NDEBUG 4339 4340 /// viewGraph - Pop up a ghostview window with the reachable parts of the DAG 4341 /// rendered using 'dot'. 4342 void ScheduleDAGMI::viewGraph(const Twine &Name, const Twine &Title) { 4343 #ifndef NDEBUG 4344 ViewGraph(this, Name, false, Title); 4345 #else 4346 errs() << "ScheduleDAGMI::viewGraph is only available in debug builds on " 4347 << "systems with Graphviz or gv!\n"; 4348 #endif // NDEBUG 4349 } 4350 4351 /// Out-of-line implementation with no arguments is handy for gdb. 4352 void ScheduleDAGMI::viewGraph() { 4353 viewGraph(getDAGName(), "Scheduling-Units Graph for " + getDAGName()); 4354 } 4355 4356 /// Sort predicate for the intervals stored in an instance of 4357 /// ResourceSegments. Intervals are always disjoint (no intersection 4358 /// for any pairs of intervals), therefore we can sort the totality of 4359 /// the intervals by looking only at the left boundary. 4360 static bool sortIntervals(const ResourceSegments::IntervalTy &A, 4361 const ResourceSegments::IntervalTy &B) { 4362 return A.first < B.first; 4363 } 4364 4365 unsigned ResourceSegments::getFirstAvailableAt( 4366 unsigned CurrCycle, unsigned AcquireAtCycle, unsigned ReleaseAtCycle, 4367 std::function<ResourceSegments::IntervalTy(unsigned, unsigned, unsigned)> 4368 IntervalBuilder) const { 4369 assert(std::is_sorted(std::begin(_Intervals), std::end(_Intervals), 4370 sortIntervals) && 4371 "Cannot execute on an un-sorted set of intervals."); 4372 4373 // Zero resource usage is allowed by TargetSchedule.td but we do not construct 4374 // a ResourceSegment interval for that situation. 4375 if (AcquireAtCycle == ReleaseAtCycle) 4376 return CurrCycle; 4377 4378 unsigned RetCycle = CurrCycle; 4379 ResourceSegments::IntervalTy NewInterval = 4380 IntervalBuilder(RetCycle, AcquireAtCycle, ReleaseAtCycle); 4381 for (auto &Interval : _Intervals) { 4382 if (!intersects(NewInterval, Interval)) 4383 continue; 4384 4385 // Move the interval right next to the top of the one it 4386 // intersects. 4387 assert(Interval.second > NewInterval.first && 4388 "Invalid intervals configuration."); 4389 RetCycle += (unsigned)Interval.second - (unsigned)NewInterval.first; 4390 NewInterval = IntervalBuilder(RetCycle, AcquireAtCycle, ReleaseAtCycle); 4391 } 4392 return RetCycle; 4393 } 4394 4395 void ResourceSegments::add(ResourceSegments::IntervalTy A, 4396 const unsigned CutOff) { 4397 assert(A.first <= A.second && "Cannot add negative resource usage"); 4398 assert(CutOff > 0 && "0-size interval history has no use."); 4399 // Zero resource usage is allowed by TargetSchedule.td, in the case that the 4400 // instruction needed the resource to be available but does not use it. 4401 // However, ResourceSegment represents an interval that is closed on the left 4402 // and open on the right. It is impossible to represent an empty interval when 4403 // the left is closed. Do not add it to Intervals. 4404 if (A.first == A.second) 4405 return; 4406 4407 assert(all_of(_Intervals, 4408 [&A](const ResourceSegments::IntervalTy &Interval) -> bool { 4409 return !intersects(A, Interval); 4410 }) && 4411 "A resource is being overwritten"); 4412 _Intervals.push_back(A); 4413 4414 sortAndMerge(); 4415 4416 // Do not keep the full history of the intervals, just the 4417 // latest #CutOff. 4418 while (_Intervals.size() > CutOff) 4419 _Intervals.pop_front(); 4420 } 4421 4422 bool ResourceSegments::intersects(ResourceSegments::IntervalTy A, 4423 ResourceSegments::IntervalTy B) { 4424 assert(A.first <= A.second && "Invalid interval"); 4425 assert(B.first <= B.second && "Invalid interval"); 4426 4427 // Share one boundary. 4428 if ((A.first == B.first) || (A.second == B.second)) 4429 return true; 4430 4431 // full intersersect: [ *** ) B 4432 // [***) A 4433 if ((A.first > B.first) && (A.second < B.second)) 4434 return true; 4435 4436 // right intersect: [ ***) B 4437 // [*** ) A 4438 if ((A.first > B.first) && (A.first < B.second) && (A.second > B.second)) 4439 return true; 4440 4441 // left intersect: [*** ) B 4442 // [ ***) A 4443 if ((A.first < B.first) && (B.first < A.second) && (B.second > B.first)) 4444 return true; 4445 4446 return false; 4447 } 4448 4449 void ResourceSegments::sortAndMerge() { 4450 if (_Intervals.size() <= 1) 4451 return; 4452 4453 // First sort the collection. 4454 _Intervals.sort(sortIntervals); 4455 4456 // can use next because I have at least 2 elements in the list 4457 auto next = std::next(std::begin(_Intervals)); 4458 auto E = std::end(_Intervals); 4459 for (; next != E; ++next) { 4460 if (std::prev(next)->second >= next->first) { 4461 next->first = std::prev(next)->first; 4462 _Intervals.erase(std::prev(next)); 4463 continue; 4464 } 4465 } 4466 } 4467