1 //===-- SIMachineScheduler.cpp - SI Scheduler Interface -------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// SI Machine Scheduler interface 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "SIMachineScheduler.h" 15 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 16 #include "SIInstrInfo.h" 17 #include "llvm/CodeGen/LiveIntervals.h" 18 #include "llvm/CodeGen/MachineRegisterInfo.h" 19 20 using namespace llvm; 21 22 #define DEBUG_TYPE "machine-scheduler" 23 24 // This scheduler implements a different scheduling algorithm than 25 // GenericScheduler. 26 // 27 // There are several specific architecture behaviours that can't be modelled 28 // for GenericScheduler: 29 // . When accessing the result of an SGPR load instruction, you have to wait 30 // for all the SGPR load instructions before your current instruction to 31 // have finished. 32 // . When accessing the result of an VGPR load instruction, you have to wait 33 // for all the VGPR load instructions previous to the VGPR load instruction 34 // you are interested in to finish. 35 // . The less the register pressure, the best load latencies are hidden 36 // 37 // Moreover some specifities (like the fact a lot of instructions in the shader 38 // have few dependencies) makes the generic scheduler have some unpredictable 39 // behaviours. For example when register pressure becomes high, it can either 40 // manage to prevent register pressure from going too high, or it can 41 // increase register pressure even more than if it hadn't taken register 42 // pressure into account. 43 // 44 // Also some other bad behaviours are generated, like loading at the beginning 45 // of the shader a constant in VGPR you won't need until the end of the shader. 46 // 47 // The scheduling problem for SI can distinguish three main parts: 48 // . Hiding high latencies (texture sampling, etc) 49 // . Hiding low latencies (SGPR constant loading, etc) 50 // . Keeping register usage low for better latency hiding and general 51 // performance 52 // 53 // Some other things can also affect performance, but are hard to predict 54 // (cache usage, the fact the HW can issue several instructions from different 55 // wavefronts if different types, etc) 56 // 57 // This scheduler tries to solve the scheduling problem by dividing it into 58 // simpler sub-problems. It divides the instructions into blocks, schedules 59 // locally inside the blocks where it takes care of low latencies, and then 60 // chooses the order of the blocks by taking care of high latencies. 61 // Dividing the instructions into blocks helps control keeping register 62 // usage low. 63 // 64 // First the instructions are put into blocks. 65 // We want the blocks help control register usage and hide high latencies 66 // later. To help control register usage, we typically want all local 67 // computations, when for example you create a result that can be consumed 68 // right away, to be contained in a block. Block inputs and outputs would 69 // typically be important results that are needed in several locations of 70 // the shader. Since we do want blocks to help hide high latencies, we want 71 // the instructions inside the block to have a minimal set of dependencies 72 // on high latencies. It will make it easy to pick blocks to hide specific 73 // high latencies. 74 // The block creation algorithm is divided into several steps, and several 75 // variants can be tried during the scheduling process. 76 // 77 // Second the order of the instructions inside the blocks is chosen. 78 // At that step we do take into account only register usage and hiding 79 // low latency instructions 80 // 81 // Third the block order is chosen, there we try to hide high latencies 82 // and keep register usage low. 83 // 84 // After the third step, a pass is done to improve the hiding of low 85 // latencies. 86 // 87 // Actually when talking about 'low latency' or 'high latency' it includes 88 // both the latency to get the cache (or global mem) data go to the register, 89 // and the bandwidth limitations. 90 // Increasing the number of active wavefronts helps hide the former, but it 91 // doesn't solve the latter, thus why even if wavefront count is high, we have 92 // to try have as many instructions hiding high latencies as possible. 93 // The OpenCL doc says for example latency of 400 cycles for a global mem 94 // access, which is hidden by 10 instructions if the wavefront count is 10. 95 96 // Some figures taken from AMD docs: 97 // Both texture and constant L1 caches are 4-way associative with 64 bytes 98 // lines. 99 // Constant cache is shared with 4 CUs. 100 // For texture sampling, the address generation unit receives 4 texture 101 // addresses per cycle, thus we could expect texture sampling latency to be 102 // equivalent to 4 instructions in the very best case (a VGPR is 64 work items, 103 // instructions in a wavefront group are executed every 4 cycles), 104 // or 16 instructions if the other wavefronts associated to the 3 other VALUs 105 // of the CU do texture sampling too. (Don't take these figures too seriously, 106 // as I'm not 100% sure of the computation) 107 // Data exports should get similar latency. 108 // For constant loading, the cache is shader with 4 CUs. 109 // The doc says "a throughput of 16B/cycle for each of the 4 Compute Unit" 110 // I guess if the other CU don't read the cache, it can go up to 64B/cycle. 111 // It means a simple s_buffer_load should take one instruction to hide, as 112 // well as a s_buffer_loadx2 and potentially a s_buffer_loadx8 if on the same 113 // cache line. 114 // 115 // As of today the driver doesn't preload the constants in cache, thus the 116 // first loads get extra latency. The doc says global memory access can be 117 // 300-600 cycles. We do not specially take that into account when scheduling 118 // As we expect the driver to be able to preload the constants soon. 119 120 // common code // 121 122 #ifndef NDEBUG 123 124 static const char *getReasonStr(SIScheduleCandReason Reason) { 125 switch (Reason) { 126 case NoCand: return "NOCAND"; 127 case RegUsage: return "REGUSAGE"; 128 case Latency: return "LATENCY"; 129 case Successor: return "SUCCESSOR"; 130 case Depth: return "DEPTH"; 131 case NodeOrder: return "ORDER"; 132 } 133 llvm_unreachable("Unknown reason!"); 134 } 135 136 #endif 137 138 namespace llvm::SISched { 139 static bool tryLess(int TryVal, int CandVal, 140 SISchedulerCandidate &TryCand, 141 SISchedulerCandidate &Cand, 142 SIScheduleCandReason Reason) { 143 if (TryVal < CandVal) { 144 TryCand.Reason = Reason; 145 return true; 146 } 147 if (TryVal > CandVal) { 148 if (Cand.Reason > Reason) 149 Cand.Reason = Reason; 150 return true; 151 } 152 Cand.setRepeat(Reason); 153 return false; 154 } 155 156 static bool tryGreater(int TryVal, int CandVal, 157 SISchedulerCandidate &TryCand, 158 SISchedulerCandidate &Cand, 159 SIScheduleCandReason Reason) { 160 if (TryVal > CandVal) { 161 TryCand.Reason = Reason; 162 return true; 163 } 164 if (TryVal < CandVal) { 165 if (Cand.Reason > Reason) 166 Cand.Reason = Reason; 167 return true; 168 } 169 Cand.setRepeat(Reason); 170 return false; 171 } 172 } // end namespace llvm::SISched 173 174 // SIScheduleBlock // 175 176 void SIScheduleBlock::addUnit(SUnit *SU) { 177 NodeNum2Index[SU->NodeNum] = SUnits.size(); 178 SUnits.push_back(SU); 179 } 180 181 #ifndef NDEBUG 182 void SIScheduleBlock::traceCandidate(const SISchedCandidate &Cand) { 183 184 dbgs() << " SU(" << Cand.SU->NodeNum << ") " << getReasonStr(Cand.Reason); 185 dbgs() << '\n'; 186 } 187 #endif 188 189 void SIScheduleBlock::tryCandidateTopDown(SISchedCandidate &Cand, 190 SISchedCandidate &TryCand) { 191 // Initialize the candidate if needed. 192 if (!Cand.isValid()) { 193 TryCand.Reason = NodeOrder; 194 return; 195 } 196 197 if (Cand.SGPRUsage > 60 && 198 SISched::tryLess(TryCand.SGPRUsage, Cand.SGPRUsage, 199 TryCand, Cand, RegUsage)) 200 return; 201 202 // Schedule low latency instructions as top as possible. 203 // Order of priority is: 204 // . Low latency instructions which do not depend on other low latency 205 // instructions we haven't waited for 206 // . Other instructions which do not depend on low latency instructions 207 // we haven't waited for 208 // . Low latencies 209 // . All other instructions 210 // Goal is to get: low latency instructions - independent instructions 211 // - (eventually some more low latency instructions) 212 // - instructions that depend on the first low latency instructions. 213 // If in the block there is a lot of constant loads, the SGPR usage 214 // could go quite high, thus above the arbitrary limit of 60 will encourage 215 // use the already loaded constants (in order to release some SGPRs) before 216 // loading more. 217 if (SISched::tryLess(TryCand.HasLowLatencyNonWaitedParent, 218 Cand.HasLowLatencyNonWaitedParent, 219 TryCand, Cand, SIScheduleCandReason::Depth)) 220 return; 221 222 if (SISched::tryGreater(TryCand.IsLowLatency, Cand.IsLowLatency, 223 TryCand, Cand, SIScheduleCandReason::Depth)) 224 return; 225 226 if (TryCand.IsLowLatency && 227 SISched::tryLess(TryCand.LowLatencyOffset, Cand.LowLatencyOffset, 228 TryCand, Cand, SIScheduleCandReason::Depth)) 229 return; 230 231 if (SISched::tryLess(TryCand.VGPRUsage, Cand.VGPRUsage, 232 TryCand, Cand, RegUsage)) 233 return; 234 235 // Fall through to original instruction order. 236 if (TryCand.SU->NodeNum < Cand.SU->NodeNum) { 237 TryCand.Reason = NodeOrder; 238 } 239 } 240 241 SUnit* SIScheduleBlock::pickNode() { 242 SISchedCandidate TopCand; 243 244 for (SUnit* SU : TopReadySUs) { 245 SISchedCandidate TryCand; 246 std::vector<unsigned> pressure; 247 std::vector<unsigned> MaxPressure; 248 // Predict register usage after this instruction. 249 TryCand.SU = SU; 250 TopRPTracker.getDownwardPressure(SU->getInstr(), pressure, MaxPressure); 251 TryCand.SGPRUsage = pressure[AMDGPU::RegisterPressureSets::SReg_32]; 252 TryCand.VGPRUsage = pressure[AMDGPU::RegisterPressureSets::VGPR_32]; 253 TryCand.IsLowLatency = DAG->IsLowLatencySU[SU->NodeNum]; 254 TryCand.LowLatencyOffset = DAG->LowLatencyOffset[SU->NodeNum]; 255 TryCand.HasLowLatencyNonWaitedParent = 256 HasLowLatencyNonWaitedParent[NodeNum2Index[SU->NodeNum]]; 257 tryCandidateTopDown(TopCand, TryCand); 258 if (TryCand.Reason != NoCand) 259 TopCand.setBest(TryCand); 260 } 261 262 return TopCand.SU; 263 } 264 265 266 // Schedule something valid. 267 void SIScheduleBlock::fastSchedule() { 268 TopReadySUs.clear(); 269 if (Scheduled) 270 undoSchedule(); 271 272 for (SUnit* SU : SUnits) { 273 if (!SU->NumPredsLeft) 274 TopReadySUs.push_back(SU); 275 } 276 277 while (!TopReadySUs.empty()) { 278 SUnit *SU = TopReadySUs[0]; 279 ScheduledSUnits.push_back(SU); 280 nodeScheduled(SU); 281 } 282 283 Scheduled = true; 284 } 285 286 // Returns if the register was set between first and last. 287 static bool isDefBetween(Register Reg, SlotIndex First, SlotIndex Last, 288 const MachineRegisterInfo *MRI, 289 const LiveIntervals *LIS) { 290 for (MachineRegisterInfo::def_instr_iterator 291 UI = MRI->def_instr_begin(Reg), 292 UE = MRI->def_instr_end(); UI != UE; ++UI) { 293 const MachineInstr* MI = &*UI; 294 if (MI->isDebugValue()) 295 continue; 296 SlotIndex InstSlot = LIS->getInstructionIndex(*MI).getRegSlot(); 297 if (InstSlot >= First && InstSlot <= Last) 298 return true; 299 } 300 return false; 301 } 302 303 void SIScheduleBlock::initRegPressure(MachineBasicBlock::iterator BeginBlock, 304 MachineBasicBlock::iterator EndBlock) { 305 IntervalPressure Pressure, BotPressure; 306 RegPressureTracker RPTracker(Pressure), BotRPTracker(BotPressure); 307 LiveIntervals *LIS = DAG->getLIS(); 308 MachineRegisterInfo *MRI = DAG->getMRI(); 309 DAG->initRPTracker(TopRPTracker); 310 DAG->initRPTracker(BotRPTracker); 311 DAG->initRPTracker(RPTracker); 312 313 // Goes though all SU. RPTracker captures what had to be alive for the SUs 314 // to execute, and what is still alive at the end. 315 for (SUnit* SU : ScheduledSUnits) { 316 RPTracker.setPos(SU->getInstr()); 317 RPTracker.advance(); 318 } 319 320 // Close the RPTracker to finalize live ins/outs. 321 RPTracker.closeRegion(); 322 323 // Initialize the live ins and live outs. 324 TopRPTracker.addLiveRegs(RPTracker.getPressure().LiveInRegs); 325 BotRPTracker.addLiveRegs(RPTracker.getPressure().LiveOutRegs); 326 327 // Do not Track Physical Registers, because it messes up. 328 for (const auto &RegMaskPair : RPTracker.getPressure().LiveInRegs) { 329 if (RegMaskPair.RegUnit.isVirtual()) 330 LiveInRegs.insert(RegMaskPair.RegUnit); 331 } 332 LiveOutRegs.clear(); 333 // There is several possibilities to distinguish: 334 // 1) Reg is not input to any instruction in the block, but is output of one 335 // 2) 1) + read in the block and not needed after it 336 // 3) 1) + read in the block but needed in another block 337 // 4) Reg is input of an instruction but another block will read it too 338 // 5) Reg is input of an instruction and then rewritten in the block. 339 // result is not read in the block (implies used in another block) 340 // 6) Reg is input of an instruction and then rewritten in the block. 341 // result is read in the block and not needed in another block 342 // 7) Reg is input of an instruction and then rewritten in the block. 343 // result is read in the block but also needed in another block 344 // LiveInRegs will contains all the regs in situation 4, 5, 6, 7 345 // We want LiveOutRegs to contain only Regs whose content will be read after 346 // in another block, and whose content was written in the current block, 347 // that is we want it to get 1, 3, 5, 7 348 // Since we made the MIs of a block to be packed all together before 349 // scheduling, then the LiveIntervals were correct, and the RPTracker was 350 // able to correctly handle 5 vs 6, 2 vs 3. 351 // (Note: This is not sufficient for RPTracker to not do mistakes for case 4) 352 // The RPTracker's LiveOutRegs has 1, 3, (some correct or incorrect)4, 5, 7 353 // Comparing to LiveInRegs is not sufficient to differentiate 4 vs 5, 7 354 // The use of findDefBetween removes the case 4. 355 for (const auto &RegMaskPair : RPTracker.getPressure().LiveOutRegs) { 356 Register Reg = RegMaskPair.RegUnit; 357 if (Reg.isVirtual() && 358 isDefBetween(Reg, LIS->getInstructionIndex(*BeginBlock).getRegSlot(), 359 LIS->getInstructionIndex(*EndBlock).getRegSlot(), MRI, 360 LIS)) { 361 LiveOutRegs.insert(Reg); 362 } 363 } 364 365 // Pressure = sum_alive_registers register size 366 // Internally llvm will represent some registers as big 128 bits registers 367 // for example, but they actually correspond to 4 actual 32 bits registers. 368 // Thus Pressure is not equal to num_alive_registers * constant. 369 LiveInPressure = TopPressure.MaxSetPressure; 370 LiveOutPressure = BotPressure.MaxSetPressure; 371 372 // Prepares TopRPTracker for top down scheduling. 373 TopRPTracker.closeTop(); 374 } 375 376 void SIScheduleBlock::schedule(MachineBasicBlock::iterator BeginBlock, 377 MachineBasicBlock::iterator EndBlock) { 378 if (!Scheduled) 379 fastSchedule(); 380 381 // PreScheduling phase to set LiveIn and LiveOut. 382 initRegPressure(BeginBlock, EndBlock); 383 undoSchedule(); 384 385 // Schedule for real now. 386 387 TopReadySUs.clear(); 388 389 for (SUnit* SU : SUnits) { 390 if (!SU->NumPredsLeft) 391 TopReadySUs.push_back(SU); 392 } 393 394 while (!TopReadySUs.empty()) { 395 SUnit *SU = pickNode(); 396 ScheduledSUnits.push_back(SU); 397 TopRPTracker.setPos(SU->getInstr()); 398 TopRPTracker.advance(); 399 nodeScheduled(SU); 400 } 401 402 // TODO: compute InternalAdditionalPressure. 403 InternalAdditionalPressure.resize(TopPressure.MaxSetPressure.size()); 404 405 // Check everything is right. 406 #ifndef NDEBUG 407 assert(SUnits.size() == ScheduledSUnits.size() && 408 TopReadySUs.empty()); 409 for (SUnit* SU : SUnits) { 410 assert(SU->isScheduled && 411 SU->NumPredsLeft == 0); 412 } 413 #endif 414 415 Scheduled = true; 416 } 417 418 void SIScheduleBlock::undoSchedule() { 419 for (SUnit* SU : SUnits) { 420 SU->isScheduled = false; 421 for (SDep& Succ : SU->Succs) { 422 if (BC->isSUInBlock(Succ.getSUnit(), ID)) 423 undoReleaseSucc(SU, &Succ); 424 } 425 } 426 HasLowLatencyNonWaitedParent.assign(SUnits.size(), 0); 427 ScheduledSUnits.clear(); 428 Scheduled = false; 429 } 430 431 void SIScheduleBlock::undoReleaseSucc(SUnit *SU, SDep *SuccEdge) { 432 SUnit *SuccSU = SuccEdge->getSUnit(); 433 434 if (SuccEdge->isWeak()) { 435 ++SuccSU->WeakPredsLeft; 436 return; 437 } 438 ++SuccSU->NumPredsLeft; 439 } 440 441 void SIScheduleBlock::releaseSucc(SUnit *SU, SDep *SuccEdge) { 442 SUnit *SuccSU = SuccEdge->getSUnit(); 443 444 if (SuccEdge->isWeak()) { 445 --SuccSU->WeakPredsLeft; 446 return; 447 } 448 #ifndef NDEBUG 449 if (SuccSU->NumPredsLeft == 0) { 450 dbgs() << "*** Scheduling failed! ***\n"; 451 DAG->dumpNode(*SuccSU); 452 dbgs() << " has been released too many times!\n"; 453 llvm_unreachable(nullptr); 454 } 455 #endif 456 457 --SuccSU->NumPredsLeft; 458 } 459 460 /// Release Successors of the SU that are in the block or not. 461 void SIScheduleBlock::releaseSuccessors(SUnit *SU, bool InOrOutBlock) { 462 for (SDep& Succ : SU->Succs) { 463 SUnit *SuccSU = Succ.getSUnit(); 464 465 if (SuccSU->NodeNum >= DAG->SUnits.size()) 466 continue; 467 468 if (BC->isSUInBlock(SuccSU, ID) != InOrOutBlock) 469 continue; 470 471 releaseSucc(SU, &Succ); 472 if (SuccSU->NumPredsLeft == 0 && InOrOutBlock) 473 TopReadySUs.push_back(SuccSU); 474 } 475 } 476 477 void SIScheduleBlock::nodeScheduled(SUnit *SU) { 478 // Is in TopReadySUs 479 assert (!SU->NumPredsLeft); 480 std::vector<SUnit *>::iterator I = llvm::find(TopReadySUs, SU); 481 if (I == TopReadySUs.end()) { 482 dbgs() << "Data Structure Bug in SI Scheduler\n"; 483 llvm_unreachable(nullptr); 484 } 485 TopReadySUs.erase(I); 486 487 releaseSuccessors(SU, true); 488 // Scheduling this node will trigger a wait, 489 // thus propagate to other instructions that they do not need to wait either. 490 if (HasLowLatencyNonWaitedParent[NodeNum2Index[SU->NodeNum]]) 491 HasLowLatencyNonWaitedParent.assign(SUnits.size(), 0); 492 493 if (DAG->IsLowLatencySU[SU->NodeNum]) { 494 for (SDep& Succ : SU->Succs) { 495 std::map<unsigned, unsigned>::iterator I = 496 NodeNum2Index.find(Succ.getSUnit()->NodeNum); 497 if (I != NodeNum2Index.end()) 498 HasLowLatencyNonWaitedParent[I->second] = 1; 499 } 500 } 501 SU->isScheduled = true; 502 } 503 504 void SIScheduleBlock::finalizeUnits() { 505 // We remove links from outside blocks to enable scheduling inside the block. 506 for (SUnit* SU : SUnits) { 507 releaseSuccessors(SU, false); 508 if (DAG->IsHighLatencySU[SU->NodeNum]) 509 HighLatencyBlock = true; 510 } 511 HasLowLatencyNonWaitedParent.resize(SUnits.size(), 0); 512 } 513 514 // we maintain ascending order of IDs 515 void SIScheduleBlock::addPred(SIScheduleBlock *Pred) { 516 unsigned PredID = Pred->getID(); 517 518 // Check if not already predecessor. 519 for (SIScheduleBlock* P : Preds) { 520 if (PredID == P->getID()) 521 return; 522 } 523 Preds.push_back(Pred); 524 525 assert(none_of(Succs, 526 [=](std::pair<SIScheduleBlock*, 527 SIScheduleBlockLinkKind> S) { 528 return PredID == S.first->getID(); 529 }) && 530 "Loop in the Block Graph!"); 531 } 532 533 void SIScheduleBlock::addSucc(SIScheduleBlock *Succ, 534 SIScheduleBlockLinkKind Kind) { 535 unsigned SuccID = Succ->getID(); 536 537 // Check if not already predecessor. 538 for (std::pair<SIScheduleBlock*, SIScheduleBlockLinkKind> &S : Succs) { 539 if (SuccID == S.first->getID()) { 540 if (S.second == SIScheduleBlockLinkKind::NoData && 541 Kind == SIScheduleBlockLinkKind::Data) 542 S.second = Kind; 543 return; 544 } 545 } 546 if (Succ->isHighLatencyBlock()) 547 ++NumHighLatencySuccessors; 548 Succs.emplace_back(Succ, Kind); 549 550 assert(none_of(Preds, 551 [=](SIScheduleBlock *P) { return SuccID == P->getID(); }) && 552 "Loop in the Block Graph!"); 553 } 554 555 #ifndef NDEBUG 556 void SIScheduleBlock::printDebug(bool full) { 557 dbgs() << "Block (" << ID << ")\n"; 558 if (!full) 559 return; 560 561 dbgs() << "\nContains High Latency Instruction: " 562 << HighLatencyBlock << '\n'; 563 dbgs() << "\nDepends On:\n"; 564 for (SIScheduleBlock* P : Preds) { 565 P->printDebug(false); 566 } 567 568 dbgs() << "\nSuccessors:\n"; 569 for (std::pair<SIScheduleBlock*, SIScheduleBlockLinkKind> S : Succs) { 570 if (S.second == SIScheduleBlockLinkKind::Data) 571 dbgs() << "(Data Dep) "; 572 S.first->printDebug(false); 573 } 574 575 if (Scheduled) { 576 dbgs() << "LiveInPressure " 577 << LiveInPressure[AMDGPU::RegisterPressureSets::SReg_32] << ' ' 578 << LiveInPressure[AMDGPU::RegisterPressureSets::VGPR_32] << '\n'; 579 dbgs() << "LiveOutPressure " 580 << LiveOutPressure[AMDGPU::RegisterPressureSets::SReg_32] << ' ' 581 << LiveOutPressure[AMDGPU::RegisterPressureSets::VGPR_32] << "\n\n"; 582 dbgs() << "LiveIns:\n"; 583 for (Register Reg : LiveInRegs) 584 dbgs() << printVRegOrUnit(Reg, DAG->getTRI()) << ' '; 585 586 dbgs() << "\nLiveOuts:\n"; 587 for (Register Reg : LiveOutRegs) 588 dbgs() << printVRegOrUnit(Reg, DAG->getTRI()) << ' '; 589 } 590 591 dbgs() << "\nInstructions:\n"; 592 for (const SUnit* SU : SUnits) 593 DAG->dumpNode(*SU); 594 595 dbgs() << "///////////////////////\n"; 596 } 597 #endif 598 599 // SIScheduleBlockCreator // 600 601 SIScheduleBlockCreator::SIScheduleBlockCreator(SIScheduleDAGMI *DAG) 602 : DAG(DAG) {} 603 604 SIScheduleBlocks 605 SIScheduleBlockCreator::getBlocks(SISchedulerBlockCreatorVariant BlockVariant) { 606 std::map<SISchedulerBlockCreatorVariant, SIScheduleBlocks>::iterator B = 607 Blocks.find(BlockVariant); 608 if (B == Blocks.end()) { 609 SIScheduleBlocks Res; 610 createBlocksForVariant(BlockVariant); 611 topologicalSort(); 612 scheduleInsideBlocks(); 613 fillStats(); 614 Res.Blocks = CurrentBlocks; 615 Res.TopDownIndex2Block = TopDownIndex2Block; 616 Res.TopDownBlock2Index = TopDownBlock2Index; 617 Blocks[BlockVariant] = Res; 618 return Res; 619 } 620 return B->second; 621 } 622 623 bool SIScheduleBlockCreator::isSUInBlock(SUnit *SU, unsigned ID) { 624 if (SU->NodeNum >= DAG->SUnits.size()) 625 return false; 626 return CurrentBlocks[Node2CurrentBlock[SU->NodeNum]]->getID() == ID; 627 } 628 629 void SIScheduleBlockCreator::colorHighLatenciesAlone() { 630 unsigned DAGSize = DAG->SUnits.size(); 631 632 for (unsigned i = 0, e = DAGSize; i != e; ++i) { 633 SUnit *SU = &DAG->SUnits[i]; 634 if (DAG->IsHighLatencySU[SU->NodeNum]) { 635 CurrentColoring[SU->NodeNum] = NextReservedID++; 636 } 637 } 638 } 639 640 static bool 641 hasDataDependencyPred(const SUnit &SU, const SUnit &FromSU) { 642 for (const auto &PredDep : SU.Preds) { 643 if (PredDep.getSUnit() == &FromSU && 644 PredDep.getKind() == llvm::SDep::Data) 645 return true; 646 } 647 return false; 648 } 649 650 void SIScheduleBlockCreator::colorHighLatenciesGroups() { 651 unsigned DAGSize = DAG->SUnits.size(); 652 unsigned NumHighLatencies = 0; 653 unsigned GroupSize; 654 int Color = NextReservedID; 655 unsigned Count = 0; 656 std::set<unsigned> FormingGroup; 657 658 for (unsigned i = 0, e = DAGSize; i != e; ++i) { 659 SUnit *SU = &DAG->SUnits[i]; 660 if (DAG->IsHighLatencySU[SU->NodeNum]) 661 ++NumHighLatencies; 662 } 663 664 if (NumHighLatencies == 0) 665 return; 666 667 if (NumHighLatencies <= 6) 668 GroupSize = 2; 669 else if (NumHighLatencies <= 12) 670 GroupSize = 3; 671 else 672 GroupSize = 4; 673 674 for (unsigned SUNum : DAG->TopDownIndex2SU) { 675 const SUnit &SU = DAG->SUnits[SUNum]; 676 if (DAG->IsHighLatencySU[SU.NodeNum]) { 677 unsigned CompatibleGroup = true; 678 int ProposedColor = Color; 679 std::vector<int> AdditionalElements; 680 681 // We don't want to put in the same block 682 // two high latency instructions that depend 683 // on each other. 684 // One way would be to check canAddEdge 685 // in both directions, but that currently is not 686 // enough because there the high latency order is 687 // enforced (via links). 688 // Instead, look at the dependencies between the 689 // high latency instructions and deduce if it is 690 // a data dependency or not. 691 for (unsigned j : FormingGroup) { 692 bool HasSubGraph; 693 std::vector<int> SubGraph; 694 // By construction (topological order), if SU and 695 // DAG->SUnits[j] are linked, DAG->SUnits[j] is necessary 696 // in the parent graph of SU. 697 #ifndef NDEBUG 698 SubGraph = DAG->GetTopo()->GetSubGraph(SU, DAG->SUnits[j], 699 HasSubGraph); 700 assert(!HasSubGraph); 701 #endif 702 SubGraph = DAG->GetTopo()->GetSubGraph(DAG->SUnits[j], SU, 703 HasSubGraph); 704 if (!HasSubGraph) 705 continue; // No dependencies between each other 706 if (SubGraph.size() > 5) { 707 // Too many elements would be required to be added to the block. 708 CompatibleGroup = false; 709 break; 710 } 711 // Check the type of dependency 712 for (unsigned k : SubGraph) { 713 // If in the path to join the two instructions, 714 // there is another high latency instruction, 715 // or instructions colored for another block 716 // abort the merge. 717 if (DAG->IsHighLatencySU[k] || (CurrentColoring[k] != ProposedColor && 718 CurrentColoring[k] != 0)) { 719 CompatibleGroup = false; 720 break; 721 } 722 // If one of the SU in the subgraph depends on the result of SU j, 723 // there'll be a data dependency. 724 if (hasDataDependencyPred(DAG->SUnits[k], DAG->SUnits[j])) { 725 CompatibleGroup = false; 726 break; 727 } 728 } 729 if (!CompatibleGroup) 730 break; 731 // Same check for the SU 732 if (hasDataDependencyPred(SU, DAG->SUnits[j])) { 733 CompatibleGroup = false; 734 break; 735 } 736 // Add all the required instructions to the block 737 // These cannot live in another block (because they 738 // depend (order dependency) on one of the 739 // instruction in the block, and are required for the 740 // high latency instruction we add. 741 llvm::append_range(AdditionalElements, SubGraph); 742 } 743 if (CompatibleGroup) { 744 FormingGroup.insert(SU.NodeNum); 745 for (unsigned j : AdditionalElements) 746 CurrentColoring[j] = ProposedColor; 747 CurrentColoring[SU.NodeNum] = ProposedColor; 748 ++Count; 749 } 750 // Found one incompatible instruction, 751 // or has filled a big enough group. 752 // -> start a new one. 753 if (!CompatibleGroup) { 754 FormingGroup.clear(); 755 Color = ++NextReservedID; 756 ProposedColor = Color; 757 FormingGroup.insert(SU.NodeNum); 758 CurrentColoring[SU.NodeNum] = ProposedColor; 759 Count = 0; 760 } else if (Count == GroupSize) { 761 FormingGroup.clear(); 762 Color = ++NextReservedID; 763 ProposedColor = Color; 764 Count = 0; 765 } 766 } 767 } 768 } 769 770 void SIScheduleBlockCreator::colorComputeReservedDependencies() { 771 unsigned DAGSize = DAG->SUnits.size(); 772 std::map<std::set<unsigned>, unsigned> ColorCombinations; 773 774 CurrentTopDownReservedDependencyColoring.clear(); 775 CurrentBottomUpReservedDependencyColoring.clear(); 776 777 CurrentTopDownReservedDependencyColoring.resize(DAGSize, 0); 778 CurrentBottomUpReservedDependencyColoring.resize(DAGSize, 0); 779 780 // Traverse TopDown, and give different colors to SUs depending 781 // on which combination of High Latencies they depend on. 782 783 for (unsigned SUNum : DAG->TopDownIndex2SU) { 784 SUnit *SU = &DAG->SUnits[SUNum]; 785 std::set<unsigned> SUColors; 786 787 // Already given. 788 if (CurrentColoring[SU->NodeNum]) { 789 CurrentTopDownReservedDependencyColoring[SU->NodeNum] = 790 CurrentColoring[SU->NodeNum]; 791 continue; 792 } 793 794 for (SDep& PredDep : SU->Preds) { 795 SUnit *Pred = PredDep.getSUnit(); 796 if (PredDep.isWeak() || Pred->NodeNum >= DAGSize) 797 continue; 798 if (CurrentTopDownReservedDependencyColoring[Pred->NodeNum] > 0) 799 SUColors.insert(CurrentTopDownReservedDependencyColoring[Pred->NodeNum]); 800 } 801 // Color 0 by default. 802 if (SUColors.empty()) 803 continue; 804 // Same color than parents. 805 if (SUColors.size() == 1 && *SUColors.begin() > DAGSize) 806 CurrentTopDownReservedDependencyColoring[SU->NodeNum] = 807 *SUColors.begin(); 808 else { 809 std::map<std::set<unsigned>, unsigned>::iterator Pos = 810 ColorCombinations.find(SUColors); 811 if (Pos != ColorCombinations.end()) { 812 CurrentTopDownReservedDependencyColoring[SU->NodeNum] = Pos->second; 813 } else { 814 CurrentTopDownReservedDependencyColoring[SU->NodeNum] = 815 NextNonReservedID; 816 ColorCombinations[SUColors] = NextNonReservedID++; 817 } 818 } 819 } 820 821 ColorCombinations.clear(); 822 823 // Same as before, but BottomUp. 824 825 for (unsigned SUNum : DAG->BottomUpIndex2SU) { 826 SUnit *SU = &DAG->SUnits[SUNum]; 827 std::set<unsigned> SUColors; 828 829 // Already given. 830 if (CurrentColoring[SU->NodeNum]) { 831 CurrentBottomUpReservedDependencyColoring[SU->NodeNum] = 832 CurrentColoring[SU->NodeNum]; 833 continue; 834 } 835 836 for (SDep& SuccDep : SU->Succs) { 837 SUnit *Succ = SuccDep.getSUnit(); 838 if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize) 839 continue; 840 if (CurrentBottomUpReservedDependencyColoring[Succ->NodeNum] > 0) 841 SUColors.insert(CurrentBottomUpReservedDependencyColoring[Succ->NodeNum]); 842 } 843 // Keep color 0. 844 if (SUColors.empty()) 845 continue; 846 // Same color than parents. 847 if (SUColors.size() == 1 && *SUColors.begin() > DAGSize) 848 CurrentBottomUpReservedDependencyColoring[SU->NodeNum] = 849 *SUColors.begin(); 850 else { 851 std::map<std::set<unsigned>, unsigned>::iterator Pos = 852 ColorCombinations.find(SUColors); 853 if (Pos != ColorCombinations.end()) { 854 CurrentBottomUpReservedDependencyColoring[SU->NodeNum] = Pos->second; 855 } else { 856 CurrentBottomUpReservedDependencyColoring[SU->NodeNum] = 857 NextNonReservedID; 858 ColorCombinations[SUColors] = NextNonReservedID++; 859 } 860 } 861 } 862 } 863 864 void SIScheduleBlockCreator::colorAccordingToReservedDependencies() { 865 std::map<std::pair<unsigned, unsigned>, unsigned> ColorCombinations; 866 867 // Every combination of colors given by the top down 868 // and bottom up Reserved node dependency 869 870 for (const SUnit &SU : DAG->SUnits) { 871 std::pair<unsigned, unsigned> SUColors; 872 873 // High latency instructions: already given. 874 if (CurrentColoring[SU.NodeNum]) 875 continue; 876 877 SUColors.first = CurrentTopDownReservedDependencyColoring[SU.NodeNum]; 878 SUColors.second = CurrentBottomUpReservedDependencyColoring[SU.NodeNum]; 879 880 auto [Pos, Inserted] = 881 ColorCombinations.try_emplace(SUColors, NextNonReservedID); 882 CurrentColoring[SU.NodeNum] = Pos->second; 883 if (Inserted) 884 NextNonReservedID++; 885 } 886 } 887 888 void SIScheduleBlockCreator::colorEndsAccordingToDependencies() { 889 unsigned DAGSize = DAG->SUnits.size(); 890 std::vector<int> PendingColoring = CurrentColoring; 891 892 assert(DAGSize >= 1 && 893 CurrentBottomUpReservedDependencyColoring.size() == DAGSize && 894 CurrentTopDownReservedDependencyColoring.size() == DAGSize); 895 // If there is no reserved block at all, do nothing. We don't want 896 // everything in one block. 897 if (*llvm::max_element(CurrentBottomUpReservedDependencyColoring) == 0 && 898 *llvm::max_element(CurrentTopDownReservedDependencyColoring) == 0) 899 return; 900 901 for (unsigned SUNum : DAG->BottomUpIndex2SU) { 902 SUnit *SU = &DAG->SUnits[SUNum]; 903 std::set<unsigned> SUColors; 904 std::set<unsigned> SUColorsPending; 905 906 if (CurrentColoring[SU->NodeNum] <= (int)DAGSize) 907 continue; 908 909 if (CurrentBottomUpReservedDependencyColoring[SU->NodeNum] > 0 || 910 CurrentTopDownReservedDependencyColoring[SU->NodeNum] > 0) 911 continue; 912 913 for (SDep& SuccDep : SU->Succs) { 914 SUnit *Succ = SuccDep.getSUnit(); 915 if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize) 916 continue; 917 if (CurrentBottomUpReservedDependencyColoring[Succ->NodeNum] > 0 || 918 CurrentTopDownReservedDependencyColoring[Succ->NodeNum] > 0) 919 SUColors.insert(CurrentColoring[Succ->NodeNum]); 920 SUColorsPending.insert(PendingColoring[Succ->NodeNum]); 921 } 922 // If there is only one child/parent block, and that block 923 // is not among the ones we are removing in this path, then 924 // merge the instruction to that block 925 if (SUColors.size() == 1 && SUColorsPending.size() == 1) 926 PendingColoring[SU->NodeNum] = *SUColors.begin(); 927 else // TODO: Attribute new colors depending on color 928 // combination of children. 929 PendingColoring[SU->NodeNum] = NextNonReservedID++; 930 } 931 CurrentColoring = PendingColoring; 932 } 933 934 935 void SIScheduleBlockCreator::colorForceConsecutiveOrderInGroup() { 936 unsigned DAGSize = DAG->SUnits.size(); 937 unsigned PreviousColor; 938 std::set<unsigned> SeenColors; 939 940 if (DAGSize <= 1) 941 return; 942 943 PreviousColor = CurrentColoring[0]; 944 945 for (unsigned i = 1, e = DAGSize; i != e; ++i) { 946 SUnit *SU = &DAG->SUnits[i]; 947 unsigned CurrentColor = CurrentColoring[i]; 948 unsigned PreviousColorSave = PreviousColor; 949 assert(i == SU->NodeNum); 950 951 if (CurrentColor != PreviousColor) 952 SeenColors.insert(PreviousColor); 953 PreviousColor = CurrentColor; 954 955 if (CurrentColoring[SU->NodeNum] <= (int)DAGSize) 956 continue; 957 958 if (SeenColors.find(CurrentColor) == SeenColors.end()) 959 continue; 960 961 if (PreviousColorSave != CurrentColor) 962 CurrentColoring[i] = NextNonReservedID++; 963 else 964 CurrentColoring[i] = CurrentColoring[i-1]; 965 } 966 } 967 968 void SIScheduleBlockCreator::colorMergeConstantLoadsNextGroup() { 969 unsigned DAGSize = DAG->SUnits.size(); 970 971 for (unsigned SUNum : DAG->BottomUpIndex2SU) { 972 SUnit *SU = &DAG->SUnits[SUNum]; 973 std::set<unsigned> SUColors; 974 975 if (CurrentColoring[SU->NodeNum] <= (int)DAGSize) 976 continue; 977 978 // No predecessor: Vgpr constant loading. 979 // Low latency instructions usually have a predecessor (the address) 980 if (SU->Preds.size() > 0 && !DAG->IsLowLatencySU[SU->NodeNum]) 981 continue; 982 983 for (SDep& SuccDep : SU->Succs) { 984 SUnit *Succ = SuccDep.getSUnit(); 985 if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize) 986 continue; 987 SUColors.insert(CurrentColoring[Succ->NodeNum]); 988 } 989 if (SUColors.size() == 1) 990 CurrentColoring[SU->NodeNum] = *SUColors.begin(); 991 } 992 } 993 994 void SIScheduleBlockCreator::colorMergeIfPossibleNextGroup() { 995 unsigned DAGSize = DAG->SUnits.size(); 996 997 for (unsigned SUNum : DAG->BottomUpIndex2SU) { 998 SUnit *SU = &DAG->SUnits[SUNum]; 999 std::set<unsigned> SUColors; 1000 1001 if (CurrentColoring[SU->NodeNum] <= (int)DAGSize) 1002 continue; 1003 1004 for (SDep& SuccDep : SU->Succs) { 1005 SUnit *Succ = SuccDep.getSUnit(); 1006 if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize) 1007 continue; 1008 SUColors.insert(CurrentColoring[Succ->NodeNum]); 1009 } 1010 if (SUColors.size() == 1) 1011 CurrentColoring[SU->NodeNum] = *SUColors.begin(); 1012 } 1013 } 1014 1015 void SIScheduleBlockCreator::colorMergeIfPossibleNextGroupOnlyForReserved() { 1016 unsigned DAGSize = DAG->SUnits.size(); 1017 1018 for (unsigned SUNum : DAG->BottomUpIndex2SU) { 1019 SUnit *SU = &DAG->SUnits[SUNum]; 1020 std::set<unsigned> SUColors; 1021 1022 if (CurrentColoring[SU->NodeNum] <= (int)DAGSize) 1023 continue; 1024 1025 for (SDep& SuccDep : SU->Succs) { 1026 SUnit *Succ = SuccDep.getSUnit(); 1027 if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize) 1028 continue; 1029 SUColors.insert(CurrentColoring[Succ->NodeNum]); 1030 } 1031 if (SUColors.size() == 1 && *SUColors.begin() <= DAGSize) 1032 CurrentColoring[SU->NodeNum] = *SUColors.begin(); 1033 } 1034 } 1035 1036 void SIScheduleBlockCreator::colorMergeIfPossibleSmallGroupsToNextGroup() { 1037 unsigned DAGSize = DAG->SUnits.size(); 1038 std::map<unsigned, unsigned> ColorCount; 1039 1040 for (unsigned SUNum : DAG->BottomUpIndex2SU) { 1041 SUnit *SU = &DAG->SUnits[SUNum]; 1042 unsigned color = CurrentColoring[SU->NodeNum]; 1043 ++ColorCount[color]; 1044 } 1045 1046 for (unsigned SUNum : DAG->BottomUpIndex2SU) { 1047 SUnit *SU = &DAG->SUnits[SUNum]; 1048 unsigned color = CurrentColoring[SU->NodeNum]; 1049 std::set<unsigned> SUColors; 1050 1051 if (CurrentColoring[SU->NodeNum] <= (int)DAGSize) 1052 continue; 1053 1054 if (ColorCount[color] > 1) 1055 continue; 1056 1057 for (SDep& SuccDep : SU->Succs) { 1058 SUnit *Succ = SuccDep.getSUnit(); 1059 if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize) 1060 continue; 1061 SUColors.insert(CurrentColoring[Succ->NodeNum]); 1062 } 1063 if (SUColors.size() == 1 && *SUColors.begin() != color) { 1064 --ColorCount[color]; 1065 CurrentColoring[SU->NodeNum] = *SUColors.begin(); 1066 ++ColorCount[*SUColors.begin()]; 1067 } 1068 } 1069 } 1070 1071 void SIScheduleBlockCreator::cutHugeBlocks() { 1072 // TODO 1073 } 1074 1075 void SIScheduleBlockCreator::regroupNoUserInstructions() { 1076 unsigned DAGSize = DAG->SUnits.size(); 1077 int GroupID = NextNonReservedID++; 1078 1079 for (unsigned SUNum : DAG->BottomUpIndex2SU) { 1080 SUnit *SU = &DAG->SUnits[SUNum]; 1081 bool hasSuccessor = false; 1082 1083 if (CurrentColoring[SU->NodeNum] <= (int)DAGSize) 1084 continue; 1085 1086 for (SDep& SuccDep : SU->Succs) { 1087 SUnit *Succ = SuccDep.getSUnit(); 1088 if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize) 1089 continue; 1090 hasSuccessor = true; 1091 } 1092 if (!hasSuccessor) 1093 CurrentColoring[SU->NodeNum] = GroupID; 1094 } 1095 } 1096 1097 void SIScheduleBlockCreator::colorExports() { 1098 unsigned ExportColor = NextNonReservedID++; 1099 SmallVector<unsigned, 8> ExpGroup; 1100 1101 // Put all exports together in a block. 1102 // The block will naturally end up being scheduled last, 1103 // thus putting exports at the end of the schedule, which 1104 // is better for performance. 1105 // However we must ensure, for safety, the exports can be put 1106 // together in the same block without any other instruction. 1107 // This could happen, for example, when scheduling after regalloc 1108 // if reloading a spilled register from memory using the same 1109 // register than used in a previous export. 1110 // If that happens, do not regroup the exports. 1111 for (unsigned SUNum : DAG->TopDownIndex2SU) { 1112 const SUnit &SU = DAG->SUnits[SUNum]; 1113 if (SIInstrInfo::isEXP(*SU.getInstr())) { 1114 // SU is an export instruction. Check whether one of its successor 1115 // dependencies is a non-export, in which case we skip export grouping. 1116 for (const SDep &SuccDep : SU.Succs) { 1117 const SUnit *SuccSU = SuccDep.getSUnit(); 1118 if (SuccDep.isWeak() || SuccSU->NodeNum >= DAG->SUnits.size()) { 1119 // Ignore these dependencies. 1120 continue; 1121 } 1122 assert(SuccSU->isInstr() && 1123 "SUnit unexpectedly not representing an instruction!"); 1124 1125 if (!SIInstrInfo::isEXP(*SuccSU->getInstr())) { 1126 // A non-export depends on us. Skip export grouping. 1127 // Note that this is a bit pessimistic: We could still group all other 1128 // exports that are not depended on by non-exports, directly or 1129 // indirectly. Simply skipping this particular export but grouping all 1130 // others would not account for indirect dependencies. 1131 return; 1132 } 1133 } 1134 ExpGroup.push_back(SUNum); 1135 } 1136 } 1137 1138 // The group can be formed. Give the color. 1139 for (unsigned j : ExpGroup) 1140 CurrentColoring[j] = ExportColor; 1141 } 1142 1143 void SIScheduleBlockCreator::createBlocksForVariant(SISchedulerBlockCreatorVariant BlockVariant) { 1144 unsigned DAGSize = DAG->SUnits.size(); 1145 std::map<unsigned,unsigned> RealID; 1146 1147 CurrentBlocks.clear(); 1148 CurrentColoring.clear(); 1149 CurrentColoring.resize(DAGSize, 0); 1150 Node2CurrentBlock.clear(); 1151 1152 // Restore links previous scheduling variant has overridden. 1153 DAG->restoreSULinksLeft(); 1154 1155 NextReservedID = 1; 1156 NextNonReservedID = DAGSize + 1; 1157 1158 LLVM_DEBUG(dbgs() << "Coloring the graph\n"); 1159 1160 if (BlockVariant == SISchedulerBlockCreatorVariant::LatenciesGrouped) 1161 colorHighLatenciesGroups(); 1162 else 1163 colorHighLatenciesAlone(); 1164 colorComputeReservedDependencies(); 1165 colorAccordingToReservedDependencies(); 1166 colorEndsAccordingToDependencies(); 1167 if (BlockVariant == SISchedulerBlockCreatorVariant::LatenciesAlonePlusConsecutive) 1168 colorForceConsecutiveOrderInGroup(); 1169 regroupNoUserInstructions(); 1170 colorMergeConstantLoadsNextGroup(); 1171 colorMergeIfPossibleNextGroupOnlyForReserved(); 1172 colorExports(); 1173 1174 // Put SUs of same color into same block 1175 Node2CurrentBlock.resize(DAGSize, -1); 1176 for (unsigned i = 0, e = DAGSize; i != e; ++i) { 1177 SUnit *SU = &DAG->SUnits[i]; 1178 unsigned Color = CurrentColoring[SU->NodeNum]; 1179 if (RealID.find(Color) == RealID.end()) { 1180 int ID = CurrentBlocks.size(); 1181 BlockPtrs.push_back(std::make_unique<SIScheduleBlock>(DAG, this, ID)); 1182 CurrentBlocks.push_back(BlockPtrs.rbegin()->get()); 1183 RealID[Color] = ID; 1184 } 1185 CurrentBlocks[RealID[Color]]->addUnit(SU); 1186 Node2CurrentBlock[SU->NodeNum] = RealID[Color]; 1187 } 1188 1189 // Build dependencies between blocks. 1190 for (unsigned i = 0, e = DAGSize; i != e; ++i) { 1191 SUnit *SU = &DAG->SUnits[i]; 1192 int SUID = Node2CurrentBlock[i]; 1193 for (SDep& SuccDep : SU->Succs) { 1194 SUnit *Succ = SuccDep.getSUnit(); 1195 if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize) 1196 continue; 1197 if (Node2CurrentBlock[Succ->NodeNum] != SUID) 1198 CurrentBlocks[SUID]->addSucc(CurrentBlocks[Node2CurrentBlock[Succ->NodeNum]], 1199 SuccDep.isCtrl() ? NoData : Data); 1200 } 1201 for (SDep& PredDep : SU->Preds) { 1202 SUnit *Pred = PredDep.getSUnit(); 1203 if (PredDep.isWeak() || Pred->NodeNum >= DAGSize) 1204 continue; 1205 if (Node2CurrentBlock[Pred->NodeNum] != SUID) 1206 CurrentBlocks[SUID]->addPred(CurrentBlocks[Node2CurrentBlock[Pred->NodeNum]]); 1207 } 1208 } 1209 1210 // Free root and leafs of all blocks to enable scheduling inside them. 1211 for (SIScheduleBlock *Block : CurrentBlocks) 1212 Block->finalizeUnits(); 1213 LLVM_DEBUG({ 1214 dbgs() << "Blocks created:\n\n"; 1215 for (SIScheduleBlock *Block : CurrentBlocks) 1216 Block->printDebug(true); 1217 }); 1218 } 1219 1220 // Two functions taken from Codegen/MachineScheduler.cpp 1221 1222 /// Non-const version. 1223 static MachineBasicBlock::iterator 1224 nextIfDebug(MachineBasicBlock::iterator I, 1225 MachineBasicBlock::const_iterator End) { 1226 for (; I != End; ++I) { 1227 if (!I->isDebugInstr()) 1228 break; 1229 } 1230 return I; 1231 } 1232 1233 void SIScheduleBlockCreator::topologicalSort() { 1234 unsigned DAGSize = CurrentBlocks.size(); 1235 std::vector<int> WorkList; 1236 1237 LLVM_DEBUG(dbgs() << "Topological Sort\n"); 1238 1239 WorkList.reserve(DAGSize); 1240 TopDownIndex2Block.resize(DAGSize); 1241 TopDownBlock2Index.resize(DAGSize); 1242 BottomUpIndex2Block.resize(DAGSize); 1243 1244 for (unsigned i = 0, e = DAGSize; i != e; ++i) { 1245 SIScheduleBlock *Block = CurrentBlocks[i]; 1246 unsigned Degree = Block->getSuccs().size(); 1247 TopDownBlock2Index[i] = Degree; 1248 if (Degree == 0) { 1249 WorkList.push_back(i); 1250 } 1251 } 1252 1253 int Id = DAGSize; 1254 while (!WorkList.empty()) { 1255 int i = WorkList.back(); 1256 SIScheduleBlock *Block = CurrentBlocks[i]; 1257 WorkList.pop_back(); 1258 TopDownBlock2Index[i] = --Id; 1259 TopDownIndex2Block[Id] = i; 1260 for (SIScheduleBlock* Pred : Block->getPreds()) { 1261 if (!--TopDownBlock2Index[Pred->getID()]) 1262 WorkList.push_back(Pred->getID()); 1263 } 1264 } 1265 1266 #ifndef NDEBUG 1267 // Check correctness of the ordering. 1268 for (unsigned i = 0, e = DAGSize; i != e; ++i) { 1269 SIScheduleBlock *Block = CurrentBlocks[i]; 1270 for (SIScheduleBlock* Pred : Block->getPreds()) { 1271 assert(TopDownBlock2Index[i] > TopDownBlock2Index[Pred->getID()] && 1272 "Wrong Top Down topological sorting"); 1273 } 1274 } 1275 #endif 1276 1277 BottomUpIndex2Block = std::vector<int>(TopDownIndex2Block.rbegin(), 1278 TopDownIndex2Block.rend()); 1279 } 1280 1281 void SIScheduleBlockCreator::scheduleInsideBlocks() { 1282 unsigned DAGSize = CurrentBlocks.size(); 1283 1284 LLVM_DEBUG(dbgs() << "\nScheduling Blocks\n\n"); 1285 1286 // We do schedule a valid scheduling such that a Block corresponds 1287 // to a range of instructions. 1288 LLVM_DEBUG(dbgs() << "First phase: Fast scheduling for Reg Liveness\n"); 1289 for (unsigned i = 0, e = DAGSize; i != e; ++i) { 1290 SIScheduleBlock *Block = CurrentBlocks[i]; 1291 Block->fastSchedule(); 1292 } 1293 1294 // Note: the following code, and the part restoring previous position 1295 // is by far the most expensive operation of the Scheduler. 1296 1297 // Do not update CurrentTop. 1298 MachineBasicBlock::iterator CurrentTopFastSched = DAG->getCurrentTop(); 1299 std::vector<MachineBasicBlock::iterator> PosOld; 1300 std::vector<MachineBasicBlock::iterator> PosNew; 1301 PosOld.reserve(DAG->SUnits.size()); 1302 PosNew.reserve(DAG->SUnits.size()); 1303 1304 for (unsigned i = 0, e = DAGSize; i != e; ++i) { 1305 int BlockIndice = TopDownIndex2Block[i]; 1306 SIScheduleBlock *Block = CurrentBlocks[BlockIndice]; 1307 std::vector<SUnit*> SUs = Block->getScheduledUnits(); 1308 1309 for (SUnit* SU : SUs) { 1310 MachineInstr *MI = SU->getInstr(); 1311 MachineBasicBlock::iterator Pos = MI; 1312 PosOld.push_back(Pos); 1313 if (&*CurrentTopFastSched == MI) { 1314 PosNew.push_back(Pos); 1315 CurrentTopFastSched = nextIfDebug(++CurrentTopFastSched, 1316 DAG->getCurrentBottom()); 1317 } else { 1318 // Update the instruction stream. 1319 DAG->getBB()->splice(CurrentTopFastSched, DAG->getBB(), MI); 1320 1321 // Update LiveIntervals. 1322 // Note: Moving all instructions and calling handleMove every time 1323 // is the most cpu intensive operation of the scheduler. 1324 // It would gain a lot if there was a way to recompute the 1325 // LiveIntervals for the entire scheduling region. 1326 DAG->getLIS()->handleMove(*MI, /*UpdateFlags=*/true); 1327 PosNew.push_back(CurrentTopFastSched); 1328 } 1329 } 1330 } 1331 1332 // Now we have Block of SUs == Block of MI. 1333 // We do the final schedule for the instructions inside the block. 1334 // The property that all the SUs of the Block are grouped together as MI 1335 // is used for correct reg usage tracking. 1336 for (unsigned i = 0, e = DAGSize; i != e; ++i) { 1337 SIScheduleBlock *Block = CurrentBlocks[i]; 1338 std::vector<SUnit*> SUs = Block->getScheduledUnits(); 1339 Block->schedule((*SUs.begin())->getInstr(), (*SUs.rbegin())->getInstr()); 1340 } 1341 1342 LLVM_DEBUG(dbgs() << "Restoring MI Pos\n"); 1343 // Restore old ordering (which prevents a LIS->handleMove bug). 1344 for (unsigned i = PosOld.size(), e = 0; i != e; --i) { 1345 MachineBasicBlock::iterator POld = PosOld[i-1]; 1346 MachineBasicBlock::iterator PNew = PosNew[i-1]; 1347 if (PNew != POld) { 1348 // Update the instruction stream. 1349 DAG->getBB()->splice(POld, DAG->getBB(), PNew); 1350 1351 // Update LiveIntervals. 1352 DAG->getLIS()->handleMove(*POld, /*UpdateFlags=*/true); 1353 } 1354 } 1355 1356 LLVM_DEBUG({ 1357 for (SIScheduleBlock *Block : CurrentBlocks) 1358 Block->printDebug(true); 1359 }); 1360 } 1361 1362 void SIScheduleBlockCreator::fillStats() { 1363 unsigned DAGSize = CurrentBlocks.size(); 1364 1365 for (unsigned i = 0, e = DAGSize; i != e; ++i) { 1366 int BlockIndice = TopDownIndex2Block[i]; 1367 SIScheduleBlock *Block = CurrentBlocks[BlockIndice]; 1368 if (Block->getPreds().empty()) 1369 Block->Depth = 0; 1370 else { 1371 unsigned Depth = 0; 1372 for (SIScheduleBlock *Pred : Block->getPreds()) { 1373 if (Depth < Pred->Depth + Pred->getCost()) 1374 Depth = Pred->Depth + Pred->getCost(); 1375 } 1376 Block->Depth = Depth; 1377 } 1378 } 1379 1380 for (unsigned i = 0, e = DAGSize; i != e; ++i) { 1381 int BlockIndice = BottomUpIndex2Block[i]; 1382 SIScheduleBlock *Block = CurrentBlocks[BlockIndice]; 1383 if (Block->getSuccs().empty()) 1384 Block->Height = 0; 1385 else { 1386 unsigned Height = 0; 1387 for (const auto &Succ : Block->getSuccs()) 1388 Height = std::max(Height, Succ.first->Height + Succ.first->getCost()); 1389 Block->Height = Height; 1390 } 1391 } 1392 } 1393 1394 // SIScheduleBlockScheduler // 1395 1396 SIScheduleBlockScheduler::SIScheduleBlockScheduler(SIScheduleDAGMI *DAG, 1397 SISchedulerBlockSchedulerVariant Variant, 1398 SIScheduleBlocks BlocksStruct) : 1399 DAG(DAG), Variant(Variant), Blocks(BlocksStruct.Blocks), 1400 LastPosWaitedHighLatency(0), NumBlockScheduled(0), VregCurrentUsage(0), 1401 SregCurrentUsage(0), maxVregUsage(0), maxSregUsage(0) { 1402 1403 // Fill the usage of every output 1404 // Warning: while by construction we always have a link between two blocks 1405 // when one needs a result from the other, the number of users of an output 1406 // is not the sum of child blocks having as input the same virtual register. 1407 // Here is an example. A produces x and y. B eats x and produces x'. 1408 // C eats x' and y. The register coalescer may have attributed the same 1409 // virtual register to x and x'. 1410 // To count accurately, we do a topological sort. In case the register is 1411 // found for several parents, we increment the usage of the one with the 1412 // highest topological index. 1413 LiveOutRegsNumUsages.resize(Blocks.size()); 1414 for (SIScheduleBlock *Block : Blocks) { 1415 for (Register Reg : Block->getInRegs()) { 1416 bool Found = false; 1417 int topoInd = -1; 1418 for (SIScheduleBlock* Pred: Block->getPreds()) { 1419 std::set<Register> PredOutRegs = Pred->getOutRegs(); 1420 std::set<Register>::iterator RegPos = PredOutRegs.find(Reg); 1421 1422 if (RegPos != PredOutRegs.end()) { 1423 Found = true; 1424 if (topoInd < BlocksStruct.TopDownBlock2Index[Pred->getID()]) { 1425 topoInd = BlocksStruct.TopDownBlock2Index[Pred->getID()]; 1426 } 1427 } 1428 } 1429 1430 if (!Found) 1431 continue; 1432 1433 int PredID = BlocksStruct.TopDownIndex2Block[topoInd]; 1434 ++LiveOutRegsNumUsages[PredID][Reg]; 1435 } 1436 } 1437 1438 LastPosHighLatencyParentScheduled.resize(Blocks.size(), 0); 1439 BlockNumPredsLeft.resize(Blocks.size()); 1440 BlockNumSuccsLeft.resize(Blocks.size()); 1441 1442 for (unsigned i = 0, e = Blocks.size(); i != e; ++i) { 1443 SIScheduleBlock *Block = Blocks[i]; 1444 BlockNumPredsLeft[i] = Block->getPreds().size(); 1445 BlockNumSuccsLeft[i] = Block->getSuccs().size(); 1446 } 1447 1448 #ifndef NDEBUG 1449 for (unsigned i = 0, e = Blocks.size(); i != e; ++i) { 1450 SIScheduleBlock *Block = Blocks[i]; 1451 assert(Block->getID() == i); 1452 } 1453 #endif 1454 1455 std::set<Register> InRegs = DAG->getInRegs(); 1456 addLiveRegs(InRegs); 1457 1458 // Increase LiveOutRegsNumUsages for blocks 1459 // producing registers consumed in another 1460 // scheduling region. 1461 for (Register Reg : DAG->getOutRegs()) { 1462 for (unsigned i = 0, e = Blocks.size(); i != e; ++i) { 1463 // Do reverse traversal 1464 int ID = BlocksStruct.TopDownIndex2Block[Blocks.size()-1-i]; 1465 SIScheduleBlock *Block = Blocks[ID]; 1466 const std::set<Register> &OutRegs = Block->getOutRegs(); 1467 1468 if (OutRegs.find(Reg) == OutRegs.end()) 1469 continue; 1470 1471 ++LiveOutRegsNumUsages[ID][Reg]; 1472 break; 1473 } 1474 } 1475 1476 // Fill LiveRegsConsumers for regs that were already 1477 // defined before scheduling. 1478 for (SIScheduleBlock *Block : Blocks) { 1479 for (Register Reg : Block->getInRegs()) { 1480 bool Found = false; 1481 for (SIScheduleBlock* Pred: Block->getPreds()) { 1482 std::set<Register> PredOutRegs = Pred->getOutRegs(); 1483 std::set<Register>::iterator RegPos = PredOutRegs.find(Reg); 1484 1485 if (RegPos != PredOutRegs.end()) { 1486 Found = true; 1487 break; 1488 } 1489 } 1490 1491 if (!Found) 1492 ++LiveRegsConsumers[Reg]; 1493 } 1494 } 1495 1496 for (unsigned i = 0, e = Blocks.size(); i != e; ++i) { 1497 SIScheduleBlock *Block = Blocks[i]; 1498 if (BlockNumPredsLeft[i] == 0) { 1499 ReadyBlocks.push_back(Block); 1500 } 1501 } 1502 1503 while (SIScheduleBlock *Block = pickBlock()) { 1504 BlocksScheduled.push_back(Block); 1505 blockScheduled(Block); 1506 } 1507 1508 LLVM_DEBUG(dbgs() << "Block Order:"; for (SIScheduleBlock *Block 1509 : BlocksScheduled) { 1510 dbgs() << ' ' << Block->getID(); 1511 } dbgs() << '\n';); 1512 } 1513 1514 bool SIScheduleBlockScheduler::tryCandidateLatency(SIBlockSchedCandidate &Cand, 1515 SIBlockSchedCandidate &TryCand) { 1516 if (!Cand.isValid()) { 1517 TryCand.Reason = NodeOrder; 1518 return true; 1519 } 1520 1521 // Try to hide high latencies. 1522 if (SISched::tryLess(TryCand.LastPosHighLatParentScheduled, 1523 Cand.LastPosHighLatParentScheduled, TryCand, Cand, Latency)) 1524 return true; 1525 // Schedule high latencies early so you can hide them better. 1526 if (SISched::tryGreater(TryCand.IsHighLatency, Cand.IsHighLatency, 1527 TryCand, Cand, Latency)) 1528 return true; 1529 if (TryCand.IsHighLatency && SISched::tryGreater(TryCand.Height, Cand.Height, 1530 TryCand, Cand, Depth)) 1531 return true; 1532 if (SISched::tryGreater(TryCand.NumHighLatencySuccessors, 1533 Cand.NumHighLatencySuccessors, 1534 TryCand, Cand, Successor)) 1535 return true; 1536 return false; 1537 } 1538 1539 bool SIScheduleBlockScheduler::tryCandidateRegUsage(SIBlockSchedCandidate &Cand, 1540 SIBlockSchedCandidate &TryCand) { 1541 if (!Cand.isValid()) { 1542 TryCand.Reason = NodeOrder; 1543 return true; 1544 } 1545 1546 if (SISched::tryLess(TryCand.VGPRUsageDiff > 0, Cand.VGPRUsageDiff > 0, 1547 TryCand, Cand, RegUsage)) 1548 return true; 1549 if (SISched::tryGreater(TryCand.NumSuccessors > 0, 1550 Cand.NumSuccessors > 0, 1551 TryCand, Cand, Successor)) 1552 return true; 1553 if (SISched::tryGreater(TryCand.Height, Cand.Height, TryCand, Cand, Depth)) 1554 return true; 1555 if (SISched::tryLess(TryCand.VGPRUsageDiff, Cand.VGPRUsageDiff, 1556 TryCand, Cand, RegUsage)) 1557 return true; 1558 return false; 1559 } 1560 1561 SIScheduleBlock *SIScheduleBlockScheduler::pickBlock() { 1562 SIBlockSchedCandidate Cand; 1563 std::vector<SIScheduleBlock*>::iterator Best; 1564 SIScheduleBlock *Block; 1565 if (ReadyBlocks.empty()) 1566 return nullptr; 1567 1568 DAG->fillVgprSgprCost(LiveRegs.begin(), LiveRegs.end(), 1569 VregCurrentUsage, SregCurrentUsage); 1570 if (VregCurrentUsage > maxVregUsage) 1571 maxVregUsage = VregCurrentUsage; 1572 if (SregCurrentUsage > maxSregUsage) 1573 maxSregUsage = SregCurrentUsage; 1574 LLVM_DEBUG(dbgs() << "Picking New Blocks\n"; dbgs() << "Available: "; 1575 for (SIScheduleBlock *Block : ReadyBlocks) 1576 dbgs() << Block->getID() << ' '; 1577 dbgs() << "\nCurrent Live:\n"; 1578 for (Register Reg : LiveRegs) 1579 dbgs() << printVRegOrUnit(Reg, DAG->getTRI()) << ' '; 1580 dbgs() << '\n'; 1581 dbgs() << "Current VGPRs: " << VregCurrentUsage << '\n'; 1582 dbgs() << "Current SGPRs: " << SregCurrentUsage << '\n';); 1583 1584 Cand.Block = nullptr; 1585 for (std::vector<SIScheduleBlock*>::iterator I = ReadyBlocks.begin(), 1586 E = ReadyBlocks.end(); I != E; ++I) { 1587 SIBlockSchedCandidate TryCand; 1588 TryCand.Block = *I; 1589 TryCand.IsHighLatency = TryCand.Block->isHighLatencyBlock(); 1590 TryCand.VGPRUsageDiff = 1591 checkRegUsageImpact(TryCand.Block->getInRegs(), 1592 TryCand.Block->getOutRegs())[AMDGPU::RegisterPressureSets::VGPR_32]; 1593 TryCand.NumSuccessors = TryCand.Block->getSuccs().size(); 1594 TryCand.NumHighLatencySuccessors = 1595 TryCand.Block->getNumHighLatencySuccessors(); 1596 TryCand.LastPosHighLatParentScheduled = 1597 (unsigned int) std::max<int> (0, 1598 LastPosHighLatencyParentScheduled[TryCand.Block->getID()] - 1599 LastPosWaitedHighLatency); 1600 TryCand.Height = TryCand.Block->Height; 1601 // Try not to increase VGPR usage too much, else we may spill. 1602 if (VregCurrentUsage > 120 || 1603 Variant != SISchedulerBlockSchedulerVariant::BlockLatencyRegUsage) { 1604 if (!tryCandidateRegUsage(Cand, TryCand) && 1605 Variant != SISchedulerBlockSchedulerVariant::BlockRegUsage) 1606 tryCandidateLatency(Cand, TryCand); 1607 } else { 1608 if (!tryCandidateLatency(Cand, TryCand)) 1609 tryCandidateRegUsage(Cand, TryCand); 1610 } 1611 if (TryCand.Reason != NoCand) { 1612 Cand.setBest(TryCand); 1613 Best = I; 1614 LLVM_DEBUG(dbgs() << "Best Current Choice: " << Cand.Block->getID() << ' ' 1615 << getReasonStr(Cand.Reason) << '\n'); 1616 } 1617 } 1618 1619 LLVM_DEBUG(dbgs() << "Picking: " << Cand.Block->getID() << '\n'; 1620 dbgs() << "Is a block with high latency instruction: " 1621 << (Cand.IsHighLatency ? "yes\n" : "no\n"); 1622 dbgs() << "Position of last high latency dependency: " 1623 << Cand.LastPosHighLatParentScheduled << '\n'; 1624 dbgs() << "VGPRUsageDiff: " << Cand.VGPRUsageDiff << '\n'; 1625 dbgs() << '\n';); 1626 1627 Block = Cand.Block; 1628 ReadyBlocks.erase(Best); 1629 return Block; 1630 } 1631 1632 // Tracking of currently alive registers to determine VGPR Usage. 1633 1634 void SIScheduleBlockScheduler::addLiveRegs(std::set<Register> &Regs) { 1635 for (Register Reg : Regs) { 1636 // For now only track virtual registers. 1637 if (!Reg.isVirtual()) 1638 continue; 1639 // If not already in the live set, then add it. 1640 (void) LiveRegs.insert(Reg); 1641 } 1642 } 1643 1644 void SIScheduleBlockScheduler::decreaseLiveRegs(SIScheduleBlock *Block, 1645 std::set<Register> &Regs) { 1646 for (Register Reg : Regs) { 1647 // For now only track virtual registers. 1648 std::set<Register>::iterator Pos = LiveRegs.find(Reg); 1649 assert (Pos != LiveRegs.end() && // Reg must be live. 1650 LiveRegsConsumers.find(Reg) != LiveRegsConsumers.end() && 1651 LiveRegsConsumers[Reg] >= 1); 1652 --LiveRegsConsumers[Reg]; 1653 if (LiveRegsConsumers[Reg] == 0) 1654 LiveRegs.erase(Pos); 1655 } 1656 } 1657 1658 void SIScheduleBlockScheduler::releaseBlockSuccs(SIScheduleBlock *Parent) { 1659 for (const auto &Block : Parent->getSuccs()) { 1660 if (--BlockNumPredsLeft[Block.first->getID()] == 0) 1661 ReadyBlocks.push_back(Block.first); 1662 1663 if (Parent->isHighLatencyBlock() && 1664 Block.second == SIScheduleBlockLinkKind::Data) 1665 LastPosHighLatencyParentScheduled[Block.first->getID()] = NumBlockScheduled; 1666 } 1667 } 1668 1669 void SIScheduleBlockScheduler::blockScheduled(SIScheduleBlock *Block) { 1670 decreaseLiveRegs(Block, Block->getInRegs()); 1671 addLiveRegs(Block->getOutRegs()); 1672 releaseBlockSuccs(Block); 1673 for (const auto &RegP : LiveOutRegsNumUsages[Block->getID()]) { 1674 // We produce this register, thus it must not be previously alive. 1675 assert(LiveRegsConsumers.find(RegP.first) == LiveRegsConsumers.end() || 1676 LiveRegsConsumers[RegP.first] == 0); 1677 LiveRegsConsumers[RegP.first] += RegP.second; 1678 } 1679 if (LastPosHighLatencyParentScheduled[Block->getID()] > 1680 (unsigned)LastPosWaitedHighLatency) 1681 LastPosWaitedHighLatency = 1682 LastPosHighLatencyParentScheduled[Block->getID()]; 1683 ++NumBlockScheduled; 1684 } 1685 1686 std::vector<int> 1687 SIScheduleBlockScheduler::checkRegUsageImpact(std::set<Register> &InRegs, 1688 std::set<Register> &OutRegs) { 1689 std::vector<int> DiffSetPressure; 1690 DiffSetPressure.assign(DAG->getTRI()->getNumRegPressureSets(), 0); 1691 1692 for (Register Reg : InRegs) { 1693 // For now only track virtual registers. 1694 if (!Reg.isVirtual()) 1695 continue; 1696 if (LiveRegsConsumers[Reg] > 1) 1697 continue; 1698 PSetIterator PSetI = DAG->getMRI()->getPressureSets(Reg); 1699 for (; PSetI.isValid(); ++PSetI) { 1700 DiffSetPressure[*PSetI] -= PSetI.getWeight(); 1701 } 1702 } 1703 1704 for (Register Reg : OutRegs) { 1705 // For now only track virtual registers. 1706 if (!Reg.isVirtual()) 1707 continue; 1708 PSetIterator PSetI = DAG->getMRI()->getPressureSets(Reg); 1709 for (; PSetI.isValid(); ++PSetI) { 1710 DiffSetPressure[*PSetI] += PSetI.getWeight(); 1711 } 1712 } 1713 1714 return DiffSetPressure; 1715 } 1716 1717 // SIScheduler // 1718 1719 struct SIScheduleBlockResult 1720 SIScheduler::scheduleVariant(SISchedulerBlockCreatorVariant BlockVariant, 1721 SISchedulerBlockSchedulerVariant ScheduleVariant) { 1722 SIScheduleBlocks Blocks = BlockCreator.getBlocks(BlockVariant); 1723 SIScheduleBlockScheduler Scheduler(DAG, ScheduleVariant, Blocks); 1724 std::vector<SIScheduleBlock*> ScheduledBlocks; 1725 struct SIScheduleBlockResult Res; 1726 1727 ScheduledBlocks = Scheduler.getBlocks(); 1728 1729 for (SIScheduleBlock *Block : ScheduledBlocks) { 1730 std::vector<SUnit*> SUs = Block->getScheduledUnits(); 1731 1732 for (SUnit* SU : SUs) 1733 Res.SUs.push_back(SU->NodeNum); 1734 } 1735 1736 Res.MaxSGPRUsage = Scheduler.getSGPRUsage(); 1737 Res.MaxVGPRUsage = Scheduler.getVGPRUsage(); 1738 return Res; 1739 } 1740 1741 // SIScheduleDAGMI // 1742 1743 SIScheduleDAGMI::SIScheduleDAGMI(MachineSchedContext *C) : 1744 ScheduleDAGMILive(C, std::make_unique<GenericScheduler>(C)) { 1745 SITII = static_cast<const SIInstrInfo*>(TII); 1746 SITRI = static_cast<const SIRegisterInfo*>(TRI); 1747 } 1748 1749 SIScheduleDAGMI::~SIScheduleDAGMI() = default; 1750 1751 // Code adapted from scheduleDAG.cpp 1752 // Does a topological sort over the SUs. 1753 // Both TopDown and BottomUp 1754 void SIScheduleDAGMI::topologicalSort() { 1755 Topo.InitDAGTopologicalSorting(); 1756 1757 TopDownIndex2SU = std::vector<int>(Topo.begin(), Topo.end()); 1758 BottomUpIndex2SU = std::vector<int>(Topo.rbegin(), Topo.rend()); 1759 } 1760 1761 // Move low latencies further from their user without 1762 // increasing SGPR usage (in general) 1763 // This is to be replaced by a better pass that would 1764 // take into account SGPR usage (based on VGPR Usage 1765 // and the corresponding wavefront count), that would 1766 // try to merge groups of loads if it make sense, etc 1767 void SIScheduleDAGMI::moveLowLatencies() { 1768 unsigned DAGSize = SUnits.size(); 1769 int LastLowLatencyUser = -1; 1770 int LastLowLatencyPos = -1; 1771 1772 for (unsigned i = 0, e = ScheduledSUnits.size(); i != e; ++i) { 1773 SUnit *SU = &SUnits[ScheduledSUnits[i]]; 1774 bool IsLowLatencyUser = false; 1775 unsigned MinPos = 0; 1776 1777 for (SDep& PredDep : SU->Preds) { 1778 SUnit *Pred = PredDep.getSUnit(); 1779 if (SITII->isLowLatencyInstruction(*Pred->getInstr())) { 1780 IsLowLatencyUser = true; 1781 } 1782 if (Pred->NodeNum >= DAGSize) 1783 continue; 1784 unsigned PredPos = ScheduledSUnitsInv[Pred->NodeNum]; 1785 if (PredPos >= MinPos) 1786 MinPos = PredPos + 1; 1787 } 1788 1789 if (SITII->isLowLatencyInstruction(*SU->getInstr())) { 1790 unsigned BestPos = LastLowLatencyUser + 1; 1791 if ((int)BestPos <= LastLowLatencyPos) 1792 BestPos = LastLowLatencyPos + 1; 1793 if (BestPos < MinPos) 1794 BestPos = MinPos; 1795 if (BestPos < i) { 1796 for (unsigned u = i; u > BestPos; --u) { 1797 ++ScheduledSUnitsInv[ScheduledSUnits[u-1]]; 1798 ScheduledSUnits[u] = ScheduledSUnits[u-1]; 1799 } 1800 ScheduledSUnits[BestPos] = SU->NodeNum; 1801 ScheduledSUnitsInv[SU->NodeNum] = BestPos; 1802 } 1803 LastLowLatencyPos = BestPos; 1804 if (IsLowLatencyUser) 1805 LastLowLatencyUser = BestPos; 1806 } else if (IsLowLatencyUser) { 1807 LastLowLatencyUser = i; 1808 // Moves COPY instructions on which depends 1809 // the low latency instructions too. 1810 } else if (SU->getInstr()->getOpcode() == AMDGPU::COPY) { 1811 bool CopyForLowLat = false; 1812 for (SDep& SuccDep : SU->Succs) { 1813 SUnit *Succ = SuccDep.getSUnit(); 1814 if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize) 1815 continue; 1816 if (SITII->isLowLatencyInstruction(*Succ->getInstr())) { 1817 CopyForLowLat = true; 1818 } 1819 } 1820 if (!CopyForLowLat) 1821 continue; 1822 if (MinPos < i) { 1823 for (unsigned u = i; u > MinPos; --u) { 1824 ++ScheduledSUnitsInv[ScheduledSUnits[u-1]]; 1825 ScheduledSUnits[u] = ScheduledSUnits[u-1]; 1826 } 1827 ScheduledSUnits[MinPos] = SU->NodeNum; 1828 ScheduledSUnitsInv[SU->NodeNum] = MinPos; 1829 } 1830 } 1831 } 1832 } 1833 1834 void SIScheduleDAGMI::restoreSULinksLeft() { 1835 for (unsigned i = 0, e = SUnits.size(); i != e; ++i) { 1836 SUnits[i].isScheduled = false; 1837 SUnits[i].WeakPredsLeft = SUnitsLinksBackup[i].WeakPredsLeft; 1838 SUnits[i].NumPredsLeft = SUnitsLinksBackup[i].NumPredsLeft; 1839 SUnits[i].WeakSuccsLeft = SUnitsLinksBackup[i].WeakSuccsLeft; 1840 SUnits[i].NumSuccsLeft = SUnitsLinksBackup[i].NumSuccsLeft; 1841 } 1842 } 1843 1844 // Return the Vgpr and Sgpr usage corresponding to some virtual registers. 1845 template<typename _Iterator> void 1846 SIScheduleDAGMI::fillVgprSgprCost(_Iterator First, _Iterator End, 1847 unsigned &VgprUsage, unsigned &SgprUsage) { 1848 VgprUsage = 0; 1849 SgprUsage = 0; 1850 for (_Iterator RegI = First; RegI != End; ++RegI) { 1851 Register Reg = *RegI; 1852 // For now only track virtual registers 1853 if (!Reg.isVirtual()) 1854 continue; 1855 PSetIterator PSetI = MRI.getPressureSets(Reg); 1856 for (; PSetI.isValid(); ++PSetI) { 1857 if (*PSetI == AMDGPU::RegisterPressureSets::VGPR_32) 1858 VgprUsage += PSetI.getWeight(); 1859 else if (*PSetI == AMDGPU::RegisterPressureSets::SReg_32) 1860 SgprUsage += PSetI.getWeight(); 1861 } 1862 } 1863 } 1864 1865 void SIScheduleDAGMI::schedule() 1866 { 1867 SmallVector<SUnit*, 8> TopRoots, BotRoots; 1868 SIScheduleBlockResult Best, Temp; 1869 LLVM_DEBUG(dbgs() << "Preparing Scheduling\n"); 1870 1871 buildDAGWithRegPressure(); 1872 postProcessDAG(); 1873 1874 LLVM_DEBUG(dump()); 1875 if (PrintDAGs) 1876 dump(); 1877 if (ViewMISchedDAGs) 1878 viewGraph(); 1879 1880 topologicalSort(); 1881 findRootsAndBiasEdges(TopRoots, BotRoots); 1882 // We reuse several ScheduleDAGMI and ScheduleDAGMILive 1883 // functions, but to make them happy we must initialize 1884 // the default Scheduler implementation (even if we do not 1885 // run it) 1886 SchedImpl->initialize(this); 1887 initQueues(TopRoots, BotRoots); 1888 1889 // Fill some stats to help scheduling. 1890 1891 SUnitsLinksBackup = SUnits; 1892 IsLowLatencySU.clear(); 1893 LowLatencyOffset.clear(); 1894 IsHighLatencySU.clear(); 1895 1896 IsLowLatencySU.resize(SUnits.size(), 0); 1897 LowLatencyOffset.resize(SUnits.size(), 0); 1898 IsHighLatencySU.resize(SUnits.size(), 0); 1899 1900 for (unsigned i = 0, e = (unsigned)SUnits.size(); i != e; ++i) { 1901 SUnit *SU = &SUnits[i]; 1902 const MachineOperand *BaseLatOp; 1903 int64_t OffLatReg; 1904 if (SITII->isLowLatencyInstruction(*SU->getInstr())) { 1905 IsLowLatencySU[i] = 1; 1906 bool OffsetIsScalable; 1907 if (SITII->getMemOperandWithOffset(*SU->getInstr(), BaseLatOp, OffLatReg, 1908 OffsetIsScalable, TRI)) 1909 LowLatencyOffset[i] = OffLatReg; 1910 } else if (SITII->isHighLatencyDef(SU->getInstr()->getOpcode())) 1911 IsHighLatencySU[i] = 1; 1912 } 1913 1914 SIScheduler Scheduler(this); 1915 Best = Scheduler.scheduleVariant(SISchedulerBlockCreatorVariant::LatenciesAlone, 1916 SISchedulerBlockSchedulerVariant::BlockLatencyRegUsage); 1917 1918 // if VGPR usage is extremely high, try other good performing variants 1919 // which could lead to lower VGPR usage 1920 if (Best.MaxVGPRUsage > 180) { 1921 static const std::pair<SISchedulerBlockCreatorVariant, 1922 SISchedulerBlockSchedulerVariant> 1923 Variants[] = { 1924 { LatenciesAlone, BlockRegUsageLatency }, 1925 // { LatenciesAlone, BlockRegUsage }, 1926 { LatenciesGrouped, BlockLatencyRegUsage }, 1927 // { LatenciesGrouped, BlockRegUsageLatency }, 1928 // { LatenciesGrouped, BlockRegUsage }, 1929 { LatenciesAlonePlusConsecutive, BlockLatencyRegUsage }, 1930 // { LatenciesAlonePlusConsecutive, BlockRegUsageLatency }, 1931 // { LatenciesAlonePlusConsecutive, BlockRegUsage } 1932 }; 1933 for (std::pair<SISchedulerBlockCreatorVariant, SISchedulerBlockSchedulerVariant> v : Variants) { 1934 Temp = Scheduler.scheduleVariant(v.first, v.second); 1935 if (Temp.MaxVGPRUsage < Best.MaxVGPRUsage) 1936 Best = Temp; 1937 } 1938 } 1939 // if VGPR usage is still extremely high, we may spill. Try other variants 1940 // which are less performing, but that could lead to lower VGPR usage. 1941 if (Best.MaxVGPRUsage > 200) { 1942 static const std::pair<SISchedulerBlockCreatorVariant, 1943 SISchedulerBlockSchedulerVariant> 1944 Variants[] = { 1945 // { LatenciesAlone, BlockRegUsageLatency }, 1946 { LatenciesAlone, BlockRegUsage }, 1947 // { LatenciesGrouped, BlockLatencyRegUsage }, 1948 { LatenciesGrouped, BlockRegUsageLatency }, 1949 { LatenciesGrouped, BlockRegUsage }, 1950 // { LatenciesAlonePlusConsecutive, BlockLatencyRegUsage }, 1951 { LatenciesAlonePlusConsecutive, BlockRegUsageLatency }, 1952 { LatenciesAlonePlusConsecutive, BlockRegUsage } 1953 }; 1954 for (std::pair<SISchedulerBlockCreatorVariant, SISchedulerBlockSchedulerVariant> v : Variants) { 1955 Temp = Scheduler.scheduleVariant(v.first, v.second); 1956 if (Temp.MaxVGPRUsage < Best.MaxVGPRUsage) 1957 Best = Temp; 1958 } 1959 } 1960 1961 ScheduledSUnits = Best.SUs; 1962 ScheduledSUnitsInv.resize(SUnits.size()); 1963 1964 for (unsigned i = 0, e = (unsigned)SUnits.size(); i != e; ++i) { 1965 ScheduledSUnitsInv[ScheduledSUnits[i]] = i; 1966 } 1967 1968 moveLowLatencies(); 1969 1970 // Tell the outside world about the result of the scheduling. 1971 1972 assert(TopRPTracker.getPos() == RegionBegin && "bad initial Top tracker"); 1973 TopRPTracker.setPos(CurrentTop); 1974 1975 for (unsigned I : ScheduledSUnits) { 1976 SUnit *SU = &SUnits[I]; 1977 1978 scheduleMI(SU, true); 1979 1980 LLVM_DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") " 1981 << *SU->getInstr()); 1982 } 1983 1984 assert(CurrentTop == CurrentBottom && "Nonempty unscheduled zone."); 1985 1986 placeDebugValues(); 1987 1988 LLVM_DEBUG({ 1989 dbgs() << "*** Final schedule for " 1990 << printMBBReference(*begin()->getParent()) << " ***\n"; 1991 dumpSchedule(); 1992 dbgs() << '\n'; 1993 }); 1994 } 1995