1 //===-- SIWholeQuadMode.cpp - enter and suspend whole quad mode -----------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 /// \file 11 /// \brief This pass adds instructions to enable whole quad mode for pixel 12 /// shaders. 13 /// 14 /// Whole quad mode is required for derivative computations, but it interferes 15 /// with shader side effects (stores and atomics). This pass is run on the 16 /// scheduled machine IR but before register coalescing, so that machine SSA is 17 /// available for analysis. It ensures that WQM is enabled when necessary, but 18 /// disabled around stores and atomics. 19 /// 20 /// When necessary, this pass creates a function prolog 21 /// 22 /// S_MOV_B64 LiveMask, EXEC 23 /// S_WQM_B64 EXEC, EXEC 24 /// 25 /// to enter WQM at the top of the function and surrounds blocks of Exact 26 /// instructions by 27 /// 28 /// S_AND_SAVEEXEC_B64 Tmp, LiveMask 29 /// ... 30 /// S_MOV_B64 EXEC, Tmp 31 /// 32 /// In order to avoid excessive switching during sequences of Exact 33 /// instructions, the pass first analyzes which instructions must be run in WQM 34 /// (aka which instructions produce values that lead to derivative 35 /// computations). 36 /// 37 /// Basic blocks are always exited in WQM as long as some successor needs WQM. 38 /// 39 /// There is room for improvement given better control flow analysis: 40 /// 41 /// (1) at the top level (outside of control flow statements, and as long as 42 /// kill hasn't been used), one SGPR can be saved by recovering WQM from 43 /// the LiveMask (this is implemented for the entry block). 44 /// 45 /// (2) when entire regions (e.g. if-else blocks or entire loops) only 46 /// consist of exact and don't-care instructions, the switch only has to 47 /// be done at the entry and exit points rather than potentially in each 48 /// block of the region. 49 /// 50 //===----------------------------------------------------------------------===// 51 52 #include "AMDGPU.h" 53 #include "AMDGPUSubtarget.h" 54 #include "SIInstrInfo.h" 55 #include "SIMachineFunctionInfo.h" 56 #include "llvm/ADT/DenseMap.h" 57 #include "llvm/ADT/SmallVector.h" 58 #include "llvm/ADT/StringRef.h" 59 #include "llvm/CodeGen/LiveInterval.h" 60 #include "llvm/CodeGen/LiveIntervalAnalysis.h" 61 #include "llvm/CodeGen/MachineBasicBlock.h" 62 #include "llvm/CodeGen/MachineFunction.h" 63 #include "llvm/CodeGen/MachineFunctionPass.h" 64 #include "llvm/CodeGen/MachineInstr.h" 65 #include "llvm/CodeGen/MachineInstrBuilder.h" 66 #include "llvm/CodeGen/MachineOperand.h" 67 #include "llvm/CodeGen/MachineRegisterInfo.h" 68 #include "llvm/CodeGen/SlotIndexes.h" 69 #include "llvm/IR/CallingConv.h" 70 #include "llvm/IR/DebugLoc.h" 71 #include "llvm/MC/MCRegisterInfo.h" 72 #include "llvm/Pass.h" 73 #include "llvm/Support/Debug.h" 74 #include "llvm/Support/raw_ostream.h" 75 #include "llvm/Target/TargetRegisterInfo.h" 76 #include <cassert> 77 #include <vector> 78 79 using namespace llvm; 80 81 #define DEBUG_TYPE "si-wqm" 82 83 namespace { 84 85 enum { 86 StateWQM = 0x1, 87 StateExact = 0x2, 88 }; 89 90 struct PrintState { 91 public: 92 int State; 93 94 explicit PrintState(int State) : State(State) {} 95 }; 96 97 static raw_ostream &operator<<(raw_ostream &OS, const PrintState &PS) { 98 if (PS.State & StateWQM) 99 OS << "WQM"; 100 if (PS.State & StateExact) { 101 if (PS.State & StateWQM) 102 OS << '|'; 103 OS << "Exact"; 104 } 105 106 return OS; 107 } 108 109 struct InstrInfo { 110 char Needs = 0; 111 char OutNeeds = 0; 112 }; 113 114 struct BlockInfo { 115 char Needs = 0; 116 char InNeeds = 0; 117 char OutNeeds = 0; 118 }; 119 120 struct WorkItem { 121 MachineBasicBlock *MBB = nullptr; 122 MachineInstr *MI = nullptr; 123 124 WorkItem() = default; 125 WorkItem(MachineBasicBlock *MBB) : MBB(MBB) {} 126 WorkItem(MachineInstr *MI) : MI(MI) {} 127 }; 128 129 class SIWholeQuadMode : public MachineFunctionPass { 130 private: 131 const SIInstrInfo *TII; 132 const SIRegisterInfo *TRI; 133 MachineRegisterInfo *MRI; 134 LiveIntervals *LIS; 135 136 DenseMap<const MachineInstr *, InstrInfo> Instructions; 137 DenseMap<MachineBasicBlock *, BlockInfo> Blocks; 138 SmallVector<MachineInstr *, 1> LiveMaskQueries; 139 140 void printInfo(); 141 142 void markInstruction(MachineInstr &MI, char Flag, 143 std::vector<WorkItem> &Worklist); 144 void markUsesWQM(const MachineInstr &MI, std::vector<WorkItem> &Worklist); 145 char scanInstructions(MachineFunction &MF, std::vector<WorkItem> &Worklist); 146 void propagateInstruction(MachineInstr &MI, std::vector<WorkItem> &Worklist); 147 void propagateBlock(MachineBasicBlock &MBB, std::vector<WorkItem> &Worklist); 148 char analyzeFunction(MachineFunction &MF); 149 150 bool requiresCorrectState(const MachineInstr &MI) const; 151 152 MachineBasicBlock::iterator saveSCC(MachineBasicBlock &MBB, 153 MachineBasicBlock::iterator Before); 154 MachineBasicBlock::iterator 155 prepareInsertion(MachineBasicBlock &MBB, MachineBasicBlock::iterator First, 156 MachineBasicBlock::iterator Last, bool PreferLast, 157 bool SaveSCC); 158 void toExact(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before, 159 unsigned SaveWQM, unsigned LiveMaskReg); 160 void toWQM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before, 161 unsigned SavedWQM); 162 void processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg, bool isEntry); 163 164 void lowerLiveMaskQueries(unsigned LiveMaskReg); 165 166 public: 167 static char ID; 168 169 SIWholeQuadMode() : 170 MachineFunctionPass(ID) { } 171 172 bool runOnMachineFunction(MachineFunction &MF) override; 173 174 StringRef getPassName() const override { return "SI Whole Quad Mode"; } 175 176 void getAnalysisUsage(AnalysisUsage &AU) const override { 177 AU.addRequired<LiveIntervals>(); 178 AU.setPreservesCFG(); 179 MachineFunctionPass::getAnalysisUsage(AU); 180 } 181 }; 182 183 } // end anonymous namespace 184 185 char SIWholeQuadMode::ID = 0; 186 187 INITIALIZE_PASS_BEGIN(SIWholeQuadMode, DEBUG_TYPE, "SI Whole Quad Mode", false, 188 false) 189 INITIALIZE_PASS_DEPENDENCY(LiveIntervals) 190 INITIALIZE_PASS_END(SIWholeQuadMode, DEBUG_TYPE, "SI Whole Quad Mode", false, 191 false) 192 193 char &llvm::SIWholeQuadModeID = SIWholeQuadMode::ID; 194 195 FunctionPass *llvm::createSIWholeQuadModePass() { 196 return new SIWholeQuadMode; 197 } 198 199 void SIWholeQuadMode::printInfo() { 200 for (const auto &BII : Blocks) { 201 dbgs() << "\nBB#" << BII.first->getNumber() << ":\n" 202 << " InNeeds = " << PrintState(BII.second.InNeeds) 203 << ", Needs = " << PrintState(BII.second.Needs) 204 << ", OutNeeds = " << PrintState(BII.second.OutNeeds) << "\n\n"; 205 206 for (const MachineInstr &MI : *BII.first) { 207 auto III = Instructions.find(&MI); 208 if (III == Instructions.end()) 209 continue; 210 211 dbgs() << " " << MI << " Needs = " << PrintState(III->second.Needs) 212 << ", OutNeeds = " << PrintState(III->second.OutNeeds) << '\n'; 213 } 214 } 215 } 216 217 void SIWholeQuadMode::markInstruction(MachineInstr &MI, char Flag, 218 std::vector<WorkItem> &Worklist) { 219 InstrInfo &II = Instructions[&MI]; 220 221 assert(Flag == StateWQM || Flag == StateExact); 222 223 // Ignore if the instruction is already marked. The typical case is that we 224 // mark an instruction WQM multiple times, but for atomics it can happen that 225 // Flag is StateWQM, but Needs is already set to StateExact. In this case, 226 // letting the atomic run in StateExact is correct as per the relevant specs. 227 if (II.Needs) 228 return; 229 230 II.Needs = Flag; 231 Worklist.push_back(&MI); 232 } 233 234 /// Mark all instructions defining the uses in \p MI as WQM. 235 void SIWholeQuadMode::markUsesWQM(const MachineInstr &MI, 236 std::vector<WorkItem> &Worklist) { 237 for (const MachineOperand &Use : MI.uses()) { 238 if (!Use.isReg() || !Use.isUse()) 239 continue; 240 241 unsigned Reg = Use.getReg(); 242 243 // Handle physical registers that we need to track; this is mostly relevant 244 // for VCC, which can appear as the (implicit) input of a uniform branch, 245 // e.g. when a loop counter is stored in a VGPR. 246 if (!TargetRegisterInfo::isVirtualRegister(Reg)) { 247 if (Reg == AMDGPU::EXEC) 248 continue; 249 250 for (MCRegUnitIterator RegUnit(Reg, TRI); RegUnit.isValid(); ++RegUnit) { 251 LiveRange &LR = LIS->getRegUnit(*RegUnit); 252 const VNInfo *Value = LR.Query(LIS->getInstructionIndex(MI)).valueIn(); 253 if (!Value) 254 continue; 255 256 // Since we're in machine SSA, we do not need to track physical 257 // registers across basic blocks. 258 if (Value->isPHIDef()) 259 continue; 260 261 markInstruction(*LIS->getInstructionFromIndex(Value->def), StateWQM, 262 Worklist); 263 } 264 265 continue; 266 } 267 268 for (MachineInstr &DefMI : MRI->def_instructions(Use.getReg())) 269 markInstruction(DefMI, StateWQM, Worklist); 270 } 271 } 272 273 // Scan instructions to determine which ones require an Exact execmask and 274 // which ones seed WQM requirements. 275 char SIWholeQuadMode::scanInstructions(MachineFunction &MF, 276 std::vector<WorkItem> &Worklist) { 277 char GlobalFlags = 0; 278 bool WQMOutputs = MF.getFunction()->hasFnAttribute("amdgpu-ps-wqm-outputs"); 279 280 for (auto BI = MF.begin(), BE = MF.end(); BI != BE; ++BI) { 281 MachineBasicBlock &MBB = *BI; 282 283 for (auto II = MBB.begin(), IE = MBB.end(); II != IE; ++II) { 284 MachineInstr &MI = *II; 285 unsigned Opcode = MI.getOpcode(); 286 char Flags = 0; 287 288 if (TII->isDS(Opcode)) { 289 Flags = StateWQM; 290 } else if (TII->isWQM(Opcode)) { 291 // Sampling instructions don't need to produce results for all pixels 292 // in a quad, they just require all inputs of a quad to have been 293 // computed for derivatives. 294 markUsesWQM(MI, Worklist); 295 GlobalFlags |= StateWQM; 296 continue; 297 } else if (TII->isDisableWQM(MI)) { 298 Flags = StateExact; 299 } else { 300 if (Opcode == AMDGPU::SI_PS_LIVE) { 301 LiveMaskQueries.push_back(&MI); 302 } else if (WQMOutputs) { 303 // The function is in machine SSA form, which means that physical 304 // VGPRs correspond to shader inputs and outputs. Inputs are 305 // only used, outputs are only defined. 306 for (const MachineOperand &MO : MI.defs()) { 307 if (!MO.isReg()) 308 continue; 309 310 unsigned Reg = MO.getReg(); 311 312 if (!TRI->isVirtualRegister(Reg) && 313 TRI->hasVGPRs(TRI->getPhysRegClass(Reg))) { 314 Flags = StateWQM; 315 break; 316 } 317 } 318 } 319 320 if (!Flags) 321 continue; 322 } 323 324 markInstruction(MI, Flags, Worklist); 325 GlobalFlags |= Flags; 326 } 327 } 328 329 return GlobalFlags; 330 } 331 332 void SIWholeQuadMode::propagateInstruction(MachineInstr &MI, 333 std::vector<WorkItem>& Worklist) { 334 MachineBasicBlock *MBB = MI.getParent(); 335 InstrInfo II = Instructions[&MI]; // take a copy to prevent dangling references 336 BlockInfo &BI = Blocks[MBB]; 337 338 // Control flow-type instructions and stores to temporary memory that are 339 // followed by WQM computations must themselves be in WQM. 340 if ((II.OutNeeds & StateWQM) && !II.Needs && 341 (MI.isTerminator() || (TII->usesVM_CNT(MI) && MI.mayStore()))) { 342 Instructions[&MI].Needs = StateWQM; 343 II.Needs = StateWQM; 344 } 345 346 // Propagate to block level 347 BI.Needs |= II.Needs; 348 if ((BI.InNeeds | II.Needs) != BI.InNeeds) { 349 BI.InNeeds |= II.Needs; 350 Worklist.push_back(MBB); 351 } 352 353 // Propagate backwards within block 354 if (MachineInstr *PrevMI = MI.getPrevNode()) { 355 char InNeeds = II.Needs | II.OutNeeds; 356 if (!PrevMI->isPHI()) { 357 InstrInfo &PrevII = Instructions[PrevMI]; 358 if ((PrevII.OutNeeds | InNeeds) != PrevII.OutNeeds) { 359 PrevII.OutNeeds |= InNeeds; 360 Worklist.push_back(PrevMI); 361 } 362 } 363 } 364 365 // Propagate WQM flag to instruction inputs 366 assert(II.Needs != (StateWQM | StateExact)); 367 368 if (II.Needs == StateWQM) 369 markUsesWQM(MI, Worklist); 370 } 371 372 void SIWholeQuadMode::propagateBlock(MachineBasicBlock &MBB, 373 std::vector<WorkItem>& Worklist) { 374 BlockInfo BI = Blocks[&MBB]; // Make a copy to prevent dangling references. 375 376 // Propagate through instructions 377 if (!MBB.empty()) { 378 MachineInstr *LastMI = &*MBB.rbegin(); 379 InstrInfo &LastII = Instructions[LastMI]; 380 if ((LastII.OutNeeds | BI.OutNeeds) != LastII.OutNeeds) { 381 LastII.OutNeeds |= BI.OutNeeds; 382 Worklist.push_back(LastMI); 383 } 384 } 385 386 // Predecessor blocks must provide for our WQM/Exact needs. 387 for (MachineBasicBlock *Pred : MBB.predecessors()) { 388 BlockInfo &PredBI = Blocks[Pred]; 389 if ((PredBI.OutNeeds | BI.InNeeds) == PredBI.OutNeeds) 390 continue; 391 392 PredBI.OutNeeds |= BI.InNeeds; 393 PredBI.InNeeds |= BI.InNeeds; 394 Worklist.push_back(Pred); 395 } 396 397 // All successors must be prepared to accept the same set of WQM/Exact data. 398 for (MachineBasicBlock *Succ : MBB.successors()) { 399 BlockInfo &SuccBI = Blocks[Succ]; 400 if ((SuccBI.InNeeds | BI.OutNeeds) == SuccBI.InNeeds) 401 continue; 402 403 SuccBI.InNeeds |= BI.OutNeeds; 404 Worklist.push_back(Succ); 405 } 406 } 407 408 char SIWholeQuadMode::analyzeFunction(MachineFunction &MF) { 409 std::vector<WorkItem> Worklist; 410 char GlobalFlags = scanInstructions(MF, Worklist); 411 412 while (!Worklist.empty()) { 413 WorkItem WI = Worklist.back(); 414 Worklist.pop_back(); 415 416 if (WI.MI) 417 propagateInstruction(*WI.MI, Worklist); 418 else 419 propagateBlock(*WI.MBB, Worklist); 420 } 421 422 return GlobalFlags; 423 } 424 425 /// Whether \p MI really requires the exec state computed during analysis. 426 /// 427 /// Scalar instructions must occasionally be marked WQM for correct propagation 428 /// (e.g. thread masks leading up to branches), but when it comes to actual 429 /// execution, they don't care about EXEC. 430 bool SIWholeQuadMode::requiresCorrectState(const MachineInstr &MI) const { 431 if (MI.isTerminator()) 432 return true; 433 434 // Skip instructions that are not affected by EXEC 435 if (TII->isScalarUnit(MI)) 436 return false; 437 438 // Generic instructions such as COPY will either disappear by register 439 // coalescing or be lowered to SALU or VALU instructions. 440 if (MI.isTransient()) { 441 if (MI.getNumExplicitOperands() >= 1) { 442 const MachineOperand &Op = MI.getOperand(0); 443 if (Op.isReg()) { 444 if (TRI->isSGPRReg(*MRI, Op.getReg())) { 445 // SGPR instructions are not affected by EXEC 446 return false; 447 } 448 } 449 } 450 } 451 452 return true; 453 } 454 455 MachineBasicBlock::iterator 456 SIWholeQuadMode::saveSCC(MachineBasicBlock &MBB, 457 MachineBasicBlock::iterator Before) { 458 unsigned SaveReg = MRI->createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); 459 460 MachineInstr *Save = 461 BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), SaveReg) 462 .addReg(AMDGPU::SCC); 463 MachineInstr *Restore = 464 BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), AMDGPU::SCC) 465 .addReg(SaveReg); 466 467 LIS->InsertMachineInstrInMaps(*Save); 468 LIS->InsertMachineInstrInMaps(*Restore); 469 LIS->createAndComputeVirtRegInterval(SaveReg); 470 471 return Restore; 472 } 473 474 // Return an iterator in the (inclusive) range [First, Last] at which 475 // instructions can be safely inserted, keeping in mind that some of the 476 // instructions we want to add necessarily clobber SCC. 477 MachineBasicBlock::iterator SIWholeQuadMode::prepareInsertion( 478 MachineBasicBlock &MBB, MachineBasicBlock::iterator First, 479 MachineBasicBlock::iterator Last, bool PreferLast, bool SaveSCC) { 480 if (!SaveSCC) 481 return PreferLast ? Last : First; 482 483 LiveRange &LR = LIS->getRegUnit(*MCRegUnitIterator(AMDGPU::SCC, TRI)); 484 auto MBBE = MBB.end(); 485 SlotIndex FirstIdx = First != MBBE ? LIS->getInstructionIndex(*First) 486 : LIS->getMBBEndIdx(&MBB); 487 SlotIndex LastIdx = 488 Last != MBBE ? LIS->getInstructionIndex(*Last) : LIS->getMBBEndIdx(&MBB); 489 SlotIndex Idx = PreferLast ? LastIdx : FirstIdx; 490 const LiveRange::Segment *S; 491 492 for (;;) { 493 S = LR.getSegmentContaining(Idx); 494 if (!S) 495 break; 496 497 if (PreferLast) { 498 SlotIndex Next = S->start.getBaseIndex(); 499 if (Next < FirstIdx) 500 break; 501 Idx = Next; 502 } else { 503 SlotIndex Next = S->end.getNextIndex().getBaseIndex(); 504 if (Next > LastIdx) 505 break; 506 Idx = Next; 507 } 508 } 509 510 MachineBasicBlock::iterator MBBI; 511 512 if (MachineInstr *MI = LIS->getInstructionFromIndex(Idx)) 513 MBBI = MI; 514 else { 515 assert(Idx == LIS->getMBBEndIdx(&MBB)); 516 MBBI = MBB.end(); 517 } 518 519 if (S) 520 MBBI = saveSCC(MBB, MBBI); 521 522 return MBBI; 523 } 524 525 void SIWholeQuadMode::toExact(MachineBasicBlock &MBB, 526 MachineBasicBlock::iterator Before, 527 unsigned SaveWQM, unsigned LiveMaskReg) { 528 MachineInstr *MI; 529 530 if (SaveWQM) { 531 MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::S_AND_SAVEEXEC_B64), 532 SaveWQM) 533 .addReg(LiveMaskReg); 534 } else { 535 MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::S_AND_B64), 536 AMDGPU::EXEC) 537 .addReg(AMDGPU::EXEC) 538 .addReg(LiveMaskReg); 539 } 540 541 LIS->InsertMachineInstrInMaps(*MI); 542 } 543 544 void SIWholeQuadMode::toWQM(MachineBasicBlock &MBB, 545 MachineBasicBlock::iterator Before, 546 unsigned SavedWQM) { 547 MachineInstr *MI; 548 549 if (SavedWQM) { 550 MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), AMDGPU::EXEC) 551 .addReg(SavedWQM); 552 } else { 553 MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::S_WQM_B64), 554 AMDGPU::EXEC) 555 .addReg(AMDGPU::EXEC); 556 } 557 558 LIS->InsertMachineInstrInMaps(*MI); 559 } 560 561 void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg, 562 bool isEntry) { 563 auto BII = Blocks.find(&MBB); 564 if (BII == Blocks.end()) 565 return; 566 567 const BlockInfo &BI = BII->second; 568 569 if (!(BI.InNeeds & StateWQM)) 570 return; 571 572 // This is a non-entry block that is WQM throughout, so no need to do 573 // anything. 574 if (!isEntry && !(BI.Needs & StateExact) && BI.OutNeeds != StateExact) 575 return; 576 577 DEBUG(dbgs() << "\nProcessing block BB#" << MBB.getNumber() << ":\n"); 578 579 unsigned SavedWQMReg = 0; 580 bool WQMFromExec = isEntry; 581 char State = isEntry ? StateExact : StateWQM; 582 583 auto II = MBB.getFirstNonPHI(), IE = MBB.end(); 584 if (isEntry) 585 ++II; // Skip the instruction that saves LiveMask 586 587 MachineBasicBlock::iterator First = IE; 588 for (;;) { 589 MachineBasicBlock::iterator Next = II; 590 char Needs = 0; 591 char OutNeeds = 0; 592 593 if (First == IE) 594 First = II; 595 596 if (II != IE) { 597 MachineInstr &MI = *II; 598 599 if (requiresCorrectState(MI)) { 600 auto III = Instructions.find(&MI); 601 if (III != Instructions.end()) { 602 Needs = III->second.Needs; 603 OutNeeds = III->second.OutNeeds; 604 } 605 } 606 607 if (MI.isTerminator() && !Needs && OutNeeds == StateExact) 608 Needs = StateExact; 609 610 if (MI.getOpcode() == AMDGPU::SI_ELSE && BI.OutNeeds == StateExact) 611 MI.getOperand(3).setImm(1); 612 613 ++Next; 614 } else { 615 // End of basic block 616 if (BI.OutNeeds & StateWQM) 617 Needs = StateWQM; 618 else if (BI.OutNeeds == StateExact) 619 Needs = StateExact; 620 } 621 622 if (Needs) { 623 if (Needs != State) { 624 MachineBasicBlock::iterator Before = 625 prepareInsertion(MBB, First, II, Needs == StateWQM, 626 Needs == StateExact || WQMFromExec); 627 628 if (Needs == StateExact) { 629 if (!WQMFromExec && (OutNeeds & StateWQM)) 630 SavedWQMReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass); 631 632 toExact(MBB, Before, SavedWQMReg, LiveMaskReg); 633 } else { 634 assert(WQMFromExec == (SavedWQMReg == 0)); 635 636 toWQM(MBB, Before, SavedWQMReg); 637 638 if (SavedWQMReg) { 639 LIS->createAndComputeVirtRegInterval(SavedWQMReg); 640 SavedWQMReg = 0; 641 } 642 } 643 644 State = Needs; 645 } 646 647 First = IE; 648 } 649 650 if (II == IE) 651 break; 652 II = Next; 653 } 654 } 655 656 void SIWholeQuadMode::lowerLiveMaskQueries(unsigned LiveMaskReg) { 657 for (MachineInstr *MI : LiveMaskQueries) { 658 const DebugLoc &DL = MI->getDebugLoc(); 659 unsigned Dest = MI->getOperand(0).getReg(); 660 MachineInstr *Copy = 661 BuildMI(*MI->getParent(), MI, DL, TII->get(AMDGPU::COPY), Dest) 662 .addReg(LiveMaskReg); 663 664 LIS->ReplaceMachineInstrInMaps(*MI, *Copy); 665 MI->eraseFromParent(); 666 } 667 } 668 669 bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) { 670 if (MF.getFunction()->getCallingConv() != CallingConv::AMDGPU_PS) 671 return false; 672 673 Instructions.clear(); 674 Blocks.clear(); 675 LiveMaskQueries.clear(); 676 677 const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); 678 679 TII = ST.getInstrInfo(); 680 TRI = &TII->getRegisterInfo(); 681 MRI = &MF.getRegInfo(); 682 LIS = &getAnalysis<LiveIntervals>(); 683 684 char GlobalFlags = analyzeFunction(MF); 685 if (!(GlobalFlags & StateWQM)) { 686 lowerLiveMaskQueries(AMDGPU::EXEC); 687 return !LiveMaskQueries.empty(); 688 } 689 690 // Store a copy of the original live mask when required 691 unsigned LiveMaskReg = 0; 692 { 693 MachineBasicBlock &Entry = MF.front(); 694 MachineBasicBlock::iterator EntryMI = Entry.getFirstNonPHI(); 695 696 if (GlobalFlags & StateExact || !LiveMaskQueries.empty()) { 697 LiveMaskReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass); 698 MachineInstr *MI = BuildMI(Entry, EntryMI, DebugLoc(), 699 TII->get(AMDGPU::COPY), LiveMaskReg) 700 .addReg(AMDGPU::EXEC); 701 LIS->InsertMachineInstrInMaps(*MI); 702 } 703 704 if (GlobalFlags == StateWQM) { 705 // For a shader that needs only WQM, we can just set it once. 706 BuildMI(Entry, EntryMI, DebugLoc(), TII->get(AMDGPU::S_WQM_B64), 707 AMDGPU::EXEC) 708 .addReg(AMDGPU::EXEC); 709 710 lowerLiveMaskQueries(LiveMaskReg); 711 // EntryMI may become invalid here 712 return true; 713 } 714 } 715 716 DEBUG(printInfo()); 717 718 lowerLiveMaskQueries(LiveMaskReg); 719 720 // Handle the general case 721 for (auto BII : Blocks) 722 processBlock(*BII.first, LiveMaskReg, BII.first == &*MF.begin()); 723 724 // Physical registers like SCC aren't tracked by default anyway, so just 725 // removing the ranges we computed is the simplest option for maintaining 726 // the analysis results. 727 LIS->removeRegUnit(*MCRegUnitIterator(AMDGPU::SCC, TRI)); 728 729 return true; 730 } 731