1 //===-- SIWholeQuadMode.cpp - enter and suspend whole quad mode -----------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 /// \file 11 /// \brief This pass adds instructions to enable whole quad mode for pixel 12 /// shaders, and whole wavefront mode for all programs. 13 /// 14 /// Whole quad mode is required for derivative computations, but it interferes 15 /// with shader side effects (stores and atomics). This pass is run on the 16 /// scheduled machine IR but before register coalescing, so that machine SSA is 17 /// available for analysis. It ensures that WQM is enabled when necessary, but 18 /// disabled around stores and atomics. 19 /// 20 /// When necessary, this pass creates a function prolog 21 /// 22 /// S_MOV_B64 LiveMask, EXEC 23 /// S_WQM_B64 EXEC, EXEC 24 /// 25 /// to enter WQM at the top of the function and surrounds blocks of Exact 26 /// instructions by 27 /// 28 /// S_AND_SAVEEXEC_B64 Tmp, LiveMask 29 /// ... 30 /// S_MOV_B64 EXEC, Tmp 31 /// 32 /// We also compute when a sequence of instructions requires Whole Wavefront 33 /// Mode (WWM) and insert instructions to save and restore it: 34 /// 35 /// S_OR_SAVEEXEC_B64 Tmp, -1 36 /// ... 37 /// S_MOV_B64 EXEC, Tmp 38 /// 39 /// In order to avoid excessive switching during sequences of Exact 40 /// instructions, the pass first analyzes which instructions must be run in WQM 41 /// (aka which instructions produce values that lead to derivative 42 /// computations). 43 /// 44 /// Basic blocks are always exited in WQM as long as some successor needs WQM. 45 /// 46 /// There is room for improvement given better control flow analysis: 47 /// 48 /// (1) at the top level (outside of control flow statements, and as long as 49 /// kill hasn't been used), one SGPR can be saved by recovering WQM from 50 /// the LiveMask (this is implemented for the entry block). 51 /// 52 /// (2) when entire regions (e.g. if-else blocks or entire loops) only 53 /// consist of exact and don't-care instructions, the switch only has to 54 /// be done at the entry and exit points rather than potentially in each 55 /// block of the region. 56 /// 57 //===----------------------------------------------------------------------===// 58 59 #include "AMDGPU.h" 60 #include "AMDGPUSubtarget.h" 61 #include "SIInstrInfo.h" 62 #include "SIMachineFunctionInfo.h" 63 #include "llvm/ADT/DenseMap.h" 64 #include "llvm/ADT/PostOrderIterator.h" 65 #include "llvm/ADT/SmallVector.h" 66 #include "llvm/ADT/StringRef.h" 67 #include "llvm/CodeGen/LiveInterval.h" 68 #include "llvm/CodeGen/LiveIntervalAnalysis.h" 69 #include "llvm/CodeGen/MachineBasicBlock.h" 70 #include "llvm/CodeGen/MachineFunction.h" 71 #include "llvm/CodeGen/MachineFunctionPass.h" 72 #include "llvm/CodeGen/MachineInstr.h" 73 #include "llvm/CodeGen/MachineInstrBuilder.h" 74 #include "llvm/CodeGen/MachineOperand.h" 75 #include "llvm/CodeGen/MachineRegisterInfo.h" 76 #include "llvm/CodeGen/SlotIndexes.h" 77 #include "llvm/CodeGen/TargetRegisterInfo.h" 78 #include "llvm/IR/CallingConv.h" 79 #include "llvm/IR/DebugLoc.h" 80 #include "llvm/MC/MCRegisterInfo.h" 81 #include "llvm/Pass.h" 82 #include "llvm/Support/Debug.h" 83 #include "llvm/Support/raw_ostream.h" 84 #include <cassert> 85 #include <vector> 86 87 using namespace llvm; 88 89 #define DEBUG_TYPE "si-wqm" 90 91 namespace { 92 93 enum { 94 StateWQM = 0x1, 95 StateWWM = 0x2, 96 StateExact = 0x4, 97 }; 98 99 struct PrintState { 100 public: 101 int State; 102 103 explicit PrintState(int State) : State(State) {} 104 }; 105 106 #ifndef NDEBUG 107 static raw_ostream &operator<<(raw_ostream &OS, const PrintState &PS) { 108 if (PS.State & StateWQM) 109 OS << "WQM"; 110 if (PS.State & StateWWM) { 111 if (PS.State & StateWQM) 112 OS << '|'; 113 OS << "WWM"; 114 } 115 if (PS.State & StateExact) { 116 if (PS.State & (StateWQM | StateWWM)) 117 OS << '|'; 118 OS << "Exact"; 119 } 120 121 return OS; 122 } 123 #endif 124 125 struct InstrInfo { 126 char Needs = 0; 127 char Disabled = 0; 128 char OutNeeds = 0; 129 }; 130 131 struct BlockInfo { 132 char Needs = 0; 133 char InNeeds = 0; 134 char OutNeeds = 0; 135 }; 136 137 struct WorkItem { 138 MachineBasicBlock *MBB = nullptr; 139 MachineInstr *MI = nullptr; 140 141 WorkItem() = default; 142 WorkItem(MachineBasicBlock *MBB) : MBB(MBB) {} 143 WorkItem(MachineInstr *MI) : MI(MI) {} 144 }; 145 146 class SIWholeQuadMode : public MachineFunctionPass { 147 private: 148 CallingConv::ID CallingConv; 149 const SIInstrInfo *TII; 150 const SIRegisterInfo *TRI; 151 MachineRegisterInfo *MRI; 152 LiveIntervals *LIS; 153 154 DenseMap<const MachineInstr *, InstrInfo> Instructions; 155 DenseMap<MachineBasicBlock *, BlockInfo> Blocks; 156 SmallVector<MachineInstr *, 1> LiveMaskQueries; 157 SmallVector<MachineInstr *, 4> LowerToCopyInstrs; 158 159 void printInfo(); 160 161 void markInstruction(MachineInstr &MI, char Flag, 162 std::vector<WorkItem> &Worklist); 163 void markInstructionUses(const MachineInstr &MI, char Flag, 164 std::vector<WorkItem> &Worklist); 165 char scanInstructions(MachineFunction &MF, std::vector<WorkItem> &Worklist); 166 void propagateInstruction(MachineInstr &MI, std::vector<WorkItem> &Worklist); 167 void propagateBlock(MachineBasicBlock &MBB, std::vector<WorkItem> &Worklist); 168 char analyzeFunction(MachineFunction &MF); 169 170 bool requiresCorrectState(const MachineInstr &MI) const; 171 172 MachineBasicBlock::iterator saveSCC(MachineBasicBlock &MBB, 173 MachineBasicBlock::iterator Before); 174 MachineBasicBlock::iterator 175 prepareInsertion(MachineBasicBlock &MBB, MachineBasicBlock::iterator First, 176 MachineBasicBlock::iterator Last, bool PreferLast, 177 bool SaveSCC); 178 void toExact(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before, 179 unsigned SaveWQM, unsigned LiveMaskReg); 180 void toWQM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before, 181 unsigned SavedWQM); 182 void toWWM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before, 183 unsigned SaveOrig); 184 void fromWWM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before, 185 unsigned SavedOrig); 186 void processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg, bool isEntry); 187 188 void lowerLiveMaskQueries(unsigned LiveMaskReg); 189 void lowerCopyInstrs(); 190 191 public: 192 static char ID; 193 194 SIWholeQuadMode() : 195 MachineFunctionPass(ID) { } 196 197 bool runOnMachineFunction(MachineFunction &MF) override; 198 199 StringRef getPassName() const override { return "SI Whole Quad Mode"; } 200 201 void getAnalysisUsage(AnalysisUsage &AU) const override { 202 AU.addRequired<LiveIntervals>(); 203 AU.setPreservesCFG(); 204 MachineFunctionPass::getAnalysisUsage(AU); 205 } 206 }; 207 208 } // end anonymous namespace 209 210 char SIWholeQuadMode::ID = 0; 211 212 INITIALIZE_PASS_BEGIN(SIWholeQuadMode, DEBUG_TYPE, "SI Whole Quad Mode", false, 213 false) 214 INITIALIZE_PASS_DEPENDENCY(LiveIntervals) 215 INITIALIZE_PASS_END(SIWholeQuadMode, DEBUG_TYPE, "SI Whole Quad Mode", false, 216 false) 217 218 char &llvm::SIWholeQuadModeID = SIWholeQuadMode::ID; 219 220 FunctionPass *llvm::createSIWholeQuadModePass() { 221 return new SIWholeQuadMode; 222 } 223 224 #ifndef NDEBUG 225 LLVM_DUMP_METHOD void SIWholeQuadMode::printInfo() { 226 for (const auto &BII : Blocks) { 227 dbgs() << "\nBB#" << BII.first->getNumber() << ":\n" 228 << " InNeeds = " << PrintState(BII.second.InNeeds) 229 << ", Needs = " << PrintState(BII.second.Needs) 230 << ", OutNeeds = " << PrintState(BII.second.OutNeeds) << "\n\n"; 231 232 for (const MachineInstr &MI : *BII.first) { 233 auto III = Instructions.find(&MI); 234 if (III == Instructions.end()) 235 continue; 236 237 dbgs() << " " << MI << " Needs = " << PrintState(III->second.Needs) 238 << ", OutNeeds = " << PrintState(III->second.OutNeeds) << '\n'; 239 } 240 } 241 } 242 #endif 243 244 void SIWholeQuadMode::markInstruction(MachineInstr &MI, char Flag, 245 std::vector<WorkItem> &Worklist) { 246 InstrInfo &II = Instructions[&MI]; 247 248 assert(!(Flag & StateExact) && Flag != 0); 249 250 // Remove any disabled states from the flag. The user that required it gets 251 // an undefined value in the helper lanes. For example, this can happen if 252 // the result of an atomic is used by instruction that requires WQM, where 253 // ignoring the request for WQM is correct as per the relevant specs. 254 Flag &= ~II.Disabled; 255 256 // Ignore if the flag is already encompassed by the existing needs, or we 257 // just disabled everything. 258 if ((II.Needs & Flag) == Flag) 259 return; 260 261 II.Needs |= Flag; 262 Worklist.push_back(&MI); 263 } 264 265 /// Mark all instructions defining the uses in \p MI with \p Flag. 266 void SIWholeQuadMode::markInstructionUses(const MachineInstr &MI, char Flag, 267 std::vector<WorkItem> &Worklist) { 268 for (const MachineOperand &Use : MI.uses()) { 269 if (!Use.isReg() || !Use.isUse()) 270 continue; 271 272 unsigned Reg = Use.getReg(); 273 274 // Handle physical registers that we need to track; this is mostly relevant 275 // for VCC, which can appear as the (implicit) input of a uniform branch, 276 // e.g. when a loop counter is stored in a VGPR. 277 if (!TargetRegisterInfo::isVirtualRegister(Reg)) { 278 if (Reg == AMDGPU::EXEC) 279 continue; 280 281 for (MCRegUnitIterator RegUnit(Reg, TRI); RegUnit.isValid(); ++RegUnit) { 282 LiveRange &LR = LIS->getRegUnit(*RegUnit); 283 const VNInfo *Value = LR.Query(LIS->getInstructionIndex(MI)).valueIn(); 284 if (!Value) 285 continue; 286 287 // Since we're in machine SSA, we do not need to track physical 288 // registers across basic blocks. 289 if (Value->isPHIDef()) 290 continue; 291 292 markInstruction(*LIS->getInstructionFromIndex(Value->def), Flag, 293 Worklist); 294 } 295 296 continue; 297 } 298 299 for (MachineInstr &DefMI : MRI->def_instructions(Use.getReg())) 300 markInstruction(DefMI, Flag, Worklist); 301 } 302 } 303 304 // Scan instructions to determine which ones require an Exact execmask and 305 // which ones seed WQM requirements. 306 char SIWholeQuadMode::scanInstructions(MachineFunction &MF, 307 std::vector<WorkItem> &Worklist) { 308 char GlobalFlags = 0; 309 bool WQMOutputs = MF.getFunction()->hasFnAttribute("amdgpu-ps-wqm-outputs"); 310 SmallVector<MachineInstr *, 4> SetInactiveInstrs; 311 312 // We need to visit the basic blocks in reverse post-order so that we visit 313 // defs before uses, in particular so that we don't accidentally mark an 314 // instruction as needing e.g. WQM before visiting it and realizing it needs 315 // WQM disabled. 316 ReversePostOrderTraversal<MachineFunction *> RPOT(&MF); 317 for (auto BI = RPOT.begin(), BE = RPOT.end(); BI != BE; ++BI) { 318 MachineBasicBlock &MBB = **BI; 319 BlockInfo &BBI = Blocks[&MBB]; 320 321 for (auto II = MBB.begin(), IE = MBB.end(); II != IE; ++II) { 322 MachineInstr &MI = *II; 323 InstrInfo &III = Instructions[&MI]; 324 unsigned Opcode = MI.getOpcode(); 325 char Flags = 0; 326 327 if (TII->isDS(Opcode) && CallingConv == CallingConv::AMDGPU_PS) { 328 Flags = StateWQM; 329 } else if (TII->isWQM(Opcode)) { 330 // Sampling instructions don't need to produce results for all pixels 331 // in a quad, they just require all inputs of a quad to have been 332 // computed for derivatives. 333 markInstructionUses(MI, StateWQM, Worklist); 334 GlobalFlags |= StateWQM; 335 continue; 336 } else if (Opcode == AMDGPU::WQM) { 337 // The WQM intrinsic requires its output to have all the helper lanes 338 // correct, so we need it to be in WQM. 339 Flags = StateWQM; 340 LowerToCopyInstrs.push_back(&MI); 341 } else if (Opcode == AMDGPU::WWM) { 342 // The WWM intrinsic doesn't make the same guarantee, and plus it needs 343 // to be executed in WQM or Exact so that its copy doesn't clobber 344 // inactive lanes. 345 markInstructionUses(MI, StateWWM, Worklist); 346 GlobalFlags |= StateWWM; 347 LowerToCopyInstrs.push_back(&MI); 348 continue; 349 } else if (Opcode == AMDGPU::V_SET_INACTIVE_B32 || 350 Opcode == AMDGPU::V_SET_INACTIVE_B64) { 351 III.Disabled = StateWWM; 352 MachineOperand &Inactive = MI.getOperand(2); 353 if (Inactive.isReg()) { 354 if (Inactive.isUndef()) { 355 LowerToCopyInstrs.push_back(&MI); 356 } else { 357 unsigned Reg = Inactive.getReg(); 358 if (TargetRegisterInfo::isVirtualRegister(Reg)) { 359 for (MachineInstr &DefMI : MRI->def_instructions(Reg)) 360 markInstruction(DefMI, StateWWM, Worklist); 361 } 362 } 363 } 364 SetInactiveInstrs.push_back(&MI); 365 continue; 366 } else if (TII->isDisableWQM(MI)) { 367 BBI.Needs |= StateExact; 368 if (!(BBI.InNeeds & StateExact)) { 369 BBI.InNeeds |= StateExact; 370 Worklist.push_back(&MBB); 371 } 372 GlobalFlags |= StateExact; 373 III.Disabled = StateWQM | StateWWM; 374 continue; 375 } else { 376 if (Opcode == AMDGPU::SI_PS_LIVE) { 377 LiveMaskQueries.push_back(&MI); 378 } else if (WQMOutputs) { 379 // The function is in machine SSA form, which means that physical 380 // VGPRs correspond to shader inputs and outputs. Inputs are 381 // only used, outputs are only defined. 382 for (const MachineOperand &MO : MI.defs()) { 383 if (!MO.isReg()) 384 continue; 385 386 unsigned Reg = MO.getReg(); 387 388 if (!TRI->isVirtualRegister(Reg) && 389 TRI->hasVGPRs(TRI->getPhysRegClass(Reg))) { 390 Flags = StateWQM; 391 break; 392 } 393 } 394 } 395 396 if (!Flags) 397 continue; 398 } 399 400 markInstruction(MI, Flags, Worklist); 401 GlobalFlags |= Flags; 402 } 403 } 404 405 // Mark sure that any SET_INACTIVE instructions are computed in WQM if WQM is 406 // ever used anywhere in the function. This implements the corresponding 407 // semantics of @llvm.amdgcn.set.inactive. 408 if (GlobalFlags & StateWQM) { 409 for (MachineInstr *MI : SetInactiveInstrs) 410 markInstruction(*MI, StateWQM, Worklist); 411 } 412 413 return GlobalFlags; 414 } 415 416 void SIWholeQuadMode::propagateInstruction(MachineInstr &MI, 417 std::vector<WorkItem>& Worklist) { 418 MachineBasicBlock *MBB = MI.getParent(); 419 InstrInfo II = Instructions[&MI]; // take a copy to prevent dangling references 420 BlockInfo &BI = Blocks[MBB]; 421 422 // Control flow-type instructions and stores to temporary memory that are 423 // followed by WQM computations must themselves be in WQM. 424 if ((II.OutNeeds & StateWQM) && !(II.Disabled & StateWQM) && 425 (MI.isTerminator() || (TII->usesVM_CNT(MI) && MI.mayStore()))) { 426 Instructions[&MI].Needs = StateWQM; 427 II.Needs = StateWQM; 428 } 429 430 // Propagate to block level 431 if (II.Needs & StateWQM) { 432 BI.Needs |= StateWQM; 433 if (!(BI.InNeeds & StateWQM)) { 434 BI.InNeeds |= StateWQM; 435 Worklist.push_back(MBB); 436 } 437 } 438 439 // Propagate backwards within block 440 if (MachineInstr *PrevMI = MI.getPrevNode()) { 441 char InNeeds = (II.Needs & ~StateWWM) | II.OutNeeds; 442 if (!PrevMI->isPHI()) { 443 InstrInfo &PrevII = Instructions[PrevMI]; 444 if ((PrevII.OutNeeds | InNeeds) != PrevII.OutNeeds) { 445 PrevII.OutNeeds |= InNeeds; 446 Worklist.push_back(PrevMI); 447 } 448 } 449 } 450 451 // Propagate WQM flag to instruction inputs 452 assert(!(II.Needs & StateExact)); 453 454 if (II.Needs != 0) 455 markInstructionUses(MI, II.Needs, Worklist); 456 } 457 458 void SIWholeQuadMode::propagateBlock(MachineBasicBlock &MBB, 459 std::vector<WorkItem>& Worklist) { 460 BlockInfo BI = Blocks[&MBB]; // Make a copy to prevent dangling references. 461 462 // Propagate through instructions 463 if (!MBB.empty()) { 464 MachineInstr *LastMI = &*MBB.rbegin(); 465 InstrInfo &LastII = Instructions[LastMI]; 466 if ((LastII.OutNeeds | BI.OutNeeds) != LastII.OutNeeds) { 467 LastII.OutNeeds |= BI.OutNeeds; 468 Worklist.push_back(LastMI); 469 } 470 } 471 472 // Predecessor blocks must provide for our WQM/Exact needs. 473 for (MachineBasicBlock *Pred : MBB.predecessors()) { 474 BlockInfo &PredBI = Blocks[Pred]; 475 if ((PredBI.OutNeeds | BI.InNeeds) == PredBI.OutNeeds) 476 continue; 477 478 PredBI.OutNeeds |= BI.InNeeds; 479 PredBI.InNeeds |= BI.InNeeds; 480 Worklist.push_back(Pred); 481 } 482 483 // All successors must be prepared to accept the same set of WQM/Exact data. 484 for (MachineBasicBlock *Succ : MBB.successors()) { 485 BlockInfo &SuccBI = Blocks[Succ]; 486 if ((SuccBI.InNeeds | BI.OutNeeds) == SuccBI.InNeeds) 487 continue; 488 489 SuccBI.InNeeds |= BI.OutNeeds; 490 Worklist.push_back(Succ); 491 } 492 } 493 494 char SIWholeQuadMode::analyzeFunction(MachineFunction &MF) { 495 std::vector<WorkItem> Worklist; 496 char GlobalFlags = scanInstructions(MF, Worklist); 497 498 while (!Worklist.empty()) { 499 WorkItem WI = Worklist.back(); 500 Worklist.pop_back(); 501 502 if (WI.MI) 503 propagateInstruction(*WI.MI, Worklist); 504 else 505 propagateBlock(*WI.MBB, Worklist); 506 } 507 508 return GlobalFlags; 509 } 510 511 /// Whether \p MI really requires the exec state computed during analysis. 512 /// 513 /// Scalar instructions must occasionally be marked WQM for correct propagation 514 /// (e.g. thread masks leading up to branches), but when it comes to actual 515 /// execution, they don't care about EXEC. 516 bool SIWholeQuadMode::requiresCorrectState(const MachineInstr &MI) const { 517 if (MI.isTerminator()) 518 return true; 519 520 // Skip instructions that are not affected by EXEC 521 if (TII->isScalarUnit(MI)) 522 return false; 523 524 // Generic instructions such as COPY will either disappear by register 525 // coalescing or be lowered to SALU or VALU instructions. 526 if (MI.isTransient()) { 527 if (MI.getNumExplicitOperands() >= 1) { 528 const MachineOperand &Op = MI.getOperand(0); 529 if (Op.isReg()) { 530 if (TRI->isSGPRReg(*MRI, Op.getReg())) { 531 // SGPR instructions are not affected by EXEC 532 return false; 533 } 534 } 535 } 536 } 537 538 return true; 539 } 540 541 MachineBasicBlock::iterator 542 SIWholeQuadMode::saveSCC(MachineBasicBlock &MBB, 543 MachineBasicBlock::iterator Before) { 544 unsigned SaveReg = MRI->createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); 545 546 MachineInstr *Save = 547 BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), SaveReg) 548 .addReg(AMDGPU::SCC); 549 MachineInstr *Restore = 550 BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), AMDGPU::SCC) 551 .addReg(SaveReg); 552 553 LIS->InsertMachineInstrInMaps(*Save); 554 LIS->InsertMachineInstrInMaps(*Restore); 555 LIS->createAndComputeVirtRegInterval(SaveReg); 556 557 return Restore; 558 } 559 560 // Return an iterator in the (inclusive) range [First, Last] at which 561 // instructions can be safely inserted, keeping in mind that some of the 562 // instructions we want to add necessarily clobber SCC. 563 MachineBasicBlock::iterator SIWholeQuadMode::prepareInsertion( 564 MachineBasicBlock &MBB, MachineBasicBlock::iterator First, 565 MachineBasicBlock::iterator Last, bool PreferLast, bool SaveSCC) { 566 if (!SaveSCC) 567 return PreferLast ? Last : First; 568 569 LiveRange &LR = LIS->getRegUnit(*MCRegUnitIterator(AMDGPU::SCC, TRI)); 570 auto MBBE = MBB.end(); 571 SlotIndex FirstIdx = First != MBBE ? LIS->getInstructionIndex(*First) 572 : LIS->getMBBEndIdx(&MBB); 573 SlotIndex LastIdx = 574 Last != MBBE ? LIS->getInstructionIndex(*Last) : LIS->getMBBEndIdx(&MBB); 575 SlotIndex Idx = PreferLast ? LastIdx : FirstIdx; 576 const LiveRange::Segment *S; 577 578 for (;;) { 579 S = LR.getSegmentContaining(Idx); 580 if (!S) 581 break; 582 583 if (PreferLast) { 584 SlotIndex Next = S->start.getBaseIndex(); 585 if (Next < FirstIdx) 586 break; 587 Idx = Next; 588 } else { 589 SlotIndex Next = S->end.getNextIndex().getBaseIndex(); 590 if (Next > LastIdx) 591 break; 592 Idx = Next; 593 } 594 } 595 596 MachineBasicBlock::iterator MBBI; 597 598 if (MachineInstr *MI = LIS->getInstructionFromIndex(Idx)) 599 MBBI = MI; 600 else { 601 assert(Idx == LIS->getMBBEndIdx(&MBB)); 602 MBBI = MBB.end(); 603 } 604 605 if (S) 606 MBBI = saveSCC(MBB, MBBI); 607 608 return MBBI; 609 } 610 611 void SIWholeQuadMode::toExact(MachineBasicBlock &MBB, 612 MachineBasicBlock::iterator Before, 613 unsigned SaveWQM, unsigned LiveMaskReg) { 614 MachineInstr *MI; 615 616 if (SaveWQM) { 617 MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::S_AND_SAVEEXEC_B64), 618 SaveWQM) 619 .addReg(LiveMaskReg); 620 } else { 621 MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::S_AND_B64), 622 AMDGPU::EXEC) 623 .addReg(AMDGPU::EXEC) 624 .addReg(LiveMaskReg); 625 } 626 627 LIS->InsertMachineInstrInMaps(*MI); 628 } 629 630 void SIWholeQuadMode::toWQM(MachineBasicBlock &MBB, 631 MachineBasicBlock::iterator Before, 632 unsigned SavedWQM) { 633 MachineInstr *MI; 634 635 if (SavedWQM) { 636 MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), AMDGPU::EXEC) 637 .addReg(SavedWQM); 638 } else { 639 MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::S_WQM_B64), 640 AMDGPU::EXEC) 641 .addReg(AMDGPU::EXEC); 642 } 643 644 LIS->InsertMachineInstrInMaps(*MI); 645 } 646 647 void SIWholeQuadMode::toWWM(MachineBasicBlock &MBB, 648 MachineBasicBlock::iterator Before, 649 unsigned SaveOrig) { 650 MachineInstr *MI; 651 652 assert(SaveOrig); 653 MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::S_OR_SAVEEXEC_B64), 654 SaveOrig) 655 .addImm(-1); 656 LIS->InsertMachineInstrInMaps(*MI); 657 } 658 659 void SIWholeQuadMode::fromWWM(MachineBasicBlock &MBB, 660 MachineBasicBlock::iterator Before, 661 unsigned SavedOrig) { 662 MachineInstr *MI; 663 664 assert(SavedOrig); 665 MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::EXIT_WWM), AMDGPU::EXEC) 666 .addReg(SavedOrig); 667 LIS->InsertMachineInstrInMaps(*MI); 668 } 669 670 void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg, 671 bool isEntry) { 672 auto BII = Blocks.find(&MBB); 673 if (BII == Blocks.end()) 674 return; 675 676 const BlockInfo &BI = BII->second; 677 678 // This is a non-entry block that is WQM throughout, so no need to do 679 // anything. 680 if (!isEntry && BI.Needs == StateWQM && BI.OutNeeds != StateExact) 681 return; 682 683 DEBUG(dbgs() << "\nProcessing block BB#" << MBB.getNumber() << ":\n"); 684 685 unsigned SavedWQMReg = 0; 686 unsigned SavedNonWWMReg = 0; 687 bool WQMFromExec = isEntry; 688 char State = (isEntry || !(BI.InNeeds & StateWQM)) ? StateExact : StateWQM; 689 char NonWWMState = 0; 690 691 auto II = MBB.getFirstNonPHI(), IE = MBB.end(); 692 if (isEntry) 693 ++II; // Skip the instruction that saves LiveMask 694 695 // This stores the first instruction where it's safe to switch from WQM to 696 // Exact or vice versa. 697 MachineBasicBlock::iterator FirstWQM = IE; 698 699 // This stores the first instruction where it's safe to switch from WWM to 700 // Exact/WQM or to switch to WWM. It must always be the same as, or after, 701 // FirstWQM since if it's safe to switch to/from WWM, it must be safe to 702 // switch to/from WQM as well. 703 MachineBasicBlock::iterator FirstWWM = IE; 704 for (;;) { 705 MachineBasicBlock::iterator Next = II; 706 char Needs = StateExact | StateWQM; // WWM is disabled by default 707 char OutNeeds = 0; 708 709 if (FirstWQM == IE) 710 FirstWQM = II; 711 712 if (FirstWWM == IE) 713 FirstWWM = II; 714 715 // First, figure out the allowed states (Needs) based on the propagated 716 // flags. 717 if (II != IE) { 718 MachineInstr &MI = *II; 719 720 if (requiresCorrectState(MI)) { 721 auto III = Instructions.find(&MI); 722 if (III != Instructions.end()) { 723 if (III->second.Needs & StateWWM) 724 Needs = StateWWM; 725 else if (III->second.Needs & StateWQM) 726 Needs = StateWQM; 727 else 728 Needs &= ~III->second.Disabled; 729 OutNeeds = III->second.OutNeeds; 730 } 731 } else { 732 // If the instruction doesn't actually need a correct EXEC, then we can 733 // safely leave WWM enabled. 734 Needs = StateExact | StateWQM | StateWWM; 735 } 736 737 if (MI.isTerminator() && OutNeeds == StateExact) 738 Needs = StateExact; 739 740 if (MI.getOpcode() == AMDGPU::SI_ELSE && BI.OutNeeds == StateExact) 741 MI.getOperand(3).setImm(1); 742 743 ++Next; 744 } else { 745 // End of basic block 746 if (BI.OutNeeds & StateWQM) 747 Needs = StateWQM; 748 else if (BI.OutNeeds == StateExact) 749 Needs = StateExact; 750 else 751 Needs = StateWQM | StateExact; 752 } 753 754 // Now, transition if necessary. 755 if (!(Needs & State)) { 756 MachineBasicBlock::iterator First; 757 if (State == StateWWM || Needs == StateWWM) { 758 // We must switch to or from WWM 759 First = FirstWWM; 760 } else { 761 // We only need to switch to/from WQM, so we can use FirstWQM 762 First = FirstWQM; 763 } 764 765 MachineBasicBlock::iterator Before = 766 prepareInsertion(MBB, First, II, Needs == StateWQM, 767 Needs == StateExact || WQMFromExec); 768 769 if (State == StateWWM) { 770 assert(SavedNonWWMReg); 771 fromWWM(MBB, Before, SavedNonWWMReg); 772 State = NonWWMState; 773 } 774 775 if (Needs == StateWWM) { 776 NonWWMState = State; 777 SavedNonWWMReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass); 778 toWWM(MBB, Before, SavedNonWWMReg); 779 State = StateWWM; 780 } else { 781 if (State == StateWQM && (Needs & StateExact) && !(Needs & StateWQM)) { 782 if (!WQMFromExec && (OutNeeds & StateWQM)) 783 SavedWQMReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass); 784 785 toExact(MBB, Before, SavedWQMReg, LiveMaskReg); 786 State = StateExact; 787 } else if (State == StateExact && (Needs & StateWQM) && 788 !(Needs & StateExact)) { 789 assert(WQMFromExec == (SavedWQMReg == 0)); 790 791 toWQM(MBB, Before, SavedWQMReg); 792 793 if (SavedWQMReg) { 794 LIS->createAndComputeVirtRegInterval(SavedWQMReg); 795 SavedWQMReg = 0; 796 } 797 State = StateWQM; 798 } else { 799 // We can get here if we transitioned from WWM to a non-WWM state that 800 // already matches our needs, but we shouldn't need to do anything. 801 assert(Needs & State); 802 } 803 } 804 } 805 806 if (Needs != (StateExact | StateWQM | StateWWM)) { 807 if (Needs != (StateExact | StateWQM)) 808 FirstWQM = IE; 809 FirstWWM = IE; 810 } 811 812 if (II == IE) 813 break; 814 II = Next; 815 } 816 } 817 818 void SIWholeQuadMode::lowerLiveMaskQueries(unsigned LiveMaskReg) { 819 for (MachineInstr *MI : LiveMaskQueries) { 820 const DebugLoc &DL = MI->getDebugLoc(); 821 unsigned Dest = MI->getOperand(0).getReg(); 822 MachineInstr *Copy = 823 BuildMI(*MI->getParent(), MI, DL, TII->get(AMDGPU::COPY), Dest) 824 .addReg(LiveMaskReg); 825 826 LIS->ReplaceMachineInstrInMaps(*MI, *Copy); 827 MI->eraseFromParent(); 828 } 829 } 830 831 void SIWholeQuadMode::lowerCopyInstrs() { 832 for (MachineInstr *MI : LowerToCopyInstrs) { 833 for (unsigned i = MI->getNumExplicitOperands() - 1; i > 1; i--) 834 MI->RemoveOperand(i); 835 MI->setDesc(TII->get(AMDGPU::COPY)); 836 } 837 } 838 839 bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) { 840 Instructions.clear(); 841 Blocks.clear(); 842 LiveMaskQueries.clear(); 843 LowerToCopyInstrs.clear(); 844 CallingConv = MF.getFunction()->getCallingConv(); 845 846 const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); 847 848 TII = ST.getInstrInfo(); 849 TRI = &TII->getRegisterInfo(); 850 MRI = &MF.getRegInfo(); 851 LIS = &getAnalysis<LiveIntervals>(); 852 853 char GlobalFlags = analyzeFunction(MF); 854 unsigned LiveMaskReg = 0; 855 if (!(GlobalFlags & StateWQM)) { 856 lowerLiveMaskQueries(AMDGPU::EXEC); 857 if (!(GlobalFlags & StateWWM)) 858 return !LiveMaskQueries.empty(); 859 } else { 860 // Store a copy of the original live mask when required 861 MachineBasicBlock &Entry = MF.front(); 862 MachineBasicBlock::iterator EntryMI = Entry.getFirstNonPHI(); 863 864 if (GlobalFlags & StateExact || !LiveMaskQueries.empty()) { 865 LiveMaskReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass); 866 MachineInstr *MI = BuildMI(Entry, EntryMI, DebugLoc(), 867 TII->get(AMDGPU::COPY), LiveMaskReg) 868 .addReg(AMDGPU::EXEC); 869 LIS->InsertMachineInstrInMaps(*MI); 870 } 871 872 lowerLiveMaskQueries(LiveMaskReg); 873 874 if (GlobalFlags == StateWQM) { 875 // For a shader that needs only WQM, we can just set it once. 876 BuildMI(Entry, EntryMI, DebugLoc(), TII->get(AMDGPU::S_WQM_B64), 877 AMDGPU::EXEC) 878 .addReg(AMDGPU::EXEC); 879 880 lowerCopyInstrs(); 881 // EntryMI may become invalid here 882 return true; 883 } 884 } 885 886 DEBUG(printInfo()); 887 888 lowerCopyInstrs(); 889 890 // Handle the general case 891 for (auto BII : Blocks) 892 processBlock(*BII.first, LiveMaskReg, BII.first == &*MF.begin()); 893 894 // Physical registers like SCC aren't tracked by default anyway, so just 895 // removing the ranges we computed is the simplest option for maintaining 896 // the analysis results. 897 LIS->removeRegUnit(*MCRegUnitIterator(AMDGPU::SCC, TRI)); 898 899 return true; 900 } 901