1 //===-- SIWholeQuadMode.cpp - enter and suspend whole quad mode -----------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// This pass adds instructions to enable whole quad mode (strict or non-strict) 11 /// for pixel shaders, and strict whole wavefront mode for all programs. 12 /// 13 /// The "strict" prefix indicates that inactive lanes do not take part in 14 /// control flow, specifically an inactive lane enabled by a strict WQM/WWM will 15 /// always be enabled irrespective of control flow decisions. Conversely in 16 /// non-strict WQM inactive lanes may control flow decisions. 17 /// 18 /// Whole quad mode is required for derivative computations, but it interferes 19 /// with shader side effects (stores and atomics). It ensures that WQM is 20 /// enabled when necessary, but disabled around stores and atomics. 21 /// 22 /// When necessary, this pass creates a function prolog 23 /// 24 /// S_MOV_B64 LiveMask, EXEC 25 /// S_WQM_B64 EXEC, EXEC 26 /// 27 /// to enter WQM at the top of the function and surrounds blocks of Exact 28 /// instructions by 29 /// 30 /// S_AND_SAVEEXEC_B64 Tmp, LiveMask 31 /// ... 32 /// S_MOV_B64 EXEC, Tmp 33 /// 34 /// We also compute when a sequence of instructions requires strict whole 35 /// wavefront mode (StrictWWM) and insert instructions to save and restore it: 36 /// 37 /// S_OR_SAVEEXEC_B64 Tmp, -1 38 /// ... 39 /// S_MOV_B64 EXEC, Tmp 40 /// 41 /// When a sequence of instructions requires strict whole quad mode (StrictWQM) 42 /// we use a similar save and restore mechanism and force whole quad mode for 43 /// those instructions: 44 /// 45 /// S_MOV_B64 Tmp, EXEC 46 /// S_WQM_B64 EXEC, EXEC 47 /// ... 48 /// S_MOV_B64 EXEC, Tmp 49 /// 50 /// In order to avoid excessive switching during sequences of Exact 51 /// instructions, the pass first analyzes which instructions must be run in WQM 52 /// (aka which instructions produce values that lead to derivative 53 /// computations). 54 /// 55 /// Basic blocks are always exited in WQM as long as some successor needs WQM. 56 /// 57 /// There is room for improvement given better control flow analysis: 58 /// 59 /// (1) at the top level (outside of control flow statements, and as long as 60 /// kill hasn't been used), one SGPR can be saved by recovering WQM from 61 /// the LiveMask (this is implemented for the entry block). 62 /// 63 /// (2) when entire regions (e.g. if-else blocks or entire loops) only 64 /// consist of exact and don't-care instructions, the switch only has to 65 /// be done at the entry and exit points rather than potentially in each 66 /// block of the region. 67 /// 68 //===----------------------------------------------------------------------===// 69 70 #include "AMDGPU.h" 71 #include "GCNSubtarget.h" 72 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 73 #include "llvm/ADT/MapVector.h" 74 #include "llvm/ADT/PostOrderIterator.h" 75 #include "llvm/CodeGen/LiveIntervals.h" 76 #include "llvm/CodeGen/MachineBasicBlock.h" 77 #include "llvm/CodeGen/MachineDominators.h" 78 #include "llvm/CodeGen/MachineFunctionPass.h" 79 #include "llvm/CodeGen/MachineInstr.h" 80 #include "llvm/CodeGen/MachinePostDominators.h" 81 #include "llvm/IR/CallingConv.h" 82 #include "llvm/InitializePasses.h" 83 #include "llvm/Support/raw_ostream.h" 84 85 using namespace llvm; 86 87 #define DEBUG_TYPE "si-wqm" 88 89 namespace { 90 91 enum { 92 StateWQM = 0x1, 93 StateStrictWWM = 0x2, 94 StateStrictWQM = 0x4, 95 StateExact = 0x8, 96 StateStrict = StateStrictWWM | StateStrictWQM, 97 }; 98 99 struct PrintState { 100 public: 101 int State; 102 103 explicit PrintState(int State) : State(State) {} 104 }; 105 106 #ifndef NDEBUG 107 static raw_ostream &operator<<(raw_ostream &OS, const PrintState &PS) { 108 109 static const std::pair<char, const char *> Mapping[] = { 110 std::pair(StateWQM, "WQM"), std::pair(StateStrictWWM, "StrictWWM"), 111 std::pair(StateStrictWQM, "StrictWQM"), std::pair(StateExact, "Exact")}; 112 char State = PS.State; 113 for (auto M : Mapping) { 114 if (State & M.first) { 115 OS << M.second; 116 State &= ~M.first; 117 118 if (State) 119 OS << '|'; 120 } 121 } 122 assert(State == 0); 123 return OS; 124 } 125 #endif 126 127 struct InstrInfo { 128 char Needs = 0; 129 char Disabled = 0; 130 char OutNeeds = 0; 131 }; 132 133 struct BlockInfo { 134 char Needs = 0; 135 char InNeeds = 0; 136 char OutNeeds = 0; 137 char InitialState = 0; 138 bool NeedsLowering = false; 139 }; 140 141 struct WorkItem { 142 MachineBasicBlock *MBB = nullptr; 143 MachineInstr *MI = nullptr; 144 145 WorkItem() = default; 146 WorkItem(MachineBasicBlock *MBB) : MBB(MBB) {} 147 WorkItem(MachineInstr *MI) : MI(MI) {} 148 }; 149 150 class SIWholeQuadMode : public MachineFunctionPass { 151 private: 152 const SIInstrInfo *TII; 153 const SIRegisterInfo *TRI; 154 const GCNSubtarget *ST; 155 MachineRegisterInfo *MRI; 156 LiveIntervals *LIS; 157 MachineDominatorTree *MDT; 158 MachinePostDominatorTree *PDT; 159 160 unsigned AndOpc; 161 unsigned AndTermOpc; 162 unsigned AndN2Opc; 163 unsigned XorOpc; 164 unsigned AndSaveExecOpc; 165 unsigned AndSaveExecTermOpc; 166 unsigned WQMOpc; 167 Register Exec; 168 Register LiveMaskReg; 169 170 DenseMap<const MachineInstr *, InstrInfo> Instructions; 171 MapVector<MachineBasicBlock *, BlockInfo> Blocks; 172 173 // Tracks state (WQM/StrictWWM/StrictWQM/Exact) after a given instruction 174 DenseMap<const MachineInstr *, char> StateTransition; 175 176 SmallVector<MachineInstr *, 2> LiveMaskQueries; 177 SmallVector<MachineInstr *, 4> LowerToMovInstrs; 178 SmallVector<MachineInstr *, 4> LowerToCopyInstrs; 179 SmallVector<MachineInstr *, 4> KillInstrs; 180 SmallVector<MachineInstr *, 4> InitExecInstrs; 181 182 void printInfo(); 183 184 void markInstruction(MachineInstr &MI, char Flag, 185 std::vector<WorkItem> &Worklist); 186 void markDefs(const MachineInstr &UseMI, LiveRange &LR, Register Reg, 187 unsigned SubReg, char Flag, std::vector<WorkItem> &Worklist); 188 void markOperand(const MachineInstr &MI, const MachineOperand &Op, char Flag, 189 std::vector<WorkItem> &Worklist); 190 void markInstructionUses(const MachineInstr &MI, char Flag, 191 std::vector<WorkItem> &Worklist); 192 char scanInstructions(MachineFunction &MF, std::vector<WorkItem> &Worklist); 193 void propagateInstruction(MachineInstr &MI, std::vector<WorkItem> &Worklist); 194 void propagateBlock(MachineBasicBlock &MBB, std::vector<WorkItem> &Worklist); 195 char analyzeFunction(MachineFunction &MF); 196 197 MachineBasicBlock::iterator saveSCC(MachineBasicBlock &MBB, 198 MachineBasicBlock::iterator Before); 199 MachineBasicBlock::iterator 200 prepareInsertion(MachineBasicBlock &MBB, MachineBasicBlock::iterator First, 201 MachineBasicBlock::iterator Last, bool PreferLast, 202 bool SaveSCC); 203 void toExact(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before, 204 Register SaveWQM); 205 void toWQM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before, 206 Register SavedWQM); 207 void toStrictMode(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before, 208 Register SaveOrig, char StrictStateNeeded); 209 void fromStrictMode(MachineBasicBlock &MBB, 210 MachineBasicBlock::iterator Before, Register SavedOrig, 211 char NonStrictState, char CurrentStrictState); 212 213 MachineBasicBlock *splitBlock(MachineBasicBlock *BB, MachineInstr *TermMI); 214 215 MachineInstr *lowerKillI1(MachineBasicBlock &MBB, MachineInstr &MI, 216 bool IsWQM); 217 MachineInstr *lowerKillF32(MachineBasicBlock &MBB, MachineInstr &MI); 218 void lowerPseudoStrictMode(MachineBasicBlock &MBB, MachineInstr *Entry, 219 MachineInstr *Exit); 220 221 void lowerBlock(MachineBasicBlock &MBB); 222 void processBlock(MachineBasicBlock &MBB, bool IsEntry); 223 224 void lowerLiveMaskQueries(); 225 void lowerCopyInstrs(); 226 void lowerKillInstrs(bool IsWQM); 227 void lowerInitExec(MachineInstr &MI); 228 MachineBasicBlock::iterator lowerInitExecInstrs(MachineBasicBlock &Entry); 229 230 public: 231 static char ID; 232 233 SIWholeQuadMode() : 234 MachineFunctionPass(ID) { } 235 236 bool runOnMachineFunction(MachineFunction &MF) override; 237 238 StringRef getPassName() const override { return "SI Whole Quad Mode"; } 239 240 void getAnalysisUsage(AnalysisUsage &AU) const override { 241 AU.addRequired<LiveIntervals>(); 242 AU.addPreserved<SlotIndexes>(); 243 AU.addPreserved<LiveIntervals>(); 244 AU.addPreserved<MachineDominatorTree>(); 245 AU.addPreserved<MachinePostDominatorTree>(); 246 MachineFunctionPass::getAnalysisUsage(AU); 247 } 248 249 MachineFunctionProperties getClearedProperties() const override { 250 return MachineFunctionProperties().set( 251 MachineFunctionProperties::Property::IsSSA); 252 } 253 }; 254 255 } // end anonymous namespace 256 257 char SIWholeQuadMode::ID = 0; 258 259 INITIALIZE_PASS_BEGIN(SIWholeQuadMode, DEBUG_TYPE, "SI Whole Quad Mode", false, 260 false) 261 INITIALIZE_PASS_DEPENDENCY(LiveIntervals) 262 INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) 263 INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree) 264 INITIALIZE_PASS_END(SIWholeQuadMode, DEBUG_TYPE, "SI Whole Quad Mode", false, 265 false) 266 267 char &llvm::SIWholeQuadModeID = SIWholeQuadMode::ID; 268 269 FunctionPass *llvm::createSIWholeQuadModePass() { 270 return new SIWholeQuadMode; 271 } 272 273 #ifndef NDEBUG 274 LLVM_DUMP_METHOD void SIWholeQuadMode::printInfo() { 275 for (const auto &BII : Blocks) { 276 dbgs() << "\n" 277 << printMBBReference(*BII.first) << ":\n" 278 << " InNeeds = " << PrintState(BII.second.InNeeds) 279 << ", Needs = " << PrintState(BII.second.Needs) 280 << ", OutNeeds = " << PrintState(BII.second.OutNeeds) << "\n\n"; 281 282 for (const MachineInstr &MI : *BII.first) { 283 auto III = Instructions.find(&MI); 284 if (III != Instructions.end()) { 285 dbgs() << " " << MI << " Needs = " << PrintState(III->second.Needs) 286 << ", OutNeeds = " << PrintState(III->second.OutNeeds) << '\n'; 287 } 288 } 289 } 290 } 291 #endif 292 293 void SIWholeQuadMode::markInstruction(MachineInstr &MI, char Flag, 294 std::vector<WorkItem> &Worklist) { 295 InstrInfo &II = Instructions[&MI]; 296 297 assert(!(Flag & StateExact) && Flag != 0); 298 299 // Remove any disabled states from the flag. The user that required it gets 300 // an undefined value in the helper lanes. For example, this can happen if 301 // the result of an atomic is used by instruction that requires WQM, where 302 // ignoring the request for WQM is correct as per the relevant specs. 303 Flag &= ~II.Disabled; 304 305 // Ignore if the flag is already encompassed by the existing needs, or we 306 // just disabled everything. 307 if ((II.Needs & Flag) == Flag) 308 return; 309 310 LLVM_DEBUG(dbgs() << "markInstruction " << PrintState(Flag) << ": " << MI); 311 II.Needs |= Flag; 312 Worklist.push_back(&MI); 313 } 314 315 /// Mark all relevant definitions of register \p Reg in usage \p UseMI. 316 void SIWholeQuadMode::markDefs(const MachineInstr &UseMI, LiveRange &LR, 317 Register Reg, unsigned SubReg, char Flag, 318 std::vector<WorkItem> &Worklist) { 319 LLVM_DEBUG(dbgs() << "markDefs " << PrintState(Flag) << ": " << UseMI); 320 321 LiveQueryResult UseLRQ = LR.Query(LIS->getInstructionIndex(UseMI)); 322 const VNInfo *Value = UseLRQ.valueIn(); 323 if (!Value) 324 return; 325 326 // Note: this code assumes that lane masks on AMDGPU completely 327 // cover registers. 328 const LaneBitmask UseLanes = 329 SubReg ? TRI->getSubRegIndexLaneMask(SubReg) 330 : (Reg.isVirtual() ? MRI->getMaxLaneMaskForVReg(Reg) 331 : LaneBitmask::getNone()); 332 333 // Perform a depth-first iteration of the LiveRange graph marking defs. 334 // Stop processing of a given branch when all use lanes have been defined. 335 // The first definition stops processing for a physical register. 336 struct PhiEntry { 337 const VNInfo *Phi; 338 unsigned PredIdx; 339 LaneBitmask DefinedLanes; 340 341 PhiEntry(const VNInfo *Phi, unsigned PredIdx, LaneBitmask DefinedLanes) 342 : Phi(Phi), PredIdx(PredIdx), DefinedLanes(DefinedLanes) {} 343 }; 344 using VisitKey = std::pair<const VNInfo *, LaneBitmask>; 345 SmallVector<PhiEntry, 2> PhiStack; 346 SmallSet<VisitKey, 4> Visited; 347 LaneBitmask DefinedLanes; 348 unsigned NextPredIdx = 0; // Only used for processing phi nodes 349 do { 350 const VNInfo *NextValue = nullptr; 351 const VisitKey Key(Value, DefinedLanes); 352 353 if (Visited.insert(Key).second) { 354 // On first visit to a phi then start processing first predecessor 355 NextPredIdx = 0; 356 } 357 358 if (Value->isPHIDef()) { 359 // Each predecessor node in the phi must be processed as a subgraph 360 const MachineBasicBlock *MBB = LIS->getMBBFromIndex(Value->def); 361 assert(MBB && "Phi-def has no defining MBB"); 362 363 // Find next predecessor to process 364 unsigned Idx = NextPredIdx; 365 auto PI = MBB->pred_begin() + Idx; 366 auto PE = MBB->pred_end(); 367 for (; PI != PE && !NextValue; ++PI, ++Idx) { 368 if (const VNInfo *VN = LR.getVNInfoBefore(LIS->getMBBEndIdx(*PI))) { 369 if (!Visited.count(VisitKey(VN, DefinedLanes))) 370 NextValue = VN; 371 } 372 } 373 374 // If there are more predecessors to process; add phi to stack 375 if (PI != PE) 376 PhiStack.emplace_back(Value, Idx, DefinedLanes); 377 } else { 378 MachineInstr *MI = LIS->getInstructionFromIndex(Value->def); 379 assert(MI && "Def has no defining instruction"); 380 381 if (Reg.isVirtual()) { 382 // Iterate over all operands to find relevant definitions 383 bool HasDef = false; 384 for (const MachineOperand &Op : MI->all_defs()) { 385 if (Op.getReg() != Reg) 386 continue; 387 388 // Compute lanes defined and overlap with use 389 LaneBitmask OpLanes = 390 Op.isUndef() ? LaneBitmask::getAll() 391 : TRI->getSubRegIndexLaneMask(Op.getSubReg()); 392 LaneBitmask Overlap = (UseLanes & OpLanes); 393 394 // Record if this instruction defined any of use 395 HasDef |= Overlap.any(); 396 397 // Mark any lanes defined 398 DefinedLanes |= OpLanes; 399 } 400 401 // Check if all lanes of use have been defined 402 if ((DefinedLanes & UseLanes) != UseLanes) { 403 // Definition not complete; need to process input value 404 LiveQueryResult LRQ = LR.Query(LIS->getInstructionIndex(*MI)); 405 if (const VNInfo *VN = LRQ.valueIn()) { 406 if (!Visited.count(VisitKey(VN, DefinedLanes))) 407 NextValue = VN; 408 } 409 } 410 411 // Only mark the instruction if it defines some part of the use 412 if (HasDef) 413 markInstruction(*MI, Flag, Worklist); 414 } else { 415 // For physical registers simply mark the defining instruction 416 markInstruction(*MI, Flag, Worklist); 417 } 418 } 419 420 if (!NextValue && !PhiStack.empty()) { 421 // Reach end of chain; revert to processing last phi 422 PhiEntry &Entry = PhiStack.back(); 423 NextValue = Entry.Phi; 424 NextPredIdx = Entry.PredIdx; 425 DefinedLanes = Entry.DefinedLanes; 426 PhiStack.pop_back(); 427 } 428 429 Value = NextValue; 430 } while (Value); 431 } 432 433 void SIWholeQuadMode::markOperand(const MachineInstr &MI, 434 const MachineOperand &Op, char Flag, 435 std::vector<WorkItem> &Worklist) { 436 assert(Op.isReg()); 437 Register Reg = Op.getReg(); 438 439 // Ignore some hardware registers 440 switch (Reg) { 441 case AMDGPU::EXEC: 442 case AMDGPU::EXEC_LO: 443 return; 444 default: 445 break; 446 } 447 448 LLVM_DEBUG(dbgs() << "markOperand " << PrintState(Flag) << ": " << Op 449 << " for " << MI); 450 if (Reg.isVirtual()) { 451 LiveRange &LR = LIS->getInterval(Reg); 452 markDefs(MI, LR, Reg, Op.getSubReg(), Flag, Worklist); 453 } else { 454 // Handle physical registers that we need to track; this is mostly relevant 455 // for VCC, which can appear as the (implicit) input of a uniform branch, 456 // e.g. when a loop counter is stored in a VGPR. 457 for (MCRegUnit Unit : TRI->regunits(Reg.asMCReg())) { 458 LiveRange &LR = LIS->getRegUnit(Unit); 459 const VNInfo *Value = LR.Query(LIS->getInstructionIndex(MI)).valueIn(); 460 if (Value) 461 markDefs(MI, LR, Unit, AMDGPU::NoSubRegister, Flag, Worklist); 462 } 463 } 464 } 465 466 /// Mark all instructions defining the uses in \p MI with \p Flag. 467 void SIWholeQuadMode::markInstructionUses(const MachineInstr &MI, char Flag, 468 std::vector<WorkItem> &Worklist) { 469 LLVM_DEBUG(dbgs() << "markInstructionUses " << PrintState(Flag) << ": " 470 << MI); 471 472 for (const MachineOperand &Use : MI.all_uses()) 473 markOperand(MI, Use, Flag, Worklist); 474 } 475 476 // Scan instructions to determine which ones require an Exact execmask and 477 // which ones seed WQM requirements. 478 char SIWholeQuadMode::scanInstructions(MachineFunction &MF, 479 std::vector<WorkItem> &Worklist) { 480 char GlobalFlags = 0; 481 bool WQMOutputs = MF.getFunction().hasFnAttribute("amdgpu-ps-wqm-outputs"); 482 SmallVector<MachineInstr *, 4> SetInactiveInstrs; 483 SmallVector<MachineInstr *, 4> SoftWQMInstrs; 484 bool HasImplicitDerivatives = 485 MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS; 486 487 // We need to visit the basic blocks in reverse post-order so that we visit 488 // defs before uses, in particular so that we don't accidentally mark an 489 // instruction as needing e.g. WQM before visiting it and realizing it needs 490 // WQM disabled. 491 ReversePostOrderTraversal<MachineFunction *> RPOT(&MF); 492 for (MachineBasicBlock *MBB : RPOT) { 493 BlockInfo &BBI = Blocks[MBB]; 494 495 for (MachineInstr &MI : *MBB) { 496 InstrInfo &III = Instructions[&MI]; 497 unsigned Opcode = MI.getOpcode(); 498 char Flags = 0; 499 500 if (TII->isWQM(Opcode)) { 501 // If LOD is not supported WQM is not needed. 502 // Only generate implicit WQM if implicit derivatives are required. 503 // This avoids inserting unintended WQM if a shader type without 504 // implicit derivatives uses an image sampling instruction. 505 if (ST->hasExtendedImageInsts() && HasImplicitDerivatives) { 506 // Sampling instructions don't need to produce results for all pixels 507 // in a quad, they just require all inputs of a quad to have been 508 // computed for derivatives. 509 markInstructionUses(MI, StateWQM, Worklist); 510 GlobalFlags |= StateWQM; 511 } 512 } else if (Opcode == AMDGPU::WQM) { 513 // The WQM intrinsic requires its output to have all the helper lanes 514 // correct, so we need it to be in WQM. 515 Flags = StateWQM; 516 LowerToCopyInstrs.push_back(&MI); 517 } else if (Opcode == AMDGPU::SOFT_WQM) { 518 LowerToCopyInstrs.push_back(&MI); 519 SoftWQMInstrs.push_back(&MI); 520 } else if (Opcode == AMDGPU::STRICT_WWM) { 521 // The STRICT_WWM intrinsic doesn't make the same guarantee, and plus 522 // it needs to be executed in WQM or Exact so that its copy doesn't 523 // clobber inactive lanes. 524 markInstructionUses(MI, StateStrictWWM, Worklist); 525 GlobalFlags |= StateStrictWWM; 526 LowerToMovInstrs.push_back(&MI); 527 } else if (Opcode == AMDGPU::STRICT_WQM || 528 TII->isDualSourceBlendEXP(MI)) { 529 // STRICT_WQM is similar to STRICTWWM, but instead of enabling all 530 // threads of the wave like STRICTWWM, STRICT_WQM enables all threads in 531 // quads that have at least one active thread. 532 markInstructionUses(MI, StateStrictWQM, Worklist); 533 GlobalFlags |= StateStrictWQM; 534 535 if (Opcode == AMDGPU::STRICT_WQM) { 536 LowerToMovInstrs.push_back(&MI); 537 } else { 538 // Dual source blend export acts as implicit strict-wqm, its sources 539 // need to be shuffled in strict wqm, but the export itself needs to 540 // run in exact mode. 541 BBI.Needs |= StateExact; 542 if (!(BBI.InNeeds & StateExact)) { 543 BBI.InNeeds |= StateExact; 544 Worklist.push_back(MBB); 545 } 546 GlobalFlags |= StateExact; 547 III.Disabled = StateWQM | StateStrict; 548 } 549 } else if (Opcode == AMDGPU::LDS_PARAM_LOAD || 550 Opcode == AMDGPU::DS_PARAM_LOAD || 551 Opcode == AMDGPU::LDS_DIRECT_LOAD || 552 Opcode == AMDGPU::DS_DIRECT_LOAD) { 553 // Mark these STRICTWQM, but only for the instruction, not its operands. 554 // This avoid unnecessarily marking M0 as requiring WQM. 555 InstrInfo &II = Instructions[&MI]; 556 II.Needs |= StateStrictWQM; 557 GlobalFlags |= StateStrictWQM; 558 } else if (Opcode == AMDGPU::V_SET_INACTIVE_B32 || 559 Opcode == AMDGPU::V_SET_INACTIVE_B64) { 560 III.Disabled = StateStrict; 561 MachineOperand &Inactive = MI.getOperand(2); 562 if (Inactive.isReg()) { 563 if (Inactive.isUndef()) { 564 LowerToCopyInstrs.push_back(&MI); 565 } else { 566 markOperand(MI, Inactive, StateStrictWWM, Worklist); 567 } 568 } 569 SetInactiveInstrs.push_back(&MI); 570 } else if (TII->isDisableWQM(MI)) { 571 BBI.Needs |= StateExact; 572 if (!(BBI.InNeeds & StateExact)) { 573 BBI.InNeeds |= StateExact; 574 Worklist.push_back(MBB); 575 } 576 GlobalFlags |= StateExact; 577 III.Disabled = StateWQM | StateStrict; 578 } else if (Opcode == AMDGPU::SI_PS_LIVE || 579 Opcode == AMDGPU::SI_LIVE_MASK) { 580 LiveMaskQueries.push_back(&MI); 581 } else if (Opcode == AMDGPU::SI_KILL_I1_TERMINATOR || 582 Opcode == AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR || 583 Opcode == AMDGPU::SI_DEMOTE_I1) { 584 KillInstrs.push_back(&MI); 585 BBI.NeedsLowering = true; 586 } else if (Opcode == AMDGPU::SI_INIT_EXEC || 587 Opcode == AMDGPU::SI_INIT_EXEC_FROM_INPUT) { 588 InitExecInstrs.push_back(&MI); 589 } else if (WQMOutputs) { 590 // The function is in machine SSA form, which means that physical 591 // VGPRs correspond to shader inputs and outputs. Inputs are 592 // only used, outputs are only defined. 593 // FIXME: is this still valid? 594 for (const MachineOperand &MO : MI.defs()) { 595 Register Reg = MO.getReg(); 596 if (Reg.isPhysical() && 597 TRI->hasVectorRegisters(TRI->getPhysRegBaseClass(Reg))) { 598 Flags = StateWQM; 599 break; 600 } 601 } 602 } 603 604 if (Flags) { 605 markInstruction(MI, Flags, Worklist); 606 GlobalFlags |= Flags; 607 } 608 } 609 } 610 611 // Mark sure that any SET_INACTIVE instructions are computed in WQM if WQM is 612 // ever used anywhere in the function. This implements the corresponding 613 // semantics of @llvm.amdgcn.set.inactive. 614 // Similarly for SOFT_WQM instructions, implementing @llvm.amdgcn.softwqm. 615 if (GlobalFlags & StateWQM) { 616 for (MachineInstr *MI : SetInactiveInstrs) 617 markInstruction(*MI, StateWQM, Worklist); 618 for (MachineInstr *MI : SoftWQMInstrs) 619 markInstruction(*MI, StateWQM, Worklist); 620 } 621 622 return GlobalFlags; 623 } 624 625 void SIWholeQuadMode::propagateInstruction(MachineInstr &MI, 626 std::vector<WorkItem>& Worklist) { 627 MachineBasicBlock *MBB = MI.getParent(); 628 InstrInfo II = Instructions[&MI]; // take a copy to prevent dangling references 629 BlockInfo &BI = Blocks[MBB]; 630 631 // Control flow-type instructions and stores to temporary memory that are 632 // followed by WQM computations must themselves be in WQM. 633 if ((II.OutNeeds & StateWQM) && !(II.Disabled & StateWQM) && 634 (MI.isTerminator() || (TII->usesVM_CNT(MI) && MI.mayStore()))) { 635 Instructions[&MI].Needs = StateWQM; 636 II.Needs = StateWQM; 637 } 638 639 // Propagate to block level 640 if (II.Needs & StateWQM) { 641 BI.Needs |= StateWQM; 642 if (!(BI.InNeeds & StateWQM)) { 643 BI.InNeeds |= StateWQM; 644 Worklist.push_back(MBB); 645 } 646 } 647 648 // Propagate backwards within block 649 if (MachineInstr *PrevMI = MI.getPrevNode()) { 650 char InNeeds = (II.Needs & ~StateStrict) | II.OutNeeds; 651 if (!PrevMI->isPHI()) { 652 InstrInfo &PrevII = Instructions[PrevMI]; 653 if ((PrevII.OutNeeds | InNeeds) != PrevII.OutNeeds) { 654 PrevII.OutNeeds |= InNeeds; 655 Worklist.push_back(PrevMI); 656 } 657 } 658 } 659 660 // Propagate WQM flag to instruction inputs 661 assert(!(II.Needs & StateExact)); 662 663 if (II.Needs != 0) 664 markInstructionUses(MI, II.Needs, Worklist); 665 666 // Ensure we process a block containing StrictWWM/StrictWQM, even if it does 667 // not require any WQM transitions. 668 if (II.Needs & StateStrictWWM) 669 BI.Needs |= StateStrictWWM; 670 if (II.Needs & StateStrictWQM) 671 BI.Needs |= StateStrictWQM; 672 } 673 674 void SIWholeQuadMode::propagateBlock(MachineBasicBlock &MBB, 675 std::vector<WorkItem>& Worklist) { 676 BlockInfo BI = Blocks[&MBB]; // Make a copy to prevent dangling references. 677 678 // Propagate through instructions 679 if (!MBB.empty()) { 680 MachineInstr *LastMI = &*MBB.rbegin(); 681 InstrInfo &LastII = Instructions[LastMI]; 682 if ((LastII.OutNeeds | BI.OutNeeds) != LastII.OutNeeds) { 683 LastII.OutNeeds |= BI.OutNeeds; 684 Worklist.push_back(LastMI); 685 } 686 } 687 688 // Predecessor blocks must provide for our WQM/Exact needs. 689 for (MachineBasicBlock *Pred : MBB.predecessors()) { 690 BlockInfo &PredBI = Blocks[Pred]; 691 if ((PredBI.OutNeeds | BI.InNeeds) == PredBI.OutNeeds) 692 continue; 693 694 PredBI.OutNeeds |= BI.InNeeds; 695 PredBI.InNeeds |= BI.InNeeds; 696 Worklist.push_back(Pred); 697 } 698 699 // All successors must be prepared to accept the same set of WQM/Exact data. 700 for (MachineBasicBlock *Succ : MBB.successors()) { 701 BlockInfo &SuccBI = Blocks[Succ]; 702 if ((SuccBI.InNeeds | BI.OutNeeds) == SuccBI.InNeeds) 703 continue; 704 705 SuccBI.InNeeds |= BI.OutNeeds; 706 Worklist.push_back(Succ); 707 } 708 } 709 710 char SIWholeQuadMode::analyzeFunction(MachineFunction &MF) { 711 std::vector<WorkItem> Worklist; 712 char GlobalFlags = scanInstructions(MF, Worklist); 713 714 while (!Worklist.empty()) { 715 WorkItem WI = Worklist.back(); 716 Worklist.pop_back(); 717 718 if (WI.MI) 719 propagateInstruction(*WI.MI, Worklist); 720 else 721 propagateBlock(*WI.MBB, Worklist); 722 } 723 724 return GlobalFlags; 725 } 726 727 MachineBasicBlock::iterator 728 SIWholeQuadMode::saveSCC(MachineBasicBlock &MBB, 729 MachineBasicBlock::iterator Before) { 730 Register SaveReg = MRI->createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); 731 732 MachineInstr *Save = 733 BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), SaveReg) 734 .addReg(AMDGPU::SCC); 735 MachineInstr *Restore = 736 BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), AMDGPU::SCC) 737 .addReg(SaveReg); 738 739 LIS->InsertMachineInstrInMaps(*Save); 740 LIS->InsertMachineInstrInMaps(*Restore); 741 LIS->createAndComputeVirtRegInterval(SaveReg); 742 743 return Restore; 744 } 745 746 MachineBasicBlock *SIWholeQuadMode::splitBlock(MachineBasicBlock *BB, 747 MachineInstr *TermMI) { 748 LLVM_DEBUG(dbgs() << "Split block " << printMBBReference(*BB) << " @ " 749 << *TermMI << "\n"); 750 751 MachineBasicBlock *SplitBB = 752 BB->splitAt(*TermMI, /*UpdateLiveIns*/ true, LIS); 753 754 // Convert last instruction in block to a terminator. 755 // Note: this only covers the expected patterns 756 unsigned NewOpcode = 0; 757 switch (TermMI->getOpcode()) { 758 case AMDGPU::S_AND_B32: 759 NewOpcode = AMDGPU::S_AND_B32_term; 760 break; 761 case AMDGPU::S_AND_B64: 762 NewOpcode = AMDGPU::S_AND_B64_term; 763 break; 764 case AMDGPU::S_MOV_B32: 765 NewOpcode = AMDGPU::S_MOV_B32_term; 766 break; 767 case AMDGPU::S_MOV_B64: 768 NewOpcode = AMDGPU::S_MOV_B64_term; 769 break; 770 default: 771 break; 772 } 773 if (NewOpcode) 774 TermMI->setDesc(TII->get(NewOpcode)); 775 776 if (SplitBB != BB) { 777 // Update dominator trees 778 using DomTreeT = DomTreeBase<MachineBasicBlock>; 779 SmallVector<DomTreeT::UpdateType, 16> DTUpdates; 780 for (MachineBasicBlock *Succ : SplitBB->successors()) { 781 DTUpdates.push_back({DomTreeT::Insert, SplitBB, Succ}); 782 DTUpdates.push_back({DomTreeT::Delete, BB, Succ}); 783 } 784 DTUpdates.push_back({DomTreeT::Insert, BB, SplitBB}); 785 if (MDT) 786 MDT->getBase().applyUpdates(DTUpdates); 787 if (PDT) 788 PDT->getBase().applyUpdates(DTUpdates); 789 790 // Link blocks 791 MachineInstr *MI = 792 BuildMI(*BB, BB->end(), DebugLoc(), TII->get(AMDGPU::S_BRANCH)) 793 .addMBB(SplitBB); 794 LIS->InsertMachineInstrInMaps(*MI); 795 } 796 797 return SplitBB; 798 } 799 800 MachineInstr *SIWholeQuadMode::lowerKillF32(MachineBasicBlock &MBB, 801 MachineInstr &MI) { 802 const DebugLoc &DL = MI.getDebugLoc(); 803 unsigned Opcode = 0; 804 805 assert(MI.getOperand(0).isReg()); 806 807 // Comparison is for live lanes; however here we compute the inverse 808 // (killed lanes). This is because VCMP will always generate 0 bits 809 // for inactive lanes so a mask of live lanes would not be correct 810 // inside control flow. 811 // Invert the comparison by swapping the operands and adjusting 812 // the comparison codes. 813 814 switch (MI.getOperand(2).getImm()) { 815 case ISD::SETUEQ: 816 Opcode = AMDGPU::V_CMP_LG_F32_e64; 817 break; 818 case ISD::SETUGT: 819 Opcode = AMDGPU::V_CMP_GE_F32_e64; 820 break; 821 case ISD::SETUGE: 822 Opcode = AMDGPU::V_CMP_GT_F32_e64; 823 break; 824 case ISD::SETULT: 825 Opcode = AMDGPU::V_CMP_LE_F32_e64; 826 break; 827 case ISD::SETULE: 828 Opcode = AMDGPU::V_CMP_LT_F32_e64; 829 break; 830 case ISD::SETUNE: 831 Opcode = AMDGPU::V_CMP_EQ_F32_e64; 832 break; 833 case ISD::SETO: 834 Opcode = AMDGPU::V_CMP_O_F32_e64; 835 break; 836 case ISD::SETUO: 837 Opcode = AMDGPU::V_CMP_U_F32_e64; 838 break; 839 case ISD::SETOEQ: 840 case ISD::SETEQ: 841 Opcode = AMDGPU::V_CMP_NEQ_F32_e64; 842 break; 843 case ISD::SETOGT: 844 case ISD::SETGT: 845 Opcode = AMDGPU::V_CMP_NLT_F32_e64; 846 break; 847 case ISD::SETOGE: 848 case ISD::SETGE: 849 Opcode = AMDGPU::V_CMP_NLE_F32_e64; 850 break; 851 case ISD::SETOLT: 852 case ISD::SETLT: 853 Opcode = AMDGPU::V_CMP_NGT_F32_e64; 854 break; 855 case ISD::SETOLE: 856 case ISD::SETLE: 857 Opcode = AMDGPU::V_CMP_NGE_F32_e64; 858 break; 859 case ISD::SETONE: 860 case ISD::SETNE: 861 Opcode = AMDGPU::V_CMP_NLG_F32_e64; 862 break; 863 default: 864 llvm_unreachable("invalid ISD:SET cond code"); 865 } 866 867 // Pick opcode based on comparison type. 868 MachineInstr *VcmpMI; 869 const MachineOperand &Op0 = MI.getOperand(0); 870 const MachineOperand &Op1 = MI.getOperand(1); 871 872 // VCC represents lanes killed. 873 Register VCC = ST->isWave32() ? AMDGPU::VCC_LO : AMDGPU::VCC; 874 875 if (TRI->isVGPR(*MRI, Op0.getReg())) { 876 Opcode = AMDGPU::getVOPe32(Opcode); 877 VcmpMI = BuildMI(MBB, &MI, DL, TII->get(Opcode)).add(Op1).add(Op0); 878 } else { 879 VcmpMI = BuildMI(MBB, &MI, DL, TII->get(Opcode)) 880 .addReg(VCC, RegState::Define) 881 .addImm(0) // src0 modifiers 882 .add(Op1) 883 .addImm(0) // src1 modifiers 884 .add(Op0) 885 .addImm(0); // omod 886 } 887 888 MachineInstr *MaskUpdateMI = 889 BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg) 890 .addReg(LiveMaskReg) 891 .addReg(VCC); 892 893 // State of SCC represents whether any lanes are live in mask, 894 // if SCC is 0 then no lanes will be alive anymore. 895 MachineInstr *EarlyTermMI = 896 BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_EARLY_TERMINATE_SCC0)); 897 898 MachineInstr *ExecMaskMI = 899 BuildMI(MBB, MI, DL, TII->get(AndN2Opc), Exec).addReg(Exec).addReg(VCC); 900 901 assert(MBB.succ_size() == 1); 902 MachineInstr *NewTerm = BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_BRANCH)) 903 .addMBB(*MBB.succ_begin()); 904 905 // Update live intervals 906 LIS->ReplaceMachineInstrInMaps(MI, *VcmpMI); 907 MBB.remove(&MI); 908 909 LIS->InsertMachineInstrInMaps(*MaskUpdateMI); 910 LIS->InsertMachineInstrInMaps(*ExecMaskMI); 911 LIS->InsertMachineInstrInMaps(*EarlyTermMI); 912 LIS->InsertMachineInstrInMaps(*NewTerm); 913 914 return NewTerm; 915 } 916 917 MachineInstr *SIWholeQuadMode::lowerKillI1(MachineBasicBlock &MBB, 918 MachineInstr &MI, bool IsWQM) { 919 const DebugLoc &DL = MI.getDebugLoc(); 920 MachineInstr *MaskUpdateMI = nullptr; 921 922 const bool IsDemote = IsWQM && (MI.getOpcode() == AMDGPU::SI_DEMOTE_I1); 923 const MachineOperand &Op = MI.getOperand(0); 924 int64_t KillVal = MI.getOperand(1).getImm(); 925 MachineInstr *ComputeKilledMaskMI = nullptr; 926 Register CndReg = !Op.isImm() ? Op.getReg() : Register(); 927 Register TmpReg; 928 929 // Is this a static or dynamic kill? 930 if (Op.isImm()) { 931 if (Op.getImm() == KillVal) { 932 // Static: all active lanes are killed 933 MaskUpdateMI = BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg) 934 .addReg(LiveMaskReg) 935 .addReg(Exec); 936 } else { 937 // Static: kill does nothing 938 MachineInstr *NewTerm = nullptr; 939 if (MI.getOpcode() == AMDGPU::SI_DEMOTE_I1) { 940 LIS->RemoveMachineInstrFromMaps(MI); 941 } else { 942 assert(MBB.succ_size() == 1); 943 NewTerm = BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_BRANCH)) 944 .addMBB(*MBB.succ_begin()); 945 LIS->ReplaceMachineInstrInMaps(MI, *NewTerm); 946 } 947 MBB.remove(&MI); 948 return NewTerm; 949 } 950 } else { 951 if (!KillVal) { 952 // Op represents live lanes after kill, 953 // so exec mask needs to be factored in. 954 TmpReg = MRI->createVirtualRegister(TRI->getBoolRC()); 955 ComputeKilledMaskMI = 956 BuildMI(MBB, MI, DL, TII->get(XorOpc), TmpReg).add(Op).addReg(Exec); 957 MaskUpdateMI = BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg) 958 .addReg(LiveMaskReg) 959 .addReg(TmpReg); 960 } else { 961 // Op represents lanes to kill 962 MaskUpdateMI = BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg) 963 .addReg(LiveMaskReg) 964 .add(Op); 965 } 966 } 967 968 // State of SCC represents whether any lanes are live in mask, 969 // if SCC is 0 then no lanes will be alive anymore. 970 MachineInstr *EarlyTermMI = 971 BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_EARLY_TERMINATE_SCC0)); 972 973 // In the case we got this far some lanes are still live, 974 // update EXEC to deactivate lanes as appropriate. 975 MachineInstr *NewTerm; 976 MachineInstr *WQMMaskMI = nullptr; 977 Register LiveMaskWQM; 978 if (IsDemote) { 979 // Demote - deactivate quads with only helper lanes 980 LiveMaskWQM = MRI->createVirtualRegister(TRI->getBoolRC()); 981 WQMMaskMI = 982 BuildMI(MBB, MI, DL, TII->get(WQMOpc), LiveMaskWQM).addReg(LiveMaskReg); 983 NewTerm = BuildMI(MBB, MI, DL, TII->get(AndOpc), Exec) 984 .addReg(Exec) 985 .addReg(LiveMaskWQM); 986 } else { 987 // Kill - deactivate lanes no longer in live mask 988 if (Op.isImm()) { 989 unsigned MovOpc = ST->isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; 990 NewTerm = BuildMI(MBB, &MI, DL, TII->get(MovOpc), Exec).addImm(0); 991 } else if (!IsWQM) { 992 NewTerm = BuildMI(MBB, &MI, DL, TII->get(AndOpc), Exec) 993 .addReg(Exec) 994 .addReg(LiveMaskReg); 995 } else { 996 unsigned Opcode = KillVal ? AndN2Opc : AndOpc; 997 NewTerm = 998 BuildMI(MBB, &MI, DL, TII->get(Opcode), Exec).addReg(Exec).add(Op); 999 } 1000 } 1001 1002 // Update live intervals 1003 LIS->RemoveMachineInstrFromMaps(MI); 1004 MBB.remove(&MI); 1005 assert(EarlyTermMI); 1006 assert(MaskUpdateMI); 1007 assert(NewTerm); 1008 if (ComputeKilledMaskMI) 1009 LIS->InsertMachineInstrInMaps(*ComputeKilledMaskMI); 1010 LIS->InsertMachineInstrInMaps(*MaskUpdateMI); 1011 LIS->InsertMachineInstrInMaps(*EarlyTermMI); 1012 if (WQMMaskMI) 1013 LIS->InsertMachineInstrInMaps(*WQMMaskMI); 1014 LIS->InsertMachineInstrInMaps(*NewTerm); 1015 1016 if (CndReg) { 1017 LIS->removeInterval(CndReg); 1018 LIS->createAndComputeVirtRegInterval(CndReg); 1019 } 1020 if (TmpReg) 1021 LIS->createAndComputeVirtRegInterval(TmpReg); 1022 if (LiveMaskWQM) 1023 LIS->createAndComputeVirtRegInterval(LiveMaskWQM); 1024 1025 return NewTerm; 1026 } 1027 1028 // Convert a strict mode transition to a pseudo transition. 1029 // This still pre-allocates registers to prevent clobbering, 1030 // but avoids any EXEC mask changes. 1031 void SIWholeQuadMode::lowerPseudoStrictMode(MachineBasicBlock &MBB, 1032 MachineInstr *Entry, 1033 MachineInstr *Exit) { 1034 assert(Entry->getOpcode() == AMDGPU::ENTER_STRICT_WQM); 1035 assert(Exit->getOpcode() == AMDGPU::EXIT_STRICT_WQM); 1036 1037 Register SaveOrig = Entry->getOperand(0).getReg(); 1038 1039 MachineInstr *NewEntry = 1040 BuildMI(MBB, Entry, DebugLoc(), TII->get(AMDGPU::ENTER_PSEUDO_WM)); 1041 MachineInstr *NewExit = 1042 BuildMI(MBB, Exit, DebugLoc(), TII->get(AMDGPU::EXIT_PSEUDO_WM)); 1043 1044 LIS->ReplaceMachineInstrInMaps(*Exit, *NewExit); 1045 Exit->eraseFromParent(); 1046 1047 LIS->ReplaceMachineInstrInMaps(*Entry, *NewEntry); 1048 Entry->eraseFromParent(); 1049 1050 LIS->removeInterval(SaveOrig); 1051 } 1052 1053 // Replace (or supplement) instructions accessing live mask. 1054 // This can only happen once all the live mask registers have been created 1055 // and the execute state (WQM/StrictWWM/Exact) of instructions is known. 1056 void SIWholeQuadMode::lowerBlock(MachineBasicBlock &MBB) { 1057 auto BII = Blocks.find(&MBB); 1058 if (BII == Blocks.end()) 1059 return; 1060 1061 const BlockInfo &BI = BII->second; 1062 if (!BI.NeedsLowering) 1063 return; 1064 1065 LLVM_DEBUG(dbgs() << "\nLowering block " << printMBBReference(MBB) << ":\n"); 1066 1067 SmallVector<MachineInstr *, 4> SplitPoints; 1068 char State = BI.InitialState; 1069 MachineInstr *StrictEntry = nullptr; 1070 1071 for (MachineInstr &MI : llvm::make_early_inc_range( 1072 llvm::make_range(MBB.getFirstNonPHI(), MBB.end()))) { 1073 char PreviousState = State; 1074 1075 if (StateTransition.count(&MI)) 1076 State = StateTransition[&MI]; 1077 1078 MachineInstr *SplitPoint = nullptr; 1079 switch (MI.getOpcode()) { 1080 case AMDGPU::SI_DEMOTE_I1: 1081 case AMDGPU::SI_KILL_I1_TERMINATOR: 1082 SplitPoint = lowerKillI1(MBB, MI, State == StateWQM); 1083 break; 1084 case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR: 1085 SplitPoint = lowerKillF32(MBB, MI); 1086 break; 1087 case AMDGPU::ENTER_STRICT_WQM: 1088 StrictEntry = PreviousState == StateWQM ? &MI : nullptr; 1089 break; 1090 case AMDGPU::EXIT_STRICT_WQM: 1091 if (State == StateWQM && StrictEntry) { 1092 // Transition WQM -> StrictWQM -> WQM detected. 1093 lowerPseudoStrictMode(MBB, StrictEntry, &MI); 1094 } 1095 StrictEntry = nullptr; 1096 break; 1097 case AMDGPU::ENTER_STRICT_WWM: 1098 case AMDGPU::EXIT_STRICT_WWM: 1099 StrictEntry = nullptr; 1100 break; 1101 default: 1102 break; 1103 } 1104 if (SplitPoint) 1105 SplitPoints.push_back(SplitPoint); 1106 } 1107 1108 // Perform splitting after instruction scan to simplify iteration. 1109 if (!SplitPoints.empty()) { 1110 MachineBasicBlock *BB = &MBB; 1111 for (MachineInstr *MI : SplitPoints) { 1112 BB = splitBlock(BB, MI); 1113 } 1114 } 1115 } 1116 1117 // Return an iterator in the (inclusive) range [First, Last] at which 1118 // instructions can be safely inserted, keeping in mind that some of the 1119 // instructions we want to add necessarily clobber SCC. 1120 MachineBasicBlock::iterator SIWholeQuadMode::prepareInsertion( 1121 MachineBasicBlock &MBB, MachineBasicBlock::iterator First, 1122 MachineBasicBlock::iterator Last, bool PreferLast, bool SaveSCC) { 1123 if (!SaveSCC) 1124 return PreferLast ? Last : First; 1125 1126 LiveRange &LR = 1127 LIS->getRegUnit(*TRI->regunits(MCRegister::from(AMDGPU::SCC)).begin()); 1128 auto MBBE = MBB.end(); 1129 SlotIndex FirstIdx = First != MBBE ? LIS->getInstructionIndex(*First) 1130 : LIS->getMBBEndIdx(&MBB); 1131 SlotIndex LastIdx = 1132 Last != MBBE ? LIS->getInstructionIndex(*Last) : LIS->getMBBEndIdx(&MBB); 1133 SlotIndex Idx = PreferLast ? LastIdx : FirstIdx; 1134 const LiveRange::Segment *S; 1135 1136 for (;;) { 1137 S = LR.getSegmentContaining(Idx); 1138 if (!S) 1139 break; 1140 1141 if (PreferLast) { 1142 SlotIndex Next = S->start.getBaseIndex(); 1143 if (Next < FirstIdx) 1144 break; 1145 Idx = Next; 1146 } else { 1147 MachineInstr *EndMI = LIS->getInstructionFromIndex(S->end.getBaseIndex()); 1148 assert(EndMI && "Segment does not end on valid instruction"); 1149 auto NextI = std::next(EndMI->getIterator()); 1150 if (NextI == MBB.end()) 1151 break; 1152 SlotIndex Next = LIS->getInstructionIndex(*NextI); 1153 if (Next > LastIdx) 1154 break; 1155 Idx = Next; 1156 } 1157 } 1158 1159 MachineBasicBlock::iterator MBBI; 1160 1161 if (MachineInstr *MI = LIS->getInstructionFromIndex(Idx)) 1162 MBBI = MI; 1163 else { 1164 assert(Idx == LIS->getMBBEndIdx(&MBB)); 1165 MBBI = MBB.end(); 1166 } 1167 1168 // Move insertion point past any operations modifying EXEC. 1169 // This assumes that the value of SCC defined by any of these operations 1170 // does not need to be preserved. 1171 while (MBBI != Last) { 1172 bool IsExecDef = false; 1173 for (const MachineOperand &MO : MBBI->all_defs()) { 1174 IsExecDef |= 1175 MO.getReg() == AMDGPU::EXEC_LO || MO.getReg() == AMDGPU::EXEC; 1176 } 1177 if (!IsExecDef) 1178 break; 1179 MBBI++; 1180 S = nullptr; 1181 } 1182 1183 if (S) 1184 MBBI = saveSCC(MBB, MBBI); 1185 1186 return MBBI; 1187 } 1188 1189 void SIWholeQuadMode::toExact(MachineBasicBlock &MBB, 1190 MachineBasicBlock::iterator Before, 1191 Register SaveWQM) { 1192 bool IsTerminator = Before == MBB.end(); 1193 if (!IsTerminator) { 1194 auto FirstTerm = MBB.getFirstTerminator(); 1195 if (FirstTerm != MBB.end()) { 1196 SlotIndex FirstTermIdx = LIS->getInstructionIndex(*FirstTerm); 1197 SlotIndex BeforeIdx = LIS->getInstructionIndex(*Before); 1198 IsTerminator = BeforeIdx > FirstTermIdx; 1199 } 1200 } 1201 1202 MachineInstr *MI; 1203 1204 if (SaveWQM) { 1205 unsigned Opcode = IsTerminator ? AndSaveExecTermOpc : AndSaveExecOpc; 1206 MI = BuildMI(MBB, Before, DebugLoc(), TII->get(Opcode), SaveWQM) 1207 .addReg(LiveMaskReg); 1208 } else { 1209 unsigned Opcode = IsTerminator ? AndTermOpc : AndOpc; 1210 MI = BuildMI(MBB, Before, DebugLoc(), TII->get(Opcode), Exec) 1211 .addReg(Exec) 1212 .addReg(LiveMaskReg); 1213 } 1214 1215 LIS->InsertMachineInstrInMaps(*MI); 1216 StateTransition[MI] = StateExact; 1217 } 1218 1219 void SIWholeQuadMode::toWQM(MachineBasicBlock &MBB, 1220 MachineBasicBlock::iterator Before, 1221 Register SavedWQM) { 1222 MachineInstr *MI; 1223 1224 if (SavedWQM) { 1225 MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), Exec) 1226 .addReg(SavedWQM); 1227 } else { 1228 MI = BuildMI(MBB, Before, DebugLoc(), TII->get(WQMOpc), Exec).addReg(Exec); 1229 } 1230 1231 LIS->InsertMachineInstrInMaps(*MI); 1232 StateTransition[MI] = StateWQM; 1233 } 1234 1235 void SIWholeQuadMode::toStrictMode(MachineBasicBlock &MBB, 1236 MachineBasicBlock::iterator Before, 1237 Register SaveOrig, char StrictStateNeeded) { 1238 MachineInstr *MI; 1239 assert(SaveOrig); 1240 assert(StrictStateNeeded == StateStrictWWM || 1241 StrictStateNeeded == StateStrictWQM); 1242 1243 if (StrictStateNeeded == StateStrictWWM) { 1244 MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::ENTER_STRICT_WWM), 1245 SaveOrig) 1246 .addImm(-1); 1247 } else { 1248 MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::ENTER_STRICT_WQM), 1249 SaveOrig) 1250 .addImm(-1); 1251 } 1252 LIS->InsertMachineInstrInMaps(*MI); 1253 StateTransition[MI] = StrictStateNeeded; 1254 1255 // Mark block as needing lower so it will be checked for unnecessary transitions. 1256 auto BII = Blocks.find(&MBB); 1257 if (BII != Blocks.end()) 1258 BII->second.NeedsLowering = true; 1259 } 1260 1261 void SIWholeQuadMode::fromStrictMode(MachineBasicBlock &MBB, 1262 MachineBasicBlock::iterator Before, 1263 Register SavedOrig, char NonStrictState, 1264 char CurrentStrictState) { 1265 MachineInstr *MI; 1266 1267 assert(SavedOrig); 1268 assert(CurrentStrictState == StateStrictWWM || 1269 CurrentStrictState == StateStrictWQM); 1270 1271 if (CurrentStrictState == StateStrictWWM) { 1272 MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::EXIT_STRICT_WWM), 1273 Exec) 1274 .addReg(SavedOrig); 1275 } else { 1276 MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::EXIT_STRICT_WQM), 1277 Exec) 1278 .addReg(SavedOrig); 1279 } 1280 LIS->InsertMachineInstrInMaps(*MI); 1281 StateTransition[MI] = NonStrictState; 1282 } 1283 1284 void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, bool IsEntry) { 1285 auto BII = Blocks.find(&MBB); 1286 if (BII == Blocks.end()) 1287 return; 1288 1289 BlockInfo &BI = BII->second; 1290 1291 // This is a non-entry block that is WQM throughout, so no need to do 1292 // anything. 1293 if (!IsEntry && BI.Needs == StateWQM && BI.OutNeeds != StateExact) { 1294 BI.InitialState = StateWQM; 1295 return; 1296 } 1297 1298 LLVM_DEBUG(dbgs() << "\nProcessing block " << printMBBReference(MBB) 1299 << ":\n"); 1300 1301 Register SavedWQMReg; 1302 Register SavedNonStrictReg; 1303 bool WQMFromExec = IsEntry; 1304 char State = (IsEntry || !(BI.InNeeds & StateWQM)) ? StateExact : StateWQM; 1305 char NonStrictState = 0; 1306 const TargetRegisterClass *BoolRC = TRI->getBoolRC(); 1307 1308 auto II = MBB.getFirstNonPHI(), IE = MBB.end(); 1309 if (IsEntry) { 1310 // Skip the instruction that saves LiveMask 1311 if (II != IE && II->getOpcode() == AMDGPU::COPY && 1312 II->getOperand(1).getReg() == TRI->getExec()) 1313 ++II; 1314 } 1315 1316 // This stores the first instruction where it's safe to switch from WQM to 1317 // Exact or vice versa. 1318 MachineBasicBlock::iterator FirstWQM = IE; 1319 1320 // This stores the first instruction where it's safe to switch from Strict 1321 // mode to Exact/WQM or to switch to Strict mode. It must always be the same 1322 // as, or after, FirstWQM since if it's safe to switch to/from Strict, it must 1323 // be safe to switch to/from WQM as well. 1324 MachineBasicBlock::iterator FirstStrict = IE; 1325 1326 // Record initial state is block information. 1327 BI.InitialState = State; 1328 1329 for (;;) { 1330 MachineBasicBlock::iterator Next = II; 1331 char Needs = StateExact | StateWQM; // Strict mode is disabled by default. 1332 char OutNeeds = 0; 1333 1334 if (FirstWQM == IE) 1335 FirstWQM = II; 1336 1337 if (FirstStrict == IE) 1338 FirstStrict = II; 1339 1340 // First, figure out the allowed states (Needs) based on the propagated 1341 // flags. 1342 if (II != IE) { 1343 MachineInstr &MI = *II; 1344 1345 if (MI.isTerminator() || TII->mayReadEXEC(*MRI, MI)) { 1346 auto III = Instructions.find(&MI); 1347 if (III != Instructions.end()) { 1348 if (III->second.Needs & StateStrictWWM) 1349 Needs = StateStrictWWM; 1350 else if (III->second.Needs & StateStrictWQM) 1351 Needs = StateStrictWQM; 1352 else if (III->second.Needs & StateWQM) 1353 Needs = StateWQM; 1354 else 1355 Needs &= ~III->second.Disabled; 1356 OutNeeds = III->second.OutNeeds; 1357 } 1358 } else { 1359 // If the instruction doesn't actually need a correct EXEC, then we can 1360 // safely leave Strict mode enabled. 1361 Needs = StateExact | StateWQM | StateStrict; 1362 } 1363 1364 // Exact mode exit can occur in terminators, but must be before branches. 1365 if (MI.isBranch() && OutNeeds == StateExact) 1366 Needs = StateExact; 1367 1368 ++Next; 1369 } else { 1370 // End of basic block 1371 if (BI.OutNeeds & StateWQM) 1372 Needs = StateWQM; 1373 else if (BI.OutNeeds == StateExact) 1374 Needs = StateExact; 1375 else 1376 Needs = StateWQM | StateExact; 1377 } 1378 1379 // Now, transition if necessary. 1380 if (!(Needs & State)) { 1381 MachineBasicBlock::iterator First; 1382 if (State == StateStrictWWM || Needs == StateStrictWWM || 1383 State == StateStrictWQM || Needs == StateStrictWQM) { 1384 // We must switch to or from Strict mode. 1385 First = FirstStrict; 1386 } else { 1387 // We only need to switch to/from WQM, so we can use FirstWQM. 1388 First = FirstWQM; 1389 } 1390 1391 // Whether we need to save SCC depends on start and end states. 1392 bool SaveSCC = false; 1393 switch (State) { 1394 case StateExact: 1395 case StateStrictWWM: 1396 case StateStrictWQM: 1397 // Exact/Strict -> Strict: save SCC 1398 // Exact/Strict -> WQM: save SCC if WQM mask is generated from exec 1399 // Exact/Strict -> Exact: no save 1400 SaveSCC = (Needs & StateStrict) || ((Needs & StateWQM) && WQMFromExec); 1401 break; 1402 case StateWQM: 1403 // WQM -> Exact/Strict: save SCC 1404 SaveSCC = !(Needs & StateWQM); 1405 break; 1406 default: 1407 llvm_unreachable("Unknown state"); 1408 break; 1409 } 1410 MachineBasicBlock::iterator Before = 1411 prepareInsertion(MBB, First, II, Needs == StateWQM, SaveSCC); 1412 1413 if (State & StateStrict) { 1414 assert(State == StateStrictWWM || State == StateStrictWQM); 1415 assert(SavedNonStrictReg); 1416 fromStrictMode(MBB, Before, SavedNonStrictReg, NonStrictState, State); 1417 1418 LIS->createAndComputeVirtRegInterval(SavedNonStrictReg); 1419 SavedNonStrictReg = 0; 1420 State = NonStrictState; 1421 } 1422 1423 if (Needs & StateStrict) { 1424 NonStrictState = State; 1425 assert(Needs == StateStrictWWM || Needs == StateStrictWQM); 1426 assert(!SavedNonStrictReg); 1427 SavedNonStrictReg = MRI->createVirtualRegister(BoolRC); 1428 1429 toStrictMode(MBB, Before, SavedNonStrictReg, Needs); 1430 State = Needs; 1431 1432 } else { 1433 if (State == StateWQM && (Needs & StateExact) && !(Needs & StateWQM)) { 1434 if (!WQMFromExec && (OutNeeds & StateWQM)) { 1435 assert(!SavedWQMReg); 1436 SavedWQMReg = MRI->createVirtualRegister(BoolRC); 1437 } 1438 1439 toExact(MBB, Before, SavedWQMReg); 1440 State = StateExact; 1441 } else if (State == StateExact && (Needs & StateWQM) && 1442 !(Needs & StateExact)) { 1443 assert(WQMFromExec == (SavedWQMReg == 0)); 1444 1445 toWQM(MBB, Before, SavedWQMReg); 1446 1447 if (SavedWQMReg) { 1448 LIS->createAndComputeVirtRegInterval(SavedWQMReg); 1449 SavedWQMReg = 0; 1450 } 1451 State = StateWQM; 1452 } else { 1453 // We can get here if we transitioned from StrictWWM to a 1454 // non-StrictWWM state that already matches our needs, but we 1455 // shouldn't need to do anything. 1456 assert(Needs & State); 1457 } 1458 } 1459 } 1460 1461 if (Needs != (StateExact | StateWQM | StateStrict)) { 1462 if (Needs != (StateExact | StateWQM)) 1463 FirstWQM = IE; 1464 FirstStrict = IE; 1465 } 1466 1467 if (II == IE) 1468 break; 1469 1470 II = Next; 1471 } 1472 assert(!SavedWQMReg); 1473 assert(!SavedNonStrictReg); 1474 } 1475 1476 void SIWholeQuadMode::lowerLiveMaskQueries() { 1477 for (MachineInstr *MI : LiveMaskQueries) { 1478 const DebugLoc &DL = MI->getDebugLoc(); 1479 Register Dest = MI->getOperand(0).getReg(); 1480 1481 MachineInstr *Copy = 1482 BuildMI(*MI->getParent(), MI, DL, TII->get(AMDGPU::COPY), Dest) 1483 .addReg(LiveMaskReg); 1484 1485 LIS->ReplaceMachineInstrInMaps(*MI, *Copy); 1486 MI->eraseFromParent(); 1487 } 1488 } 1489 1490 void SIWholeQuadMode::lowerCopyInstrs() { 1491 for (MachineInstr *MI : LowerToMovInstrs) { 1492 assert(MI->getNumExplicitOperands() == 2); 1493 1494 const Register Reg = MI->getOperand(0).getReg(); 1495 1496 const TargetRegisterClass *regClass = 1497 TRI->getRegClassForOperandReg(*MRI, MI->getOperand(0)); 1498 if (TRI->isVGPRClass(regClass)) { 1499 const unsigned MovOp = TII->getMovOpcode(regClass); 1500 MI->setDesc(TII->get(MovOp)); 1501 1502 // Check that it already implicitly depends on exec (like all VALU movs 1503 // should do). 1504 assert(any_of(MI->implicit_operands(), [](const MachineOperand &MO) { 1505 return MO.isUse() && MO.getReg() == AMDGPU::EXEC; 1506 })); 1507 } else { 1508 // Remove early-clobber and exec dependency from simple SGPR copies. 1509 // This allows some to be eliminated during/post RA. 1510 LLVM_DEBUG(dbgs() << "simplify SGPR copy: " << *MI); 1511 if (MI->getOperand(0).isEarlyClobber()) { 1512 LIS->removeInterval(Reg); 1513 MI->getOperand(0).setIsEarlyClobber(false); 1514 LIS->createAndComputeVirtRegInterval(Reg); 1515 } 1516 int Index = MI->findRegisterUseOperandIdx(AMDGPU::EXEC, /*TRI=*/nullptr); 1517 while (Index >= 0) { 1518 MI->removeOperand(Index); 1519 Index = MI->findRegisterUseOperandIdx(AMDGPU::EXEC, /*TRI=*/nullptr); 1520 } 1521 MI->setDesc(TII->get(AMDGPU::COPY)); 1522 LLVM_DEBUG(dbgs() << " -> " << *MI); 1523 } 1524 } 1525 for (MachineInstr *MI : LowerToCopyInstrs) { 1526 if (MI->getOpcode() == AMDGPU::V_SET_INACTIVE_B32 || 1527 MI->getOpcode() == AMDGPU::V_SET_INACTIVE_B64) { 1528 assert(MI->getNumExplicitOperands() == 3); 1529 // the only reason we should be here is V_SET_INACTIVE has 1530 // an undef input so it is being replaced by a simple copy. 1531 // There should be a second undef source that we should remove. 1532 assert(MI->getOperand(2).isUndef()); 1533 MI->removeOperand(2); 1534 MI->untieRegOperand(1); 1535 } else { 1536 assert(MI->getNumExplicitOperands() == 2); 1537 } 1538 1539 unsigned CopyOp = MI->getOperand(1).isReg() 1540 ? (unsigned)AMDGPU::COPY 1541 : TII->getMovOpcode(TRI->getRegClassForOperandReg( 1542 *MRI, MI->getOperand(0))); 1543 MI->setDesc(TII->get(CopyOp)); 1544 } 1545 } 1546 1547 void SIWholeQuadMode::lowerKillInstrs(bool IsWQM) { 1548 for (MachineInstr *MI : KillInstrs) { 1549 MachineBasicBlock *MBB = MI->getParent(); 1550 MachineInstr *SplitPoint = nullptr; 1551 switch (MI->getOpcode()) { 1552 case AMDGPU::SI_DEMOTE_I1: 1553 case AMDGPU::SI_KILL_I1_TERMINATOR: 1554 SplitPoint = lowerKillI1(*MBB, *MI, IsWQM); 1555 break; 1556 case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR: 1557 SplitPoint = lowerKillF32(*MBB, *MI); 1558 break; 1559 } 1560 if (SplitPoint) 1561 splitBlock(MBB, SplitPoint); 1562 } 1563 } 1564 1565 void SIWholeQuadMode::lowerInitExec(MachineInstr &MI) { 1566 MachineBasicBlock *MBB = MI.getParent(); 1567 bool IsWave32 = ST->isWave32(); 1568 1569 if (MI.getOpcode() == AMDGPU::SI_INIT_EXEC) { 1570 // This should be before all vector instructions. 1571 MachineInstr *InitMI = 1572 BuildMI(*MBB, MBB->begin(), MI.getDebugLoc(), 1573 TII->get(IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64), 1574 Exec) 1575 .addImm(MI.getOperand(0).getImm()); 1576 if (LIS) { 1577 LIS->RemoveMachineInstrFromMaps(MI); 1578 LIS->InsertMachineInstrInMaps(*InitMI); 1579 } 1580 MI.eraseFromParent(); 1581 return; 1582 } 1583 1584 // Extract the thread count from an SGPR input and set EXEC accordingly. 1585 // Since BFM can't shift by 64, handle that case with CMP + CMOV. 1586 // 1587 // S_BFE_U32 count, input, {shift, 7} 1588 // S_BFM_B64 exec, count, 0 1589 // S_CMP_EQ_U32 count, 64 1590 // S_CMOV_B64 exec, -1 1591 Register InputReg = MI.getOperand(0).getReg(); 1592 MachineInstr *FirstMI = &*MBB->begin(); 1593 if (InputReg.isVirtual()) { 1594 MachineInstr *DefInstr = MRI->getVRegDef(InputReg); 1595 assert(DefInstr && DefInstr->isCopy()); 1596 if (DefInstr->getParent() == MBB) { 1597 if (DefInstr != FirstMI) { 1598 // If the `InputReg` is defined in current block, we also need to 1599 // move that instruction to the beginning of the block. 1600 DefInstr->removeFromParent(); 1601 MBB->insert(FirstMI, DefInstr); 1602 if (LIS) 1603 LIS->handleMove(*DefInstr); 1604 } else { 1605 // If first instruction is definition then move pointer after it. 1606 FirstMI = &*std::next(FirstMI->getIterator()); 1607 } 1608 } 1609 } 1610 1611 // Insert instruction sequence at block beginning (before vector operations). 1612 const DebugLoc DL = MI.getDebugLoc(); 1613 const unsigned WavefrontSize = ST->getWavefrontSize(); 1614 const unsigned Mask = (WavefrontSize << 1) - 1; 1615 Register CountReg = MRI->createVirtualRegister(&AMDGPU::SGPR_32RegClass); 1616 auto BfeMI = BuildMI(*MBB, FirstMI, DL, TII->get(AMDGPU::S_BFE_U32), CountReg) 1617 .addReg(InputReg) 1618 .addImm((MI.getOperand(1).getImm() & Mask) | 0x70000); 1619 auto BfmMI = 1620 BuildMI(*MBB, FirstMI, DL, 1621 TII->get(IsWave32 ? AMDGPU::S_BFM_B32 : AMDGPU::S_BFM_B64), Exec) 1622 .addReg(CountReg) 1623 .addImm(0); 1624 auto CmpMI = BuildMI(*MBB, FirstMI, DL, TII->get(AMDGPU::S_CMP_EQ_U32)) 1625 .addReg(CountReg, RegState::Kill) 1626 .addImm(WavefrontSize); 1627 auto CmovMI = 1628 BuildMI(*MBB, FirstMI, DL, 1629 TII->get(IsWave32 ? AMDGPU::S_CMOV_B32 : AMDGPU::S_CMOV_B64), 1630 Exec) 1631 .addImm(-1); 1632 1633 if (!LIS) { 1634 MI.eraseFromParent(); 1635 return; 1636 } 1637 1638 LIS->RemoveMachineInstrFromMaps(MI); 1639 MI.eraseFromParent(); 1640 1641 LIS->InsertMachineInstrInMaps(*BfeMI); 1642 LIS->InsertMachineInstrInMaps(*BfmMI); 1643 LIS->InsertMachineInstrInMaps(*CmpMI); 1644 LIS->InsertMachineInstrInMaps(*CmovMI); 1645 1646 LIS->removeInterval(InputReg); 1647 LIS->createAndComputeVirtRegInterval(InputReg); 1648 LIS->createAndComputeVirtRegInterval(CountReg); 1649 } 1650 1651 /// Lower INIT_EXEC instructions. Return a suitable insert point in \p Entry 1652 /// for instructions that depend on EXEC. 1653 MachineBasicBlock::iterator 1654 SIWholeQuadMode::lowerInitExecInstrs(MachineBasicBlock &Entry) { 1655 MachineBasicBlock::iterator InsertPt = Entry.getFirstNonPHI(); 1656 1657 for (MachineInstr *MI : InitExecInstrs) { 1658 // Try to handle undefined cases gracefully: 1659 // - multiple INIT_EXEC instructions 1660 // - INIT_EXEC instructions not in the entry block 1661 if (MI->getParent() == &Entry) 1662 InsertPt = std::next(MI->getIterator()); 1663 1664 lowerInitExec(*MI); 1665 } 1666 1667 return InsertPt; 1668 } 1669 1670 bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) { 1671 LLVM_DEBUG(dbgs() << "SI Whole Quad Mode on " << MF.getName() 1672 << " ------------- \n"); 1673 LLVM_DEBUG(MF.dump();); 1674 1675 Instructions.clear(); 1676 Blocks.clear(); 1677 LiveMaskQueries.clear(); 1678 LowerToCopyInstrs.clear(); 1679 LowerToMovInstrs.clear(); 1680 KillInstrs.clear(); 1681 InitExecInstrs.clear(); 1682 StateTransition.clear(); 1683 1684 ST = &MF.getSubtarget<GCNSubtarget>(); 1685 1686 TII = ST->getInstrInfo(); 1687 TRI = &TII->getRegisterInfo(); 1688 MRI = &MF.getRegInfo(); 1689 LIS = &getAnalysis<LiveIntervals>(); 1690 MDT = getAnalysisIfAvailable<MachineDominatorTree>(); 1691 PDT = getAnalysisIfAvailable<MachinePostDominatorTree>(); 1692 1693 if (ST->isWave32()) { 1694 AndOpc = AMDGPU::S_AND_B32; 1695 AndTermOpc = AMDGPU::S_AND_B32_term; 1696 AndN2Opc = AMDGPU::S_ANDN2_B32; 1697 XorOpc = AMDGPU::S_XOR_B32; 1698 AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B32; 1699 AndSaveExecTermOpc = AMDGPU::S_AND_SAVEEXEC_B32_term; 1700 WQMOpc = AMDGPU::S_WQM_B32; 1701 Exec = AMDGPU::EXEC_LO; 1702 } else { 1703 AndOpc = AMDGPU::S_AND_B64; 1704 AndTermOpc = AMDGPU::S_AND_B64_term; 1705 AndN2Opc = AMDGPU::S_ANDN2_B64; 1706 XorOpc = AMDGPU::S_XOR_B64; 1707 AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B64; 1708 AndSaveExecTermOpc = AMDGPU::S_AND_SAVEEXEC_B64_term; 1709 WQMOpc = AMDGPU::S_WQM_B64; 1710 Exec = AMDGPU::EXEC; 1711 } 1712 1713 const char GlobalFlags = analyzeFunction(MF); 1714 const bool NeedsLiveMask = !(KillInstrs.empty() && LiveMaskQueries.empty()); 1715 1716 LiveMaskReg = Exec; 1717 1718 MachineBasicBlock &Entry = MF.front(); 1719 MachineBasicBlock::iterator EntryMI = lowerInitExecInstrs(Entry); 1720 1721 // Shader is simple does not need any state changes or any complex lowering 1722 if (!(GlobalFlags & (StateWQM | StateStrict)) && LowerToCopyInstrs.empty() && 1723 LowerToMovInstrs.empty() && KillInstrs.empty()) { 1724 lowerLiveMaskQueries(); 1725 return !InitExecInstrs.empty() || !LiveMaskQueries.empty(); 1726 } 1727 1728 // Store a copy of the original live mask when required 1729 if (NeedsLiveMask || (GlobalFlags & StateWQM)) { 1730 LiveMaskReg = MRI->createVirtualRegister(TRI->getBoolRC()); 1731 MachineInstr *MI = 1732 BuildMI(Entry, EntryMI, DebugLoc(), TII->get(AMDGPU::COPY), LiveMaskReg) 1733 .addReg(Exec); 1734 LIS->InsertMachineInstrInMaps(*MI); 1735 } 1736 1737 LLVM_DEBUG(printInfo()); 1738 1739 lowerLiveMaskQueries(); 1740 lowerCopyInstrs(); 1741 1742 // Shader only needs WQM 1743 if (GlobalFlags == StateWQM) { 1744 auto MI = BuildMI(Entry, EntryMI, DebugLoc(), TII->get(WQMOpc), Exec) 1745 .addReg(Exec); 1746 LIS->InsertMachineInstrInMaps(*MI); 1747 lowerKillInstrs(true); 1748 } else { 1749 for (auto BII : Blocks) 1750 processBlock(*BII.first, BII.first == &Entry); 1751 // Lowering blocks causes block splitting so perform as a second pass. 1752 for (auto BII : Blocks) 1753 lowerBlock(*BII.first); 1754 } 1755 1756 // Compute live range for live mask 1757 if (LiveMaskReg != Exec) 1758 LIS->createAndComputeVirtRegInterval(LiveMaskReg); 1759 1760 // Physical registers like SCC aren't tracked by default anyway, so just 1761 // removing the ranges we computed is the simplest option for maintaining 1762 // the analysis results. 1763 LIS->removeAllRegUnitsForPhysReg(AMDGPU::SCC); 1764 1765 // If we performed any kills then recompute EXEC 1766 if (!KillInstrs.empty()) 1767 LIS->removeAllRegUnitsForPhysReg(AMDGPU::EXEC); 1768 1769 return true; 1770 } 1771