1 //===- SIFixSGPRCopies.cpp - Remove potential VGPR => SGPR copies ---------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// Copies from VGPR to SGPR registers are illegal and the register coalescer 11 /// will sometimes generate these illegal copies in situations like this: 12 /// 13 /// Register Class <vsrc> is the union of <vgpr> and <sgpr> 14 /// 15 /// BB0: 16 /// %0 <sgpr> = SCALAR_INST 17 /// %1 <vsrc> = COPY %0 <sgpr> 18 /// ... 19 /// BRANCH %cond BB1, BB2 20 /// BB1: 21 /// %2 <vgpr> = VECTOR_INST 22 /// %3 <vsrc> = COPY %2 <vgpr> 23 /// BB2: 24 /// %4 <vsrc> = PHI %1 <vsrc>, <%bb.0>, %3 <vrsc>, <%bb.1> 25 /// %5 <vgpr> = VECTOR_INST %4 <vsrc> 26 /// 27 /// 28 /// The coalescer will begin at BB0 and eliminate its copy, then the resulting 29 /// code will look like this: 30 /// 31 /// BB0: 32 /// %0 <sgpr> = SCALAR_INST 33 /// ... 34 /// BRANCH %cond BB1, BB2 35 /// BB1: 36 /// %2 <vgpr> = VECTOR_INST 37 /// %3 <vsrc> = COPY %2 <vgpr> 38 /// BB2: 39 /// %4 <sgpr> = PHI %0 <sgpr>, <%bb.0>, %3 <vsrc>, <%bb.1> 40 /// %5 <vgpr> = VECTOR_INST %4 <sgpr> 41 /// 42 /// Now that the result of the PHI instruction is an SGPR, the register 43 /// allocator is now forced to constrain the register class of %3 to 44 /// <sgpr> so we end up with final code like this: 45 /// 46 /// BB0: 47 /// %0 <sgpr> = SCALAR_INST 48 /// ... 49 /// BRANCH %cond BB1, BB2 50 /// BB1: 51 /// %2 <vgpr> = VECTOR_INST 52 /// %3 <sgpr> = COPY %2 <vgpr> 53 /// BB2: 54 /// %4 <sgpr> = PHI %0 <sgpr>, <%bb.0>, %3 <sgpr>, <%bb.1> 55 /// %5 <vgpr> = VECTOR_INST %4 <sgpr> 56 /// 57 /// Now this code contains an illegal copy from a VGPR to an SGPR. 58 /// 59 /// In order to avoid this problem, this pass searches for PHI instructions 60 /// which define a <vsrc> register and constrains its definition class to 61 /// <vgpr> if the user of the PHI's definition register is a vector instruction. 62 /// If the PHI's definition class is constrained to <vgpr> then the coalescer 63 /// will be unable to perform the COPY removal from the above example which 64 /// ultimately led to the creation of an illegal COPY. 65 //===----------------------------------------------------------------------===// 66 67 #include "SIFixSGPRCopies.h" 68 #include "AMDGPU.h" 69 #include "GCNSubtarget.h" 70 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 71 #include "llvm/CodeGen/MachineDominators.h" 72 #include "llvm/InitializePasses.h" 73 #include "llvm/Target/TargetMachine.h" 74 75 using namespace llvm; 76 77 #define DEBUG_TYPE "si-fix-sgpr-copies" 78 79 static cl::opt<bool> EnableM0Merge( 80 "amdgpu-enable-merge-m0", 81 cl::desc("Merge and hoist M0 initializations"), 82 cl::init(true)); 83 84 namespace { 85 86 class V2SCopyInfo { 87 public: 88 // VGPR to SGPR copy being processed 89 MachineInstr *Copy; 90 // All SALU instructions reachable from this copy in SSA graph 91 SetVector<MachineInstr *> SChain; 92 // Number of SGPR to VGPR copies that are used to put the SALU computation 93 // results back to VALU. 94 unsigned NumSVCopies = 0; 95 96 unsigned Score = 0; 97 // Actual count of v_readfirstlane_b32 98 // which need to be inserted to keep SChain SALU 99 unsigned NumReadfirstlanes = 0; 100 // Current score state. To speedup selection V2SCopyInfos for processing 101 bool NeedToBeConvertedToVALU = false; 102 // Unique ID. Used as a key for mapping to keep permanent order. 103 unsigned ID; 104 105 // Count of another VGPR to SGPR copies that contribute to the 106 // current copy SChain 107 unsigned SiblingPenalty = 0; 108 SetVector<unsigned> Siblings; 109 V2SCopyInfo() : Copy(nullptr), ID(0){}; 110 V2SCopyInfo(unsigned Id, MachineInstr *C, unsigned Width) 111 : Copy(C), NumReadfirstlanes(Width / 32), ID(Id){}; 112 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 113 void dump() { 114 dbgs() << ID << " : " << *Copy << "\n\tS:" << SChain.size() 115 << "\n\tSV:" << NumSVCopies << "\n\tSP: " << SiblingPenalty 116 << "\nScore: " << Score << "\n"; 117 } 118 #endif 119 }; 120 121 class SIFixSGPRCopies { 122 MachineDominatorTree *MDT; 123 SmallVector<MachineInstr*, 4> SCCCopies; 124 SmallVector<MachineInstr*, 4> RegSequences; 125 SmallVector<MachineInstr*, 4> PHINodes; 126 SmallVector<MachineInstr*, 4> S2VCopies; 127 unsigned NextVGPRToSGPRCopyID = 0; 128 MapVector<unsigned, V2SCopyInfo> V2SCopies; 129 DenseMap<MachineInstr *, SetVector<unsigned>> SiblingPenalty; 130 131 public: 132 MachineRegisterInfo *MRI; 133 const SIRegisterInfo *TRI; 134 const SIInstrInfo *TII; 135 136 SIFixSGPRCopies(MachineDominatorTree *MDT) : MDT(MDT) {} 137 138 bool run(MachineFunction &MF); 139 void fixSCCCopies(MachineFunction &MF); 140 void prepareRegSequenceAndPHIs(MachineFunction &MF); 141 unsigned getNextVGPRToSGPRCopyId() { return ++NextVGPRToSGPRCopyID; } 142 bool needToBeConvertedToVALU(V2SCopyInfo *I); 143 void analyzeVGPRToSGPRCopy(MachineInstr *MI); 144 void lowerVGPR2SGPRCopies(MachineFunction &MF); 145 // Handles copies which source register is: 146 // 1. Physical register 147 // 2. AGPR 148 // 3. Defined by the instruction the merely moves the immediate 149 bool lowerSpecialCase(MachineInstr &MI, MachineBasicBlock::iterator &I); 150 151 void processPHINode(MachineInstr &MI); 152 153 // Check if MO is an immediate materialized into a VGPR, and if so replace it 154 // with an SGPR immediate. The VGPR immediate is also deleted if it does not 155 // have any other uses. 156 bool tryMoveVGPRConstToSGPR(MachineOperand &MO, Register NewDst, 157 MachineBasicBlock *BlockToInsertTo, 158 MachineBasicBlock::iterator PointToInsertTo); 159 }; 160 161 class SIFixSGPRCopiesLegacy : public MachineFunctionPass { 162 public: 163 static char ID; 164 165 SIFixSGPRCopiesLegacy() : MachineFunctionPass(ID) {} 166 167 bool runOnMachineFunction(MachineFunction &MF) override { 168 MachineDominatorTree *MDT = 169 &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree(); 170 SIFixSGPRCopies Impl(MDT); 171 return Impl.run(MF); 172 } 173 174 StringRef getPassName() const override { return "SI Fix SGPR copies"; } 175 176 void getAnalysisUsage(AnalysisUsage &AU) const override { 177 AU.addRequired<MachineDominatorTreeWrapperPass>(); 178 AU.addPreserved<MachineDominatorTreeWrapperPass>(); 179 AU.setPreservesCFG(); 180 MachineFunctionPass::getAnalysisUsage(AU); 181 } 182 }; 183 184 } // end anonymous namespace 185 186 INITIALIZE_PASS_BEGIN(SIFixSGPRCopiesLegacy, DEBUG_TYPE, "SI Fix SGPR copies", 187 false, false) 188 INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass) 189 INITIALIZE_PASS_END(SIFixSGPRCopiesLegacy, DEBUG_TYPE, "SI Fix SGPR copies", 190 false, false) 191 192 char SIFixSGPRCopiesLegacy::ID = 0; 193 194 char &llvm::SIFixSGPRCopiesLegacyID = SIFixSGPRCopiesLegacy::ID; 195 196 FunctionPass *llvm::createSIFixSGPRCopiesLegacyPass() { 197 return new SIFixSGPRCopiesLegacy(); 198 } 199 200 static std::pair<const TargetRegisterClass *, const TargetRegisterClass *> 201 getCopyRegClasses(const MachineInstr &Copy, 202 const SIRegisterInfo &TRI, 203 const MachineRegisterInfo &MRI) { 204 Register DstReg = Copy.getOperand(0).getReg(); 205 Register SrcReg = Copy.getOperand(1).getReg(); 206 207 const TargetRegisterClass *SrcRC = SrcReg.isVirtual() 208 ? MRI.getRegClass(SrcReg) 209 : TRI.getPhysRegBaseClass(SrcReg); 210 211 // We don't really care about the subregister here. 212 // SrcRC = TRI.getSubRegClass(SrcRC, Copy.getOperand(1).getSubReg()); 213 214 const TargetRegisterClass *DstRC = DstReg.isVirtual() 215 ? MRI.getRegClass(DstReg) 216 : TRI.getPhysRegBaseClass(DstReg); 217 218 return std::pair(SrcRC, DstRC); 219 } 220 221 static bool isVGPRToSGPRCopy(const TargetRegisterClass *SrcRC, 222 const TargetRegisterClass *DstRC, 223 const SIRegisterInfo &TRI) { 224 return SrcRC != &AMDGPU::VReg_1RegClass && TRI.isSGPRClass(DstRC) && 225 TRI.hasVectorRegisters(SrcRC); 226 } 227 228 static bool isSGPRToVGPRCopy(const TargetRegisterClass *SrcRC, 229 const TargetRegisterClass *DstRC, 230 const SIRegisterInfo &TRI) { 231 return DstRC != &AMDGPU::VReg_1RegClass && TRI.isSGPRClass(SrcRC) && 232 TRI.hasVectorRegisters(DstRC); 233 } 234 235 static bool tryChangeVGPRtoSGPRinCopy(MachineInstr &MI, 236 const SIRegisterInfo *TRI, 237 const SIInstrInfo *TII) { 238 MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); 239 auto &Src = MI.getOperand(1); 240 Register DstReg = MI.getOperand(0).getReg(); 241 Register SrcReg = Src.getReg(); 242 if (!SrcReg.isVirtual() || !DstReg.isVirtual()) 243 return false; 244 245 for (const auto &MO : MRI.reg_nodbg_operands(DstReg)) { 246 const auto *UseMI = MO.getParent(); 247 if (UseMI == &MI) 248 continue; 249 if (MO.isDef() || UseMI->getParent() != MI.getParent() || 250 UseMI->getOpcode() <= TargetOpcode::GENERIC_OP_END) 251 return false; 252 253 unsigned OpIdx = MO.getOperandNo(); 254 if (OpIdx >= UseMI->getDesc().getNumOperands() || 255 !TII->isOperandLegal(*UseMI, OpIdx, &Src)) 256 return false; 257 } 258 // Change VGPR to SGPR destination. 259 MRI.setRegClass(DstReg, TRI->getEquivalentSGPRClass(MRI.getRegClass(DstReg))); 260 return true; 261 } 262 263 // Distribute an SGPR->VGPR copy of a REG_SEQUENCE into a VGPR REG_SEQUENCE. 264 // 265 // SGPRx = ... 266 // SGPRy = REG_SEQUENCE SGPRx, sub0 ... 267 // VGPRz = COPY SGPRy 268 // 269 // ==> 270 // 271 // VGPRx = COPY SGPRx 272 // VGPRz = REG_SEQUENCE VGPRx, sub0 273 // 274 // This exposes immediate folding opportunities when materializing 64-bit 275 // immediates. 276 static bool foldVGPRCopyIntoRegSequence(MachineInstr &MI, 277 const SIRegisterInfo *TRI, 278 const SIInstrInfo *TII, 279 MachineRegisterInfo &MRI) { 280 assert(MI.isRegSequence()); 281 282 Register DstReg = MI.getOperand(0).getReg(); 283 if (!TRI->isSGPRClass(MRI.getRegClass(DstReg))) 284 return false; 285 286 if (!MRI.hasOneUse(DstReg)) 287 return false; 288 289 MachineInstr &CopyUse = *MRI.use_instr_begin(DstReg); 290 if (!CopyUse.isCopy()) 291 return false; 292 293 // It is illegal to have vreg inputs to a physreg defining reg_sequence. 294 if (CopyUse.getOperand(0).getReg().isPhysical()) 295 return false; 296 297 const TargetRegisterClass *SrcRC, *DstRC; 298 std::tie(SrcRC, DstRC) = getCopyRegClasses(CopyUse, *TRI, MRI); 299 300 if (!isSGPRToVGPRCopy(SrcRC, DstRC, *TRI)) 301 return false; 302 303 if (tryChangeVGPRtoSGPRinCopy(CopyUse, TRI, TII)) 304 return true; 305 306 // TODO: Could have multiple extracts? 307 unsigned SubReg = CopyUse.getOperand(1).getSubReg(); 308 if (SubReg != AMDGPU::NoSubRegister) 309 return false; 310 311 MRI.setRegClass(DstReg, DstRC); 312 313 // SGPRx = ... 314 // SGPRy = REG_SEQUENCE SGPRx, sub0 ... 315 // VGPRz = COPY SGPRy 316 317 // => 318 // VGPRx = COPY SGPRx 319 // VGPRz = REG_SEQUENCE VGPRx, sub0 320 321 MI.getOperand(0).setReg(CopyUse.getOperand(0).getReg()); 322 bool IsAGPR = TRI->isAGPRClass(DstRC); 323 324 for (unsigned I = 1, N = MI.getNumOperands(); I != N; I += 2) { 325 const TargetRegisterClass *SrcRC = 326 TRI->getRegClassForOperandReg(MRI, MI.getOperand(I)); 327 assert(TRI->isSGPRClass(SrcRC) && 328 "Expected SGPR REG_SEQUENCE to only have SGPR inputs"); 329 const TargetRegisterClass *NewSrcRC = TRI->getEquivalentVGPRClass(SrcRC); 330 331 Register TmpReg = MRI.createVirtualRegister(NewSrcRC); 332 333 BuildMI(*MI.getParent(), &MI, MI.getDebugLoc(), TII->get(AMDGPU::COPY), 334 TmpReg) 335 .add(MI.getOperand(I)); 336 337 if (IsAGPR) { 338 const TargetRegisterClass *NewSrcRC = TRI->getEquivalentAGPRClass(SrcRC); 339 Register TmpAReg = MRI.createVirtualRegister(NewSrcRC); 340 unsigned Opc = NewSrcRC == &AMDGPU::AGPR_32RegClass ? 341 AMDGPU::V_ACCVGPR_WRITE_B32_e64 : AMDGPU::COPY; 342 BuildMI(*MI.getParent(), &MI, MI.getDebugLoc(), TII->get(Opc), 343 TmpAReg) 344 .addReg(TmpReg, RegState::Kill); 345 TmpReg = TmpAReg; 346 } 347 348 MI.getOperand(I).setReg(TmpReg); 349 } 350 351 CopyUse.eraseFromParent(); 352 return true; 353 } 354 355 static bool isSafeToFoldImmIntoCopy(const MachineInstr *Copy, 356 const MachineInstr *MoveImm, 357 const SIInstrInfo *TII, 358 unsigned &SMovOp, 359 int64_t &Imm) { 360 if (Copy->getOpcode() != AMDGPU::COPY) 361 return false; 362 363 if (!MoveImm->isMoveImmediate()) 364 return false; 365 366 const MachineOperand *ImmOp = 367 TII->getNamedOperand(*MoveImm, AMDGPU::OpName::src0); 368 if (!ImmOp->isImm()) 369 return false; 370 371 // FIXME: Handle copies with sub-regs. 372 if (Copy->getOperand(1).getSubReg()) 373 return false; 374 375 switch (MoveImm->getOpcode()) { 376 default: 377 return false; 378 case AMDGPU::V_MOV_B32_e32: 379 SMovOp = AMDGPU::S_MOV_B32; 380 break; 381 case AMDGPU::V_MOV_B64_PSEUDO: 382 SMovOp = AMDGPU::S_MOV_B64_IMM_PSEUDO; 383 break; 384 } 385 Imm = ImmOp->getImm(); 386 return true; 387 } 388 389 template <class UnaryPredicate> 390 bool searchPredecessors(const MachineBasicBlock *MBB, 391 const MachineBasicBlock *CutOff, 392 UnaryPredicate Predicate) { 393 if (MBB == CutOff) 394 return false; 395 396 DenseSet<const MachineBasicBlock *> Visited; 397 SmallVector<MachineBasicBlock *, 4> Worklist(MBB->predecessors()); 398 399 while (!Worklist.empty()) { 400 MachineBasicBlock *MBB = Worklist.pop_back_val(); 401 402 if (!Visited.insert(MBB).second) 403 continue; 404 if (MBB == CutOff) 405 continue; 406 if (Predicate(MBB)) 407 return true; 408 409 Worklist.append(MBB->pred_begin(), MBB->pred_end()); 410 } 411 412 return false; 413 } 414 415 // Checks if there is potential path From instruction To instruction. 416 // If CutOff is specified and it sits in between of that path we ignore 417 // a higher portion of the path and report it is not reachable. 418 static bool isReachable(const MachineInstr *From, 419 const MachineInstr *To, 420 const MachineBasicBlock *CutOff, 421 MachineDominatorTree &MDT) { 422 if (MDT.dominates(From, To)) 423 return true; 424 425 const MachineBasicBlock *MBBFrom = From->getParent(); 426 const MachineBasicBlock *MBBTo = To->getParent(); 427 428 // Do predecessor search. 429 // We should almost never get here since we do not usually produce M0 stores 430 // other than -1. 431 return searchPredecessors(MBBTo, CutOff, [MBBFrom] 432 (const MachineBasicBlock *MBB) { return MBB == MBBFrom; }); 433 } 434 435 // Return the first non-prologue instruction in the block. 436 static MachineBasicBlock::iterator 437 getFirstNonPrologue(MachineBasicBlock *MBB, const TargetInstrInfo *TII) { 438 MachineBasicBlock::iterator I = MBB->getFirstNonPHI(); 439 while (I != MBB->end() && TII->isBasicBlockPrologue(*I)) 440 ++I; 441 442 return I; 443 } 444 445 // Hoist and merge identical SGPR initializations into a common predecessor. 446 // This is intended to combine M0 initializations, but can work with any 447 // SGPR. A VGPR cannot be processed since we cannot guarantee vector 448 // executioon. 449 static bool hoistAndMergeSGPRInits(unsigned Reg, 450 const MachineRegisterInfo &MRI, 451 const TargetRegisterInfo *TRI, 452 MachineDominatorTree &MDT, 453 const TargetInstrInfo *TII) { 454 // List of inits by immediate value. 455 using InitListMap = std::map<unsigned, std::list<MachineInstr *>>; 456 InitListMap Inits; 457 // List of clobbering instructions. 458 SmallVector<MachineInstr*, 8> Clobbers; 459 // List of instructions marked for deletion. 460 SmallSet<MachineInstr*, 8> MergedInstrs; 461 462 bool Changed = false; 463 464 for (auto &MI : MRI.def_instructions(Reg)) { 465 MachineOperand *Imm = nullptr; 466 for (auto &MO : MI.operands()) { 467 if ((MO.isReg() && ((MO.isDef() && MO.getReg() != Reg) || !MO.isDef())) || 468 (!MO.isImm() && !MO.isReg()) || (MO.isImm() && Imm)) { 469 Imm = nullptr; 470 break; 471 } 472 if (MO.isImm()) 473 Imm = &MO; 474 } 475 if (Imm) 476 Inits[Imm->getImm()].push_front(&MI); 477 else 478 Clobbers.push_back(&MI); 479 } 480 481 for (auto &Init : Inits) { 482 auto &Defs = Init.second; 483 484 for (auto I1 = Defs.begin(), E = Defs.end(); I1 != E; ) { 485 MachineInstr *MI1 = *I1; 486 487 for (auto I2 = std::next(I1); I2 != E; ) { 488 MachineInstr *MI2 = *I2; 489 490 // Check any possible interference 491 auto interferes = [&](MachineBasicBlock::iterator From, 492 MachineBasicBlock::iterator To) -> bool { 493 494 assert(MDT.dominates(&*To, &*From)); 495 496 auto interferes = [&MDT, From, To](MachineInstr* &Clobber) -> bool { 497 const MachineBasicBlock *MBBFrom = From->getParent(); 498 const MachineBasicBlock *MBBTo = To->getParent(); 499 bool MayClobberFrom = isReachable(Clobber, &*From, MBBTo, MDT); 500 bool MayClobberTo = isReachable(Clobber, &*To, MBBTo, MDT); 501 if (!MayClobberFrom && !MayClobberTo) 502 return false; 503 if ((MayClobberFrom && !MayClobberTo) || 504 (!MayClobberFrom && MayClobberTo)) 505 return true; 506 // Both can clobber, this is not an interference only if both are 507 // dominated by Clobber and belong to the same block or if Clobber 508 // properly dominates To, given that To >> From, so it dominates 509 // both and located in a common dominator. 510 return !((MBBFrom == MBBTo && 511 MDT.dominates(Clobber, &*From) && 512 MDT.dominates(Clobber, &*To)) || 513 MDT.properlyDominates(Clobber->getParent(), MBBTo)); 514 }; 515 516 return (llvm::any_of(Clobbers, interferes)) || 517 (llvm::any_of(Inits, [&](InitListMap::value_type &C) { 518 return C.first != Init.first && 519 llvm::any_of(C.second, interferes); 520 })); 521 }; 522 523 if (MDT.dominates(MI1, MI2)) { 524 if (!interferes(MI2, MI1)) { 525 LLVM_DEBUG(dbgs() 526 << "Erasing from " 527 << printMBBReference(*MI2->getParent()) << " " << *MI2); 528 MergedInstrs.insert(MI2); 529 Changed = true; 530 ++I2; 531 continue; 532 } 533 } else if (MDT.dominates(MI2, MI1)) { 534 if (!interferes(MI1, MI2)) { 535 LLVM_DEBUG(dbgs() 536 << "Erasing from " 537 << printMBBReference(*MI1->getParent()) << " " << *MI1); 538 MergedInstrs.insert(MI1); 539 Changed = true; 540 ++I1; 541 break; 542 } 543 } else { 544 auto *MBB = MDT.findNearestCommonDominator(MI1->getParent(), 545 MI2->getParent()); 546 if (!MBB) { 547 ++I2; 548 continue; 549 } 550 551 MachineBasicBlock::iterator I = getFirstNonPrologue(MBB, TII); 552 if (!interferes(MI1, I) && !interferes(MI2, I)) { 553 LLVM_DEBUG(dbgs() 554 << "Erasing from " 555 << printMBBReference(*MI1->getParent()) << " " << *MI1 556 << "and moving from " 557 << printMBBReference(*MI2->getParent()) << " to " 558 << printMBBReference(*I->getParent()) << " " << *MI2); 559 I->getParent()->splice(I, MI2->getParent(), MI2); 560 MergedInstrs.insert(MI1); 561 Changed = true; 562 ++I1; 563 break; 564 } 565 } 566 ++I2; 567 } 568 ++I1; 569 } 570 } 571 572 // Remove initializations that were merged into another. 573 for (auto &Init : Inits) { 574 auto &Defs = Init.second; 575 auto I = Defs.begin(); 576 while (I != Defs.end()) { 577 if (MergedInstrs.count(*I)) { 578 (*I)->eraseFromParent(); 579 I = Defs.erase(I); 580 } else 581 ++I; 582 } 583 } 584 585 // Try to schedule SGPR initializations as early as possible in the MBB. 586 for (auto &Init : Inits) { 587 auto &Defs = Init.second; 588 for (auto *MI : Defs) { 589 auto *MBB = MI->getParent(); 590 MachineInstr &BoundaryMI = *getFirstNonPrologue(MBB, TII); 591 MachineBasicBlock::reverse_iterator B(BoundaryMI); 592 // Check if B should actually be a boundary. If not set the previous 593 // instruction as the boundary instead. 594 if (!TII->isBasicBlockPrologue(*B)) 595 B++; 596 597 auto R = std::next(MI->getReverseIterator()); 598 const unsigned Threshold = 50; 599 // Search until B or Threshold for a place to insert the initialization. 600 for (unsigned I = 0; R != B && I < Threshold; ++R, ++I) 601 if (R->readsRegister(Reg, TRI) || R->definesRegister(Reg, TRI) || 602 TII->isSchedulingBoundary(*R, MBB, *MBB->getParent())) 603 break; 604 605 // Move to directly after R. 606 if (&*--R != MI) 607 MBB->splice(*R, MBB, MI); 608 } 609 } 610 611 if (Changed) 612 MRI.clearKillFlags(Reg); 613 614 return Changed; 615 } 616 617 bool SIFixSGPRCopies::run(MachineFunction &MF) { 618 // Only need to run this in SelectionDAG path. 619 if (MF.getProperties().hasProperty( 620 MachineFunctionProperties::Property::Selected)) 621 return false; 622 623 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 624 MRI = &MF.getRegInfo(); 625 TRI = ST.getRegisterInfo(); 626 TII = ST.getInstrInfo(); 627 628 for (MachineBasicBlock &MBB : MF) { 629 for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E; 630 ++I) { 631 MachineInstr &MI = *I; 632 633 switch (MI.getOpcode()) { 634 default: 635 continue; 636 case AMDGPU::COPY: 637 case AMDGPU::WQM: 638 case AMDGPU::STRICT_WQM: 639 case AMDGPU::SOFT_WQM: 640 case AMDGPU::STRICT_WWM: { 641 const TargetRegisterClass *SrcRC, *DstRC; 642 std::tie(SrcRC, DstRC) = getCopyRegClasses(MI, *TRI, *MRI); 643 644 if (isSGPRToVGPRCopy(SrcRC, DstRC, *TRI)) { 645 // Since VGPR to SGPR copies affect VGPR to SGPR copy 646 // score and, hence the lowering decision, let's try to get rid of 647 // them as early as possible 648 if (tryChangeVGPRtoSGPRinCopy(MI, TRI, TII)) 649 continue; 650 651 // Collect those not changed to try them after VGPR to SGPR copies 652 // lowering as there will be more opportunities. 653 S2VCopies.push_back(&MI); 654 } 655 if (!isVGPRToSGPRCopy(SrcRC, DstRC, *TRI)) 656 continue; 657 if (lowerSpecialCase(MI, I)) 658 continue; 659 660 analyzeVGPRToSGPRCopy(&MI); 661 662 break; 663 } 664 case AMDGPU::INSERT_SUBREG: 665 case AMDGPU::PHI: 666 case AMDGPU::REG_SEQUENCE: { 667 if (TRI->isSGPRClass(TII->getOpRegClass(MI, 0))) { 668 for (MachineOperand &MO : MI.operands()) { 669 if (!MO.isReg() || !MO.getReg().isVirtual()) 670 continue; 671 const TargetRegisterClass *SrcRC = MRI->getRegClass(MO.getReg()); 672 if (TRI->hasVectorRegisters(SrcRC)) { 673 const TargetRegisterClass *DestRC = 674 TRI->getEquivalentSGPRClass(SrcRC); 675 Register NewDst = MRI->createVirtualRegister(DestRC); 676 MachineBasicBlock *BlockToInsertCopy = 677 MI.isPHI() ? MI.getOperand(MO.getOperandNo() + 1).getMBB() 678 : &MBB; 679 MachineBasicBlock::iterator PointToInsertCopy = 680 MI.isPHI() ? BlockToInsertCopy->getFirstInstrTerminator() : I; 681 682 if (!tryMoveVGPRConstToSGPR(MO, NewDst, BlockToInsertCopy, 683 PointToInsertCopy)) { 684 MachineInstr *NewCopy = 685 BuildMI(*BlockToInsertCopy, PointToInsertCopy, 686 PointToInsertCopy->getDebugLoc(), 687 TII->get(AMDGPU::COPY), NewDst) 688 .addReg(MO.getReg()); 689 MO.setReg(NewDst); 690 analyzeVGPRToSGPRCopy(NewCopy); 691 } 692 } 693 } 694 } 695 696 if (MI.isPHI()) 697 PHINodes.push_back(&MI); 698 else if (MI.isRegSequence()) 699 RegSequences.push_back(&MI); 700 701 break; 702 } 703 case AMDGPU::V_WRITELANE_B32: { 704 // Some architectures allow more than one constant bus access without 705 // SGPR restriction 706 if (ST.getConstantBusLimit(MI.getOpcode()) != 1) 707 break; 708 709 // Writelane is special in that it can use SGPR and M0 (which would 710 // normally count as using the constant bus twice - but in this case it 711 // is allowed since the lane selector doesn't count as a use of the 712 // constant bus). However, it is still required to abide by the 1 SGPR 713 // rule. Apply a fix here as we might have multiple SGPRs after 714 // legalizing VGPRs to SGPRs 715 int Src0Idx = 716 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0); 717 int Src1Idx = 718 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src1); 719 MachineOperand &Src0 = MI.getOperand(Src0Idx); 720 MachineOperand &Src1 = MI.getOperand(Src1Idx); 721 722 // Check to see if the instruction violates the 1 SGPR rule 723 if ((Src0.isReg() && TRI->isSGPRReg(*MRI, Src0.getReg()) && 724 Src0.getReg() != AMDGPU::M0) && 725 (Src1.isReg() && TRI->isSGPRReg(*MRI, Src1.getReg()) && 726 Src1.getReg() != AMDGPU::M0)) { 727 728 // Check for trivially easy constant prop into one of the operands 729 // If this is the case then perform the operation now to resolve SGPR 730 // issue. If we don't do that here we will always insert a mov to m0 731 // that can't be resolved in later operand folding pass 732 bool Resolved = false; 733 for (MachineOperand *MO : {&Src0, &Src1}) { 734 if (MO->getReg().isVirtual()) { 735 MachineInstr *DefMI = MRI->getVRegDef(MO->getReg()); 736 if (DefMI && TII->isFoldableCopy(*DefMI)) { 737 const MachineOperand &Def = DefMI->getOperand(0); 738 if (Def.isReg() && 739 MO->getReg() == Def.getReg() && 740 MO->getSubReg() == Def.getSubReg()) { 741 const MachineOperand &Copied = DefMI->getOperand(1); 742 if (Copied.isImm() && 743 TII->isInlineConstant(APInt(64, Copied.getImm(), true))) { 744 MO->ChangeToImmediate(Copied.getImm()); 745 Resolved = true; 746 break; 747 } 748 } 749 } 750 } 751 } 752 753 if (!Resolved) { 754 // Haven't managed to resolve by replacing an SGPR with an immediate 755 // Move src1 to be in M0 756 BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), 757 TII->get(AMDGPU::COPY), AMDGPU::M0) 758 .add(Src1); 759 Src1.ChangeToRegister(AMDGPU::M0, false); 760 } 761 } 762 break; 763 } 764 } 765 } 766 } 767 768 lowerVGPR2SGPRCopies(MF); 769 // Postprocessing 770 fixSCCCopies(MF); 771 for (auto *MI : S2VCopies) { 772 // Check if it is still valid 773 if (MI->isCopy()) { 774 const TargetRegisterClass *SrcRC, *DstRC; 775 std::tie(SrcRC, DstRC) = getCopyRegClasses(*MI, *TRI, *MRI); 776 if (isSGPRToVGPRCopy(SrcRC, DstRC, *TRI)) 777 tryChangeVGPRtoSGPRinCopy(*MI, TRI, TII); 778 } 779 } 780 for (auto *MI : RegSequences) { 781 // Check if it is still valid 782 if (MI->isRegSequence()) 783 foldVGPRCopyIntoRegSequence(*MI, TRI, TII, *MRI); 784 } 785 for (auto *MI : PHINodes) { 786 processPHINode(*MI); 787 } 788 if (MF.getTarget().getOptLevel() > CodeGenOptLevel::None && EnableM0Merge) 789 hoistAndMergeSGPRInits(AMDGPU::M0, *MRI, TRI, *MDT, TII); 790 791 SiblingPenalty.clear(); 792 V2SCopies.clear(); 793 SCCCopies.clear(); 794 RegSequences.clear(); 795 PHINodes.clear(); 796 S2VCopies.clear(); 797 798 return true; 799 } 800 801 void SIFixSGPRCopies::processPHINode(MachineInstr &MI) { 802 bool AllAGPRUses = true; 803 SetVector<const MachineInstr *> worklist; 804 SmallSet<const MachineInstr *, 4> Visited; 805 SetVector<MachineInstr *> PHIOperands; 806 worklist.insert(&MI); 807 Visited.insert(&MI); 808 // HACK to make MIR tests with no uses happy 809 bool HasUses = false; 810 while (!worklist.empty()) { 811 const MachineInstr *Instr = worklist.pop_back_val(); 812 Register Reg = Instr->getOperand(0).getReg(); 813 for (const auto &Use : MRI->use_operands(Reg)) { 814 HasUses = true; 815 const MachineInstr *UseMI = Use.getParent(); 816 AllAGPRUses &= (UseMI->isCopy() && 817 TRI->isAGPR(*MRI, UseMI->getOperand(0).getReg())) || 818 TRI->isAGPR(*MRI, Use.getReg()); 819 if (UseMI->isCopy() || UseMI->isRegSequence()) { 820 if (Visited.insert(UseMI).second) 821 worklist.insert(UseMI); 822 823 continue; 824 } 825 } 826 } 827 828 Register PHIRes = MI.getOperand(0).getReg(); 829 const TargetRegisterClass *RC0 = MRI->getRegClass(PHIRes); 830 if (HasUses && AllAGPRUses && !TRI->isAGPRClass(RC0)) { 831 LLVM_DEBUG(dbgs() << "Moving PHI to AGPR: " << MI); 832 MRI->setRegClass(PHIRes, TRI->getEquivalentAGPRClass(RC0)); 833 for (unsigned I = 1, N = MI.getNumOperands(); I != N; I += 2) { 834 MachineInstr *DefMI = MRI->getVRegDef(MI.getOperand(I).getReg()); 835 if (DefMI && DefMI->isPHI()) 836 PHIOperands.insert(DefMI); 837 } 838 } 839 840 if (TRI->isVectorRegister(*MRI, PHIRes) || 841 RC0 == &AMDGPU::VReg_1RegClass) { 842 LLVM_DEBUG(dbgs() << "Legalizing PHI: " << MI); 843 TII->legalizeOperands(MI, MDT); 844 } 845 846 // Propagate register class back to PHI operands which are PHI themselves. 847 while (!PHIOperands.empty()) { 848 processPHINode(*PHIOperands.pop_back_val()); 849 } 850 } 851 852 bool SIFixSGPRCopies::tryMoveVGPRConstToSGPR( 853 MachineOperand &MaybeVGPRConstMO, Register DstReg, 854 MachineBasicBlock *BlockToInsertTo, 855 MachineBasicBlock::iterator PointToInsertTo) { 856 857 MachineInstr *DefMI = MRI->getVRegDef(MaybeVGPRConstMO.getReg()); 858 if (!DefMI || !DefMI->isMoveImmediate()) 859 return false; 860 861 MachineOperand *SrcConst = TII->getNamedOperand(*DefMI, AMDGPU::OpName::src0); 862 if (SrcConst->isReg()) 863 return false; 864 865 const TargetRegisterClass *SrcRC = 866 MRI->getRegClass(MaybeVGPRConstMO.getReg()); 867 unsigned MoveSize = TRI->getRegSizeInBits(*SrcRC); 868 unsigned MoveOp = MoveSize == 64 ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32; 869 BuildMI(*BlockToInsertTo, PointToInsertTo, PointToInsertTo->getDebugLoc(), 870 TII->get(MoveOp), DstReg) 871 .add(*SrcConst); 872 if (MRI->hasOneUse(MaybeVGPRConstMO.getReg())) 873 DefMI->eraseFromParent(); 874 MaybeVGPRConstMO.setReg(DstReg); 875 return true; 876 } 877 878 bool SIFixSGPRCopies::lowerSpecialCase(MachineInstr &MI, 879 MachineBasicBlock::iterator &I) { 880 Register DstReg = MI.getOperand(0).getReg(); 881 Register SrcReg = MI.getOperand(1).getReg(); 882 if (!DstReg.isVirtual()) { 883 // If the destination register is a physical register there isn't 884 // really much we can do to fix this. 885 // Some special instructions use M0 as an input. Some even only use 886 // the first lane. Insert a readfirstlane and hope for the best. 887 if (DstReg == AMDGPU::M0 && 888 TRI->hasVectorRegisters(MRI->getRegClass(SrcReg))) { 889 Register TmpReg = 890 MRI->createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); 891 BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), 892 TII->get(AMDGPU::V_READFIRSTLANE_B32), TmpReg) 893 .add(MI.getOperand(1)); 894 MI.getOperand(1).setReg(TmpReg); 895 } else if (tryMoveVGPRConstToSGPR(MI.getOperand(1), DstReg, MI.getParent(), 896 MI)) { 897 I = std::next(I); 898 MI.eraseFromParent(); 899 } 900 return true; 901 } 902 if (!SrcReg.isVirtual() || TRI->isAGPR(*MRI, SrcReg)) { 903 SIInstrWorklist worklist; 904 worklist.insert(&MI); 905 TII->moveToVALU(worklist, MDT); 906 return true; 907 } 908 909 unsigned SMovOp; 910 int64_t Imm; 911 // If we are just copying an immediate, we can replace the copy with 912 // s_mov_b32. 913 if (isSafeToFoldImmIntoCopy(&MI, MRI->getVRegDef(SrcReg), TII, SMovOp, Imm)) { 914 MI.getOperand(1).ChangeToImmediate(Imm); 915 MI.addImplicitDefUseOperands(*MI.getParent()->getParent()); 916 MI.setDesc(TII->get(SMovOp)); 917 return true; 918 } 919 return false; 920 } 921 922 void SIFixSGPRCopies::analyzeVGPRToSGPRCopy(MachineInstr* MI) { 923 Register DstReg = MI->getOperand(0).getReg(); 924 const TargetRegisterClass *DstRC = MRI->getRegClass(DstReg); 925 926 V2SCopyInfo Info(getNextVGPRToSGPRCopyId(), MI, 927 TRI->getRegSizeInBits(*DstRC)); 928 SmallVector<MachineInstr *, 8> AnalysisWorklist; 929 // Needed because the SSA is not a tree but a graph and may have 930 // forks and joins. We should not then go same way twice. 931 DenseSet<MachineInstr *> Visited; 932 AnalysisWorklist.push_back(Info.Copy); 933 while (!AnalysisWorklist.empty()) { 934 935 MachineInstr *Inst = AnalysisWorklist.pop_back_val(); 936 937 if (!Visited.insert(Inst).second) 938 continue; 939 940 // Copies and REG_SEQUENCE do not contribute to the final assembly 941 // So, skip them but take care of the SGPR to VGPR copies bookkeeping. 942 if (Inst->isCopy() || Inst->isRegSequence()) { 943 if (TRI->isVGPR(*MRI, Inst->getOperand(0).getReg())) { 944 if (!Inst->isCopy() || 945 !tryChangeVGPRtoSGPRinCopy(*Inst, TRI, TII)) { 946 Info.NumSVCopies++; 947 continue; 948 } 949 } 950 } 951 952 SiblingPenalty[Inst].insert(Info.ID); 953 954 SmallVector<MachineInstr *, 4> Users; 955 if ((TII->isSALU(*Inst) && Inst->isCompare()) || 956 (Inst->isCopy() && Inst->getOperand(0).getReg() == AMDGPU::SCC)) { 957 auto I = Inst->getIterator(); 958 auto E = Inst->getParent()->end(); 959 while (++I != E && 960 !I->findRegisterDefOperand(AMDGPU::SCC, /*TRI=*/nullptr)) { 961 if (I->readsRegister(AMDGPU::SCC, /*TRI=*/nullptr)) 962 Users.push_back(&*I); 963 } 964 } else if (Inst->getNumExplicitDefs() != 0) { 965 Register Reg = Inst->getOperand(0).getReg(); 966 if (TRI->isSGPRReg(*MRI, Reg) && !TII->isVALU(*Inst)) 967 for (auto &U : MRI->use_instructions(Reg)) 968 Users.push_back(&U); 969 } 970 for (auto *U : Users) { 971 if (TII->isSALU(*U)) 972 Info.SChain.insert(U); 973 AnalysisWorklist.push_back(U); 974 } 975 } 976 V2SCopies[Info.ID] = Info; 977 } 978 979 // The main function that computes the VGPR to SGPR copy score 980 // and determines copy further lowering way: v_readfirstlane_b32 or moveToVALU 981 bool SIFixSGPRCopies::needToBeConvertedToVALU(V2SCopyInfo *Info) { 982 if (Info->SChain.empty()) { 983 Info->Score = 0; 984 return true; 985 } 986 Info->Siblings = SiblingPenalty[*llvm::max_element( 987 Info->SChain, [&](MachineInstr *A, MachineInstr *B) -> bool { 988 return SiblingPenalty[A].size() < SiblingPenalty[B].size(); 989 })]; 990 Info->Siblings.remove_if([&](unsigned ID) { return ID == Info->ID; }); 991 // The loop below computes the number of another VGPR to SGPR V2SCopies 992 // which contribute to the current copy SALU chain. We assume that all the 993 // V2SCopies with the same source virtual register will be squashed to one 994 // by regalloc. Also we take care of the V2SCopies of the differnt subregs 995 // of the same register. 996 SmallSet<std::pair<Register, unsigned>, 4> SrcRegs; 997 for (auto J : Info->Siblings) { 998 auto *InfoIt = V2SCopies.find(J); 999 if (InfoIt != V2SCopies.end()) { 1000 MachineInstr *SiblingCopy = InfoIt->second.Copy; 1001 if (SiblingCopy->isImplicitDef()) 1002 // the COPY has already been MoveToVALUed 1003 continue; 1004 1005 SrcRegs.insert(std::pair(SiblingCopy->getOperand(1).getReg(), 1006 SiblingCopy->getOperand(1).getSubReg())); 1007 } 1008 } 1009 Info->SiblingPenalty = SrcRegs.size(); 1010 1011 unsigned Penalty = 1012 Info->NumSVCopies + Info->SiblingPenalty + Info->NumReadfirstlanes; 1013 unsigned Profit = Info->SChain.size(); 1014 Info->Score = Penalty > Profit ? 0 : Profit - Penalty; 1015 Info->NeedToBeConvertedToVALU = Info->Score < 3; 1016 return Info->NeedToBeConvertedToVALU; 1017 } 1018 1019 void SIFixSGPRCopies::lowerVGPR2SGPRCopies(MachineFunction &MF) { 1020 1021 SmallVector<unsigned, 8> LoweringWorklist; 1022 for (auto &C : V2SCopies) { 1023 if (needToBeConvertedToVALU(&C.second)) 1024 LoweringWorklist.push_back(C.second.ID); 1025 } 1026 1027 // Store all the V2S copy instructions that need to be moved to VALU 1028 // in the Copies worklist. 1029 SIInstrWorklist Copies; 1030 1031 while (!LoweringWorklist.empty()) { 1032 unsigned CurID = LoweringWorklist.pop_back_val(); 1033 auto *CurInfoIt = V2SCopies.find(CurID); 1034 if (CurInfoIt != V2SCopies.end()) { 1035 V2SCopyInfo C = CurInfoIt->second; 1036 LLVM_DEBUG(dbgs() << "Processing ...\n"; C.dump()); 1037 for (auto S : C.Siblings) { 1038 auto *SibInfoIt = V2SCopies.find(S); 1039 if (SibInfoIt != V2SCopies.end()) { 1040 V2SCopyInfo &SI = SibInfoIt->second; 1041 LLVM_DEBUG(dbgs() << "Sibling:\n"; SI.dump()); 1042 if (!SI.NeedToBeConvertedToVALU) { 1043 SI.SChain.set_subtract(C.SChain); 1044 if (needToBeConvertedToVALU(&SI)) 1045 LoweringWorklist.push_back(SI.ID); 1046 } 1047 SI.Siblings.remove_if([&](unsigned ID) { return ID == C.ID; }); 1048 } 1049 } 1050 LLVM_DEBUG(dbgs() << "V2S copy " << *C.Copy 1051 << " is being turned to VALU\n"); 1052 // TODO: MapVector::erase is inefficient. Do bulk removal with remove_if 1053 // instead. 1054 V2SCopies.erase(C.ID); 1055 Copies.insert(C.Copy); 1056 } 1057 } 1058 1059 TII->moveToVALU(Copies, MDT); 1060 Copies.clear(); 1061 1062 // Now do actual lowering 1063 for (auto C : V2SCopies) { 1064 MachineInstr *MI = C.second.Copy; 1065 MachineBasicBlock *MBB = MI->getParent(); 1066 // We decide to turn V2S copy to v_readfirstlane_b32 1067 // remove it from the V2SCopies and remove it from all its siblings 1068 LLVM_DEBUG(dbgs() << "V2S copy " << *MI 1069 << " is being turned to v_readfirstlane_b32" 1070 << " Score: " << C.second.Score << "\n"); 1071 Register DstReg = MI->getOperand(0).getReg(); 1072 Register SrcReg = MI->getOperand(1).getReg(); 1073 unsigned SubReg = MI->getOperand(1).getSubReg(); 1074 const TargetRegisterClass *SrcRC = 1075 TRI->getRegClassForOperandReg(*MRI, MI->getOperand(1)); 1076 size_t SrcSize = TRI->getRegSizeInBits(*SrcRC); 1077 if (SrcSize == 16) { 1078 // HACK to handle possible 16bit VGPR source 1079 auto MIB = BuildMI(*MBB, MI, MI->getDebugLoc(), 1080 TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg); 1081 MIB.addReg(SrcReg, 0, AMDGPU::NoSubRegister); 1082 } else if (SrcSize == 32) { 1083 auto MIB = BuildMI(*MBB, MI, MI->getDebugLoc(), 1084 TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg); 1085 MIB.addReg(SrcReg, 0, SubReg); 1086 } else { 1087 auto Result = BuildMI(*MBB, MI, MI->getDebugLoc(), 1088 TII->get(AMDGPU::REG_SEQUENCE), DstReg); 1089 int N = TRI->getRegSizeInBits(*SrcRC) / 32; 1090 for (int i = 0; i < N; i++) { 1091 Register PartialSrc = TII->buildExtractSubReg( 1092 Result, *MRI, MI->getOperand(1), SrcRC, 1093 TRI->getSubRegFromChannel(i), &AMDGPU::VGPR_32RegClass); 1094 Register PartialDst = 1095 MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 1096 BuildMI(*MBB, *Result, Result->getDebugLoc(), 1097 TII->get(AMDGPU::V_READFIRSTLANE_B32), PartialDst) 1098 .addReg(PartialSrc); 1099 Result.addReg(PartialDst).addImm(TRI->getSubRegFromChannel(i)); 1100 } 1101 } 1102 MI->eraseFromParent(); 1103 } 1104 } 1105 1106 void SIFixSGPRCopies::fixSCCCopies(MachineFunction &MF) { 1107 bool IsWave32 = MF.getSubtarget<GCNSubtarget>().isWave32(); 1108 for (MachineBasicBlock &MBB : MF) { 1109 for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E; 1110 ++I) { 1111 MachineInstr &MI = *I; 1112 // May already have been lowered. 1113 if (!MI.isCopy()) 1114 continue; 1115 Register SrcReg = MI.getOperand(1).getReg(); 1116 Register DstReg = MI.getOperand(0).getReg(); 1117 if (SrcReg == AMDGPU::SCC) { 1118 Register SCCCopy = 1119 MRI->createVirtualRegister(TRI->getWaveMaskRegClass()); 1120 I = BuildMI(*MI.getParent(), std::next(MachineBasicBlock::iterator(MI)), 1121 MI.getDebugLoc(), 1122 TII->get(IsWave32 ? AMDGPU::S_CSELECT_B32 1123 : AMDGPU::S_CSELECT_B64), 1124 SCCCopy) 1125 .addImm(-1) 1126 .addImm(0); 1127 I = BuildMI(*MI.getParent(), std::next(I), I->getDebugLoc(), 1128 TII->get(AMDGPU::COPY), DstReg) 1129 .addReg(SCCCopy); 1130 MI.eraseFromParent(); 1131 continue; 1132 } 1133 if (DstReg == AMDGPU::SCC) { 1134 unsigned Opcode = IsWave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64; 1135 Register Exec = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC; 1136 Register Tmp = MRI->createVirtualRegister(TRI->getBoolRC()); 1137 I = BuildMI(*MI.getParent(), std::next(MachineBasicBlock::iterator(MI)), 1138 MI.getDebugLoc(), TII->get(Opcode)) 1139 .addReg(Tmp, getDefRegState(true)) 1140 .addReg(SrcReg) 1141 .addReg(Exec); 1142 MI.eraseFromParent(); 1143 } 1144 } 1145 } 1146 } 1147 1148 PreservedAnalyses 1149 SIFixSGPRCopiesPass::run(MachineFunction &MF, 1150 MachineFunctionAnalysisManager &MFAM) { 1151 MachineDominatorTree &MDT = MFAM.getResult<MachineDominatorTreeAnalysis>(MF); 1152 SIFixSGPRCopies Impl(&MDT); 1153 bool Changed = Impl.run(MF); 1154 if (!Changed) 1155 return PreservedAnalyses::all(); 1156 1157 // TODO: We could detect CFG changed. 1158 auto PA = getMachineFunctionPassPreservedAnalyses(); 1159 return PA; 1160 } 1161