1 //===-- SILowerI1Copies.cpp - Lower I1 Copies -----------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This pass lowers all occurrences of i1 values (with a vreg_1 register class) 10 // to lane masks (32 / 64-bit scalar registers). The pass assumes machine SSA 11 // form and a wave-level control flow graph. 12 // 13 // Before this pass, values that are semantically i1 and are defined and used 14 // within the same basic block are already represented as lane masks in scalar 15 // registers. However, values that cross basic blocks are always transferred 16 // between basic blocks in vreg_1 virtual registers and are lowered by this 17 // pass. 18 // 19 // The only instructions that use or define vreg_1 virtual registers are COPY, 20 // PHI, and IMPLICIT_DEF. 21 // 22 //===----------------------------------------------------------------------===// 23 24 #include "SILowerI1Copies.h" 25 #include "AMDGPU.h" 26 #include "llvm/CodeGen/MachineSSAUpdater.h" 27 #include "llvm/InitializePasses.h" 28 29 #define DEBUG_TYPE "si-i1-copies" 30 31 using namespace llvm; 32 33 static Register 34 insertUndefLaneMask(MachineBasicBlock *MBB, MachineRegisterInfo *MRI, 35 MachineRegisterInfo::VRegAttrs LaneMaskRegAttrs); 36 37 namespace { 38 39 class Vreg1LoweringHelper : public PhiLoweringHelper { 40 public: 41 Vreg1LoweringHelper(MachineFunction *MF, MachineDominatorTree *DT, 42 MachinePostDominatorTree *PDT); 43 44 private: 45 DenseSet<Register> ConstrainRegs; 46 47 public: 48 void markAsLaneMask(Register DstReg) const override; 49 void getCandidatesForLowering( 50 SmallVectorImpl<MachineInstr *> &Vreg1Phis) const override; 51 void collectIncomingValuesFromPhi( 52 const MachineInstr *MI, 53 SmallVectorImpl<Incoming> &Incomings) const override; 54 void replaceDstReg(Register NewReg, Register OldReg, 55 MachineBasicBlock *MBB) override; 56 void buildMergeLaneMasks(MachineBasicBlock &MBB, 57 MachineBasicBlock::iterator I, const DebugLoc &DL, 58 Register DstReg, Register PrevReg, 59 Register CurReg) override; 60 void constrainAsLaneMask(Incoming &In) override; 61 62 bool lowerCopiesFromI1(); 63 bool lowerCopiesToI1(); 64 bool cleanConstrainRegs(bool Changed); 65 bool isVreg1(Register Reg) const { 66 return Reg.isVirtual() && MRI->getRegClass(Reg) == &AMDGPU::VReg_1RegClass; 67 } 68 }; 69 70 Vreg1LoweringHelper::Vreg1LoweringHelper(MachineFunction *MF, 71 MachineDominatorTree *DT, 72 MachinePostDominatorTree *PDT) 73 : PhiLoweringHelper(MF, DT, PDT) {} 74 75 bool Vreg1LoweringHelper::cleanConstrainRegs(bool Changed) { 76 assert(Changed || ConstrainRegs.empty()); 77 for (Register Reg : ConstrainRegs) 78 MRI->constrainRegClass(Reg, &AMDGPU::SReg_1_XEXECRegClass); 79 ConstrainRegs.clear(); 80 81 return Changed; 82 } 83 84 /// Helper class that determines the relationship between incoming values of a 85 /// phi in the control flow graph to determine where an incoming value can 86 /// simply be taken as a scalar lane mask as-is, and where it needs to be 87 /// merged with another, previously defined lane mask. 88 /// 89 /// The approach is as follows: 90 /// - Determine all basic blocks which, starting from the incoming blocks, 91 /// a wave may reach before entering the def block (the block containing the 92 /// phi). 93 /// - If an incoming block has no predecessors in this set, we can take the 94 /// incoming value as a scalar lane mask as-is. 95 /// -- A special case of this is when the def block has a self-loop. 96 /// - Otherwise, the incoming value needs to be merged with a previously 97 /// defined lane mask. 98 /// - If there is a path into the set of reachable blocks that does _not_ go 99 /// through an incoming block where we can take the scalar lane mask as-is, 100 /// we need to invent an available value for the SSAUpdater. Choices are 101 /// 0 and undef, with differing consequences for how to merge values etc. 102 /// 103 /// TODO: We could use region analysis to quickly skip over SESE regions during 104 /// the traversal. 105 /// 106 class PhiIncomingAnalysis { 107 MachinePostDominatorTree &PDT; 108 const SIInstrInfo *TII; 109 110 // For each reachable basic block, whether it is a source in the induced 111 // subgraph of the CFG. 112 DenseMap<MachineBasicBlock *, bool> ReachableMap; 113 SmallVector<MachineBasicBlock *, 4> ReachableOrdered; 114 SmallVector<MachineBasicBlock *, 4> Stack; 115 SmallVector<MachineBasicBlock *, 4> Predecessors; 116 117 public: 118 PhiIncomingAnalysis(MachinePostDominatorTree &PDT, const SIInstrInfo *TII) 119 : PDT(PDT), TII(TII) {} 120 121 /// Returns whether \p MBB is a source in the induced subgraph of reachable 122 /// blocks. 123 bool isSource(MachineBasicBlock &MBB) const { 124 return ReachableMap.find(&MBB)->second; 125 } 126 127 ArrayRef<MachineBasicBlock *> predecessors() const { return Predecessors; } 128 129 void analyze(MachineBasicBlock &DefBlock, ArrayRef<Incoming> Incomings) { 130 assert(Stack.empty()); 131 ReachableMap.clear(); 132 ReachableOrdered.clear(); 133 Predecessors.clear(); 134 135 // Insert the def block first, so that it acts as an end point for the 136 // traversal. 137 ReachableMap.try_emplace(&DefBlock, false); 138 ReachableOrdered.push_back(&DefBlock); 139 140 for (auto Incoming : Incomings) { 141 MachineBasicBlock *MBB = Incoming.Block; 142 if (MBB == &DefBlock) { 143 ReachableMap[&DefBlock] = true; // self-loop on DefBlock 144 continue; 145 } 146 147 ReachableMap.try_emplace(MBB, false); 148 ReachableOrdered.push_back(MBB); 149 150 // If this block has a divergent terminator and the def block is its 151 // post-dominator, the wave may first visit the other successors. 152 if (TII->hasDivergentBranch(MBB) && PDT.dominates(&DefBlock, MBB)) 153 append_range(Stack, MBB->successors()); 154 } 155 156 while (!Stack.empty()) { 157 MachineBasicBlock *MBB = Stack.pop_back_val(); 158 if (!ReachableMap.try_emplace(MBB, false).second) 159 continue; 160 ReachableOrdered.push_back(MBB); 161 162 append_range(Stack, MBB->successors()); 163 } 164 165 for (MachineBasicBlock *MBB : ReachableOrdered) { 166 bool HaveReachablePred = false; 167 for (MachineBasicBlock *Pred : MBB->predecessors()) { 168 if (ReachableMap.count(Pred)) { 169 HaveReachablePred = true; 170 } else { 171 Stack.push_back(Pred); 172 } 173 } 174 if (!HaveReachablePred) 175 ReachableMap[MBB] = true; 176 if (HaveReachablePred) { 177 for (MachineBasicBlock *UnreachablePred : Stack) { 178 if (!llvm::is_contained(Predecessors, UnreachablePred)) 179 Predecessors.push_back(UnreachablePred); 180 } 181 } 182 Stack.clear(); 183 } 184 } 185 }; 186 187 /// Helper class that detects loops which require us to lower an i1 COPY into 188 /// bitwise manipulation. 189 /// 190 /// Unfortunately, we cannot use LoopInfo because LoopInfo does not distinguish 191 /// between loops with the same header. Consider this example: 192 /// 193 /// A-+-+ 194 /// | | | 195 /// B-+ | 196 /// | | 197 /// C---+ 198 /// 199 /// A is the header of a loop containing A, B, and C as far as LoopInfo is 200 /// concerned. However, an i1 COPY in B that is used in C must be lowered to 201 /// bitwise operations to combine results from different loop iterations when 202 /// B has a divergent branch (since by default we will compile this code such 203 /// that threads in a wave are merged at the entry of C). 204 /// 205 /// The following rule is implemented to determine whether bitwise operations 206 /// are required: use the bitwise lowering for a def in block B if a backward 207 /// edge to B is reachable without going through the nearest common 208 /// post-dominator of B and all uses of the def. 209 /// 210 /// TODO: This rule is conservative because it does not check whether the 211 /// relevant branches are actually divergent. 212 /// 213 /// The class is designed to cache the CFG traversal so that it can be re-used 214 /// for multiple defs within the same basic block. 215 /// 216 /// TODO: We could use region analysis to quickly skip over SESE regions during 217 /// the traversal. 218 /// 219 class LoopFinder { 220 MachineDominatorTree &DT; 221 MachinePostDominatorTree &PDT; 222 223 // All visited / reachable block, tagged by level (level 0 is the def block, 224 // level 1 are all blocks reachable including but not going through the def 225 // block's IPDOM, etc.). 226 DenseMap<MachineBasicBlock *, unsigned> Visited; 227 228 // Nearest common dominator of all visited blocks by level (level 0 is the 229 // def block). Used for seeding the SSAUpdater. 230 SmallVector<MachineBasicBlock *, 4> CommonDominators; 231 232 // Post-dominator of all visited blocks. 233 MachineBasicBlock *VisitedPostDom = nullptr; 234 235 // Level at which a loop was found: 0 is not possible; 1 = a backward edge is 236 // reachable without going through the IPDOM of the def block (if the IPDOM 237 // itself has an edge to the def block, the loop level is 2), etc. 238 unsigned FoundLoopLevel = ~0u; 239 240 MachineBasicBlock *DefBlock = nullptr; 241 SmallVector<MachineBasicBlock *, 4> Stack; 242 SmallVector<MachineBasicBlock *, 4> NextLevel; 243 244 public: 245 LoopFinder(MachineDominatorTree &DT, MachinePostDominatorTree &PDT) 246 : DT(DT), PDT(PDT) {} 247 248 void initialize(MachineBasicBlock &MBB) { 249 Visited.clear(); 250 CommonDominators.clear(); 251 Stack.clear(); 252 NextLevel.clear(); 253 VisitedPostDom = nullptr; 254 FoundLoopLevel = ~0u; 255 256 DefBlock = &MBB; 257 } 258 259 /// Check whether a backward edge can be reached without going through the 260 /// given \p PostDom of the def block. 261 /// 262 /// Return the level of \p PostDom if a loop was found, or 0 otherwise. 263 unsigned findLoop(MachineBasicBlock *PostDom) { 264 MachineDomTreeNode *PDNode = PDT.getNode(DefBlock); 265 266 if (!VisitedPostDom) 267 advanceLevel(); 268 269 unsigned Level = 0; 270 while (PDNode->getBlock() != PostDom) { 271 if (PDNode->getBlock() == VisitedPostDom) 272 advanceLevel(); 273 PDNode = PDNode->getIDom(); 274 Level++; 275 if (FoundLoopLevel == Level) 276 return Level; 277 } 278 279 return 0; 280 } 281 282 /// Add undef values dominating the loop and the optionally given additional 283 /// blocks, so that the SSA updater doesn't have to search all the way to the 284 /// function entry. 285 void addLoopEntries(unsigned LoopLevel, MachineSSAUpdater &SSAUpdater, 286 MachineRegisterInfo &MRI, 287 MachineRegisterInfo::VRegAttrs LaneMaskRegAttrs, 288 ArrayRef<Incoming> Incomings = {}) { 289 assert(LoopLevel < CommonDominators.size()); 290 291 MachineBasicBlock *Dom = CommonDominators[LoopLevel]; 292 for (auto &Incoming : Incomings) 293 Dom = DT.findNearestCommonDominator(Dom, Incoming.Block); 294 295 if (!inLoopLevel(*Dom, LoopLevel, Incomings)) { 296 SSAUpdater.AddAvailableValue( 297 Dom, insertUndefLaneMask(Dom, &MRI, LaneMaskRegAttrs)); 298 } else { 299 // The dominator is part of the loop or the given blocks, so add the 300 // undef value to unreachable predecessors instead. 301 for (MachineBasicBlock *Pred : Dom->predecessors()) { 302 if (!inLoopLevel(*Pred, LoopLevel, Incomings)) 303 SSAUpdater.AddAvailableValue( 304 Pred, insertUndefLaneMask(Pred, &MRI, LaneMaskRegAttrs)); 305 } 306 } 307 } 308 309 private: 310 bool inLoopLevel(MachineBasicBlock &MBB, unsigned LoopLevel, 311 ArrayRef<Incoming> Incomings) const { 312 auto DomIt = Visited.find(&MBB); 313 if (DomIt != Visited.end() && DomIt->second <= LoopLevel) 314 return true; 315 316 for (auto &Incoming : Incomings) 317 if (Incoming.Block == &MBB) 318 return true; 319 320 return false; 321 } 322 323 void advanceLevel() { 324 MachineBasicBlock *VisitedDom; 325 326 if (!VisitedPostDom) { 327 VisitedPostDom = DefBlock; 328 VisitedDom = DefBlock; 329 Stack.push_back(DefBlock); 330 } else { 331 VisitedPostDom = PDT.getNode(VisitedPostDom)->getIDom()->getBlock(); 332 VisitedDom = CommonDominators.back(); 333 334 for (unsigned i = 0; i < NextLevel.size();) { 335 if (PDT.dominates(VisitedPostDom, NextLevel[i])) { 336 Stack.push_back(NextLevel[i]); 337 338 NextLevel[i] = NextLevel.back(); 339 NextLevel.pop_back(); 340 } else { 341 i++; 342 } 343 } 344 } 345 346 unsigned Level = CommonDominators.size(); 347 while (!Stack.empty()) { 348 MachineBasicBlock *MBB = Stack.pop_back_val(); 349 if (!PDT.dominates(VisitedPostDom, MBB)) 350 NextLevel.push_back(MBB); 351 352 Visited[MBB] = Level; 353 VisitedDom = DT.findNearestCommonDominator(VisitedDom, MBB); 354 355 for (MachineBasicBlock *Succ : MBB->successors()) { 356 if (Succ == DefBlock) { 357 if (MBB == VisitedPostDom) 358 FoundLoopLevel = std::min(FoundLoopLevel, Level + 1); 359 else 360 FoundLoopLevel = std::min(FoundLoopLevel, Level); 361 continue; 362 } 363 364 if (Visited.try_emplace(Succ, ~0u).second) { 365 if (MBB == VisitedPostDom) 366 NextLevel.push_back(Succ); 367 else 368 Stack.push_back(Succ); 369 } 370 } 371 } 372 373 CommonDominators.push_back(VisitedDom); 374 } 375 }; 376 377 } // End anonymous namespace. 378 379 Register 380 llvm::createLaneMaskReg(MachineRegisterInfo *MRI, 381 MachineRegisterInfo::VRegAttrs LaneMaskRegAttrs) { 382 return MRI->createVirtualRegister(LaneMaskRegAttrs); 383 } 384 385 static Register 386 insertUndefLaneMask(MachineBasicBlock *MBB, MachineRegisterInfo *MRI, 387 MachineRegisterInfo::VRegAttrs LaneMaskRegAttrs) { 388 MachineFunction &MF = *MBB->getParent(); 389 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 390 const SIInstrInfo *TII = ST.getInstrInfo(); 391 Register UndefReg = createLaneMaskReg(MRI, LaneMaskRegAttrs); 392 BuildMI(*MBB, MBB->getFirstTerminator(), {}, TII->get(AMDGPU::IMPLICIT_DEF), 393 UndefReg); 394 return UndefReg; 395 } 396 397 #ifndef NDEBUG 398 static bool isVRegCompatibleReg(const SIRegisterInfo &TRI, 399 const MachineRegisterInfo &MRI, 400 Register Reg) { 401 unsigned Size = TRI.getRegSizeInBits(Reg, MRI); 402 return Size == 1 || Size == 32; 403 } 404 #endif 405 406 bool Vreg1LoweringHelper::lowerCopiesFromI1() { 407 bool Changed = false; 408 SmallVector<MachineInstr *, 4> DeadCopies; 409 410 for (MachineBasicBlock &MBB : *MF) { 411 for (MachineInstr &MI : MBB) { 412 if (MI.getOpcode() != AMDGPU::COPY) 413 continue; 414 415 Register DstReg = MI.getOperand(0).getReg(); 416 Register SrcReg = MI.getOperand(1).getReg(); 417 if (!isVreg1(SrcReg)) 418 continue; 419 420 if (isLaneMaskReg(DstReg) || isVreg1(DstReg)) 421 continue; 422 423 Changed = true; 424 425 // Copy into a 32-bit vector register. 426 LLVM_DEBUG(dbgs() << "Lower copy from i1: " << MI); 427 DebugLoc DL = MI.getDebugLoc(); 428 429 assert(isVRegCompatibleReg(TII->getRegisterInfo(), *MRI, DstReg)); 430 assert(!MI.getOperand(0).getSubReg()); 431 432 ConstrainRegs.insert(SrcReg); 433 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstReg) 434 .addImm(0) 435 .addImm(0) 436 .addImm(0) 437 .addImm(-1) 438 .addReg(SrcReg); 439 DeadCopies.push_back(&MI); 440 } 441 442 for (MachineInstr *MI : DeadCopies) 443 MI->eraseFromParent(); 444 DeadCopies.clear(); 445 } 446 return Changed; 447 } 448 449 PhiLoweringHelper::PhiLoweringHelper(MachineFunction *MF, 450 MachineDominatorTree *DT, 451 MachinePostDominatorTree *PDT) 452 : MF(MF), DT(DT), PDT(PDT) { 453 MRI = &MF->getRegInfo(); 454 455 ST = &MF->getSubtarget<GCNSubtarget>(); 456 TII = ST->getInstrInfo(); 457 IsWave32 = ST->isWave32(); 458 459 if (IsWave32) { 460 ExecReg = AMDGPU::EXEC_LO; 461 MovOp = AMDGPU::S_MOV_B32; 462 AndOp = AMDGPU::S_AND_B32; 463 OrOp = AMDGPU::S_OR_B32; 464 XorOp = AMDGPU::S_XOR_B32; 465 AndN2Op = AMDGPU::S_ANDN2_B32; 466 OrN2Op = AMDGPU::S_ORN2_B32; 467 } else { 468 ExecReg = AMDGPU::EXEC; 469 MovOp = AMDGPU::S_MOV_B64; 470 AndOp = AMDGPU::S_AND_B64; 471 OrOp = AMDGPU::S_OR_B64; 472 XorOp = AMDGPU::S_XOR_B64; 473 AndN2Op = AMDGPU::S_ANDN2_B64; 474 OrN2Op = AMDGPU::S_ORN2_B64; 475 } 476 } 477 478 bool PhiLoweringHelper::lowerPhis() { 479 MachineSSAUpdater SSAUpdater(*MF); 480 LoopFinder LF(*DT, *PDT); 481 PhiIncomingAnalysis PIA(*PDT, TII); 482 SmallVector<MachineInstr *, 4> Vreg1Phis; 483 SmallVector<Incoming, 4> Incomings; 484 485 getCandidatesForLowering(Vreg1Phis); 486 if (Vreg1Phis.empty()) 487 return false; 488 489 DT->updateDFSNumbers(); 490 MachineBasicBlock *PrevMBB = nullptr; 491 for (MachineInstr *MI : Vreg1Phis) { 492 MachineBasicBlock &MBB = *MI->getParent(); 493 if (&MBB != PrevMBB) { 494 LF.initialize(MBB); 495 PrevMBB = &MBB; 496 } 497 498 LLVM_DEBUG(dbgs() << "Lower PHI: " << *MI); 499 500 Register DstReg = MI->getOperand(0).getReg(); 501 markAsLaneMask(DstReg); 502 initializeLaneMaskRegisterAttributes(DstReg); 503 504 collectIncomingValuesFromPhi(MI, Incomings); 505 506 // Sort the incomings such that incoming values that dominate other incoming 507 // values are sorted earlier. This allows us to do some amount of on-the-fly 508 // constant folding. 509 // Incoming with smaller DFSNumIn goes first, DFSNumIn is 0 for entry block. 510 llvm::sort(Incomings, [this](Incoming LHS, Incoming RHS) { 511 return DT->getNode(LHS.Block)->getDFSNumIn() < 512 DT->getNode(RHS.Block)->getDFSNumIn(); 513 }); 514 515 #ifndef NDEBUG 516 PhiRegisters.insert(DstReg); 517 #endif 518 519 // Phis in a loop that are observed outside the loop receive a simple but 520 // conservatively correct treatment. 521 std::vector<MachineBasicBlock *> DomBlocks = {&MBB}; 522 for (MachineInstr &Use : MRI->use_instructions(DstReg)) 523 DomBlocks.push_back(Use.getParent()); 524 525 MachineBasicBlock *PostDomBound = 526 PDT->findNearestCommonDominator(DomBlocks); 527 528 // FIXME: This fails to find irreducible cycles. If we have a def (other 529 // than a constant) in a pair of blocks that end up looping back to each 530 // other, it will be mishandle. Due to structurization this shouldn't occur 531 // in practice. 532 unsigned FoundLoopLevel = LF.findLoop(PostDomBound); 533 534 SSAUpdater.Initialize(DstReg); 535 536 if (FoundLoopLevel) { 537 LF.addLoopEntries(FoundLoopLevel, SSAUpdater, *MRI, LaneMaskRegAttrs, 538 Incomings); 539 540 for (auto &Incoming : Incomings) { 541 Incoming.UpdatedReg = createLaneMaskReg(MRI, LaneMaskRegAttrs); 542 SSAUpdater.AddAvailableValue(Incoming.Block, Incoming.UpdatedReg); 543 } 544 545 for (auto &Incoming : Incomings) { 546 MachineBasicBlock &IMBB = *Incoming.Block; 547 buildMergeLaneMasks( 548 IMBB, getSaluInsertionAtEnd(IMBB), {}, Incoming.UpdatedReg, 549 SSAUpdater.GetValueInMiddleOfBlock(&IMBB), Incoming.Reg); 550 } 551 } else { 552 // The phi is not observed from outside a loop. Use a more accurate 553 // lowering. 554 PIA.analyze(MBB, Incomings); 555 556 for (MachineBasicBlock *MBB : PIA.predecessors()) 557 SSAUpdater.AddAvailableValue( 558 MBB, insertUndefLaneMask(MBB, MRI, LaneMaskRegAttrs)); 559 560 for (auto &Incoming : Incomings) { 561 MachineBasicBlock &IMBB = *Incoming.Block; 562 if (PIA.isSource(IMBB)) { 563 constrainAsLaneMask(Incoming); 564 SSAUpdater.AddAvailableValue(&IMBB, Incoming.Reg); 565 } else { 566 Incoming.UpdatedReg = createLaneMaskReg(MRI, LaneMaskRegAttrs); 567 SSAUpdater.AddAvailableValue(&IMBB, Incoming.UpdatedReg); 568 } 569 } 570 571 for (auto &Incoming : Incomings) { 572 if (!Incoming.UpdatedReg.isValid()) 573 continue; 574 575 MachineBasicBlock &IMBB = *Incoming.Block; 576 buildMergeLaneMasks( 577 IMBB, getSaluInsertionAtEnd(IMBB), {}, Incoming.UpdatedReg, 578 SSAUpdater.GetValueInMiddleOfBlock(&IMBB), Incoming.Reg); 579 } 580 } 581 582 Register NewReg = SSAUpdater.GetValueInMiddleOfBlock(&MBB); 583 if (NewReg != DstReg) { 584 replaceDstReg(NewReg, DstReg, &MBB); 585 MI->eraseFromParent(); 586 } 587 588 Incomings.clear(); 589 } 590 return true; 591 } 592 593 bool Vreg1LoweringHelper::lowerCopiesToI1() { 594 bool Changed = false; 595 MachineSSAUpdater SSAUpdater(*MF); 596 LoopFinder LF(*DT, *PDT); 597 SmallVector<MachineInstr *, 4> DeadCopies; 598 599 for (MachineBasicBlock &MBB : *MF) { 600 LF.initialize(MBB); 601 602 for (MachineInstr &MI : MBB) { 603 if (MI.getOpcode() != AMDGPU::IMPLICIT_DEF && 604 MI.getOpcode() != AMDGPU::COPY) 605 continue; 606 607 Register DstReg = MI.getOperand(0).getReg(); 608 if (!isVreg1(DstReg)) 609 continue; 610 611 Changed = true; 612 613 if (MRI->use_empty(DstReg)) { 614 DeadCopies.push_back(&MI); 615 continue; 616 } 617 618 LLVM_DEBUG(dbgs() << "Lower Other: " << MI); 619 620 markAsLaneMask(DstReg); 621 initializeLaneMaskRegisterAttributes(DstReg); 622 623 if (MI.getOpcode() == AMDGPU::IMPLICIT_DEF) 624 continue; 625 626 DebugLoc DL = MI.getDebugLoc(); 627 Register SrcReg = MI.getOperand(1).getReg(); 628 assert(!MI.getOperand(1).getSubReg()); 629 630 if (!SrcReg.isVirtual() || (!isLaneMaskReg(SrcReg) && !isVreg1(SrcReg))) { 631 assert(TII->getRegisterInfo().getRegSizeInBits(SrcReg, *MRI) == 32); 632 Register TmpReg = createLaneMaskReg(MRI, LaneMaskRegAttrs); 633 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_CMP_NE_U32_e64), TmpReg) 634 .addReg(SrcReg) 635 .addImm(0); 636 MI.getOperand(1).setReg(TmpReg); 637 SrcReg = TmpReg; 638 } else { 639 // SrcReg needs to be live beyond copy. 640 MI.getOperand(1).setIsKill(false); 641 } 642 643 // Defs in a loop that are observed outside the loop must be transformed 644 // into appropriate bit manipulation. 645 std::vector<MachineBasicBlock *> DomBlocks = {&MBB}; 646 for (MachineInstr &Use : MRI->use_instructions(DstReg)) 647 DomBlocks.push_back(Use.getParent()); 648 649 MachineBasicBlock *PostDomBound = 650 PDT->findNearestCommonDominator(DomBlocks); 651 unsigned FoundLoopLevel = LF.findLoop(PostDomBound); 652 if (FoundLoopLevel) { 653 SSAUpdater.Initialize(DstReg); 654 SSAUpdater.AddAvailableValue(&MBB, DstReg); 655 LF.addLoopEntries(FoundLoopLevel, SSAUpdater, *MRI, LaneMaskRegAttrs); 656 657 buildMergeLaneMasks(MBB, MI, DL, DstReg, 658 SSAUpdater.GetValueInMiddleOfBlock(&MBB), SrcReg); 659 DeadCopies.push_back(&MI); 660 } 661 } 662 663 for (MachineInstr *MI : DeadCopies) 664 MI->eraseFromParent(); 665 DeadCopies.clear(); 666 } 667 return Changed; 668 } 669 670 bool PhiLoweringHelper::isConstantLaneMask(Register Reg, bool &Val) const { 671 const MachineInstr *MI; 672 for (;;) { 673 MI = MRI->getUniqueVRegDef(Reg); 674 if (MI->getOpcode() == AMDGPU::IMPLICIT_DEF) 675 return true; 676 677 if (MI->getOpcode() != AMDGPU::COPY) 678 break; 679 680 Reg = MI->getOperand(1).getReg(); 681 if (!Reg.isVirtual()) 682 return false; 683 if (!isLaneMaskReg(Reg)) 684 return false; 685 } 686 687 if (MI->getOpcode() != MovOp) 688 return false; 689 690 if (!MI->getOperand(1).isImm()) 691 return false; 692 693 int64_t Imm = MI->getOperand(1).getImm(); 694 if (Imm == 0) { 695 Val = false; 696 return true; 697 } 698 if (Imm == -1) { 699 Val = true; 700 return true; 701 } 702 703 return false; 704 } 705 706 static void instrDefsUsesSCC(const MachineInstr &MI, bool &Def, bool &Use) { 707 Def = false; 708 Use = false; 709 710 for (const MachineOperand &MO : MI.operands()) { 711 if (MO.isReg() && MO.getReg() == AMDGPU::SCC) { 712 if (MO.isUse()) 713 Use = true; 714 else 715 Def = true; 716 } 717 } 718 } 719 720 /// Return a point at the end of the given \p MBB to insert SALU instructions 721 /// for lane mask calculation. Take terminators and SCC into account. 722 MachineBasicBlock::iterator 723 PhiLoweringHelper::getSaluInsertionAtEnd(MachineBasicBlock &MBB) const { 724 auto InsertionPt = MBB.getFirstTerminator(); 725 bool TerminatorsUseSCC = false; 726 for (auto I = InsertionPt, E = MBB.end(); I != E; ++I) { 727 bool DefsSCC; 728 instrDefsUsesSCC(*I, DefsSCC, TerminatorsUseSCC); 729 if (TerminatorsUseSCC || DefsSCC) 730 break; 731 } 732 733 if (!TerminatorsUseSCC) 734 return InsertionPt; 735 736 while (InsertionPt != MBB.begin()) { 737 InsertionPt--; 738 739 bool DefSCC, UseSCC; 740 instrDefsUsesSCC(*InsertionPt, DefSCC, UseSCC); 741 if (DefSCC) 742 return InsertionPt; 743 } 744 745 // We should have at least seen an IMPLICIT_DEF or COPY 746 llvm_unreachable("SCC used by terminator but no def in block"); 747 } 748 749 // VReg_1 -> SReg_32 or SReg_64 750 void Vreg1LoweringHelper::markAsLaneMask(Register DstReg) const { 751 MRI->setRegClass(DstReg, ST->getBoolRC()); 752 } 753 754 void Vreg1LoweringHelper::getCandidatesForLowering( 755 SmallVectorImpl<MachineInstr *> &Vreg1Phis) const { 756 for (MachineBasicBlock &MBB : *MF) { 757 for (MachineInstr &MI : MBB.phis()) { 758 if (isVreg1(MI.getOperand(0).getReg())) 759 Vreg1Phis.push_back(&MI); 760 } 761 } 762 } 763 764 void Vreg1LoweringHelper::collectIncomingValuesFromPhi( 765 const MachineInstr *MI, SmallVectorImpl<Incoming> &Incomings) const { 766 for (unsigned i = 1; i < MI->getNumOperands(); i += 2) { 767 assert(i + 1 < MI->getNumOperands()); 768 Register IncomingReg = MI->getOperand(i).getReg(); 769 MachineBasicBlock *IncomingMBB = MI->getOperand(i + 1).getMBB(); 770 MachineInstr *IncomingDef = MRI->getUniqueVRegDef(IncomingReg); 771 772 if (IncomingDef->getOpcode() == AMDGPU::COPY) { 773 IncomingReg = IncomingDef->getOperand(1).getReg(); 774 assert(isLaneMaskReg(IncomingReg) || isVreg1(IncomingReg)); 775 assert(!IncomingDef->getOperand(1).getSubReg()); 776 } else if (IncomingDef->getOpcode() == AMDGPU::IMPLICIT_DEF) { 777 continue; 778 } else { 779 assert(IncomingDef->isPHI() || PhiRegisters.count(IncomingReg)); 780 } 781 782 Incomings.emplace_back(IncomingReg, IncomingMBB, Register()); 783 } 784 } 785 786 void Vreg1LoweringHelper::replaceDstReg(Register NewReg, Register OldReg, 787 MachineBasicBlock *MBB) { 788 MRI->replaceRegWith(NewReg, OldReg); 789 } 790 791 void Vreg1LoweringHelper::buildMergeLaneMasks(MachineBasicBlock &MBB, 792 MachineBasicBlock::iterator I, 793 const DebugLoc &DL, 794 Register DstReg, Register PrevReg, 795 Register CurReg) { 796 bool PrevVal = false; 797 bool PrevConstant = isConstantLaneMask(PrevReg, PrevVal); 798 bool CurVal = false; 799 bool CurConstant = isConstantLaneMask(CurReg, CurVal); 800 801 if (PrevConstant && CurConstant) { 802 if (PrevVal == CurVal) { 803 BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), DstReg).addReg(CurReg); 804 } else if (CurVal) { 805 BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), DstReg).addReg(ExecReg); 806 } else { 807 BuildMI(MBB, I, DL, TII->get(XorOp), DstReg) 808 .addReg(ExecReg) 809 .addImm(-1); 810 } 811 return; 812 } 813 814 Register PrevMaskedReg; 815 Register CurMaskedReg; 816 if (!PrevConstant) { 817 if (CurConstant && CurVal) { 818 PrevMaskedReg = PrevReg; 819 } else { 820 PrevMaskedReg = createLaneMaskReg(MRI, LaneMaskRegAttrs); 821 BuildMI(MBB, I, DL, TII->get(AndN2Op), PrevMaskedReg) 822 .addReg(PrevReg) 823 .addReg(ExecReg); 824 } 825 } 826 if (!CurConstant) { 827 // TODO: check whether CurReg is already masked by EXEC 828 if (PrevConstant && PrevVal) { 829 CurMaskedReg = CurReg; 830 } else { 831 CurMaskedReg = createLaneMaskReg(MRI, LaneMaskRegAttrs); 832 BuildMI(MBB, I, DL, TII->get(AndOp), CurMaskedReg) 833 .addReg(CurReg) 834 .addReg(ExecReg); 835 } 836 } 837 838 if (PrevConstant && !PrevVal) { 839 BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), DstReg) 840 .addReg(CurMaskedReg); 841 } else if (CurConstant && !CurVal) { 842 BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), DstReg) 843 .addReg(PrevMaskedReg); 844 } else if (PrevConstant && PrevVal) { 845 BuildMI(MBB, I, DL, TII->get(OrN2Op), DstReg) 846 .addReg(CurMaskedReg) 847 .addReg(ExecReg); 848 } else { 849 BuildMI(MBB, I, DL, TII->get(OrOp), DstReg) 850 .addReg(PrevMaskedReg) 851 .addReg(CurMaskedReg ? CurMaskedReg : ExecReg); 852 } 853 } 854 855 void Vreg1LoweringHelper::constrainAsLaneMask(Incoming &In) {} 856 857 /// Lower all instructions that def or use vreg_1 registers. 858 /// 859 /// In a first pass, we lower COPYs from vreg_1 to vector registers, as can 860 /// occur around inline assembly. We do this first, before vreg_1 registers 861 /// are changed to scalar mask registers. 862 /// 863 /// Then we lower all defs of vreg_1 registers. Phi nodes are lowered before 864 /// all others, because phi lowering looks through copies and can therefore 865 /// often make copy lowering unnecessary. 866 static bool runFixI1Copies(MachineFunction &MF, MachineDominatorTree &MDT, 867 MachinePostDominatorTree &MPDT) { 868 // Only need to run this in SelectionDAG path. 869 if (MF.getProperties().hasProperty( 870 MachineFunctionProperties::Property::Selected)) 871 return false; 872 873 Vreg1LoweringHelper Helper(&MF, &MDT, &MPDT); 874 bool Changed = false; 875 Changed |= Helper.lowerCopiesFromI1(); 876 Changed |= Helper.lowerPhis(); 877 Changed |= Helper.lowerCopiesToI1(); 878 return Helper.cleanConstrainRegs(Changed); 879 } 880 881 PreservedAnalyses 882 SILowerI1CopiesPass::run(MachineFunction &MF, 883 MachineFunctionAnalysisManager &MFAM) { 884 MachineDominatorTree &MDT = MFAM.getResult<MachineDominatorTreeAnalysis>(MF); 885 MachinePostDominatorTree &MPDT = 886 MFAM.getResult<MachinePostDominatorTreeAnalysis>(MF); 887 bool Changed = runFixI1Copies(MF, MDT, MPDT); 888 if (!Changed) 889 return PreservedAnalyses::all(); 890 891 // TODO: Probably preserves most. 892 PreservedAnalyses PA; 893 PA.preserveSet<CFGAnalyses>(); 894 return PA; 895 } 896 897 class SILowerI1CopiesLegacy : public MachineFunctionPass { 898 public: 899 static char ID; 900 901 SILowerI1CopiesLegacy() : MachineFunctionPass(ID) { 902 initializeSILowerI1CopiesLegacyPass(*PassRegistry::getPassRegistry()); 903 } 904 905 bool runOnMachineFunction(MachineFunction &MF) override; 906 907 StringRef getPassName() const override { return "SI Lower i1 Copies"; } 908 909 void getAnalysisUsage(AnalysisUsage &AU) const override { 910 AU.setPreservesCFG(); 911 AU.addRequired<MachineDominatorTreeWrapperPass>(); 912 AU.addRequired<MachinePostDominatorTreeWrapperPass>(); 913 MachineFunctionPass::getAnalysisUsage(AU); 914 } 915 }; 916 917 bool SILowerI1CopiesLegacy::runOnMachineFunction(MachineFunction &MF) { 918 MachineDominatorTree &MDT = 919 getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree(); 920 MachinePostDominatorTree &MPDT = 921 getAnalysis<MachinePostDominatorTreeWrapperPass>().getPostDomTree(); 922 return runFixI1Copies(MF, MDT, MPDT); 923 } 924 925 INITIALIZE_PASS_BEGIN(SILowerI1CopiesLegacy, DEBUG_TYPE, "SI Lower i1 Copies", 926 false, false) 927 INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass) 928 INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTreeWrapperPass) 929 INITIALIZE_PASS_END(SILowerI1CopiesLegacy, DEBUG_TYPE, "SI Lower i1 Copies", 930 false, false) 931 932 char SILowerI1CopiesLegacy::ID = 0; 933 934 char &llvm::SILowerI1CopiesLegacyID = SILowerI1CopiesLegacy::ID; 935 936 FunctionPass *llvm::createSILowerI1CopiesLegacyPass() { 937 return new SILowerI1CopiesLegacy(); 938 } 939