1 //===-- SILowerControlFlow.cpp - Use predicates for control flow ----------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// This pass lowers the pseudo control flow instructions to real 11 /// machine instructions. 12 /// 13 /// All control flow is handled using predicated instructions and 14 /// a predicate stack. Each Scalar ALU controls the operations of 64 Vector 15 /// ALUs. The Scalar ALU can update the predicate for any of the Vector ALUs 16 /// by writting to the 64-bit EXEC register (each bit corresponds to a 17 /// single vector ALU). Typically, for predicates, a vector ALU will write 18 /// to its bit of the VCC register (like EXEC VCC is 64-bits, one for each 19 /// Vector ALU) and then the ScalarALU will AND the VCC register with the 20 /// EXEC to update the predicates. 21 /// 22 /// For example: 23 /// %vcc = V_CMP_GT_F32 %vgpr1, %vgpr2 24 /// %sgpr0 = SI_IF %vcc 25 /// %vgpr0 = V_ADD_F32 %vgpr0, %vgpr0 26 /// %sgpr0 = SI_ELSE %sgpr0 27 /// %vgpr0 = V_SUB_F32 %vgpr0, %vgpr0 28 /// SI_END_CF %sgpr0 29 /// 30 /// becomes: 31 /// 32 /// %sgpr0 = S_AND_SAVEEXEC_B64 %vcc // Save and update the exec mask 33 /// %sgpr0 = S_XOR_B64 %sgpr0, %exec // Clear live bits from saved exec mask 34 /// S_CBRANCH_EXECZ label0 // This instruction is an optional 35 /// // optimization which allows us to 36 /// // branch if all the bits of 37 /// // EXEC are zero. 38 /// %vgpr0 = V_ADD_F32 %vgpr0, %vgpr0 // Do the IF block of the branch 39 /// 40 /// label0: 41 /// %sgpr0 = S_OR_SAVEEXEC_B64 %sgpr0 // Restore the exec mask for the Then block 42 /// %exec = S_XOR_B64 %sgpr0, %exec // Update the exec mask 43 /// S_BRANCH_EXECZ label1 // Use our branch optimization 44 /// // instruction again. 45 /// %vgpr0 = V_SUB_F32 %vgpr0, %vgpr // Do the THEN block 46 /// label1: 47 /// %exec = S_OR_B64 %exec, %sgpr0 // Re-enable saved exec mask bits 48 //===----------------------------------------------------------------------===// 49 50 #include "AMDGPU.h" 51 #include "AMDGPUSubtarget.h" 52 #include "SIInstrInfo.h" 53 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 54 #include "llvm/ADT/SetVector.h" 55 #include "llvm/ADT/SmallSet.h" 56 #include "llvm/ADT/SmallVector.h" 57 #include "llvm/ADT/StringRef.h" 58 #include "llvm/CodeGen/LiveIntervals.h" 59 #include "llvm/CodeGen/MachineBasicBlock.h" 60 #include "llvm/CodeGen/MachineFunction.h" 61 #include "llvm/CodeGen/MachineFunctionPass.h" 62 #include "llvm/CodeGen/MachineInstr.h" 63 #include "llvm/CodeGen/MachineInstrBuilder.h" 64 #include "llvm/CodeGen/MachineOperand.h" 65 #include "llvm/CodeGen/MachineRegisterInfo.h" 66 #include "llvm/CodeGen/Passes.h" 67 #include "llvm/CodeGen/SlotIndexes.h" 68 #include "llvm/CodeGen/TargetRegisterInfo.h" 69 #include "llvm/MC/MCRegisterInfo.h" 70 #include "llvm/Pass.h" 71 #include <cassert> 72 #include <iterator> 73 74 using namespace llvm; 75 76 #define DEBUG_TYPE "si-lower-control-flow" 77 78 static cl::opt<bool> 79 RemoveRedundantEndcf("amdgpu-remove-redundant-endcf", 80 cl::init(true), cl::ReallyHidden); 81 82 namespace { 83 84 class SILowerControlFlow : public MachineFunctionPass { 85 private: 86 const SIRegisterInfo *TRI = nullptr; 87 const SIInstrInfo *TII = nullptr; 88 LiveIntervals *LIS = nullptr; 89 MachineRegisterInfo *MRI = nullptr; 90 SetVector<MachineInstr*> LoweredEndCf; 91 DenseSet<Register> LoweredIf; 92 SmallSet<MachineInstr *, 16> NeedsKillCleanup; 93 94 const TargetRegisterClass *BoolRC = nullptr; 95 bool InsertKillCleanups; 96 unsigned AndOpc; 97 unsigned OrOpc; 98 unsigned XorOpc; 99 unsigned MovTermOpc; 100 unsigned Andn2TermOpc; 101 unsigned XorTermrOpc; 102 unsigned OrSaveExecOpc; 103 unsigned Exec; 104 105 void emitIf(MachineInstr &MI); 106 void emitElse(MachineInstr &MI); 107 void emitIfBreak(MachineInstr &MI); 108 void emitLoop(MachineInstr &MI); 109 void emitEndCf(MachineInstr &MI); 110 111 void findMaskOperands(MachineInstr &MI, unsigned OpNo, 112 SmallVectorImpl<MachineOperand> &Src) const; 113 114 void combineMasks(MachineInstr &MI); 115 116 bool removeMBBifRedundant(MachineBasicBlock &MBB); 117 118 void process(MachineInstr &MI); 119 120 // Skip to the next instruction, ignoring debug instructions, and trivial 121 // block boundaries (blocks that have one (typically fallthrough) successor, 122 // and the successor has one predecessor. 123 MachineBasicBlock::iterator 124 skipIgnoreExecInstsTrivialSucc(MachineBasicBlock &MBB, 125 MachineBasicBlock::iterator It) const; 126 127 /// Find the insertion point for a new conditional branch. 128 MachineBasicBlock::iterator 129 skipToUncondBrOrEnd(MachineBasicBlock &MBB, 130 MachineBasicBlock::iterator I) const { 131 assert(I->isTerminator()); 132 133 // FIXME: What if we had multiple pre-existing conditional branches? 134 MachineBasicBlock::iterator End = MBB.end(); 135 while (I != End && !I->isUnconditionalBranch()) 136 ++I; 137 return I; 138 } 139 140 // Remove redundant SI_END_CF instructions. 141 void optimizeEndCf(); 142 143 public: 144 static char ID; 145 146 SILowerControlFlow() : MachineFunctionPass(ID) {} 147 148 bool runOnMachineFunction(MachineFunction &MF) override; 149 150 StringRef getPassName() const override { 151 return "SI Lower control flow pseudo instructions"; 152 } 153 154 void getAnalysisUsage(AnalysisUsage &AU) const override { 155 // Should preserve the same set that TwoAddressInstructions does. 156 AU.addPreserved<SlotIndexes>(); 157 AU.addPreserved<LiveIntervals>(); 158 AU.addPreservedID(LiveVariablesID); 159 MachineFunctionPass::getAnalysisUsage(AU); 160 } 161 }; 162 163 } // end anonymous namespace 164 165 char SILowerControlFlow::ID = 0; 166 167 INITIALIZE_PASS(SILowerControlFlow, DEBUG_TYPE, 168 "SI lower control flow", false, false) 169 170 static void setImpSCCDefDead(MachineInstr &MI, bool IsDead) { 171 MachineOperand &ImpDefSCC = MI.getOperand(3); 172 assert(ImpDefSCC.getReg() == AMDGPU::SCC && ImpDefSCC.isDef()); 173 174 ImpDefSCC.setIsDead(IsDead); 175 } 176 177 char &llvm::SILowerControlFlowID = SILowerControlFlow::ID; 178 179 static bool hasKill(const MachineBasicBlock *Begin, 180 const MachineBasicBlock *End, const SIInstrInfo *TII) { 181 DenseSet<const MachineBasicBlock*> Visited; 182 SmallVector<MachineBasicBlock *, 4> Worklist(Begin->succ_begin(), 183 Begin->succ_end()); 184 185 while (!Worklist.empty()) { 186 MachineBasicBlock *MBB = Worklist.pop_back_val(); 187 188 if (MBB == End || !Visited.insert(MBB).second) 189 continue; 190 for (auto &Term : MBB->terminators()) 191 if (TII->isKillTerminator(Term.getOpcode())) 192 return true; 193 194 Worklist.append(MBB->succ_begin(), MBB->succ_end()); 195 } 196 197 return false; 198 } 199 200 static bool isSimpleIf(const MachineInstr &MI, const MachineRegisterInfo *MRI) { 201 Register SaveExecReg = MI.getOperand(0).getReg(); 202 auto U = MRI->use_instr_nodbg_begin(SaveExecReg); 203 204 if (U == MRI->use_instr_nodbg_end() || 205 std::next(U) != MRI->use_instr_nodbg_end() || 206 U->getOpcode() != AMDGPU::SI_END_CF) 207 return false; 208 209 return true; 210 } 211 212 void SILowerControlFlow::emitIf(MachineInstr &MI) { 213 MachineBasicBlock &MBB = *MI.getParent(); 214 const DebugLoc &DL = MI.getDebugLoc(); 215 MachineBasicBlock::iterator I(&MI); 216 Register SaveExecReg = MI.getOperand(0).getReg(); 217 MachineOperand& Cond = MI.getOperand(1); 218 assert(Cond.getSubReg() == AMDGPU::NoSubRegister); 219 220 MachineOperand &ImpDefSCC = MI.getOperand(4); 221 assert(ImpDefSCC.getReg() == AMDGPU::SCC && ImpDefSCC.isDef()); 222 223 // If there is only one use of save exec register and that use is SI_END_CF, 224 // we can optimize SI_IF by returning the full saved exec mask instead of 225 // just cleared bits. 226 bool SimpleIf = isSimpleIf(MI, MRI); 227 228 if (InsertKillCleanups) { 229 // Check for SI_KILL_*_TERMINATOR on full path of control flow and 230 // flag the associated SI_END_CF for insertion of a kill cleanup. 231 auto UseMI = MRI->use_instr_nodbg_begin(SaveExecReg); 232 while (UseMI->getOpcode() != AMDGPU::SI_END_CF) { 233 assert(std::next(UseMI) == MRI->use_instr_nodbg_end()); 234 assert(UseMI->getOpcode() == AMDGPU::SI_ELSE); 235 MachineOperand &NextExec = UseMI->getOperand(0); 236 Register NextExecReg = NextExec.getReg(); 237 if (NextExec.isDead()) { 238 assert(!SimpleIf); 239 break; 240 } 241 UseMI = MRI->use_instr_nodbg_begin(NextExecReg); 242 } 243 if (UseMI->getOpcode() == AMDGPU::SI_END_CF) { 244 if (hasKill(MI.getParent(), UseMI->getParent(), TII)) { 245 NeedsKillCleanup.insert(&*UseMI); 246 SimpleIf = false; 247 } 248 } 249 } else if (SimpleIf) { 250 // Check for SI_KILL_*_TERMINATOR on path from if to endif. 251 // if there is any such terminator simplifications are not safe. 252 auto UseMI = MRI->use_instr_nodbg_begin(SaveExecReg); 253 SimpleIf = !hasKill(MI.getParent(), UseMI->getParent(), TII); 254 } 255 256 // Add an implicit def of exec to discourage scheduling VALU after this which 257 // will interfere with trying to form s_and_saveexec_b64 later. 258 Register CopyReg = SimpleIf ? SaveExecReg 259 : MRI->createVirtualRegister(BoolRC); 260 MachineInstr *CopyExec = 261 BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), CopyReg) 262 .addReg(Exec) 263 .addReg(Exec, RegState::ImplicitDefine); 264 LoweredIf.insert(CopyReg); 265 266 Register Tmp = MRI->createVirtualRegister(BoolRC); 267 268 MachineInstr *And = 269 BuildMI(MBB, I, DL, TII->get(AndOpc), Tmp) 270 .addReg(CopyReg) 271 .add(Cond); 272 273 setImpSCCDefDead(*And, true); 274 275 MachineInstr *Xor = nullptr; 276 if (!SimpleIf) { 277 Xor = 278 BuildMI(MBB, I, DL, TII->get(XorOpc), SaveExecReg) 279 .addReg(Tmp) 280 .addReg(CopyReg); 281 setImpSCCDefDead(*Xor, ImpDefSCC.isDead()); 282 } 283 284 // Use a copy that is a terminator to get correct spill code placement it with 285 // fast regalloc. 286 MachineInstr *SetExec = 287 BuildMI(MBB, I, DL, TII->get(MovTermOpc), Exec) 288 .addReg(Tmp, RegState::Kill); 289 290 // Skip ahead to the unconditional branch in case there are other terminators 291 // present. 292 I = skipToUncondBrOrEnd(MBB, I); 293 294 // Insert the S_CBRANCH_EXECZ instruction which will be optimized later 295 // during SIRemoveShortExecBranches. 296 MachineInstr *NewBr = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_CBRANCH_EXECZ)) 297 .add(MI.getOperand(2)); 298 299 if (!LIS) { 300 MI.eraseFromParent(); 301 return; 302 } 303 304 LIS->InsertMachineInstrInMaps(*CopyExec); 305 306 // Replace with and so we don't need to fix the live interval for condition 307 // register. 308 LIS->ReplaceMachineInstrInMaps(MI, *And); 309 310 if (!SimpleIf) 311 LIS->InsertMachineInstrInMaps(*Xor); 312 LIS->InsertMachineInstrInMaps(*SetExec); 313 LIS->InsertMachineInstrInMaps(*NewBr); 314 315 LIS->removeAllRegUnitsForPhysReg(AMDGPU::EXEC); 316 MI.eraseFromParent(); 317 318 // FIXME: Is there a better way of adjusting the liveness? It shouldn't be 319 // hard to add another def here but I'm not sure how to correctly update the 320 // valno. 321 LIS->removeInterval(SaveExecReg); 322 LIS->createAndComputeVirtRegInterval(SaveExecReg); 323 LIS->createAndComputeVirtRegInterval(Tmp); 324 if (!SimpleIf) 325 LIS->createAndComputeVirtRegInterval(CopyReg); 326 } 327 328 void SILowerControlFlow::emitElse(MachineInstr &MI) { 329 MachineBasicBlock &MBB = *MI.getParent(); 330 const DebugLoc &DL = MI.getDebugLoc(); 331 332 Register DstReg = MI.getOperand(0).getReg(); 333 334 bool ExecModified = MI.getOperand(3).getImm() != 0; 335 MachineBasicBlock::iterator Start = MBB.begin(); 336 337 // This must be inserted before phis and any spill code inserted before the 338 // else. 339 Register SaveReg = ExecModified ? 340 MRI->createVirtualRegister(BoolRC) : DstReg; 341 MachineInstr *OrSaveExec = 342 BuildMI(MBB, Start, DL, TII->get(OrSaveExecOpc), SaveReg) 343 .add(MI.getOperand(1)); // Saved EXEC 344 345 MachineBasicBlock *DestBB = MI.getOperand(2).getMBB(); 346 347 MachineBasicBlock::iterator ElsePt(MI); 348 349 if (ExecModified) { 350 MachineInstr *And = 351 BuildMI(MBB, ElsePt, DL, TII->get(AndOpc), DstReg) 352 .addReg(Exec) 353 .addReg(SaveReg); 354 355 if (LIS) 356 LIS->InsertMachineInstrInMaps(*And); 357 } 358 359 MachineInstr *Xor = 360 BuildMI(MBB, ElsePt, DL, TII->get(XorTermrOpc), Exec) 361 .addReg(Exec) 362 .addReg(DstReg); 363 364 // Skip ahead to the unconditional branch in case there are other terminators 365 // present. 366 ElsePt = skipToUncondBrOrEnd(MBB, ElsePt); 367 368 MachineInstr *Branch = 369 BuildMI(MBB, ElsePt, DL, TII->get(AMDGPU::S_CBRANCH_EXECZ)) 370 .addMBB(DestBB); 371 372 if (!LIS) { 373 MI.eraseFromParent(); 374 return; 375 } 376 377 LIS->RemoveMachineInstrFromMaps(MI); 378 MI.eraseFromParent(); 379 380 LIS->InsertMachineInstrInMaps(*OrSaveExec); 381 382 LIS->InsertMachineInstrInMaps(*Xor); 383 LIS->InsertMachineInstrInMaps(*Branch); 384 385 LIS->removeInterval(DstReg); 386 LIS->createAndComputeVirtRegInterval(DstReg); 387 if (ExecModified) 388 LIS->createAndComputeVirtRegInterval(SaveReg); 389 390 // Let this be recomputed. 391 LIS->removeAllRegUnitsForPhysReg(AMDGPU::EXEC); 392 } 393 394 void SILowerControlFlow::emitIfBreak(MachineInstr &MI) { 395 MachineBasicBlock &MBB = *MI.getParent(); 396 const DebugLoc &DL = MI.getDebugLoc(); 397 auto Dst = MI.getOperand(0).getReg(); 398 399 // Skip ANDing with exec if the break condition is already masked by exec 400 // because it is a V_CMP in the same basic block. (We know the break 401 // condition operand was an i1 in IR, so if it is a VALU instruction it must 402 // be one with a carry-out.) 403 bool SkipAnding = false; 404 if (MI.getOperand(1).isReg()) { 405 if (MachineInstr *Def = MRI->getUniqueVRegDef(MI.getOperand(1).getReg())) { 406 SkipAnding = Def->getParent() == MI.getParent() 407 && SIInstrInfo::isVALU(*Def); 408 } 409 } 410 411 // AND the break condition operand with exec, then OR that into the "loop 412 // exit" mask. 413 MachineInstr *And = nullptr, *Or = nullptr; 414 if (!SkipAnding) { 415 Register AndReg = MRI->createVirtualRegister(BoolRC); 416 And = BuildMI(MBB, &MI, DL, TII->get(AndOpc), AndReg) 417 .addReg(Exec) 418 .add(MI.getOperand(1)); 419 Or = BuildMI(MBB, &MI, DL, TII->get(OrOpc), Dst) 420 .addReg(AndReg) 421 .add(MI.getOperand(2)); 422 if (LIS) 423 LIS->createAndComputeVirtRegInterval(AndReg); 424 } else 425 Or = BuildMI(MBB, &MI, DL, TII->get(OrOpc), Dst) 426 .add(MI.getOperand(1)) 427 .add(MI.getOperand(2)); 428 429 if (LIS) { 430 if (And) 431 LIS->InsertMachineInstrInMaps(*And); 432 LIS->ReplaceMachineInstrInMaps(MI, *Or); 433 } 434 435 MI.eraseFromParent(); 436 } 437 438 void SILowerControlFlow::emitLoop(MachineInstr &MI) { 439 MachineBasicBlock &MBB = *MI.getParent(); 440 const DebugLoc &DL = MI.getDebugLoc(); 441 442 MachineInstr *AndN2 = 443 BuildMI(MBB, &MI, DL, TII->get(Andn2TermOpc), Exec) 444 .addReg(Exec) 445 .add(MI.getOperand(0)); 446 447 auto BranchPt = skipToUncondBrOrEnd(MBB, MI.getIterator()); 448 MachineInstr *Branch = 449 BuildMI(MBB, BranchPt, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ)) 450 .add(MI.getOperand(1)); 451 452 if (LIS) { 453 LIS->ReplaceMachineInstrInMaps(MI, *AndN2); 454 LIS->InsertMachineInstrInMaps(*Branch); 455 } 456 457 MI.eraseFromParent(); 458 } 459 460 MachineBasicBlock::iterator 461 SILowerControlFlow::skipIgnoreExecInstsTrivialSucc( 462 MachineBasicBlock &MBB, MachineBasicBlock::iterator It) const { 463 464 SmallSet<const MachineBasicBlock *, 4> Visited; 465 MachineBasicBlock *B = &MBB; 466 do { 467 if (!Visited.insert(B).second) 468 return MBB.end(); 469 470 auto E = B->end(); 471 for ( ; It != E; ++It) { 472 if (It->getOpcode() == AMDGPU::SI_KILL_CLEANUP) 473 continue; 474 if (TII->mayReadEXEC(*MRI, *It)) 475 break; 476 } 477 478 if (It != E) 479 return It; 480 481 if (B->succ_size() != 1) 482 return MBB.end(); 483 484 // If there is one trivial successor, advance to the next block. 485 MachineBasicBlock *Succ = *B->succ_begin(); 486 487 It = Succ->begin(); 488 B = Succ; 489 } while (true); 490 } 491 492 void SILowerControlFlow::emitEndCf(MachineInstr &MI) { 493 MachineBasicBlock &MBB = *MI.getParent(); 494 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 495 Register CFMask = MI.getOperand(0).getReg(); 496 MachineInstr *Def = MRI.getUniqueVRegDef(CFMask); 497 const DebugLoc &DL = MI.getDebugLoc(); 498 499 MachineBasicBlock::iterator InsPt = 500 Def && Def->getParent() == &MBB ? std::next(MachineBasicBlock::iterator(Def)) 501 : MBB.begin(); 502 MachineInstr *NewMI = BuildMI(MBB, InsPt, DL, TII->get(OrOpc), Exec) 503 .addReg(Exec) 504 .add(MI.getOperand(0)); 505 506 LoweredEndCf.insert(NewMI); 507 508 // If this ends control flow which contains kills (as flagged in emitIf) 509 // then insert an SI_KILL_CLEANUP immediately following the exec mask 510 // manipulation. This can be lowered to early termination if appropriate. 511 MachineInstr *CleanUpMI = nullptr; 512 if (NeedsKillCleanup.count(&MI)) 513 CleanUpMI = BuildMI(MBB, InsPt, DL, TII->get(AMDGPU::SI_KILL_CLEANUP)); 514 515 if (LIS) { 516 LIS->ReplaceMachineInstrInMaps(MI, *NewMI); 517 if (CleanUpMI) 518 LIS->InsertMachineInstrInMaps(*CleanUpMI); 519 } 520 521 MI.eraseFromParent(); 522 523 if (LIS) 524 LIS->handleMove(*NewMI); 525 } 526 527 // Returns replace operands for a logical operation, either single result 528 // for exec or two operands if source was another equivalent operation. 529 void SILowerControlFlow::findMaskOperands(MachineInstr &MI, unsigned OpNo, 530 SmallVectorImpl<MachineOperand> &Src) const { 531 MachineOperand &Op = MI.getOperand(OpNo); 532 if (!Op.isReg() || !Op.getReg().isVirtual()) { 533 Src.push_back(Op); 534 return; 535 } 536 537 MachineInstr *Def = MRI->getUniqueVRegDef(Op.getReg()); 538 if (!Def || Def->getParent() != MI.getParent() || 539 !(Def->isFullCopy() || (Def->getOpcode() == MI.getOpcode()))) 540 return; 541 542 // Make sure we do not modify exec between def and use. 543 // A copy with implcitly defined exec inserted earlier is an exclusion, it 544 // does not really modify exec. 545 for (auto I = Def->getIterator(); I != MI.getIterator(); ++I) 546 if (I->modifiesRegister(AMDGPU::EXEC, TRI) && 547 !(I->isCopy() && I->getOperand(0).getReg() != Exec)) 548 return; 549 550 for (const auto &SrcOp : Def->explicit_operands()) 551 if (SrcOp.isReg() && SrcOp.isUse() && 552 (SrcOp.getReg().isVirtual() || SrcOp.getReg() == Exec)) 553 Src.push_back(SrcOp); 554 } 555 556 // Search and combine pairs of equivalent instructions, like 557 // S_AND_B64 x, (S_AND_B64 x, y) => S_AND_B64 x, y 558 // S_OR_B64 x, (S_OR_B64 x, y) => S_OR_B64 x, y 559 // One of the operands is exec mask. 560 void SILowerControlFlow::combineMasks(MachineInstr &MI) { 561 assert(MI.getNumExplicitOperands() == 3); 562 SmallVector<MachineOperand, 4> Ops; 563 unsigned OpToReplace = 1; 564 findMaskOperands(MI, 1, Ops); 565 if (Ops.size() == 1) OpToReplace = 2; // First operand can be exec or its copy 566 findMaskOperands(MI, 2, Ops); 567 if (Ops.size() != 3) return; 568 569 unsigned UniqueOpndIdx; 570 if (Ops[0].isIdenticalTo(Ops[1])) UniqueOpndIdx = 2; 571 else if (Ops[0].isIdenticalTo(Ops[2])) UniqueOpndIdx = 1; 572 else if (Ops[1].isIdenticalTo(Ops[2])) UniqueOpndIdx = 1; 573 else return; 574 575 Register Reg = MI.getOperand(OpToReplace).getReg(); 576 MI.RemoveOperand(OpToReplace); 577 MI.addOperand(Ops[UniqueOpndIdx]); 578 if (MRI->use_empty(Reg)) 579 MRI->getUniqueVRegDef(Reg)->eraseFromParent(); 580 } 581 582 void SILowerControlFlow::optimizeEndCf() { 583 // If the only instruction immediately following this END_CF is an another 584 // END_CF in the only successor we can avoid emitting exec mask restore here. 585 if (!RemoveRedundantEndcf) 586 return; 587 588 for (MachineInstr *MI : LoweredEndCf) { 589 MachineBasicBlock &MBB = *MI->getParent(); 590 auto Next = 591 skipIgnoreExecInstsTrivialSucc(MBB, std::next(MI->getIterator())); 592 if (Next == MBB.end() || !LoweredEndCf.count(&*Next)) 593 continue; 594 // Only skip inner END_CF if outer ENDCF belongs to SI_IF. 595 // If that belongs to SI_ELSE then saved mask has an inverted value. 596 Register SavedExec 597 = TII->getNamedOperand(*Next, AMDGPU::OpName::src1)->getReg(); 598 assert(SavedExec.isVirtual() && "Expected saved exec to be src1!"); 599 600 const MachineInstr *Def = MRI->getUniqueVRegDef(SavedExec); 601 if (Def && LoweredIf.count(SavedExec)) { 602 LLVM_DEBUG(dbgs() << "Skip redundant "; MI->dump()); 603 if (LIS) 604 LIS->RemoveMachineInstrFromMaps(*MI); 605 MI->eraseFromParent(); 606 removeMBBifRedundant(MBB); 607 } 608 } 609 } 610 611 void SILowerControlFlow::process(MachineInstr &MI) { 612 MachineBasicBlock &MBB = *MI.getParent(); 613 MachineBasicBlock::iterator I(MI); 614 MachineInstr *Prev = (I != MBB.begin()) ? &*(std::prev(I)) : nullptr; 615 616 switch (MI.getOpcode()) { 617 case AMDGPU::SI_IF: 618 emitIf(MI); 619 break; 620 621 case AMDGPU::SI_ELSE: 622 emitElse(MI); 623 break; 624 625 case AMDGPU::SI_IF_BREAK: 626 emitIfBreak(MI); 627 break; 628 629 case AMDGPU::SI_LOOP: 630 emitLoop(MI); 631 break; 632 633 case AMDGPU::SI_END_CF: 634 emitEndCf(MI); 635 break; 636 637 default: 638 assert(false && "Attempt to process unsupported instruction"); 639 break; 640 } 641 642 MachineBasicBlock::iterator Next; 643 for (I = Prev ? Prev->getIterator() : MBB.begin(); I != MBB.end(); I = Next) { 644 Next = std::next(I); 645 MachineInstr &MaskMI = *I; 646 switch (MaskMI.getOpcode()) { 647 case AMDGPU::S_AND_B64: 648 case AMDGPU::S_OR_B64: 649 case AMDGPU::S_AND_B32: 650 case AMDGPU::S_OR_B32: 651 // Cleanup bit manipulations on exec mask 652 combineMasks(MaskMI); 653 break; 654 default: 655 I = MBB.end(); 656 break; 657 } 658 } 659 } 660 661 bool SILowerControlFlow::removeMBBifRedundant(MachineBasicBlock &MBB) { 662 bool Redundant = true; 663 for (auto &I : MBB.instrs()) { 664 if (!I.isDebugInstr() && !I.isUnconditionalBranch()) 665 Redundant = false; 666 } 667 if (Redundant) { 668 MachineBasicBlock *Succ = *MBB.succ_begin(); 669 SmallVector<MachineBasicBlock *, 2> Preds(MBB.predecessors()); 670 for (auto P : Preds) { 671 P->replaceSuccessor(&MBB, Succ); 672 MachineBasicBlock::iterator I(P->getFirstInstrTerminator()); 673 while (I != P->end()) { 674 if (I->isBranch()) { 675 if (TII->getBranchDestBlock(*I) == &MBB) { 676 I->getOperand(0).setMBB(Succ); 677 break; 678 } 679 } 680 I++; 681 } 682 if (I == P->end()) { 683 MachineFunction *MF = P->getParent(); 684 MachineFunction::iterator InsertPt = 685 P->getNextNode() ? MachineFunction::iterator(P->getNextNode()) 686 : MF->end(); 687 MF->splice(InsertPt, Succ); 688 } 689 } 690 MBB.removeSuccessor(Succ); 691 if (LIS) { 692 for (auto &I : MBB.instrs()) 693 LIS->RemoveMachineInstrFromMaps(I); 694 } 695 MBB.clear(); 696 MBB.eraseFromParent(); 697 return true; 698 } 699 return false; 700 } 701 702 bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) { 703 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 704 TII = ST.getInstrInfo(); 705 TRI = &TII->getRegisterInfo(); 706 707 // This doesn't actually need LiveIntervals, but we can preserve them. 708 LIS = getAnalysisIfAvailable<LiveIntervals>(); 709 MRI = &MF.getRegInfo(); 710 BoolRC = TRI->getBoolRC(); 711 InsertKillCleanups = 712 MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS; 713 714 if (ST.isWave32()) { 715 AndOpc = AMDGPU::S_AND_B32; 716 OrOpc = AMDGPU::S_OR_B32; 717 XorOpc = AMDGPU::S_XOR_B32; 718 MovTermOpc = AMDGPU::S_MOV_B32_term; 719 Andn2TermOpc = AMDGPU::S_ANDN2_B32_term; 720 XorTermrOpc = AMDGPU::S_XOR_B32_term; 721 OrSaveExecOpc = AMDGPU::S_OR_SAVEEXEC_B32; 722 Exec = AMDGPU::EXEC_LO; 723 } else { 724 AndOpc = AMDGPU::S_AND_B64; 725 OrOpc = AMDGPU::S_OR_B64; 726 XorOpc = AMDGPU::S_XOR_B64; 727 MovTermOpc = AMDGPU::S_MOV_B64_term; 728 Andn2TermOpc = AMDGPU::S_ANDN2_B64_term; 729 XorTermrOpc = AMDGPU::S_XOR_B64_term; 730 OrSaveExecOpc = AMDGPU::S_OR_SAVEEXEC_B64; 731 Exec = AMDGPU::EXEC; 732 } 733 734 SmallVector<MachineInstr *, 32> Worklist; 735 736 MachineFunction::iterator NextBB; 737 for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); 738 BI != BE; BI = NextBB) { 739 NextBB = std::next(BI); 740 MachineBasicBlock &MBB = *BI; 741 742 MachineBasicBlock::iterator I, Next; 743 for (I = MBB.begin(); I != MBB.end(); I = Next) { 744 Next = std::next(I); 745 MachineInstr &MI = *I; 746 747 switch (MI.getOpcode()) { 748 case AMDGPU::SI_IF: 749 process(MI); 750 break; 751 752 case AMDGPU::SI_ELSE: 753 case AMDGPU::SI_IF_BREAK: 754 case AMDGPU::SI_LOOP: 755 case AMDGPU::SI_END_CF: 756 // Only build worklist if SI_IF instructions must be processed first. 757 if (InsertKillCleanups) 758 Worklist.push_back(&MI); 759 else 760 process(MI); 761 break; 762 763 default: 764 break; 765 } 766 } 767 } 768 769 for (MachineInstr *MI : Worklist) 770 process(*MI); 771 772 optimizeEndCf(); 773 774 LoweredEndCf.clear(); 775 LoweredIf.clear(); 776 NeedsKillCleanup.clear(); 777 778 return true; 779 } 780