1 //===-- SIPreEmitPeephole.cpp ------------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// This pass performs the peephole optimizations before code emission. 11 /// 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPU.h" 15 #include "GCNSubtarget.h" 16 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 17 #include "llvm/CodeGen/MachineFunctionPass.h" 18 #include "llvm/CodeGen/TargetSchedule.h" 19 #include "llvm/Support/BranchProbability.h" 20 21 using namespace llvm; 22 23 #define DEBUG_TYPE "si-pre-emit-peephole" 24 25 namespace { 26 27 class SIPreEmitPeephole : public MachineFunctionPass { 28 private: 29 const SIInstrInfo *TII = nullptr; 30 const SIRegisterInfo *TRI = nullptr; 31 32 bool optimizeVccBranch(MachineInstr &MI) const; 33 bool optimizeSetGPR(MachineInstr &First, MachineInstr &MI) const; 34 bool getBlockDestinations(MachineBasicBlock &SrcMBB, 35 MachineBasicBlock *&TrueMBB, 36 MachineBasicBlock *&FalseMBB, 37 SmallVectorImpl<MachineOperand> &Cond); 38 bool mustRetainExeczBranch(const MachineInstr &Branch, 39 const MachineBasicBlock &From, 40 const MachineBasicBlock &To) const; 41 bool removeExeczBranch(MachineInstr &MI, MachineBasicBlock &SrcMBB); 42 43 public: 44 static char ID; 45 46 SIPreEmitPeephole() : MachineFunctionPass(ID) { 47 initializeSIPreEmitPeepholePass(*PassRegistry::getPassRegistry()); 48 } 49 50 bool runOnMachineFunction(MachineFunction &MF) override; 51 }; 52 53 } // End anonymous namespace. 54 55 INITIALIZE_PASS(SIPreEmitPeephole, DEBUG_TYPE, 56 "SI peephole optimizations", false, false) 57 58 char SIPreEmitPeephole::ID = 0; 59 60 char &llvm::SIPreEmitPeepholeID = SIPreEmitPeephole::ID; 61 62 bool SIPreEmitPeephole::optimizeVccBranch(MachineInstr &MI) const { 63 // Match: 64 // sreg = -1 or 0 65 // vcc = S_AND_B64 exec, sreg or S_ANDN2_B64 exec, sreg 66 // S_CBRANCH_VCC[N]Z 67 // => 68 // S_CBRANCH_EXEC[N]Z 69 // We end up with this pattern sometimes after basic block placement. 70 // It happens while combining a block which assigns -1 or 0 to a saved mask 71 // and another block which consumes that saved mask and then a branch. 72 // 73 // While searching this also performs the following substitution: 74 // vcc = V_CMP 75 // vcc = S_AND exec, vcc 76 // S_CBRANCH_VCC[N]Z 77 // => 78 // vcc = V_CMP 79 // S_CBRANCH_VCC[N]Z 80 81 bool Changed = false; 82 MachineBasicBlock &MBB = *MI.getParent(); 83 const GCNSubtarget &ST = MBB.getParent()->getSubtarget<GCNSubtarget>(); 84 const bool IsWave32 = ST.isWave32(); 85 const unsigned CondReg = TRI->getVCC(); 86 const unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC; 87 const unsigned And = IsWave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64; 88 const unsigned AndN2 = IsWave32 ? AMDGPU::S_ANDN2_B32 : AMDGPU::S_ANDN2_B64; 89 const unsigned Mov = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; 90 91 MachineBasicBlock::reverse_iterator A = MI.getReverseIterator(), 92 E = MBB.rend(); 93 bool ReadsCond = false; 94 unsigned Threshold = 5; 95 for (++A; A != E; ++A) { 96 if (!--Threshold) 97 return false; 98 if (A->modifiesRegister(ExecReg, TRI)) 99 return false; 100 if (A->modifiesRegister(CondReg, TRI)) { 101 if (!A->definesRegister(CondReg, TRI) || 102 (A->getOpcode() != And && A->getOpcode() != AndN2)) 103 return false; 104 break; 105 } 106 ReadsCond |= A->readsRegister(CondReg, TRI); 107 } 108 if (A == E) 109 return false; 110 111 MachineOperand &Op1 = A->getOperand(1); 112 MachineOperand &Op2 = A->getOperand(2); 113 if (Op1.getReg() != ExecReg && Op2.isReg() && Op2.getReg() == ExecReg) { 114 TII->commuteInstruction(*A); 115 Changed = true; 116 } 117 if (Op1.getReg() != ExecReg) 118 return Changed; 119 if (Op2.isImm() && !(Op2.getImm() == -1 || Op2.getImm() == 0)) 120 return Changed; 121 122 int64_t MaskValue = 0; 123 Register SReg; 124 if (Op2.isReg()) { 125 SReg = Op2.getReg(); 126 auto M = std::next(A); 127 bool ReadsSreg = false; 128 bool ModifiesExec = false; 129 for (; M != E; ++M) { 130 if (M->definesRegister(SReg, TRI)) 131 break; 132 if (M->modifiesRegister(SReg, TRI)) 133 return Changed; 134 ReadsSreg |= M->readsRegister(SReg, TRI); 135 ModifiesExec |= M->modifiesRegister(ExecReg, TRI); 136 } 137 if (M == E) 138 return Changed; 139 // If SReg is VCC and SReg definition is a VALU comparison. 140 // This means S_AND with EXEC is not required. 141 // Erase the S_AND and return. 142 // Note: isVOPC is used instead of isCompare to catch V_CMP_CLASS 143 if (A->getOpcode() == And && SReg == CondReg && !ModifiesExec && 144 TII->isVOPC(*M)) { 145 A->eraseFromParent(); 146 return true; 147 } 148 if (!M->isMoveImmediate() || !M->getOperand(1).isImm() || 149 (M->getOperand(1).getImm() != -1 && M->getOperand(1).getImm() != 0)) 150 return Changed; 151 MaskValue = M->getOperand(1).getImm(); 152 // First if sreg is only used in the AND instruction fold the immediate 153 // into the AND. 154 if (!ReadsSreg && Op2.isKill()) { 155 A->getOperand(2).ChangeToImmediate(MaskValue); 156 M->eraseFromParent(); 157 } 158 } else if (Op2.isImm()) { 159 MaskValue = Op2.getImm(); 160 } else { 161 llvm_unreachable("Op2 must be register or immediate"); 162 } 163 164 // Invert mask for s_andn2 165 assert(MaskValue == 0 || MaskValue == -1); 166 if (A->getOpcode() == AndN2) 167 MaskValue = ~MaskValue; 168 169 if (!ReadsCond && A->registerDefIsDead(AMDGPU::SCC, /*TRI=*/nullptr)) { 170 if (!MI.killsRegister(CondReg, TRI)) { 171 // Replace AND with MOV 172 if (MaskValue == 0) { 173 BuildMI(*A->getParent(), *A, A->getDebugLoc(), TII->get(Mov), CondReg) 174 .addImm(0); 175 } else { 176 BuildMI(*A->getParent(), *A, A->getDebugLoc(), TII->get(Mov), CondReg) 177 .addReg(ExecReg); 178 } 179 } 180 // Remove AND instruction 181 A->eraseFromParent(); 182 } 183 184 bool IsVCCZ = MI.getOpcode() == AMDGPU::S_CBRANCH_VCCZ; 185 if (SReg == ExecReg) { 186 // EXEC is updated directly 187 if (IsVCCZ) { 188 MI.eraseFromParent(); 189 return true; 190 } 191 MI.setDesc(TII->get(AMDGPU::S_BRANCH)); 192 } else if (IsVCCZ && MaskValue == 0) { 193 // Will always branch 194 // Remove all successors shadowed by new unconditional branch 195 MachineBasicBlock *Parent = MI.getParent(); 196 SmallVector<MachineInstr *, 4> ToRemove; 197 bool Found = false; 198 for (MachineInstr &Term : Parent->terminators()) { 199 if (Found) { 200 if (Term.isBranch()) 201 ToRemove.push_back(&Term); 202 } else { 203 Found = Term.isIdenticalTo(MI); 204 } 205 } 206 assert(Found && "conditional branch is not terminator"); 207 for (auto *BranchMI : ToRemove) { 208 MachineOperand &Dst = BranchMI->getOperand(0); 209 assert(Dst.isMBB() && "destination is not basic block"); 210 Parent->removeSuccessor(Dst.getMBB()); 211 BranchMI->eraseFromParent(); 212 } 213 214 if (MachineBasicBlock *Succ = Parent->getFallThrough()) { 215 Parent->removeSuccessor(Succ); 216 } 217 218 // Rewrite to unconditional branch 219 MI.setDesc(TII->get(AMDGPU::S_BRANCH)); 220 } else if (!IsVCCZ && MaskValue == 0) { 221 // Will never branch 222 MachineOperand &Dst = MI.getOperand(0); 223 assert(Dst.isMBB() && "destination is not basic block"); 224 MI.getParent()->removeSuccessor(Dst.getMBB()); 225 MI.eraseFromParent(); 226 return true; 227 } else if (MaskValue == -1) { 228 // Depends only on EXEC 229 MI.setDesc( 230 TII->get(IsVCCZ ? AMDGPU::S_CBRANCH_EXECZ : AMDGPU::S_CBRANCH_EXECNZ)); 231 } 232 233 MI.removeOperand(MI.findRegisterUseOperandIdx(CondReg, TRI, false /*Kill*/)); 234 MI.addImplicitDefUseOperands(*MBB.getParent()); 235 236 return true; 237 } 238 239 bool SIPreEmitPeephole::optimizeSetGPR(MachineInstr &First, 240 MachineInstr &MI) const { 241 MachineBasicBlock &MBB = *MI.getParent(); 242 const MachineFunction &MF = *MBB.getParent(); 243 const MachineRegisterInfo &MRI = MF.getRegInfo(); 244 MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::src0); 245 Register IdxReg = Idx->isReg() ? Idx->getReg() : Register(); 246 SmallVector<MachineInstr *, 4> ToRemove; 247 bool IdxOn = true; 248 249 if (!MI.isIdenticalTo(First)) 250 return false; 251 252 // Scan back to find an identical S_SET_GPR_IDX_ON 253 for (MachineBasicBlock::instr_iterator I = std::next(First.getIterator()), 254 E = MI.getIterator(); 255 I != E; ++I) { 256 if (I->isBundle()) 257 continue; 258 switch (I->getOpcode()) { 259 case AMDGPU::S_SET_GPR_IDX_MODE: 260 return false; 261 case AMDGPU::S_SET_GPR_IDX_OFF: 262 IdxOn = false; 263 ToRemove.push_back(&*I); 264 break; 265 default: 266 if (I->modifiesRegister(AMDGPU::M0, TRI)) 267 return false; 268 if (IdxReg && I->modifiesRegister(IdxReg, TRI)) 269 return false; 270 if (llvm::any_of(I->operands(), 271 [&MRI, this](const MachineOperand &MO) { 272 return MO.isReg() && 273 TRI->isVectorRegister(MRI, MO.getReg()); 274 })) { 275 // The only exception allowed here is another indirect vector move 276 // with the same mode. 277 if (!IdxOn || !(I->getOpcode() == AMDGPU::V_MOV_B32_indirect_write || 278 I->getOpcode() == AMDGPU::V_MOV_B32_indirect_read)) 279 return false; 280 } 281 } 282 } 283 284 MI.eraseFromBundle(); 285 for (MachineInstr *RI : ToRemove) 286 RI->eraseFromBundle(); 287 return true; 288 } 289 290 bool SIPreEmitPeephole::getBlockDestinations( 291 MachineBasicBlock &SrcMBB, MachineBasicBlock *&TrueMBB, 292 MachineBasicBlock *&FalseMBB, SmallVectorImpl<MachineOperand> &Cond) { 293 if (TII->analyzeBranch(SrcMBB, TrueMBB, FalseMBB, Cond)) 294 return false; 295 296 if (!FalseMBB) 297 FalseMBB = SrcMBB.getNextNode(); 298 299 return true; 300 } 301 302 namespace { 303 class BranchWeightCostModel { 304 const SIInstrInfo &TII; 305 const TargetSchedModel &SchedModel; 306 BranchProbability BranchProb; 307 static constexpr uint64_t BranchNotTakenCost = 1; 308 uint64_t BranchTakenCost; 309 uint64_t ThenCyclesCost = 0; 310 311 public: 312 BranchWeightCostModel(const SIInstrInfo &TII, const MachineInstr &Branch, 313 const MachineBasicBlock &Succ) 314 : TII(TII), SchedModel(TII.getSchedModel()) { 315 const MachineBasicBlock &Head = *Branch.getParent(); 316 const auto *FromIt = find(Head.successors(), &Succ); 317 assert(FromIt != Head.succ_end()); 318 319 BranchProb = Head.getSuccProbability(FromIt); 320 if (BranchProb.isUnknown()) 321 BranchProb = BranchProbability::getZero(); 322 BranchTakenCost = SchedModel.computeInstrLatency(&Branch); 323 } 324 325 bool isProfitable(const MachineInstr &MI) { 326 if (TII.isWaitcnt(MI.getOpcode())) 327 return false; 328 329 ThenCyclesCost += SchedModel.computeInstrLatency(&MI); 330 331 // Consider `P = N/D` to be the probability of execz being false (skipping 332 // the then-block) The transformation is profitable if always executing the 333 // 'then' block is cheaper than executing sometimes 'then' and always 334 // executing s_cbranch_execz: 335 // * ThenCost <= P*ThenCost + (1-P)*BranchTakenCost + P*BranchNotTakenCost 336 // * (1-P) * ThenCost <= (1-P)*BranchTakenCost + P*BranchNotTakenCost 337 // * (D-N)/D * ThenCost <= (D-N)/D * BranchTakenCost + N/D * 338 // BranchNotTakenCost 339 uint64_t Numerator = BranchProb.getNumerator(); 340 uint64_t Denominator = BranchProb.getDenominator(); 341 return (Denominator - Numerator) * ThenCyclesCost <= 342 ((Denominator - Numerator) * BranchTakenCost + 343 Numerator * BranchNotTakenCost); 344 } 345 }; 346 347 bool SIPreEmitPeephole::mustRetainExeczBranch( 348 const MachineInstr &Branch, const MachineBasicBlock &From, 349 const MachineBasicBlock &To) const { 350 assert(is_contained(Branch.getParent()->successors(), &From)); 351 BranchWeightCostModel CostModel{*TII, Branch, From}; 352 353 const MachineFunction *MF = From.getParent(); 354 for (MachineFunction::const_iterator MBBI(&From), ToI(&To), End = MF->end(); 355 MBBI != End && MBBI != ToI; ++MBBI) { 356 const MachineBasicBlock &MBB = *MBBI; 357 358 for (const MachineInstr &MI : MBB) { 359 // When a uniform loop is inside non-uniform control flow, the branch 360 // leaving the loop might never be taken when EXEC = 0. 361 // Hence we should retain cbranch out of the loop lest it become infinite. 362 if (MI.isConditionalBranch()) 363 return true; 364 365 if (MI.isUnconditionalBranch() && 366 TII->getBranchDestBlock(MI) != MBB.getNextNode()) 367 return true; 368 369 if (MI.isMetaInstruction()) 370 continue; 371 372 if (TII->hasUnwantedEffectsWhenEXECEmpty(MI)) 373 return true; 374 375 if (!CostModel.isProfitable(MI)) 376 return true; 377 } 378 } 379 380 return false; 381 } 382 } // namespace 383 384 // Returns true if the skip branch instruction is removed. 385 bool SIPreEmitPeephole::removeExeczBranch(MachineInstr &MI, 386 MachineBasicBlock &SrcMBB) { 387 388 if (!TII->getSchedModel().hasInstrSchedModel()) 389 return false; 390 391 MachineBasicBlock *TrueMBB = nullptr; 392 MachineBasicBlock *FalseMBB = nullptr; 393 SmallVector<MachineOperand, 1> Cond; 394 395 if (!getBlockDestinations(SrcMBB, TrueMBB, FalseMBB, Cond)) 396 return false; 397 398 // Consider only the forward branches. 399 if (SrcMBB.getNumber() >= TrueMBB->getNumber()) 400 return false; 401 402 // Consider only when it is legal and profitable 403 if (mustRetainExeczBranch(MI, *FalseMBB, *TrueMBB)) 404 return false; 405 406 LLVM_DEBUG(dbgs() << "Removing the execz branch: " << MI); 407 MI.eraseFromParent(); 408 SrcMBB.removeSuccessor(TrueMBB); 409 410 return true; 411 } 412 413 bool SIPreEmitPeephole::runOnMachineFunction(MachineFunction &MF) { 414 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 415 TII = ST.getInstrInfo(); 416 TRI = &TII->getRegisterInfo(); 417 bool Changed = false; 418 419 MF.RenumberBlocks(); 420 421 for (MachineBasicBlock &MBB : MF) { 422 MachineBasicBlock::iterator TermI = MBB.getFirstTerminator(); 423 // Check first terminator for branches to optimize 424 if (TermI != MBB.end()) { 425 MachineInstr &MI = *TermI; 426 switch (MI.getOpcode()) { 427 case AMDGPU::S_CBRANCH_VCCZ: 428 case AMDGPU::S_CBRANCH_VCCNZ: 429 Changed |= optimizeVccBranch(MI); 430 break; 431 case AMDGPU::S_CBRANCH_EXECZ: 432 Changed |= removeExeczBranch(MI, MBB); 433 break; 434 } 435 } 436 437 if (!ST.hasVGPRIndexMode()) 438 continue; 439 440 MachineInstr *SetGPRMI = nullptr; 441 const unsigned Threshold = 20; 442 unsigned Count = 0; 443 // Scan the block for two S_SET_GPR_IDX_ON instructions to see if a 444 // second is not needed. Do expensive checks in the optimizeSetGPR() 445 // and limit the distance to 20 instructions for compile time purposes. 446 // Note: this needs to work on bundles as S_SET_GPR_IDX* instructions 447 // may be bundled with the instructions they modify. 448 for (auto &MI : make_early_inc_range(MBB.instrs())) { 449 if (Count == Threshold) 450 SetGPRMI = nullptr; 451 else 452 ++Count; 453 454 if (MI.getOpcode() != AMDGPU::S_SET_GPR_IDX_ON) 455 continue; 456 457 Count = 0; 458 if (!SetGPRMI) { 459 SetGPRMI = &MI; 460 continue; 461 } 462 463 if (optimizeSetGPR(*SetGPRMI, MI)) 464 Changed = true; 465 else 466 SetGPRMI = &MI; 467 } 468 } 469 470 return Changed; 471 } 472