1 //===-- SIOptimizeExecMasking.cpp -----------------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 9 #include "SIOptimizeExecMasking.h" 10 #include "AMDGPU.h" 11 #include "GCNSubtarget.h" 12 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 13 #include "SIRegisterInfo.h" 14 #include "llvm/ADT/SmallVector.h" 15 #include "llvm/CodeGen/LiveRegUnits.h" 16 #include "llvm/CodeGen/MachineFunctionPass.h" 17 #include "llvm/CodeGen/MachineOperand.h" 18 #include "llvm/CodeGen/TargetRegisterInfo.h" 19 #include "llvm/InitializePasses.h" 20 21 using namespace llvm; 22 23 #define DEBUG_TYPE "si-optimize-exec-masking" 24 25 namespace { 26 27 class SIOptimizeExecMasking { 28 MachineFunction *MF = nullptr; 29 const GCNSubtarget *ST = nullptr; 30 const SIRegisterInfo *TRI = nullptr; 31 const SIInstrInfo *TII = nullptr; 32 const MachineRegisterInfo *MRI = nullptr; 33 MCRegister Exec; 34 35 DenseMap<MachineInstr *, MachineInstr *> SaveExecVCmpMapping; 36 SmallVector<std::pair<MachineInstr *, MachineInstr *>, 1> OrXors; 37 SmallVector<MachineOperand *, 1> KillFlagCandidates; 38 39 Register isCopyFromExec(const MachineInstr &MI) const; 40 Register isCopyToExec(const MachineInstr &MI) const; 41 bool removeTerminatorBit(MachineInstr &MI) const; 42 MachineBasicBlock::reverse_iterator 43 fixTerminators(MachineBasicBlock &MBB) const; 44 MachineBasicBlock::reverse_iterator 45 findExecCopy(MachineBasicBlock &MBB, 46 MachineBasicBlock::reverse_iterator I) const; 47 bool isRegisterInUseBetween(MachineInstr &Stop, MachineInstr &Start, 48 MCRegister Reg, bool UseLiveOuts = false, 49 bool IgnoreStart = false) const; 50 bool isRegisterInUseAfter(MachineInstr &Stop, MCRegister Reg) const; 51 MachineInstr *findInstrBackwards( 52 MachineInstr &Origin, std::function<bool(MachineInstr *)> Pred, 53 ArrayRef<MCRegister> NonModifiableRegs, 54 MachineInstr *Terminator = nullptr, 55 SmallVectorImpl<MachineOperand *> *KillFlagCandidates = nullptr, 56 unsigned MaxInstructions = 20) const; 57 bool optimizeExecSequence(); 58 void tryRecordVCmpxAndSaveexecSequence(MachineInstr &MI); 59 bool optimizeVCMPSaveExecSequence(MachineInstr &SaveExecInstr, 60 MachineInstr &VCmp, MCRegister Exec) const; 61 62 void tryRecordOrSaveexecXorSequence(MachineInstr &MI); 63 bool optimizeOrSaveexecXorSequences(); 64 65 public: 66 bool run(MachineFunction &MF); 67 }; 68 69 class SIOptimizeExecMaskingLegacy : public MachineFunctionPass { 70 public: 71 static char ID; 72 73 SIOptimizeExecMaskingLegacy() : MachineFunctionPass(ID) { 74 initializeSIOptimizeExecMaskingLegacyPass(*PassRegistry::getPassRegistry()); 75 } 76 77 bool runOnMachineFunction(MachineFunction &MF) override; 78 79 StringRef getPassName() const override { 80 return "SI optimize exec mask operations"; 81 } 82 83 void getAnalysisUsage(AnalysisUsage &AU) const override { 84 AU.setPreservesCFG(); 85 MachineFunctionPass::getAnalysisUsage(AU); 86 } 87 }; 88 89 } // End anonymous namespace. 90 91 PreservedAnalyses 92 SIOptimizeExecMaskingPass::run(MachineFunction &MF, 93 MachineFunctionAnalysisManager &) { 94 SIOptimizeExecMasking Impl; 95 96 if (!Impl.run(MF)) 97 return PreservedAnalyses::all(); 98 99 auto PA = getMachineFunctionPassPreservedAnalyses(); 100 PA.preserveSet<CFGAnalyses>(); 101 return PA; 102 } 103 104 INITIALIZE_PASS_BEGIN(SIOptimizeExecMaskingLegacy, DEBUG_TYPE, 105 "SI optimize exec mask operations", false, false) 106 INITIALIZE_PASS_DEPENDENCY(LiveIntervalsWrapperPass) 107 INITIALIZE_PASS_END(SIOptimizeExecMaskingLegacy, DEBUG_TYPE, 108 "SI optimize exec mask operations", false, false) 109 110 char SIOptimizeExecMaskingLegacy::ID = 0; 111 112 char &llvm::SIOptimizeExecMaskingLegacyID = SIOptimizeExecMaskingLegacy::ID; 113 114 /// If \p MI is a copy from exec, return the register copied to. 115 Register SIOptimizeExecMasking::isCopyFromExec(const MachineInstr &MI) const { 116 switch (MI.getOpcode()) { 117 case AMDGPU::COPY: 118 case AMDGPU::S_MOV_B64: 119 case AMDGPU::S_MOV_B64_term: 120 case AMDGPU::S_MOV_B32: 121 case AMDGPU::S_MOV_B32_term: { 122 const MachineOperand &Src = MI.getOperand(1); 123 if (Src.isReg() && Src.getReg() == Exec) 124 return MI.getOperand(0).getReg(); 125 } 126 } 127 128 return AMDGPU::NoRegister; 129 } 130 131 /// If \p MI is a copy to exec, return the register copied from. 132 Register SIOptimizeExecMasking::isCopyToExec(const MachineInstr &MI) const { 133 switch (MI.getOpcode()) { 134 case AMDGPU::COPY: 135 case AMDGPU::S_MOV_B64: 136 case AMDGPU::S_MOV_B32: { 137 const MachineOperand &Dst = MI.getOperand(0); 138 if (Dst.isReg() && Dst.getReg() == Exec && MI.getOperand(1).isReg()) 139 return MI.getOperand(1).getReg(); 140 break; 141 } 142 case AMDGPU::S_MOV_B64_term: 143 case AMDGPU::S_MOV_B32_term: 144 llvm_unreachable("should have been replaced"); 145 } 146 147 return Register(); 148 } 149 150 /// If \p MI is a logical operation on an exec value, 151 /// return the register copied to. 152 static Register isLogicalOpOnExec(const MachineInstr &MI) { 153 switch (MI.getOpcode()) { 154 case AMDGPU::S_AND_B64: 155 case AMDGPU::S_OR_B64: 156 case AMDGPU::S_XOR_B64: 157 case AMDGPU::S_ANDN2_B64: 158 case AMDGPU::S_ORN2_B64: 159 case AMDGPU::S_NAND_B64: 160 case AMDGPU::S_NOR_B64: 161 case AMDGPU::S_XNOR_B64: { 162 const MachineOperand &Src1 = MI.getOperand(1); 163 if (Src1.isReg() && Src1.getReg() == AMDGPU::EXEC) 164 return MI.getOperand(0).getReg(); 165 const MachineOperand &Src2 = MI.getOperand(2); 166 if (Src2.isReg() && Src2.getReg() == AMDGPU::EXEC) 167 return MI.getOperand(0).getReg(); 168 break; 169 } 170 case AMDGPU::S_AND_B32: 171 case AMDGPU::S_OR_B32: 172 case AMDGPU::S_XOR_B32: 173 case AMDGPU::S_ANDN2_B32: 174 case AMDGPU::S_ORN2_B32: 175 case AMDGPU::S_NAND_B32: 176 case AMDGPU::S_NOR_B32: 177 case AMDGPU::S_XNOR_B32: { 178 const MachineOperand &Src1 = MI.getOperand(1); 179 if (Src1.isReg() && Src1.getReg() == AMDGPU::EXEC_LO) 180 return MI.getOperand(0).getReg(); 181 const MachineOperand &Src2 = MI.getOperand(2); 182 if (Src2.isReg() && Src2.getReg() == AMDGPU::EXEC_LO) 183 return MI.getOperand(0).getReg(); 184 break; 185 } 186 } 187 188 return AMDGPU::NoRegister; 189 } 190 191 static unsigned getSaveExecOp(unsigned Opc) { 192 switch (Opc) { 193 case AMDGPU::S_AND_B64: 194 return AMDGPU::S_AND_SAVEEXEC_B64; 195 case AMDGPU::S_OR_B64: 196 return AMDGPU::S_OR_SAVEEXEC_B64; 197 case AMDGPU::S_XOR_B64: 198 return AMDGPU::S_XOR_SAVEEXEC_B64; 199 case AMDGPU::S_ANDN2_B64: 200 return AMDGPU::S_ANDN2_SAVEEXEC_B64; 201 case AMDGPU::S_ORN2_B64: 202 return AMDGPU::S_ORN2_SAVEEXEC_B64; 203 case AMDGPU::S_NAND_B64: 204 return AMDGPU::S_NAND_SAVEEXEC_B64; 205 case AMDGPU::S_NOR_B64: 206 return AMDGPU::S_NOR_SAVEEXEC_B64; 207 case AMDGPU::S_XNOR_B64: 208 return AMDGPU::S_XNOR_SAVEEXEC_B64; 209 case AMDGPU::S_AND_B32: 210 return AMDGPU::S_AND_SAVEEXEC_B32; 211 case AMDGPU::S_OR_B32: 212 return AMDGPU::S_OR_SAVEEXEC_B32; 213 case AMDGPU::S_XOR_B32: 214 return AMDGPU::S_XOR_SAVEEXEC_B32; 215 case AMDGPU::S_ANDN2_B32: 216 return AMDGPU::S_ANDN2_SAVEEXEC_B32; 217 case AMDGPU::S_ORN2_B32: 218 return AMDGPU::S_ORN2_SAVEEXEC_B32; 219 case AMDGPU::S_NAND_B32: 220 return AMDGPU::S_NAND_SAVEEXEC_B32; 221 case AMDGPU::S_NOR_B32: 222 return AMDGPU::S_NOR_SAVEEXEC_B32; 223 case AMDGPU::S_XNOR_B32: 224 return AMDGPU::S_XNOR_SAVEEXEC_B32; 225 default: 226 return AMDGPU::INSTRUCTION_LIST_END; 227 } 228 } 229 230 // These are only terminators to get correct spill code placement during 231 // register allocation, so turn them back into normal instructions. 232 bool SIOptimizeExecMasking::removeTerminatorBit(MachineInstr &MI) const { 233 switch (MI.getOpcode()) { 234 case AMDGPU::S_MOV_B32_term: { 235 bool RegSrc = MI.getOperand(1).isReg(); 236 MI.setDesc(TII->get(RegSrc ? AMDGPU::COPY : AMDGPU::S_MOV_B32)); 237 return true; 238 } 239 case AMDGPU::S_MOV_B64_term: { 240 bool RegSrc = MI.getOperand(1).isReg(); 241 MI.setDesc(TII->get(RegSrc ? AMDGPU::COPY : AMDGPU::S_MOV_B64)); 242 return true; 243 } 244 case AMDGPU::S_XOR_B64_term: { 245 // This is only a terminator to get the correct spill code placement during 246 // register allocation. 247 MI.setDesc(TII->get(AMDGPU::S_XOR_B64)); 248 return true; 249 } 250 case AMDGPU::S_XOR_B32_term: { 251 // This is only a terminator to get the correct spill code placement during 252 // register allocation. 253 MI.setDesc(TII->get(AMDGPU::S_XOR_B32)); 254 return true; 255 } 256 case AMDGPU::S_OR_B64_term: { 257 // This is only a terminator to get the correct spill code placement during 258 // register allocation. 259 MI.setDesc(TII->get(AMDGPU::S_OR_B64)); 260 return true; 261 } 262 case AMDGPU::S_OR_B32_term: { 263 // This is only a terminator to get the correct spill code placement during 264 // register allocation. 265 MI.setDesc(TII->get(AMDGPU::S_OR_B32)); 266 return true; 267 } 268 case AMDGPU::S_ANDN2_B64_term: { 269 // This is only a terminator to get the correct spill code placement during 270 // register allocation. 271 MI.setDesc(TII->get(AMDGPU::S_ANDN2_B64)); 272 return true; 273 } 274 case AMDGPU::S_ANDN2_B32_term: { 275 // This is only a terminator to get the correct spill code placement during 276 // register allocation. 277 MI.setDesc(TII->get(AMDGPU::S_ANDN2_B32)); 278 return true; 279 } 280 case AMDGPU::S_AND_B64_term: { 281 // This is only a terminator to get the correct spill code placement during 282 // register allocation. 283 MI.setDesc(TII->get(AMDGPU::S_AND_B64)); 284 return true; 285 } 286 case AMDGPU::S_AND_B32_term: { 287 // This is only a terminator to get the correct spill code placement during 288 // register allocation. 289 MI.setDesc(TII->get(AMDGPU::S_AND_B32)); 290 return true; 291 } 292 default: 293 return false; 294 } 295 } 296 297 // Turn all pseudoterminators in the block into their equivalent non-terminator 298 // instructions. Returns the reverse iterator to the first non-terminator 299 // instruction in the block. 300 MachineBasicBlock::reverse_iterator 301 SIOptimizeExecMasking::fixTerminators(MachineBasicBlock &MBB) const { 302 MachineBasicBlock::reverse_iterator I = MBB.rbegin(), E = MBB.rend(); 303 304 bool Seen = false; 305 MachineBasicBlock::reverse_iterator FirstNonTerm = I; 306 for (; I != E; ++I) { 307 if (!I->isTerminator()) 308 return Seen ? FirstNonTerm : I; 309 310 if (removeTerminatorBit(*I)) { 311 if (!Seen) { 312 FirstNonTerm = I; 313 Seen = true; 314 } 315 } 316 } 317 318 return FirstNonTerm; 319 } 320 321 MachineBasicBlock::reverse_iterator SIOptimizeExecMasking::findExecCopy( 322 MachineBasicBlock &MBB, MachineBasicBlock::reverse_iterator I) const { 323 const unsigned InstLimit = 25; 324 325 auto E = MBB.rend(); 326 for (unsigned N = 0; N <= InstLimit && I != E; ++I, ++N) { 327 Register CopyFromExec = isCopyFromExec(*I); 328 if (CopyFromExec.isValid()) 329 return I; 330 } 331 332 return E; 333 } 334 335 // XXX - Seems LiveRegUnits doesn't work correctly since it will incorrectly 336 // report the register as unavailable because a super-register with a lane mask 337 // is unavailable. 338 static bool isLiveOut(const MachineBasicBlock &MBB, unsigned Reg) { 339 for (MachineBasicBlock *Succ : MBB.successors()) { 340 if (Succ->isLiveIn(Reg)) 341 return true; 342 } 343 344 return false; 345 } 346 347 // Backwards-iterate from Origin (for n=MaxInstructions iterations) until either 348 // the beginning of the BB is reached or Pred evaluates to true - which can be 349 // an arbitrary condition based on the current MachineInstr, for instance an 350 // target instruction. Breaks prematurely by returning nullptr if one of the 351 // registers given in NonModifiableRegs is modified by the current instruction. 352 MachineInstr *SIOptimizeExecMasking::findInstrBackwards( 353 MachineInstr &Origin, std::function<bool(MachineInstr *)> Pred, 354 ArrayRef<MCRegister> NonModifiableRegs, MachineInstr *Terminator, 355 SmallVectorImpl<MachineOperand *> *KillFlagCandidates, 356 unsigned MaxInstructions) const { 357 MachineBasicBlock::reverse_iterator A = Origin.getReverseIterator(), 358 E = Origin.getParent()->rend(); 359 unsigned CurrentIteration = 0; 360 361 for (++A; CurrentIteration < MaxInstructions && A != E; ++A) { 362 if (A->isDebugInstr()) 363 continue; 364 365 if (Pred(&*A)) 366 return &*A; 367 368 for (MCRegister Reg : NonModifiableRegs) { 369 if (A->modifiesRegister(Reg, TRI)) 370 return nullptr; 371 372 // Check for kills that appear after the terminator instruction, that 373 // would not be detected by clearKillFlags, since they will cause the 374 // register to be dead at a later place, causing the verifier to fail. 375 // We use the candidates to clear the kill flags later. 376 if (Terminator && KillFlagCandidates && A != Terminator && 377 A->killsRegister(Reg, TRI)) { 378 for (MachineOperand &MO : A->operands()) { 379 if (MO.isReg() && MO.isKill()) { 380 Register Candidate = MO.getReg(); 381 if (Candidate != Reg && TRI->regsOverlap(Candidate, Reg)) 382 KillFlagCandidates->push_back(&MO); 383 } 384 } 385 } 386 } 387 388 ++CurrentIteration; 389 } 390 391 return nullptr; 392 } 393 394 // Determine if a register Reg is not re-defined and still in use 395 // in the range (Stop..Start]. 396 // It does so by backwards calculating liveness from the end of the BB until 397 // either Stop or the beginning of the BB is reached. 398 // After liveness is calculated, we can determine if Reg is still in use and not 399 // defined inbetween the instructions. 400 bool SIOptimizeExecMasking::isRegisterInUseBetween(MachineInstr &Stop, 401 MachineInstr &Start, 402 MCRegister Reg, 403 bool UseLiveOuts, 404 bool IgnoreStart) const { 405 LiveRegUnits LR(*TRI); 406 if (UseLiveOuts) 407 LR.addLiveOuts(*Stop.getParent()); 408 409 MachineBasicBlock::reverse_iterator A(Start); 410 411 if (IgnoreStart) 412 ++A; 413 414 for (; A != Stop.getParent()->rend() && A != Stop; ++A) { 415 LR.stepBackward(*A); 416 } 417 418 return !LR.available(Reg) || MRI->isReserved(Reg); 419 } 420 421 // Determine if a register Reg is not re-defined and still in use 422 // in the range (Stop..BB.end]. 423 bool SIOptimizeExecMasking::isRegisterInUseAfter(MachineInstr &Stop, 424 MCRegister Reg) const { 425 return isRegisterInUseBetween(Stop, *Stop.getParent()->rbegin(), Reg, true); 426 } 427 428 // Optimize sequences emitted for control flow lowering. They are originally 429 // emitted as the separate operations because spill code may need to be 430 // inserted for the saved copy of exec. 431 // 432 // x = copy exec 433 // z = s_<op>_b64 x, y 434 // exec = copy z 435 // => 436 // x = s_<op>_saveexec_b64 y 437 // 438 bool SIOptimizeExecMasking::optimizeExecSequence() { 439 bool Changed = false; 440 for (MachineBasicBlock &MBB : *MF) { 441 MachineBasicBlock::reverse_iterator I = fixTerminators(MBB); 442 MachineBasicBlock::reverse_iterator E = MBB.rend(); 443 if (I == E) 444 continue; 445 446 // It's possible to see other terminator copies after the exec copy. This 447 // can happen if control flow pseudos had their outputs used by phis. 448 Register CopyToExec; 449 450 unsigned SearchCount = 0; 451 const unsigned SearchLimit = 5; 452 while (I != E && SearchCount++ < SearchLimit) { 453 CopyToExec = isCopyToExec(*I); 454 if (CopyToExec) 455 break; 456 ++I; 457 } 458 459 if (!CopyToExec) 460 continue; 461 462 // Scan backwards to find the def. 463 auto *CopyToExecInst = &*I; 464 auto CopyFromExecInst = findExecCopy(MBB, I); 465 if (CopyFromExecInst == E) { 466 auto PrepareExecInst = std::next(I); 467 if (PrepareExecInst == E) 468 continue; 469 // Fold exec = COPY (S_AND_B64 reg, exec) -> exec = S_AND_B64 reg, exec 470 if (CopyToExecInst->getOperand(1).isKill() && 471 isLogicalOpOnExec(*PrepareExecInst) == CopyToExec) { 472 LLVM_DEBUG(dbgs() << "Fold exec copy: " << *PrepareExecInst); 473 474 PrepareExecInst->getOperand(0).setReg(Exec); 475 476 LLVM_DEBUG(dbgs() << "into: " << *PrepareExecInst << '\n'); 477 478 CopyToExecInst->eraseFromParent(); 479 Changed = true; 480 } 481 482 continue; 483 } 484 485 if (isLiveOut(MBB, CopyToExec)) { 486 // The copied register is live out and has a second use in another block. 487 LLVM_DEBUG(dbgs() << "Exec copy source register is live out\n"); 488 continue; 489 } 490 491 Register CopyFromExec = CopyFromExecInst->getOperand(0).getReg(); 492 MachineInstr *SaveExecInst = nullptr; 493 SmallVector<MachineInstr *, 4> OtherUseInsts; 494 495 for (MachineBasicBlock::iterator 496 J = std::next(CopyFromExecInst->getIterator()), 497 JE = I->getIterator(); 498 J != JE; ++J) { 499 if (SaveExecInst && J->readsRegister(Exec, TRI)) { 500 LLVM_DEBUG(dbgs() << "exec read prevents saveexec: " << *J << '\n'); 501 // Make sure this is inserted after any VALU ops that may have been 502 // scheduled in between. 503 SaveExecInst = nullptr; 504 break; 505 } 506 507 bool ReadsCopyFromExec = J->readsRegister(CopyFromExec, TRI); 508 509 if (J->modifiesRegister(CopyToExec, TRI)) { 510 if (SaveExecInst) { 511 LLVM_DEBUG(dbgs() << "Multiple instructions modify " 512 << printReg(CopyToExec, TRI) << '\n'); 513 SaveExecInst = nullptr; 514 break; 515 } 516 517 unsigned SaveExecOp = getSaveExecOp(J->getOpcode()); 518 if (SaveExecOp == AMDGPU::INSTRUCTION_LIST_END) 519 break; 520 521 if (ReadsCopyFromExec) { 522 SaveExecInst = &*J; 523 LLVM_DEBUG(dbgs() << "Found save exec op: " << *SaveExecInst << '\n'); 524 continue; 525 } 526 LLVM_DEBUG(dbgs() << "Instruction does not read exec copy: " << *J 527 << '\n'); 528 break; 529 } 530 if (ReadsCopyFromExec && !SaveExecInst) { 531 // Make sure no other instruction is trying to use this copy, before it 532 // will be rewritten by the saveexec, i.e. hasOneUse. There may have 533 // been another use, such as an inserted spill. For example: 534 // 535 // %sgpr0_sgpr1 = COPY %exec 536 // spill %sgpr0_sgpr1 537 // %sgpr2_sgpr3 = S_AND_B64 %sgpr0_sgpr1 538 // 539 LLVM_DEBUG(dbgs() << "Found second use of save inst candidate: " << *J 540 << '\n'); 541 break; 542 } 543 544 if (SaveExecInst && J->readsRegister(CopyToExec, TRI)) { 545 assert(SaveExecInst != &*J); 546 OtherUseInsts.push_back(&*J); 547 } 548 } 549 550 if (!SaveExecInst) 551 continue; 552 553 LLVM_DEBUG(dbgs() << "Insert save exec op: " << *SaveExecInst << '\n'); 554 555 MachineOperand &Src0 = SaveExecInst->getOperand(1); 556 MachineOperand &Src1 = SaveExecInst->getOperand(2); 557 558 MachineOperand *OtherOp = nullptr; 559 560 if (Src0.isReg() && Src0.getReg() == CopyFromExec) { 561 OtherOp = &Src1; 562 } else if (Src1.isReg() && Src1.getReg() == CopyFromExec) { 563 if (!SaveExecInst->isCommutable()) 564 break; 565 566 OtherOp = &Src0; 567 } else 568 llvm_unreachable("unexpected"); 569 570 CopyFromExecInst->eraseFromParent(); 571 572 auto InsPt = SaveExecInst->getIterator(); 573 const DebugLoc &DL = SaveExecInst->getDebugLoc(); 574 575 BuildMI(MBB, InsPt, DL, TII->get(getSaveExecOp(SaveExecInst->getOpcode())), 576 CopyFromExec) 577 .addReg(OtherOp->getReg()); 578 SaveExecInst->eraseFromParent(); 579 580 CopyToExecInst->eraseFromParent(); 581 582 for (MachineInstr *OtherInst : OtherUseInsts) { 583 OtherInst->substituteRegister(CopyToExec, Exec, AMDGPU::NoSubRegister, 584 *TRI); 585 } 586 587 Changed = true; 588 } 589 590 return Changed; 591 } 592 593 // Inserts the optimized s_mov_b32 / v_cmpx sequence based on the 594 // operands extracted from a v_cmp ..., s_and_saveexec pattern. 595 bool SIOptimizeExecMasking::optimizeVCMPSaveExecSequence( 596 MachineInstr &SaveExecInstr, MachineInstr &VCmp, MCRegister Exec) const { 597 const int NewOpcode = AMDGPU::getVCMPXOpFromVCMP(VCmp.getOpcode()); 598 599 if (NewOpcode == -1) 600 return false; 601 602 MachineOperand *Src0 = TII->getNamedOperand(VCmp, AMDGPU::OpName::src0); 603 MachineOperand *Src1 = TII->getNamedOperand(VCmp, AMDGPU::OpName::src1); 604 605 Register MoveDest = SaveExecInstr.getOperand(0).getReg(); 606 607 MachineBasicBlock::instr_iterator InsertPosIt = SaveExecInstr.getIterator(); 608 if (!SaveExecInstr.uses().empty()) { 609 bool IsSGPR32 = TRI->getRegSizeInBits(MoveDest, *MRI) == 32; 610 unsigned MovOpcode = IsSGPR32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; 611 BuildMI(*SaveExecInstr.getParent(), InsertPosIt, 612 SaveExecInstr.getDebugLoc(), TII->get(MovOpcode), MoveDest) 613 .addReg(Exec); 614 } 615 616 // Omit dst as V_CMPX is implicitly writing to EXEC. 617 // Add dummy src and clamp modifiers, if needed. 618 auto Builder = BuildMI(*VCmp.getParent(), std::next(InsertPosIt), 619 VCmp.getDebugLoc(), TII->get(NewOpcode)); 620 621 auto TryAddImmediateValueFromNamedOperand = 622 [&](unsigned OperandName) -> void { 623 if (auto *Mod = TII->getNamedOperand(VCmp, OperandName)) 624 Builder.addImm(Mod->getImm()); 625 }; 626 627 TryAddImmediateValueFromNamedOperand(AMDGPU::OpName::src0_modifiers); 628 Builder.add(*Src0); 629 630 TryAddImmediateValueFromNamedOperand(AMDGPU::OpName::src1_modifiers); 631 Builder.add(*Src1); 632 633 TryAddImmediateValueFromNamedOperand(AMDGPU::OpName::clamp); 634 635 // The kill flags may no longer be correct. 636 if (Src0->isReg()) 637 MRI->clearKillFlags(Src0->getReg()); 638 if (Src1->isReg()) 639 MRI->clearKillFlags(Src1->getReg()); 640 641 for (MachineOperand *MO : KillFlagCandidates) 642 MO->setIsKill(false); 643 644 SaveExecInstr.eraseFromParent(); 645 VCmp.eraseFromParent(); 646 647 return true; 648 } 649 650 // Record (on GFX10.3 and later) occurences of 651 // v_cmp_* SGPR, IMM, VGPR 652 // s_and_saveexec_b32 EXEC_SGPR_DEST, SGPR 653 // to be replaced with 654 // s_mov_b32 EXEC_SGPR_DEST, exec_lo 655 // v_cmpx_* IMM, VGPR 656 // to reduce pipeline stalls. 657 void SIOptimizeExecMasking::tryRecordVCmpxAndSaveexecSequence( 658 MachineInstr &MI) { 659 if (!ST->hasGFX10_3Insts()) 660 return; 661 662 const unsigned AndSaveExecOpcode = 663 ST->isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64; 664 665 if (MI.getOpcode() != AndSaveExecOpcode) 666 return; 667 668 Register SaveExecDest = MI.getOperand(0).getReg(); 669 if (!TRI->isSGPRReg(*MRI, SaveExecDest)) 670 return; 671 672 MachineOperand *SaveExecSrc0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); 673 if (!SaveExecSrc0->isReg()) 674 return; 675 676 // Tries to find a possibility to optimize a v_cmp ..., s_and_saveexec 677 // sequence by looking at an instance of an s_and_saveexec instruction. 678 // Returns a pointer to the v_cmp instruction if it is safe to replace the 679 // sequence (see the conditions in the function body). This is after register 680 // allocation, so some checks on operand dependencies need to be considered. 681 MachineInstr *VCmp = nullptr; 682 683 // Try to find the last v_cmp instruction that defs the saveexec input 684 // operand without any write to Exec or the saveexec input operand inbetween. 685 VCmp = findInstrBackwards( 686 MI, 687 [&](MachineInstr *Check) { 688 return AMDGPU::getVCMPXOpFromVCMP(Check->getOpcode()) != -1 && 689 Check->modifiesRegister(SaveExecSrc0->getReg(), TRI); 690 }, 691 {Exec, SaveExecSrc0->getReg()}); 692 693 if (!VCmp) 694 return; 695 696 MachineOperand *VCmpDest = TII->getNamedOperand(*VCmp, AMDGPU::OpName::sdst); 697 assert(VCmpDest && "Should have an sdst operand!"); 698 699 // Check if any of the v_cmp source operands is written by the saveexec. 700 MachineOperand *Src0 = TII->getNamedOperand(*VCmp, AMDGPU::OpName::src0); 701 if (Src0->isReg() && TRI->isSGPRReg(*MRI, Src0->getReg()) && 702 MI.modifiesRegister(Src0->getReg(), TRI)) 703 return; 704 705 MachineOperand *Src1 = TII->getNamedOperand(*VCmp, AMDGPU::OpName::src1); 706 if (Src1->isReg() && TRI->isSGPRReg(*MRI, Src1->getReg()) && 707 MI.modifiesRegister(Src1->getReg(), TRI)) 708 return; 709 710 // Don't do the transformation if the destination operand is included in 711 // it's MBB Live-outs, meaning it's used in any of its successors, leading 712 // to incorrect code if the v_cmp and therefore the def of 713 // the dest operand is removed. 714 if (isLiveOut(*VCmp->getParent(), VCmpDest->getReg())) 715 return; 716 717 // If the v_cmp target is in use between v_cmp and s_and_saveexec or after the 718 // s_and_saveexec, skip the optimization. 719 if (isRegisterInUseBetween(*VCmp, MI, VCmpDest->getReg(), false, true) || 720 isRegisterInUseAfter(MI, VCmpDest->getReg())) 721 return; 722 723 // Try to determine if there is a write to any of the VCmp 724 // operands between the saveexec and the vcmp. 725 // If yes, additional VGPR spilling might need to be inserted. In this case, 726 // it's not worth replacing the instruction sequence. 727 SmallVector<MCRegister, 2> NonDefRegs; 728 if (Src0->isReg()) 729 NonDefRegs.push_back(Src0->getReg()); 730 731 if (Src1->isReg()) 732 NonDefRegs.push_back(Src1->getReg()); 733 734 if (!findInstrBackwards( 735 MI, [&](MachineInstr *Check) { return Check == VCmp; }, NonDefRegs, 736 VCmp, &KillFlagCandidates)) 737 return; 738 739 if (VCmp) 740 SaveExecVCmpMapping[&MI] = VCmp; 741 } 742 743 // Record occurences of 744 // s_or_saveexec s_o, s_i 745 // s_xor exec, exec, s_o 746 // to be replaced with 747 // s_andn2_saveexec s_o, s_i. 748 void SIOptimizeExecMasking::tryRecordOrSaveexecXorSequence(MachineInstr &MI) { 749 const unsigned XorOpcode = 750 ST->isWave32() ? AMDGPU::S_XOR_B32 : AMDGPU::S_XOR_B64; 751 752 if (MI.getOpcode() == XorOpcode && &MI != &MI.getParent()->front()) { 753 const MachineOperand &XorDst = MI.getOperand(0); 754 const MachineOperand &XorSrc0 = MI.getOperand(1); 755 const MachineOperand &XorSrc1 = MI.getOperand(2); 756 757 if (XorDst.isReg() && XorDst.getReg() == Exec && XorSrc0.isReg() && 758 XorSrc1.isReg() && 759 (XorSrc0.getReg() == Exec || XorSrc1.getReg() == Exec)) { 760 const unsigned OrSaveexecOpcode = ST->isWave32() 761 ? AMDGPU::S_OR_SAVEEXEC_B32 762 : AMDGPU::S_OR_SAVEEXEC_B64; 763 764 // Peek at the previous instruction and check if this is a relevant 765 // s_or_saveexec instruction. 766 MachineInstr &PossibleOrSaveexec = *MI.getPrevNode(); 767 if (PossibleOrSaveexec.getOpcode() != OrSaveexecOpcode) 768 return; 769 770 const MachineOperand &OrDst = PossibleOrSaveexec.getOperand(0); 771 const MachineOperand &OrSrc0 = PossibleOrSaveexec.getOperand(1); 772 if (OrDst.isReg() && OrSrc0.isReg()) { 773 if ((XorSrc0.getReg() == Exec && XorSrc1.getReg() == OrDst.getReg()) || 774 (XorSrc0.getReg() == OrDst.getReg() && XorSrc1.getReg() == Exec)) { 775 OrXors.emplace_back(&PossibleOrSaveexec, &MI); 776 } 777 } 778 } 779 } 780 } 781 782 bool SIOptimizeExecMasking::optimizeOrSaveexecXorSequences() { 783 if (OrXors.empty()) { 784 return false; 785 } 786 787 bool Changed = false; 788 const unsigned Andn2Opcode = ST->isWave32() ? AMDGPU::S_ANDN2_SAVEEXEC_B32 789 : AMDGPU::S_ANDN2_SAVEEXEC_B64; 790 791 for (const auto &Pair : OrXors) { 792 MachineInstr *Or = nullptr; 793 MachineInstr *Xor = nullptr; 794 std::tie(Or, Xor) = Pair; 795 BuildMI(*Or->getParent(), Or->getIterator(), Or->getDebugLoc(), 796 TII->get(Andn2Opcode), Or->getOperand(0).getReg()) 797 .addReg(Or->getOperand(1).getReg()); 798 799 Or->eraseFromParent(); 800 Xor->eraseFromParent(); 801 802 Changed = true; 803 } 804 805 return Changed; 806 } 807 808 bool SIOptimizeExecMaskingLegacy::runOnMachineFunction(MachineFunction &MF) { 809 if (skipFunction(MF.getFunction())) 810 return false; 811 812 return SIOptimizeExecMasking().run(MF); 813 } 814 815 bool SIOptimizeExecMasking::run(MachineFunction &MF) { 816 this->MF = &MF; 817 ST = &MF.getSubtarget<GCNSubtarget>(); 818 TRI = ST->getRegisterInfo(); 819 TII = ST->getInstrInfo(); 820 MRI = &MF.getRegInfo(); 821 Exec = TRI->getExec(); 822 823 bool Changed = optimizeExecSequence(); 824 825 OrXors.clear(); 826 SaveExecVCmpMapping.clear(); 827 KillFlagCandidates.clear(); 828 static unsigned SearchWindow = 10; 829 for (MachineBasicBlock &MBB : MF) { 830 unsigned SearchCount = 0; 831 832 for (auto &MI : llvm::reverse(MBB)) { 833 if (MI.isDebugInstr()) 834 continue; 835 836 if (SearchCount >= SearchWindow) { 837 break; 838 } 839 840 tryRecordOrSaveexecXorSequence(MI); 841 tryRecordVCmpxAndSaveexecSequence(MI); 842 843 if (MI.modifiesRegister(Exec, TRI)) { 844 break; 845 } 846 847 ++SearchCount; 848 } 849 } 850 851 Changed |= optimizeOrSaveexecXorSequences(); 852 for (const auto &Entry : SaveExecVCmpMapping) { 853 MachineInstr *SaveExecInstr = Entry.getFirst(); 854 MachineInstr *VCmpInstr = Entry.getSecond(); 855 856 Changed |= optimizeVCMPSaveExecSequence(*SaveExecInstr, *VCmpInstr, Exec); 857 } 858 859 return Changed; 860 } 861