1 //===-- SIOptimizeExecMasking.cpp -----------------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 9 #include "AMDGPU.h" 10 #include "AMDGPUSubtarget.h" 11 #include "SIInstrInfo.h" 12 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 13 #include "llvm/ADT/SmallSet.h" 14 #include "llvm/CodeGen/MachineFunctionPass.h" 15 #include "llvm/CodeGen/MachineInstrBuilder.h" 16 #include "llvm/CodeGen/MachineRegisterInfo.h" 17 #include "llvm/Support/Debug.h" 18 19 using namespace llvm; 20 21 #define DEBUG_TYPE "si-optimize-exec-masking" 22 23 namespace { 24 25 class SIOptimizeExecMasking : public MachineFunctionPass { 26 public: 27 static char ID; 28 29 public: 30 SIOptimizeExecMasking() : MachineFunctionPass(ID) { 31 initializeSIOptimizeExecMaskingPass(*PassRegistry::getPassRegistry()); 32 } 33 34 bool runOnMachineFunction(MachineFunction &MF) override; 35 36 StringRef getPassName() const override { 37 return "SI optimize exec mask operations"; 38 } 39 40 void getAnalysisUsage(AnalysisUsage &AU) const override { 41 AU.setPreservesCFG(); 42 MachineFunctionPass::getAnalysisUsage(AU); 43 } 44 }; 45 46 } // End anonymous namespace. 47 48 INITIALIZE_PASS_BEGIN(SIOptimizeExecMasking, DEBUG_TYPE, 49 "SI optimize exec mask operations", false, false) 50 INITIALIZE_PASS_DEPENDENCY(LiveIntervals) 51 INITIALIZE_PASS_END(SIOptimizeExecMasking, DEBUG_TYPE, 52 "SI optimize exec mask operations", false, false) 53 54 char SIOptimizeExecMasking::ID = 0; 55 56 char &llvm::SIOptimizeExecMaskingID = SIOptimizeExecMasking::ID; 57 58 /// If \p MI is a copy from exec, return the register copied to. 59 static unsigned isCopyFromExec(const MachineInstr &MI) { 60 switch (MI.getOpcode()) { 61 case AMDGPU::COPY: 62 case AMDGPU::S_MOV_B64: 63 case AMDGPU::S_MOV_B64_term: { 64 const MachineOperand &Src = MI.getOperand(1); 65 if (Src.isReg() && Src.getReg() == AMDGPU::EXEC) 66 return MI.getOperand(0).getReg(); 67 } 68 } 69 70 return AMDGPU::NoRegister; 71 } 72 73 /// If \p MI is a copy to exec, return the register copied from. 74 static unsigned isCopyToExec(const MachineInstr &MI) { 75 switch (MI.getOpcode()) { 76 case AMDGPU::COPY: 77 case AMDGPU::S_MOV_B64: { 78 const MachineOperand &Dst = MI.getOperand(0); 79 if (Dst.isReg() && Dst.getReg() == AMDGPU::EXEC && MI.getOperand(1).isReg()) 80 return MI.getOperand(1).getReg(); 81 break; 82 } 83 case AMDGPU::S_MOV_B64_term: 84 llvm_unreachable("should have been replaced"); 85 } 86 87 return AMDGPU::NoRegister; 88 } 89 90 /// If \p MI is a logical operation on an exec value, 91 /// return the register copied to. 92 static unsigned isLogicalOpOnExec(const MachineInstr &MI) { 93 switch (MI.getOpcode()) { 94 case AMDGPU::S_AND_B64: 95 case AMDGPU::S_OR_B64: 96 case AMDGPU::S_XOR_B64: 97 case AMDGPU::S_ANDN2_B64: 98 case AMDGPU::S_ORN2_B64: 99 case AMDGPU::S_NAND_B64: 100 case AMDGPU::S_NOR_B64: 101 case AMDGPU::S_XNOR_B64: { 102 const MachineOperand &Src1 = MI.getOperand(1); 103 if (Src1.isReg() && Src1.getReg() == AMDGPU::EXEC) 104 return MI.getOperand(0).getReg(); 105 const MachineOperand &Src2 = MI.getOperand(2); 106 if (Src2.isReg() && Src2.getReg() == AMDGPU::EXEC) 107 return MI.getOperand(0).getReg(); 108 } 109 } 110 111 return AMDGPU::NoRegister; 112 } 113 114 static unsigned getSaveExecOp(unsigned Opc) { 115 switch (Opc) { 116 case AMDGPU::S_AND_B64: 117 return AMDGPU::S_AND_SAVEEXEC_B64; 118 case AMDGPU::S_OR_B64: 119 return AMDGPU::S_OR_SAVEEXEC_B64; 120 case AMDGPU::S_XOR_B64: 121 return AMDGPU::S_XOR_SAVEEXEC_B64; 122 case AMDGPU::S_ANDN2_B64: 123 return AMDGPU::S_ANDN2_SAVEEXEC_B64; 124 case AMDGPU::S_ORN2_B64: 125 return AMDGPU::S_ORN2_SAVEEXEC_B64; 126 case AMDGPU::S_NAND_B64: 127 return AMDGPU::S_NAND_SAVEEXEC_B64; 128 case AMDGPU::S_NOR_B64: 129 return AMDGPU::S_NOR_SAVEEXEC_B64; 130 case AMDGPU::S_XNOR_B64: 131 return AMDGPU::S_XNOR_SAVEEXEC_B64; 132 default: 133 return AMDGPU::INSTRUCTION_LIST_END; 134 } 135 } 136 137 // These are only terminators to get correct spill code placement during 138 // register allocation, so turn them back into normal instructions. Only one of 139 // these is expected per block. 140 static bool removeTerminatorBit(const SIInstrInfo &TII, MachineInstr &MI) { 141 switch (MI.getOpcode()) { 142 case AMDGPU::S_MOV_B64_term: { 143 MI.setDesc(TII.get(AMDGPU::COPY)); 144 return true; 145 } 146 case AMDGPU::S_XOR_B64_term: { 147 // This is only a terminator to get the correct spill code placement during 148 // register allocation. 149 MI.setDesc(TII.get(AMDGPU::S_XOR_B64)); 150 return true; 151 } 152 case AMDGPU::S_OR_B64_term: { 153 // This is only a terminator to get the correct spill code placement during 154 // register allocation. 155 MI.setDesc(TII.get(AMDGPU::S_OR_B64)); 156 return true; 157 } 158 case AMDGPU::S_ANDN2_B64_term: { 159 // This is only a terminator to get the correct spill code placement during 160 // register allocation. 161 MI.setDesc(TII.get(AMDGPU::S_ANDN2_B64)); 162 return true; 163 } 164 default: 165 return false; 166 } 167 } 168 169 static MachineBasicBlock::reverse_iterator fixTerminators( 170 const SIInstrInfo &TII, 171 MachineBasicBlock &MBB) { 172 MachineBasicBlock::reverse_iterator I = MBB.rbegin(), E = MBB.rend(); 173 for (; I != E; ++I) { 174 if (!I->isTerminator()) 175 return I; 176 177 if (removeTerminatorBit(TII, *I)) 178 return I; 179 } 180 181 return E; 182 } 183 184 static MachineBasicBlock::reverse_iterator findExecCopy( 185 const SIInstrInfo &TII, 186 MachineBasicBlock &MBB, 187 MachineBasicBlock::reverse_iterator I, 188 unsigned CopyToExec) { 189 const unsigned InstLimit = 25; 190 191 auto E = MBB.rend(); 192 for (unsigned N = 0; N <= InstLimit && I != E; ++I, ++N) { 193 unsigned CopyFromExec = isCopyFromExec(*I); 194 if (CopyFromExec != AMDGPU::NoRegister) 195 return I; 196 } 197 198 return E; 199 } 200 201 // XXX - Seems LivePhysRegs doesn't work correctly since it will incorrectly 202 // repor tthe register as unavailable because a super-register with a lane mask 203 // as unavailable. 204 static bool isLiveOut(const MachineBasicBlock &MBB, unsigned Reg) { 205 for (MachineBasicBlock *Succ : MBB.successors()) { 206 if (Succ->isLiveIn(Reg)) 207 return true; 208 } 209 210 return false; 211 } 212 213 bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) { 214 if (skipFunction(MF.getFunction())) 215 return false; 216 217 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 218 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 219 const SIInstrInfo *TII = ST.getInstrInfo(); 220 221 // Optimize sequences emitted for control flow lowering. They are originally 222 // emitted as the separate operations because spill code may need to be 223 // inserted for the saved copy of exec. 224 // 225 // x = copy exec 226 // z = s_<op>_b64 x, y 227 // exec = copy z 228 // => 229 // x = s_<op>_saveexec_b64 y 230 // 231 232 for (MachineBasicBlock &MBB : MF) { 233 MachineBasicBlock::reverse_iterator I = fixTerminators(*TII, MBB); 234 MachineBasicBlock::reverse_iterator E = MBB.rend(); 235 if (I == E) 236 continue; 237 238 unsigned CopyToExec = isCopyToExec(*I); 239 if (CopyToExec == AMDGPU::NoRegister) 240 continue; 241 242 // Scan backwards to find the def. 243 auto CopyToExecInst = &*I; 244 auto CopyFromExecInst = findExecCopy(*TII, MBB, I, CopyToExec); 245 if (CopyFromExecInst == E) { 246 auto PrepareExecInst = std::next(I); 247 if (PrepareExecInst == E) 248 continue; 249 // Fold exec = COPY (S_AND_B64 reg, exec) -> exec = S_AND_B64 reg, exec 250 if (CopyToExecInst->getOperand(1).isKill() && 251 isLogicalOpOnExec(*PrepareExecInst) == CopyToExec) { 252 LLVM_DEBUG(dbgs() << "Fold exec copy: " << *PrepareExecInst); 253 254 PrepareExecInst->getOperand(0).setReg(AMDGPU::EXEC); 255 256 LLVM_DEBUG(dbgs() << "into: " << *PrepareExecInst << '\n'); 257 258 CopyToExecInst->eraseFromParent(); 259 } 260 261 continue; 262 } 263 264 if (isLiveOut(MBB, CopyToExec)) { 265 // The copied register is live out and has a second use in another block. 266 LLVM_DEBUG(dbgs() << "Exec copy source register is live out\n"); 267 continue; 268 } 269 270 unsigned CopyFromExec = CopyFromExecInst->getOperand(0).getReg(); 271 MachineInstr *SaveExecInst = nullptr; 272 SmallVector<MachineInstr *, 4> OtherUseInsts; 273 274 for (MachineBasicBlock::iterator J 275 = std::next(CopyFromExecInst->getIterator()), JE = I->getIterator(); 276 J != JE; ++J) { 277 if (SaveExecInst && J->readsRegister(AMDGPU::EXEC, TRI)) { 278 LLVM_DEBUG(dbgs() << "exec read prevents saveexec: " << *J << '\n'); 279 // Make sure this is inserted after any VALU ops that may have been 280 // scheduled in between. 281 SaveExecInst = nullptr; 282 break; 283 } 284 285 bool ReadsCopyFromExec = J->readsRegister(CopyFromExec, TRI); 286 287 if (J->modifiesRegister(CopyToExec, TRI)) { 288 if (SaveExecInst) { 289 LLVM_DEBUG(dbgs() << "Multiple instructions modify " 290 << printReg(CopyToExec, TRI) << '\n'); 291 SaveExecInst = nullptr; 292 break; 293 } 294 295 unsigned SaveExecOp = getSaveExecOp(J->getOpcode()); 296 if (SaveExecOp == AMDGPU::INSTRUCTION_LIST_END) 297 break; 298 299 if (ReadsCopyFromExec) { 300 SaveExecInst = &*J; 301 LLVM_DEBUG(dbgs() << "Found save exec op: " << *SaveExecInst << '\n'); 302 continue; 303 } else { 304 LLVM_DEBUG(dbgs() 305 << "Instruction does not read exec copy: " << *J << '\n'); 306 break; 307 } 308 } else if (ReadsCopyFromExec && !SaveExecInst) { 309 // Make sure no other instruction is trying to use this copy, before it 310 // will be rewritten by the saveexec, i.e. hasOneUse. There may have 311 // been another use, such as an inserted spill. For example: 312 // 313 // %sgpr0_sgpr1 = COPY %exec 314 // spill %sgpr0_sgpr1 315 // %sgpr2_sgpr3 = S_AND_B64 %sgpr0_sgpr1 316 // 317 LLVM_DEBUG(dbgs() << "Found second use of save inst candidate: " << *J 318 << '\n'); 319 break; 320 } 321 322 if (SaveExecInst && J->readsRegister(CopyToExec, TRI)) { 323 assert(SaveExecInst != &*J); 324 OtherUseInsts.push_back(&*J); 325 } 326 } 327 328 if (!SaveExecInst) 329 continue; 330 331 LLVM_DEBUG(dbgs() << "Insert save exec op: " << *SaveExecInst << '\n'); 332 333 MachineOperand &Src0 = SaveExecInst->getOperand(1); 334 MachineOperand &Src1 = SaveExecInst->getOperand(2); 335 336 MachineOperand *OtherOp = nullptr; 337 338 if (Src0.isReg() && Src0.getReg() == CopyFromExec) { 339 OtherOp = &Src1; 340 } else if (Src1.isReg() && Src1.getReg() == CopyFromExec) { 341 if (!SaveExecInst->isCommutable()) 342 break; 343 344 OtherOp = &Src0; 345 } else 346 llvm_unreachable("unexpected"); 347 348 CopyFromExecInst->eraseFromParent(); 349 350 auto InsPt = SaveExecInst->getIterator(); 351 const DebugLoc &DL = SaveExecInst->getDebugLoc(); 352 353 BuildMI(MBB, InsPt, DL, TII->get(getSaveExecOp(SaveExecInst->getOpcode())), 354 CopyFromExec) 355 .addReg(OtherOp->getReg()); 356 SaveExecInst->eraseFromParent(); 357 358 CopyToExecInst->eraseFromParent(); 359 360 for (MachineInstr *OtherInst : OtherUseInsts) { 361 OtherInst->substituteRegister(CopyToExec, AMDGPU::EXEC, 362 AMDGPU::NoSubRegister, *TRI); 363 } 364 } 365 366 return true; 367 368 } 369