1 //===-- SILowerControlFlow.cpp - Use predicates for control flow ----------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 /// \file 11 /// \brief This pass lowers the pseudo control flow instructions to real 12 /// machine instructions. 13 /// 14 /// All control flow is handled using predicated instructions and 15 /// a predicate stack. Each Scalar ALU controls the operations of 64 Vector 16 /// ALUs. The Scalar ALU can update the predicate for any of the Vector ALUs 17 /// by writting to the 64-bit EXEC register (each bit corresponds to a 18 /// single vector ALU). Typically, for predicates, a vector ALU will write 19 /// to its bit of the VCC register (like EXEC VCC is 64-bits, one for each 20 /// Vector ALU) and then the ScalarALU will AND the VCC register with the 21 /// EXEC to update the predicates. 22 /// 23 /// For example: 24 /// %VCC = V_CMP_GT_F32 %VGPR1, %VGPR2 25 /// %SGPR0 = SI_IF %VCC 26 /// %VGPR0 = V_ADD_F32 %VGPR0, %VGPR0 27 /// %SGPR0 = SI_ELSE %SGPR0 28 /// %VGPR0 = V_SUB_F32 %VGPR0, %VGPR0 29 /// SI_END_CF %SGPR0 30 /// 31 /// becomes: 32 /// 33 /// %SGPR0 = S_AND_SAVEEXEC_B64 %VCC // Save and update the exec mask 34 /// %SGPR0 = S_XOR_B64 %SGPR0, %EXEC // Clear live bits from saved exec mask 35 /// S_CBRANCH_EXECZ label0 // This instruction is an optional 36 /// // optimization which allows us to 37 /// // branch if all the bits of 38 /// // EXEC are zero. 39 /// %VGPR0 = V_ADD_F32 %VGPR0, %VGPR0 // Do the IF block of the branch 40 /// 41 /// label0: 42 /// %SGPR0 = S_OR_SAVEEXEC_B64 %EXEC // Restore the exec mask for the Then block 43 /// %EXEC = S_XOR_B64 %SGPR0, %EXEC // Clear live bits from saved exec mask 44 /// S_BRANCH_EXECZ label1 // Use our branch optimization 45 /// // instruction again. 46 /// %VGPR0 = V_SUB_F32 %VGPR0, %VGPR // Do the THEN block 47 /// label1: 48 /// %EXEC = S_OR_B64 %EXEC, %SGPR0 // Re-enable saved exec mask bits 49 //===----------------------------------------------------------------------===// 50 51 #include "AMDGPU.h" 52 #include "AMDGPUSubtarget.h" 53 #include "SIInstrInfo.h" 54 #include "SIMachineFunctionInfo.h" 55 #include "llvm/CodeGen/LivePhysRegs.h" 56 #include "llvm/CodeGen/MachineFrameInfo.h" 57 #include "llvm/CodeGen/MachineFunction.h" 58 #include "llvm/CodeGen/MachineFunctionPass.h" 59 #include "llvm/CodeGen/MachineInstrBuilder.h" 60 #include "llvm/CodeGen/MachineRegisterInfo.h" 61 62 using namespace llvm; 63 64 #define DEBUG_TYPE "si-lower-control-flow" 65 66 namespace { 67 68 class SILowerControlFlow : public MachineFunctionPass { 69 private: 70 const SIRegisterInfo *TRI; 71 const SIInstrInfo *TII; 72 LiveIntervals *LIS; 73 74 void emitIf(MachineInstr &MI); 75 void emitElse(MachineInstr &MI); 76 void emitBreak(MachineInstr &MI); 77 void emitIfBreak(MachineInstr &MI); 78 void emitElseBreak(MachineInstr &MI); 79 void emitLoop(MachineInstr &MI); 80 void emitEndCf(MachineInstr &MI); 81 82 public: 83 static char ID; 84 85 SILowerControlFlow() : 86 MachineFunctionPass(ID), 87 TRI(nullptr), 88 TII(nullptr), 89 LIS(nullptr) {} 90 91 bool runOnMachineFunction(MachineFunction &MF) override; 92 93 const char *getPassName() const override { 94 return "SI Lower control flow pseudo instructions"; 95 } 96 97 void getAnalysisUsage(AnalysisUsage &AU) const override { 98 AU.addPreserved<LiveIntervals>(); 99 AU.addPreserved<SlotIndexes>(); 100 AU.setPreservesCFG(); 101 MachineFunctionPass::getAnalysisUsage(AU); 102 } 103 }; 104 105 } // End anonymous namespace 106 107 char SILowerControlFlow::ID = 0; 108 109 INITIALIZE_PASS(SILowerControlFlow, DEBUG_TYPE, 110 "SI lower control flow", false, false) 111 112 char &llvm::SILowerControlFlowID = SILowerControlFlow::ID; 113 114 void SILowerControlFlow::emitIf(MachineInstr &MI) { 115 MachineBasicBlock &MBB = *MI.getParent(); 116 const DebugLoc &DL = MI.getDebugLoc(); 117 MachineBasicBlock::iterator I(&MI); 118 119 MachineOperand &SaveExec = MI.getOperand(0); 120 MachineOperand &Cond = MI.getOperand(1); 121 assert(SaveExec.getSubReg() == AMDGPU::NoSubRegister && 122 Cond.getSubReg() == AMDGPU::NoSubRegister); 123 124 unsigned SaveExecReg = SaveExec.getReg(); 125 126 MachineInstr *AndSaveExec = 127 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), SaveExecReg) 128 .addOperand(Cond); 129 130 MachineInstr *Xor = 131 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_XOR_B64), SaveExecReg) 132 .addReg(AMDGPU::EXEC) 133 .addReg(SaveExecReg); 134 135 // Insert a pseudo terminator to help keep the verifier happy. This will also 136 // be used later when inserting skips. 137 MachineInstr *NewBr = 138 BuildMI(MBB, I, DL, TII->get(AMDGPU::SI_MASK_BRANCH)) 139 .addOperand(MI.getOperand(2)) 140 .addReg(SaveExecReg, getKillRegState(SaveExec.isKill())); 141 142 if (!LIS) { 143 MI.eraseFromParent(); 144 return; 145 } 146 147 148 LIS->ReplaceMachineInstrInMaps(MI, *AndSaveExec); 149 LIS->InsertMachineInstrInMaps(*Xor); 150 LIS->InsertMachineInstrInMaps(*NewBr); 151 152 MI.eraseFromParent(); 153 154 // FIXME: Is there a better way of adjusting the liveness? It shouldn't be 155 // hard to add another def here but I'm not sure how to correctly update the 156 // valno. 157 LIS->removeInterval(SaveExecReg); 158 LIS->createAndComputeVirtRegInterval(SaveExecReg); 159 } 160 161 void SILowerControlFlow::emitElse(MachineInstr &MI) { 162 MachineBasicBlock &MBB = *MI.getParent(); 163 const DebugLoc &DL = MI.getDebugLoc(); 164 165 unsigned DstReg = MI.getOperand(0).getReg(); 166 assert(MI.getOperand(0).getSubReg() == AMDGPU::NoSubRegister); 167 168 bool ExecModified = MI.getOperand(3).getImm() != 0; 169 MachineBasicBlock::iterator Start = MBB.begin(); 170 171 // This must be inserted before phis and any spill code inserted before the 172 // else. 173 MachineInstr *OrSaveExec = 174 BuildMI(MBB, Start, DL, TII->get(AMDGPU::S_OR_SAVEEXEC_B64), DstReg) 175 .addOperand(MI.getOperand(1)); // Saved EXEC 176 MachineBasicBlock *DestBB = MI.getOperand(2).getMBB(); 177 178 MachineBasicBlock::iterator ElsePt(MI); 179 180 if (ExecModified) { 181 MachineInstr *And = 182 BuildMI(MBB, ElsePt, DL, TII->get(AMDGPU::S_AND_B64), DstReg) 183 .addReg(AMDGPU::EXEC) 184 .addReg(DstReg); 185 186 if (LIS) 187 LIS->InsertMachineInstrInMaps(*And); 188 } 189 190 MachineInstr *Xor = 191 BuildMI(MBB, ElsePt, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC) 192 .addReg(AMDGPU::EXEC) 193 .addReg(DstReg); 194 195 MachineBasicBlock::iterator Term = MBB.getFirstTerminator(); 196 // Insert a pseudo terminator to help keep the verifier happy. 197 MachineInstr *Branch = 198 BuildMI(MBB, Term, DL, TII->get(AMDGPU::SI_MASK_BRANCH)) 199 .addMBB(DestBB) 200 .addReg(DstReg); 201 202 if (!LIS) { 203 MI.eraseFromParent(); 204 return; 205 } 206 207 LIS->RemoveMachineInstrFromMaps(MI); 208 MI.eraseFromParent(); 209 210 LIS->InsertMachineInstrInMaps(*OrSaveExec); 211 212 LIS->InsertMachineInstrInMaps(*Xor); 213 LIS->InsertMachineInstrInMaps(*Branch); 214 215 // src reg is tied to dst reg. 216 LIS->removeInterval(DstReg); 217 LIS->createAndComputeVirtRegInterval(DstReg); 218 219 // Let this be recomputed. 220 LIS->removeRegUnit(*MCRegUnitIterator(AMDGPU::EXEC, TRI)); 221 } 222 223 void SILowerControlFlow::emitBreak(MachineInstr &MI) { 224 MachineBasicBlock &MBB = *MI.getParent(); 225 const DebugLoc &DL = MI.getDebugLoc(); 226 unsigned Dst = MI.getOperand(0).getReg(); 227 228 MachineInstr *Or = 229 BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst) 230 .addReg(AMDGPU::EXEC) 231 .addOperand(MI.getOperand(1)); 232 233 if (LIS) 234 LIS->ReplaceMachineInstrInMaps(MI, *Or); 235 MI.eraseFromParent(); 236 } 237 238 void SILowerControlFlow::emitIfBreak(MachineInstr &MI) { 239 MI.setDesc(TII->get(AMDGPU::S_OR_B64)); 240 } 241 242 void SILowerControlFlow::emitElseBreak(MachineInstr &MI) { 243 MI.setDesc(TII->get(AMDGPU::S_OR_B64)); 244 } 245 246 void SILowerControlFlow::emitLoop(MachineInstr &MI) { 247 MachineBasicBlock &MBB = *MI.getParent(); 248 const DebugLoc &DL = MI.getDebugLoc(); 249 250 MachineInstr *AndN2 = 251 BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_ANDN2_B64), AMDGPU::EXEC) 252 .addReg(AMDGPU::EXEC) 253 .addOperand(MI.getOperand(0)); 254 255 MachineInstr *Branch = 256 BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ)) 257 .addOperand(MI.getOperand(1)); 258 259 if (LIS) { 260 LIS->ReplaceMachineInstrInMaps(MI, *AndN2); 261 LIS->InsertMachineInstrInMaps(*Branch); 262 } 263 264 MI.eraseFromParent(); 265 } 266 267 void SILowerControlFlow::emitEndCf(MachineInstr &MI) { 268 MachineBasicBlock &MBB = *MI.getParent(); 269 const DebugLoc &DL = MI.getDebugLoc(); 270 271 MachineBasicBlock::iterator InsPt = MBB.begin(); 272 MachineInstr *NewMI = 273 BuildMI(MBB, InsPt, DL, TII->get(AMDGPU::S_OR_B64), AMDGPU::EXEC) 274 .addReg(AMDGPU::EXEC) 275 .addOperand(MI.getOperand(0)); 276 277 if (LIS) 278 LIS->ReplaceMachineInstrInMaps(MI, *NewMI); 279 280 MI.eraseFromParent(); 281 282 if (LIS) 283 LIS->handleMove(*NewMI); 284 } 285 286 bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) { 287 const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); 288 TII = ST.getInstrInfo(); 289 TRI = &TII->getRegisterInfo(); 290 291 // This doesn't actually need LiveIntervals, but we can preserve them. 292 LIS = getAnalysisIfAvailable<LiveIntervals>(); 293 294 MachineFunction::iterator NextBB; 295 for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); 296 BI != BE; BI = NextBB) { 297 NextBB = std::next(BI); 298 MachineBasicBlock &MBB = *BI; 299 300 MachineBasicBlock::iterator I, Next; 301 302 for (I = MBB.begin(); I != MBB.end(); I = Next) { 303 Next = std::next(I); 304 MachineInstr &MI = *I; 305 306 switch (MI.getOpcode()) { 307 case AMDGPU::SI_IF: 308 emitIf(MI); 309 break; 310 311 case AMDGPU::SI_ELSE: 312 emitElse(MI); 313 break; 314 315 case AMDGPU::SI_BREAK: 316 emitBreak(MI); 317 break; 318 319 case AMDGPU::SI_IF_BREAK: 320 emitIfBreak(MI); 321 break; 322 323 case AMDGPU::SI_ELSE_BREAK: 324 emitElseBreak(MI); 325 break; 326 327 case AMDGPU::SI_LOOP: 328 emitLoop(MI); 329 break; 330 331 case AMDGPU::SI_END_CF: 332 emitEndCf(MI); 333 break; 334 335 default: 336 break; 337 } 338 } 339 } 340 341 return true; 342 } 343