1 //===-- SILowerControlFlow.cpp - Use predicates for control flow ----------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 /// \file 11 /// \brief This pass lowers the pseudo control flow instructions to real 12 /// machine instructions. 13 /// 14 /// All control flow is handled using predicated instructions and 15 /// a predicate stack. Each Scalar ALU controls the operations of 64 Vector 16 /// ALUs. The Scalar ALU can update the predicate for any of the Vector ALUs 17 /// by writting to the 64-bit EXEC register (each bit corresponds to a 18 /// single vector ALU). Typically, for predicates, a vector ALU will write 19 /// to its bit of the VCC register (like EXEC VCC is 64-bits, one for each 20 /// Vector ALU) and then the ScalarALU will AND the VCC register with the 21 /// EXEC to update the predicates. 22 /// 23 /// For example: 24 /// %VCC = V_CMP_GT_F32 %VGPR1, %VGPR2 25 /// %SGPR0 = SI_IF %VCC 26 /// %VGPR0 = V_ADD_F32 %VGPR0, %VGPR0 27 /// %SGPR0 = SI_ELSE %SGPR0 28 /// %VGPR0 = V_SUB_F32 %VGPR0, %VGPR0 29 /// SI_END_CF %SGPR0 30 /// 31 /// becomes: 32 /// 33 /// %SGPR0 = S_AND_SAVEEXEC_B64 %VCC // Save and update the exec mask 34 /// %SGPR0 = S_XOR_B64 %SGPR0, %EXEC // Clear live bits from saved exec mask 35 /// S_CBRANCH_EXECZ label0 // This instruction is an optional 36 /// // optimization which allows us to 37 /// // branch if all the bits of 38 /// // EXEC are zero. 39 /// %VGPR0 = V_ADD_F32 %VGPR0, %VGPR0 // Do the IF block of the branch 40 /// 41 /// label0: 42 /// %SGPR0 = S_OR_SAVEEXEC_B64 %EXEC // Restore the exec mask for the Then block 43 /// %EXEC = S_XOR_B64 %SGPR0, %EXEC // Clear live bits from saved exec mask 44 /// S_BRANCH_EXECZ label1 // Use our branch optimization 45 /// // instruction again. 46 /// %VGPR0 = V_SUB_F32 %VGPR0, %VGPR // Do the THEN block 47 /// label1: 48 /// %EXEC = S_OR_B64 %EXEC, %SGPR0 // Re-enable saved exec mask bits 49 //===----------------------------------------------------------------------===// 50 51 #include "AMDGPU.h" 52 #include "AMDGPUSubtarget.h" 53 #include "SIInstrInfo.h" 54 #include "SIMachineFunctionInfo.h" 55 #include "llvm/CodeGen/LivePhysRegs.h" 56 #include "llvm/CodeGen/MachineFrameInfo.h" 57 #include "llvm/CodeGen/MachineFunction.h" 58 #include "llvm/CodeGen/MachineFunctionPass.h" 59 #include "llvm/CodeGen/MachineInstrBuilder.h" 60 #include "llvm/CodeGen/MachineRegisterInfo.h" 61 62 using namespace llvm; 63 64 #define DEBUG_TYPE "si-lower-control-flow" 65 66 namespace { 67 68 class SILowerControlFlow : public MachineFunctionPass { 69 private: 70 const SIRegisterInfo *TRI; 71 const SIInstrInfo *TII; 72 LiveIntervals *LIS; 73 74 void emitIf(MachineInstr &MI); 75 void emitElse(MachineInstr &MI); 76 void emitBreak(MachineInstr &MI); 77 void emitIfBreak(MachineInstr &MI); 78 void emitElseBreak(MachineInstr &MI); 79 void emitLoop(MachineInstr &MI); 80 void emitEndCf(MachineInstr &MI); 81 82 public: 83 static char ID; 84 85 SILowerControlFlow() : 86 MachineFunctionPass(ID), 87 TRI(nullptr), 88 TII(nullptr), 89 LIS(nullptr) {} 90 91 bool runOnMachineFunction(MachineFunction &MF) override; 92 93 const char *getPassName() const override { 94 return "SI Lower control flow pseudo instructions"; 95 } 96 97 void getAnalysisUsage(AnalysisUsage &AU) const override { 98 AU.addPreserved<LiveIntervals>(); 99 AU.addPreserved<SlotIndexes>(); 100 AU.setPreservesCFG(); 101 MachineFunctionPass::getAnalysisUsage(AU); 102 } 103 }; 104 105 } // End anonymous namespace 106 107 char SILowerControlFlow::ID = 0; 108 109 INITIALIZE_PASS(SILowerControlFlow, DEBUG_TYPE, 110 "SI lower control flow", false, false) 111 112 char &llvm::SILowerControlFlowID = SILowerControlFlow::ID; 113 114 void SILowerControlFlow::emitIf(MachineInstr &MI) { 115 MachineBasicBlock &MBB = *MI.getParent(); 116 const DebugLoc &DL = MI.getDebugLoc(); 117 MachineBasicBlock::iterator I(&MI); 118 119 MachineOperand &SaveExec = MI.getOperand(0); 120 MachineOperand &Cond = MI.getOperand(1); 121 assert(SaveExec.getSubReg() == AMDGPU::NoSubRegister && 122 Cond.getSubReg() == AMDGPU::NoSubRegister); 123 124 unsigned SaveExecReg = SaveExec.getReg(); 125 126 MachineInstr *AndSaveExec = 127 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), SaveExecReg) 128 .addOperand(Cond); 129 130 MachineInstr *Xor = 131 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_XOR_B64), SaveExecReg) 132 .addReg(AMDGPU::EXEC) 133 .addReg(SaveExecReg); 134 135 // Insert a pseudo terminator to help keep the verifier happy. This will also 136 // be used later when inserting skips. 137 MachineInstr *NewBr = 138 BuildMI(MBB, I, DL, TII->get(AMDGPU::SI_MASK_BRANCH)) 139 .addOperand(MI.getOperand(2)); 140 141 if (!LIS) { 142 MI.eraseFromParent(); 143 return; 144 } 145 146 147 LIS->ReplaceMachineInstrInMaps(MI, *AndSaveExec); 148 LIS->InsertMachineInstrInMaps(*Xor); 149 LIS->InsertMachineInstrInMaps(*NewBr); 150 151 MI.eraseFromParent(); 152 153 // FIXME: Is there a better way of adjusting the liveness? It shouldn't be 154 // hard to add another def here but I'm not sure how to correctly update the 155 // valno. 156 LIS->removeInterval(SaveExecReg); 157 LIS->createAndComputeVirtRegInterval(SaveExecReg); 158 } 159 160 void SILowerControlFlow::emitElse(MachineInstr &MI) { 161 MachineBasicBlock &MBB = *MI.getParent(); 162 const DebugLoc &DL = MI.getDebugLoc(); 163 164 unsigned DstReg = MI.getOperand(0).getReg(); 165 assert(MI.getOperand(0).getSubReg() == AMDGPU::NoSubRegister); 166 167 bool ExecModified = MI.getOperand(3).getImm() != 0; 168 MachineBasicBlock::iterator Start = MBB.begin(); 169 170 // This must be inserted before phis and any spill code inserted before the 171 // else. 172 MachineInstr *OrSaveExec = 173 BuildMI(MBB, Start, DL, TII->get(AMDGPU::S_OR_SAVEEXEC_B64), DstReg) 174 .addOperand(MI.getOperand(1)); // Saved EXEC 175 MachineBasicBlock *DestBB = MI.getOperand(2).getMBB(); 176 177 MachineBasicBlock::iterator ElsePt(MI); 178 179 if (ExecModified) { 180 MachineInstr *And = 181 BuildMI(MBB, ElsePt, DL, TII->get(AMDGPU::S_AND_B64), DstReg) 182 .addReg(AMDGPU::EXEC) 183 .addReg(DstReg); 184 185 if (LIS) 186 LIS->InsertMachineInstrInMaps(*And); 187 } 188 189 MachineInstr *Xor = 190 BuildMI(MBB, ElsePt, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC) 191 .addReg(AMDGPU::EXEC) 192 .addReg(DstReg); 193 194 MachineBasicBlock::iterator Term = MBB.getFirstTerminator(); 195 // Insert a pseudo terminator to help keep the verifier happy. 196 MachineInstr *Branch = 197 BuildMI(MBB, Term, DL, TII->get(AMDGPU::SI_MASK_BRANCH)) 198 .addMBB(DestBB); 199 200 if (!LIS) { 201 MI.eraseFromParent(); 202 return; 203 } 204 205 LIS->RemoveMachineInstrFromMaps(MI); 206 MI.eraseFromParent(); 207 208 LIS->InsertMachineInstrInMaps(*OrSaveExec); 209 210 LIS->InsertMachineInstrInMaps(*Xor); 211 LIS->InsertMachineInstrInMaps(*Branch); 212 213 // src reg is tied to dst reg. 214 LIS->removeInterval(DstReg); 215 LIS->createAndComputeVirtRegInterval(DstReg); 216 217 // Let this be recomputed. 218 LIS->removeRegUnit(*MCRegUnitIterator(AMDGPU::EXEC, TRI)); 219 } 220 221 void SILowerControlFlow::emitBreak(MachineInstr &MI) { 222 MachineBasicBlock &MBB = *MI.getParent(); 223 const DebugLoc &DL = MI.getDebugLoc(); 224 unsigned Dst = MI.getOperand(0).getReg(); 225 226 MachineInstr *Or = 227 BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst) 228 .addReg(AMDGPU::EXEC) 229 .addOperand(MI.getOperand(1)); 230 231 if (LIS) 232 LIS->ReplaceMachineInstrInMaps(MI, *Or); 233 MI.eraseFromParent(); 234 } 235 236 void SILowerControlFlow::emitIfBreak(MachineInstr &MI) { 237 MI.setDesc(TII->get(AMDGPU::S_OR_B64)); 238 } 239 240 void SILowerControlFlow::emitElseBreak(MachineInstr &MI) { 241 MI.setDesc(TII->get(AMDGPU::S_OR_B64)); 242 } 243 244 void SILowerControlFlow::emitLoop(MachineInstr &MI) { 245 MachineBasicBlock &MBB = *MI.getParent(); 246 const DebugLoc &DL = MI.getDebugLoc(); 247 248 MachineInstr *AndN2 = 249 BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_ANDN2_B64), AMDGPU::EXEC) 250 .addReg(AMDGPU::EXEC) 251 .addOperand(MI.getOperand(0)); 252 253 MachineInstr *Branch = 254 BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ)) 255 .addOperand(MI.getOperand(1)); 256 257 if (LIS) { 258 LIS->ReplaceMachineInstrInMaps(MI, *AndN2); 259 LIS->InsertMachineInstrInMaps(*Branch); 260 } 261 262 MI.eraseFromParent(); 263 } 264 265 void SILowerControlFlow::emitEndCf(MachineInstr &MI) { 266 MachineBasicBlock &MBB = *MI.getParent(); 267 const DebugLoc &DL = MI.getDebugLoc(); 268 269 MachineBasicBlock::iterator InsPt = MBB.begin(); 270 MachineInstr *NewMI = 271 BuildMI(MBB, InsPt, DL, TII->get(AMDGPU::S_OR_B64), AMDGPU::EXEC) 272 .addReg(AMDGPU::EXEC) 273 .addOperand(MI.getOperand(0)); 274 275 if (LIS) 276 LIS->ReplaceMachineInstrInMaps(MI, *NewMI); 277 278 MI.eraseFromParent(); 279 280 if (LIS) 281 LIS->handleMove(*NewMI); 282 } 283 284 bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) { 285 const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); 286 TII = ST.getInstrInfo(); 287 TRI = &TII->getRegisterInfo(); 288 289 // This doesn't actually need LiveIntervals, but we can preserve them. 290 LIS = getAnalysisIfAvailable<LiveIntervals>(); 291 292 MachineFunction::iterator NextBB; 293 for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); 294 BI != BE; BI = NextBB) { 295 NextBB = std::next(BI); 296 MachineBasicBlock &MBB = *BI; 297 298 MachineBasicBlock::iterator I, Next; 299 300 for (I = MBB.begin(); I != MBB.end(); I = Next) { 301 Next = std::next(I); 302 MachineInstr &MI = *I; 303 304 switch (MI.getOpcode()) { 305 case AMDGPU::SI_IF: 306 emitIf(MI); 307 break; 308 309 case AMDGPU::SI_ELSE: 310 emitElse(MI); 311 break; 312 313 case AMDGPU::SI_BREAK: 314 emitBreak(MI); 315 break; 316 317 case AMDGPU::SI_IF_BREAK: 318 emitIfBreak(MI); 319 break; 320 321 case AMDGPU::SI_ELSE_BREAK: 322 emitElseBreak(MI); 323 break; 324 325 case AMDGPU::SI_LOOP: 326 emitLoop(MI); 327 break; 328 329 case AMDGPU::SI_END_CF: 330 emitEndCf(MI); 331 break; 332 333 default: 334 break; 335 } 336 } 337 } 338 339 return true; 340 } 341