1 //===-- SILowerControlFlow.cpp - Use predicates for control flow ----------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 /// \file 11 /// \brief This pass lowers the pseudo control flow instructions to real 12 /// machine instructions. 13 /// 14 /// All control flow is handled using predicated instructions and 15 /// a predicate stack. Each Scalar ALU controls the operations of 64 Vector 16 /// ALUs. The Scalar ALU can update the predicate for any of the Vector ALUs 17 /// by writting to the 64-bit EXEC register (each bit corresponds to a 18 /// single vector ALU). Typically, for predicates, a vector ALU will write 19 /// to its bit of the VCC register (like EXEC VCC is 64-bits, one for each 20 /// Vector ALU) and then the ScalarALU will AND the VCC register with the 21 /// EXEC to update the predicates. 22 /// 23 /// For example: 24 /// %VCC = V_CMP_GT_F32 %VGPR1, %VGPR2 25 /// %SGPR0 = SI_IF %VCC 26 /// %VGPR0 = V_ADD_F32 %VGPR0, %VGPR0 27 /// %SGPR0 = SI_ELSE %SGPR0 28 /// %VGPR0 = V_SUB_F32 %VGPR0, %VGPR0 29 /// SI_END_CF %SGPR0 30 /// 31 /// becomes: 32 /// 33 /// %SGPR0 = S_AND_SAVEEXEC_B64 %VCC // Save and update the exec mask 34 /// %SGPR0 = S_XOR_B64 %SGPR0, %EXEC // Clear live bits from saved exec mask 35 /// S_CBRANCH_EXECZ label0 // This instruction is an optional 36 /// // optimization which allows us to 37 /// // branch if all the bits of 38 /// // EXEC are zero. 39 /// %VGPR0 = V_ADD_F32 %VGPR0, %VGPR0 // Do the IF block of the branch 40 /// 41 /// label0: 42 /// %SGPR0 = S_OR_SAVEEXEC_B64 %EXEC // Restore the exec mask for the Then block 43 /// %EXEC = S_XOR_B64 %SGPR0, %EXEC // Clear live bits from saved exec mask 44 /// S_BRANCH_EXECZ label1 // Use our branch optimization 45 /// // instruction again. 46 /// %VGPR0 = V_SUB_F32 %VGPR0, %VGPR // Do the THEN block 47 /// label1: 48 /// %EXEC = S_OR_B64 %EXEC, %SGPR0 // Re-enable saved exec mask bits 49 //===----------------------------------------------------------------------===// 50 51 #include "AMDGPU.h" 52 #include "AMDGPUSubtarget.h" 53 #include "SIInstrInfo.h" 54 #include "SIMachineFunctionInfo.h" 55 #include "llvm/CodeGen/LivePhysRegs.h" 56 #include "llvm/CodeGen/MachineFrameInfo.h" 57 #include "llvm/CodeGen/MachineFunction.h" 58 #include "llvm/CodeGen/MachineFunctionPass.h" 59 #include "llvm/CodeGen/MachineInstrBuilder.h" 60 #include "llvm/CodeGen/MachineRegisterInfo.h" 61 #include "llvm/IR/Constants.h" 62 63 using namespace llvm; 64 65 #define DEBUG_TYPE "si-lower-control-flow" 66 67 namespace { 68 69 class SILowerControlFlow : public MachineFunctionPass { 70 private: 71 static const unsigned SkipThreshold = 12; 72 73 const SIRegisterInfo *TRI; 74 const SIInstrInfo *TII; 75 76 bool shouldSkip(MachineBasicBlock *From, MachineBasicBlock *To); 77 78 void Skip(MachineInstr &From, MachineOperand &To); 79 bool skipIfDead(MachineInstr &MI); 80 81 void If(MachineInstr &MI); 82 void Else(MachineInstr &MI, bool ExecModified); 83 void Break(MachineInstr &MI); 84 void IfBreak(MachineInstr &MI); 85 void ElseBreak(MachineInstr &MI); 86 void Loop(MachineInstr &MI); 87 void EndCf(MachineInstr &MI); 88 89 void Kill(MachineInstr &MI); 90 void Branch(MachineInstr &MI); 91 92 std::pair<MachineBasicBlock *, MachineBasicBlock *> 93 splitBlock(MachineBasicBlock &MBB, MachineBasicBlock::iterator I); 94 95 void splitLoadM0BlockLiveIns(LivePhysRegs &RemainderLiveRegs, 96 const MachineRegisterInfo &MRI, 97 const MachineInstr &MI, 98 MachineBasicBlock &LoopBB, 99 MachineBasicBlock &RemainderBB, 100 unsigned SaveReg, 101 const MachineOperand &IdxReg); 102 103 void emitLoadM0FromVGPRLoop(MachineBasicBlock &LoopBB, DebugLoc DL, 104 MachineInstr *MovRel, 105 const MachineOperand &IdxReg, 106 int Offset); 107 108 bool loadM0(MachineInstr &MI, MachineInstr *MovRel, int Offset = 0); 109 std::pair<unsigned, int> computeIndirectRegAndOffset(unsigned VecReg, 110 int Offset) const; 111 bool indirectSrc(MachineInstr &MI); 112 bool indirectDst(MachineInstr &MI); 113 114 public: 115 static char ID; 116 117 SILowerControlFlow() : 118 MachineFunctionPass(ID), TRI(nullptr), TII(nullptr) { } 119 120 bool runOnMachineFunction(MachineFunction &MF) override; 121 122 const char *getPassName() const override { 123 return "SI Lower control flow pseudo instructions"; 124 } 125 }; 126 127 } // End anonymous namespace 128 129 char SILowerControlFlow::ID = 0; 130 131 INITIALIZE_PASS(SILowerControlFlow, DEBUG_TYPE, 132 "SI lower control flow", false, false) 133 134 char &llvm::SILowerControlFlowPassID = SILowerControlFlow::ID; 135 136 137 FunctionPass *llvm::createSILowerControlFlowPass() { 138 return new SILowerControlFlow(); 139 } 140 141 static bool opcodeEmitsNoInsts(unsigned Opc) { 142 switch (Opc) { 143 case TargetOpcode::IMPLICIT_DEF: 144 case TargetOpcode::KILL: 145 case TargetOpcode::BUNDLE: 146 case TargetOpcode::CFI_INSTRUCTION: 147 case TargetOpcode::EH_LABEL: 148 case TargetOpcode::GC_LABEL: 149 case TargetOpcode::DBG_VALUE: 150 return true; 151 default: 152 return false; 153 } 154 } 155 156 bool SILowerControlFlow::shouldSkip(MachineBasicBlock *From, 157 MachineBasicBlock *To) { 158 159 unsigned NumInstr = 0; 160 MachineFunction *MF = From->getParent(); 161 162 for (MachineFunction::iterator MBBI(From), ToI(To), End = MF->end(); 163 MBBI != End && MBBI != ToI; ++MBBI) { 164 MachineBasicBlock &MBB = *MBBI; 165 166 for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); 167 NumInstr < SkipThreshold && I != E; ++I) { 168 if (opcodeEmitsNoInsts(I->getOpcode())) 169 continue; 170 171 // When a uniform loop is inside non-uniform control flow, the branch 172 // leaving the loop might be an S_CBRANCH_VCCNZ, which is never taken 173 // when EXEC = 0. We should skip the loop lest it becomes infinite. 174 if (I->getOpcode() == AMDGPU::S_CBRANCH_VCCNZ || 175 I->getOpcode() == AMDGPU::S_CBRANCH_VCCZ) 176 return true; 177 178 if (I->isInlineAsm()) { 179 const MCAsmInfo *MAI = MF->getTarget().getMCAsmInfo(); 180 const char *AsmStr = I->getOperand(0).getSymbolName(); 181 182 // inlineasm length estimate is number of bytes assuming the longest 183 // instruction. 184 uint64_t MaxAsmSize = TII->getInlineAsmLength(AsmStr, *MAI); 185 NumInstr += MaxAsmSize / MAI->getMaxInstLength(); 186 } else { 187 ++NumInstr; 188 } 189 190 if (NumInstr >= SkipThreshold) 191 return true; 192 } 193 } 194 195 return false; 196 } 197 198 void SILowerControlFlow::Skip(MachineInstr &From, MachineOperand &To) { 199 200 if (!shouldSkip(*From.getParent()->succ_begin(), To.getMBB())) 201 return; 202 203 DebugLoc DL = From.getDebugLoc(); 204 BuildMI(*From.getParent(), &From, DL, TII->get(AMDGPU::S_CBRANCH_EXECZ)) 205 .addOperand(To); 206 } 207 208 bool SILowerControlFlow::skipIfDead(MachineInstr &MI) { 209 MachineBasicBlock &MBB = *MI.getParent(); 210 211 if (MBB.getParent()->getFunction()->getCallingConv() != CallingConv::AMDGPU_PS || 212 !shouldSkip(&MBB, &MBB.getParent()->back())) 213 return false; 214 215 LivePhysRegs RemainderLiveRegs(TRI); 216 RemainderLiveRegs.addLiveOuts(MBB); 217 218 MachineBasicBlock *SkipBB; 219 MachineBasicBlock *RemainderBB; 220 std::tie(SkipBB, RemainderBB) = splitBlock(MBB, MI.getIterator()); 221 222 const DebugLoc &DL = MI.getDebugLoc(); 223 224 // If the exec mask is non-zero, skip the next two instructions 225 BuildMI(&MBB, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ)) 226 .addMBB(RemainderBB); 227 228 MBB.addSuccessor(RemainderBB); 229 230 MachineBasicBlock::iterator Insert = SkipBB->begin(); 231 232 // Exec mask is zero: Export to NULL target... 233 BuildMI(*SkipBB, Insert, DL, TII->get(AMDGPU::EXP)) 234 .addImm(0) 235 .addImm(0x09) // V_008DFC_SQ_EXP_NULL 236 .addImm(0) 237 .addImm(1) 238 .addImm(1) 239 .addReg(AMDGPU::VGPR0, RegState::Undef) 240 .addReg(AMDGPU::VGPR0, RegState::Undef) 241 .addReg(AMDGPU::VGPR0, RegState::Undef) 242 .addReg(AMDGPU::VGPR0, RegState::Undef); 243 244 // ... and terminate wavefront. 245 BuildMI(*SkipBB, Insert, DL, TII->get(AMDGPU::S_ENDPGM)); 246 247 for (const MachineInstr &Inst : reverse(*RemainderBB)) 248 RemainderLiveRegs.stepBackward(Inst); 249 250 const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 251 for (unsigned Reg : RemainderLiveRegs) { 252 if (MRI.isAllocatable(Reg)) 253 RemainderBB->addLiveIn(Reg); 254 } 255 256 return true; 257 } 258 259 void SILowerControlFlow::If(MachineInstr &MI) { 260 MachineBasicBlock &MBB = *MI.getParent(); 261 DebugLoc DL = MI.getDebugLoc(); 262 unsigned Reg = MI.getOperand(0).getReg(); 263 unsigned Vcc = MI.getOperand(1).getReg(); 264 265 BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), Reg) 266 .addReg(Vcc); 267 268 BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_XOR_B64), Reg) 269 .addReg(AMDGPU::EXEC) 270 .addReg(Reg); 271 272 Skip(MI, MI.getOperand(2)); 273 274 // Insert a pseudo terminator to help keep the verifier happy. 275 BuildMI(MBB, &MI, DL, TII->get(AMDGPU::SI_MASK_BRANCH)) 276 .addOperand(MI.getOperand(2)) 277 .addReg(Reg); 278 279 MI.eraseFromParent(); 280 } 281 282 void SILowerControlFlow::Else(MachineInstr &MI, bool ExecModified) { 283 MachineBasicBlock &MBB = *MI.getParent(); 284 DebugLoc DL = MI.getDebugLoc(); 285 unsigned Dst = MI.getOperand(0).getReg(); 286 unsigned Src = MI.getOperand(1).getReg(); 287 288 BuildMI(MBB, MBB.getFirstNonPHI(), DL, 289 TII->get(AMDGPU::S_OR_SAVEEXEC_B64), Dst) 290 .addReg(Src); // Saved EXEC 291 292 if (ExecModified) { 293 // Adjust the saved exec to account for the modifications during the flow 294 // block that contains the ELSE. This can happen when WQM mode is switched 295 // off. 296 BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_B64), Dst) 297 .addReg(AMDGPU::EXEC) 298 .addReg(Dst); 299 } 300 301 BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC) 302 .addReg(AMDGPU::EXEC) 303 .addReg(Dst); 304 305 Skip(MI, MI.getOperand(2)); 306 307 // Insert a pseudo terminator to help keep the verifier happy. 308 BuildMI(MBB, &MI, DL, TII->get(AMDGPU::SI_MASK_BRANCH)) 309 .addOperand(MI.getOperand(2)) 310 .addReg(Dst); 311 312 MI.eraseFromParent(); 313 } 314 315 void SILowerControlFlow::Break(MachineInstr &MI) { 316 MachineBasicBlock &MBB = *MI.getParent(); 317 DebugLoc DL = MI.getDebugLoc(); 318 319 unsigned Dst = MI.getOperand(0).getReg(); 320 unsigned Src = MI.getOperand(1).getReg(); 321 322 BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst) 323 .addReg(AMDGPU::EXEC) 324 .addReg(Src); 325 326 MI.eraseFromParent(); 327 } 328 329 void SILowerControlFlow::IfBreak(MachineInstr &MI) { 330 MachineBasicBlock &MBB = *MI.getParent(); 331 DebugLoc DL = MI.getDebugLoc(); 332 333 unsigned Dst = MI.getOperand(0).getReg(); 334 unsigned Vcc = MI.getOperand(1).getReg(); 335 unsigned Src = MI.getOperand(2).getReg(); 336 337 BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst) 338 .addReg(Vcc) 339 .addReg(Src); 340 341 MI.eraseFromParent(); 342 } 343 344 void SILowerControlFlow::ElseBreak(MachineInstr &MI) { 345 MachineBasicBlock &MBB = *MI.getParent(); 346 DebugLoc DL = MI.getDebugLoc(); 347 348 unsigned Dst = MI.getOperand(0).getReg(); 349 unsigned Saved = MI.getOperand(1).getReg(); 350 unsigned Src = MI.getOperand(2).getReg(); 351 352 BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst) 353 .addReg(Saved) 354 .addReg(Src); 355 356 MI.eraseFromParent(); 357 } 358 359 void SILowerControlFlow::Loop(MachineInstr &MI) { 360 MachineBasicBlock &MBB = *MI.getParent(); 361 DebugLoc DL = MI.getDebugLoc(); 362 unsigned Src = MI.getOperand(0).getReg(); 363 364 BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_ANDN2_B64), AMDGPU::EXEC) 365 .addReg(AMDGPU::EXEC) 366 .addReg(Src); 367 368 BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ)) 369 .addOperand(MI.getOperand(1)); 370 371 MI.eraseFromParent(); 372 } 373 374 void SILowerControlFlow::EndCf(MachineInstr &MI) { 375 MachineBasicBlock &MBB = *MI.getParent(); 376 DebugLoc DL = MI.getDebugLoc(); 377 unsigned Reg = MI.getOperand(0).getReg(); 378 379 BuildMI(MBB, MBB.getFirstNonPHI(), DL, 380 TII->get(AMDGPU::S_OR_B64), AMDGPU::EXEC) 381 .addReg(AMDGPU::EXEC) 382 .addReg(Reg); 383 384 MI.eraseFromParent(); 385 } 386 387 void SILowerControlFlow::Branch(MachineInstr &MI) { 388 MachineBasicBlock *MBB = MI.getOperand(0).getMBB(); 389 if (MBB == MI.getParent()->getNextNode()) 390 MI.eraseFromParent(); 391 392 // If these aren't equal, this is probably an infinite loop. 393 } 394 395 void SILowerControlFlow::Kill(MachineInstr &MI) { 396 MachineBasicBlock &MBB = *MI.getParent(); 397 DebugLoc DL = MI.getDebugLoc(); 398 const MachineOperand &Op = MI.getOperand(0); 399 400 #ifndef NDEBUG 401 CallingConv::ID CallConv = MBB.getParent()->getFunction()->getCallingConv(); 402 // Kill is only allowed in pixel / geometry shaders. 403 assert(CallConv == CallingConv::AMDGPU_PS || 404 CallConv == CallingConv::AMDGPU_GS); 405 #endif 406 407 // Clear this thread from the exec mask if the operand is negative 408 if ((Op.isImm())) { 409 // Constant operand: Set exec mask to 0 or do nothing 410 if (Op.getImm() & 0x80000000) { 411 BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC) 412 .addImm(0); 413 } 414 } else { 415 BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMPX_LE_F32_e32)) 416 .addImm(0) 417 .addOperand(Op); 418 } 419 420 MI.eraseFromParent(); 421 } 422 423 // All currently live registers must remain so in the remainder block. 424 void SILowerControlFlow::splitLoadM0BlockLiveIns(LivePhysRegs &RemainderLiveRegs, 425 const MachineRegisterInfo &MRI, 426 const MachineInstr &MI, 427 MachineBasicBlock &LoopBB, 428 MachineBasicBlock &RemainderBB, 429 unsigned SaveReg, 430 const MachineOperand &IdxReg) { 431 // Add reg defined in loop body. 432 RemainderLiveRegs.addReg(SaveReg); 433 434 if (const MachineOperand *Val = TII->getNamedOperand(MI, AMDGPU::OpName::val)) { 435 if (!Val->isUndef()) { 436 RemainderLiveRegs.addReg(Val->getReg()); 437 LoopBB.addLiveIn(Val->getReg()); 438 } 439 } 440 441 for (unsigned Reg : RemainderLiveRegs) { 442 if (MRI.isAllocatable(Reg)) 443 RemainderBB.addLiveIn(Reg); 444 } 445 446 const MachineOperand *Src = TII->getNamedOperand(MI, AMDGPU::OpName::src); 447 if (!Src->isUndef()) 448 LoopBB.addLiveIn(Src->getReg()); 449 450 if (!IdxReg.isUndef()) 451 LoopBB.addLiveIn(IdxReg.getReg()); 452 LoopBB.sortUniqueLiveIns(); 453 } 454 455 void SILowerControlFlow::emitLoadM0FromVGPRLoop(MachineBasicBlock &LoopBB, 456 DebugLoc DL, 457 MachineInstr *MovRel, 458 const MachineOperand &IdxReg, 459 int Offset) { 460 MachineBasicBlock::iterator I = LoopBB.begin(); 461 462 // Read the next variant into VCC (lower 32 bits) <- also loop target 463 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), AMDGPU::VCC_LO) 464 .addReg(IdxReg.getReg(), getUndefRegState(IdxReg.isUndef())); 465 466 // Move index from VCC into M0 467 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0) 468 .addReg(AMDGPU::VCC_LO); 469 470 // Compare the just read M0 value to all possible Idx values 471 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e32)) 472 .addReg(AMDGPU::M0) 473 .addReg(IdxReg.getReg(), getUndefRegState(IdxReg.isUndef())); 474 475 // Update EXEC, save the original EXEC value to VCC 476 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), AMDGPU::VCC) 477 .addReg(AMDGPU::VCC); 478 479 if (Offset != 0) { 480 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0) 481 .addReg(AMDGPU::M0) 482 .addImm(Offset); 483 } 484 485 // Do the actual move 486 LoopBB.insert(I, MovRel); 487 488 // Update EXEC, switch all done bits to 0 and all todo bits to 1 489 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC) 490 .addReg(AMDGPU::EXEC) 491 .addReg(AMDGPU::VCC); 492 493 // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover 494 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ)) 495 .addMBB(&LoopBB); 496 } 497 498 std::pair<MachineBasicBlock *, MachineBasicBlock *> 499 SILowerControlFlow::splitBlock(MachineBasicBlock &MBB, 500 MachineBasicBlock::iterator I) { 501 MachineFunction *MF = MBB.getParent(); 502 503 // To insert the loop we need to split the block. Move everything after this 504 // point to a new block, and insert a new empty block between the two. 505 MachineBasicBlock *LoopBB = MF->CreateMachineBasicBlock(); 506 MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock(); 507 MachineFunction::iterator MBBI(MBB); 508 ++MBBI; 509 510 MF->insert(MBBI, LoopBB); 511 MF->insert(MBBI, RemainderBB); 512 513 // Move the rest of the block into a new block. 514 RemainderBB->transferSuccessors(&MBB); 515 RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end()); 516 517 MBB.addSuccessor(LoopBB); 518 519 return std::make_pair(LoopBB, RemainderBB); 520 } 521 522 // Returns true if a new block was inserted. 523 bool SILowerControlFlow::loadM0(MachineInstr &MI, MachineInstr *MovRel, int Offset) { 524 MachineBasicBlock &MBB = *MI.getParent(); 525 DebugLoc DL = MI.getDebugLoc(); 526 MachineBasicBlock::iterator I(&MI); 527 528 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx); 529 530 if (AMDGPU::SReg_32RegClass.contains(Idx->getReg())) { 531 if (Offset != 0) { 532 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0) 533 .addReg(Idx->getReg(), getUndefRegState(Idx->isUndef())) 534 .addImm(Offset); 535 } else { 536 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0) 537 .addReg(Idx->getReg(), getUndefRegState(Idx->isUndef())); 538 } 539 540 MBB.insert(I, MovRel); 541 MI.eraseFromParent(); 542 return false; 543 } 544 545 MachineOperand *SaveOp = TII->getNamedOperand(MI, AMDGPU::OpName::sdst); 546 SaveOp->setIsDead(false); 547 unsigned Save = SaveOp->getReg(); 548 549 // Reading from a VGPR requires looping over all workitems in the wavefront. 550 assert(AMDGPU::SReg_64RegClass.contains(Save) && 551 AMDGPU::VGPR_32RegClass.contains(Idx->getReg())); 552 553 // Save the EXEC mask 554 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B64), Save) 555 .addReg(AMDGPU::EXEC); 556 557 LivePhysRegs RemainderLiveRegs(TRI); 558 559 RemainderLiveRegs.addLiveOuts(MBB); 560 561 MachineBasicBlock *LoopBB; 562 MachineBasicBlock *RemainderBB; 563 564 std::tie(LoopBB, RemainderBB) = splitBlock(MBB, I); 565 566 for (const MachineInstr &Inst : reverse(*RemainderBB)) 567 RemainderLiveRegs.stepBackward(Inst); 568 569 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 570 LoopBB->addSuccessor(RemainderBB); 571 LoopBB->addSuccessor(LoopBB); 572 573 splitLoadM0BlockLiveIns(RemainderLiveRegs, MRI, MI, *LoopBB, 574 *RemainderBB, Save, *Idx); 575 576 emitLoadM0FromVGPRLoop(*LoopBB, DL, MovRel, *Idx, Offset); 577 578 MachineBasicBlock::iterator First = RemainderBB->begin(); 579 BuildMI(*RemainderBB, First, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC) 580 .addReg(Save); 581 582 MI.eraseFromParent(); 583 return true; 584 } 585 586 /// \param @VecReg The register which holds element zero of the vector being 587 /// addressed into. 588 // 589 /// \param[in] @Idx The index operand from the movrel instruction. This must be 590 // a register, but may be NoRegister. 591 /// 592 /// \param[in] @Offset As an input, this is the constant offset part of the 593 // indirect Index. e.g. v0 = v[VecReg + Offset] As an output, this is a constant 594 // value that needs to be added to the value stored in M0. 595 std::pair<unsigned, int> 596 SILowerControlFlow::computeIndirectRegAndOffset(unsigned VecReg, int Offset) const { 597 unsigned SubReg = TRI->getSubReg(VecReg, AMDGPU::sub0); 598 if (!SubReg) 599 SubReg = VecReg; 600 601 const TargetRegisterClass *SuperRC = TRI->getPhysRegClass(VecReg); 602 const TargetRegisterClass *RC = TRI->getPhysRegClass(SubReg); 603 int NumElts = SuperRC->getSize() / RC->getSize(); 604 605 int BaseRegIdx = TRI->getHWRegIndex(SubReg); 606 607 // Skip out of bounds offsets, or else we would end up using an undefined 608 // register. 609 if (Offset >= NumElts) 610 return std::make_pair(RC->getRegister(BaseRegIdx), Offset); 611 612 int RegIdx = BaseRegIdx + Offset; 613 if (RegIdx < 0) { 614 Offset = RegIdx; 615 RegIdx = 0; 616 } else { 617 Offset = 0; 618 } 619 620 unsigned Reg = RC->getRegister(RegIdx); 621 return std::make_pair(Reg, Offset); 622 } 623 624 // Return true if a new block was inserted. 625 bool SILowerControlFlow::indirectSrc(MachineInstr &MI) { 626 MachineBasicBlock &MBB = *MI.getParent(); 627 const DebugLoc &DL = MI.getDebugLoc(); 628 629 unsigned Dst = MI.getOperand(0).getReg(); 630 const MachineOperand *SrcVec = TII->getNamedOperand(MI, AMDGPU::OpName::src); 631 int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm(); 632 unsigned Reg; 633 634 std::tie(Reg, Offset) = computeIndirectRegAndOffset(SrcVec->getReg(), Offset); 635 636 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx); 637 if (Idx->getReg() == AMDGPU::NoRegister) { 638 // Only had a constant offset, copy the register directly. 639 BuildMI(MBB, MI.getIterator(), DL, TII->get(AMDGPU::V_MOV_B32_e32), Dst) 640 .addReg(Reg, getUndefRegState(SrcVec->isUndef())); 641 MI.eraseFromParent(); 642 return false; 643 } 644 645 MachineInstr *MovRel = 646 BuildMI(*MBB.getParent(), DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst) 647 .addReg(Reg, getUndefRegState(SrcVec->isUndef())) 648 .addReg(SrcVec->getReg(), RegState::Implicit); 649 650 return loadM0(MI, MovRel, Offset); 651 } 652 653 // Return true if a new block was inserted. 654 bool SILowerControlFlow::indirectDst(MachineInstr &MI) { 655 MachineBasicBlock &MBB = *MI.getParent(); 656 const DebugLoc &DL = MI.getDebugLoc(); 657 658 unsigned Dst = MI.getOperand(0).getReg(); 659 int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm(); 660 unsigned Reg; 661 662 const MachineOperand *Val = TII->getNamedOperand(MI, AMDGPU::OpName::val); 663 std::tie(Reg, Offset) = computeIndirectRegAndOffset(Dst, Offset); 664 665 MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx); 666 if (Idx->getReg() == AMDGPU::NoRegister) { 667 // Only had a constant offset, copy the register directly. 668 BuildMI(MBB, MI.getIterator(), DL, TII->get(AMDGPU::V_MOV_B32_e32), Reg) 669 .addOperand(*Val); 670 MI.eraseFromParent(); 671 return false; 672 } 673 674 MachineInstr *MovRel = 675 BuildMI(*MBB.getParent(), DL, TII->get(AMDGPU::V_MOVRELD_B32_e32), Reg) 676 .addReg(Val->getReg(), getUndefRegState(Val->isUndef())) 677 .addReg(Dst, RegState::Implicit); 678 679 return loadM0(MI, MovRel, Offset); 680 } 681 682 bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) { 683 const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); 684 TII = ST.getInstrInfo(); 685 TRI = &TII->getRegisterInfo(); 686 687 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 688 689 bool HaveKill = false; 690 bool NeedFlat = false; 691 unsigned Depth = 0; 692 693 MachineFunction::iterator NextBB; 694 695 for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); 696 BI != BE; BI = NextBB) { 697 NextBB = std::next(BI); 698 MachineBasicBlock &MBB = *BI; 699 700 MachineBasicBlock *EmptyMBBAtEnd = nullptr; 701 MachineBasicBlock::iterator I, Next; 702 bool ExecModified = false; 703 704 for (I = MBB.begin(); I != MBB.end(); I = Next) { 705 Next = std::next(I); 706 707 MachineInstr &MI = *I; 708 709 // Flat uses m0 in case it needs to access LDS. 710 if (TII->isFLAT(MI)) 711 NeedFlat = true; 712 713 if (I->modifiesRegister(AMDGPU::EXEC, TRI)) 714 ExecModified = true; 715 716 switch (MI.getOpcode()) { 717 default: break; 718 case AMDGPU::SI_IF: 719 ++Depth; 720 If(MI); 721 break; 722 723 case AMDGPU::SI_ELSE: 724 Else(MI, ExecModified); 725 break; 726 727 case AMDGPU::SI_BREAK: 728 Break(MI); 729 break; 730 731 case AMDGPU::SI_IF_BREAK: 732 IfBreak(MI); 733 break; 734 735 case AMDGPU::SI_ELSE_BREAK: 736 ElseBreak(MI); 737 break; 738 739 case AMDGPU::SI_LOOP: 740 ++Depth; 741 Loop(MI); 742 break; 743 744 case AMDGPU::SI_END_CF: 745 if (--Depth == 0 && HaveKill) { 746 HaveKill = false; 747 748 if (skipIfDead(MI)) { 749 NextBB = std::next(BI); 750 BE = MF.end(); 751 Next = MBB.end(); 752 } 753 } 754 EndCf(MI); 755 break; 756 757 case AMDGPU::SI_KILL: 758 if (Depth == 0) { 759 if (skipIfDead(MI)) { 760 NextBB = std::next(BI); 761 BE = MF.end(); 762 Next = MBB.end(); 763 } 764 } else 765 HaveKill = true; 766 Kill(MI); 767 break; 768 769 case AMDGPU::S_BRANCH: 770 Branch(MI); 771 break; 772 773 case AMDGPU::SI_INDIRECT_SRC_V1: 774 case AMDGPU::SI_INDIRECT_SRC_V2: 775 case AMDGPU::SI_INDIRECT_SRC_V4: 776 case AMDGPU::SI_INDIRECT_SRC_V8: 777 case AMDGPU::SI_INDIRECT_SRC_V16: 778 if (indirectSrc(MI)) { 779 // The block was split at this point. We can safely skip the middle 780 // inserted block to the following which contains the rest of this 781 // block's instructions. 782 NextBB = std::next(BI); 783 BE = MF.end(); 784 Next = MBB.end(); 785 } 786 787 break; 788 789 case AMDGPU::SI_INDIRECT_DST_V1: 790 case AMDGPU::SI_INDIRECT_DST_V2: 791 case AMDGPU::SI_INDIRECT_DST_V4: 792 case AMDGPU::SI_INDIRECT_DST_V8: 793 case AMDGPU::SI_INDIRECT_DST_V16: 794 if (indirectDst(MI)) { 795 // The block was split at this point. We can safely skip the middle 796 // inserted block to the following which contains the rest of this 797 // block's instructions. 798 NextBB = std::next(BI); 799 BE = MF.end(); 800 Next = MBB.end(); 801 } 802 803 break; 804 805 case AMDGPU::SI_RETURN: { 806 assert(!MF.getInfo<SIMachineFunctionInfo>()->returnsVoid()); 807 808 // Graphics shaders returning non-void shouldn't contain S_ENDPGM, 809 // because external bytecode will be appended at the end. 810 if (BI != --MF.end() || I != MBB.getFirstTerminator()) { 811 // SI_RETURN is not the last instruction. Add an empty block at 812 // the end and jump there. 813 if (!EmptyMBBAtEnd) { 814 EmptyMBBAtEnd = MF.CreateMachineBasicBlock(); 815 MF.insert(MF.end(), EmptyMBBAtEnd); 816 } 817 818 MBB.addSuccessor(EmptyMBBAtEnd); 819 BuildMI(*BI, I, MI.getDebugLoc(), TII->get(AMDGPU::S_BRANCH)) 820 .addMBB(EmptyMBBAtEnd); 821 I->eraseFromParent(); 822 } 823 break; 824 } 825 } 826 } 827 } 828 829 if (NeedFlat && MFI->IsKernel) { 830 // TODO: What to use with function calls? 831 // We will need to Initialize the flat scratch register pair. 832 if (NeedFlat) 833 MFI->setHasFlatInstructions(true); 834 } 835 836 return true; 837 } 838