1 //===-- SILowerControlFlow.cpp - Use predicates for control flow ----------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 /// \file 11 /// \brief This pass lowers the pseudo control flow instructions to real 12 /// machine instructions. 13 /// 14 /// All control flow is handled using predicated instructions and 15 /// a predicate stack. Each Scalar ALU controls the operations of 64 Vector 16 /// ALUs. The Scalar ALU can update the predicate for any of the Vector ALUs 17 /// by writting to the 64-bit EXEC register (each bit corresponds to a 18 /// single vector ALU). Typically, for predicates, a vector ALU will write 19 /// to its bit of the VCC register (like EXEC VCC is 64-bits, one for each 20 /// Vector ALU) and then the ScalarALU will AND the VCC register with the 21 /// EXEC to update the predicates. 22 /// 23 /// For example: 24 /// %VCC = V_CMP_GT_F32 %VGPR1, %VGPR2 25 /// %SGPR0 = SI_IF %VCC 26 /// %VGPR0 = V_ADD_F32 %VGPR0, %VGPR0 27 /// %SGPR0 = SI_ELSE %SGPR0 28 /// %VGPR0 = V_SUB_F32 %VGPR0, %VGPR0 29 /// SI_END_CF %SGPR0 30 /// 31 /// becomes: 32 /// 33 /// %SGPR0 = S_AND_SAVEEXEC_B64 %VCC // Save and update the exec mask 34 /// %SGPR0 = S_XOR_B64 %SGPR0, %EXEC // Clear live bits from saved exec mask 35 /// S_CBRANCH_EXECZ label0 // This instruction is an optional 36 /// // optimization which allows us to 37 /// // branch if all the bits of 38 /// // EXEC are zero. 39 /// %VGPR0 = V_ADD_F32 %VGPR0, %VGPR0 // Do the IF block of the branch 40 /// 41 /// label0: 42 /// %SGPR0 = S_OR_SAVEEXEC_B64 %EXEC // Restore the exec mask for the Then block 43 /// %EXEC = S_XOR_B64 %SGPR0, %EXEC // Clear live bits from saved exec mask 44 /// S_BRANCH_EXECZ label1 // Use our branch optimization 45 /// // instruction again. 46 /// %VGPR0 = V_SUB_F32 %VGPR0, %VGPR // Do the THEN block 47 /// label1: 48 /// %EXEC = S_OR_B64 %EXEC, %SGPR0 // Re-enable saved exec mask bits 49 //===----------------------------------------------------------------------===// 50 51 #include "AMDGPU.h" 52 #include "AMDGPUSubtarget.h" 53 #include "SIInstrInfo.h" 54 #include "SIMachineFunctionInfo.h" 55 #include "llvm/CodeGen/LivePhysRegs.h" 56 #include "llvm/CodeGen/MachineFrameInfo.h" 57 #include "llvm/CodeGen/MachineFunction.h" 58 #include "llvm/CodeGen/MachineFunctionPass.h" 59 #include "llvm/CodeGen/MachineInstrBuilder.h" 60 #include "llvm/CodeGen/MachineRegisterInfo.h" 61 #include "llvm/IR/Constants.h" 62 63 using namespace llvm; 64 65 #define DEBUG_TYPE "si-lower-control-flow" 66 67 namespace { 68 69 class SILowerControlFlow : public MachineFunctionPass { 70 private: 71 static const unsigned SkipThreshold = 12; 72 73 const SIRegisterInfo *TRI; 74 const SIInstrInfo *TII; 75 76 bool shouldSkip(MachineBasicBlock *From, MachineBasicBlock *To); 77 78 void Skip(MachineInstr &From, MachineOperand &To); 79 bool skipIfDead(MachineInstr &MI, MachineBasicBlock &NextBB); 80 81 void If(MachineInstr &MI); 82 void Else(MachineInstr &MI, bool ExecModified); 83 void Break(MachineInstr &MI); 84 void IfBreak(MachineInstr &MI); 85 void ElseBreak(MachineInstr &MI); 86 void Loop(MachineInstr &MI); 87 void EndCf(MachineInstr &MI); 88 89 void Kill(MachineInstr &MI); 90 void Branch(MachineInstr &MI); 91 92 MachineBasicBlock *insertSkipBlock(MachineBasicBlock &MBB, 93 MachineBasicBlock::iterator I) const; 94 95 std::pair<MachineBasicBlock *, MachineBasicBlock *> 96 splitBlock(MachineBasicBlock &MBB, MachineBasicBlock::iterator I); 97 98 void splitLoadM0BlockLiveIns(LivePhysRegs &RemainderLiveRegs, 99 const MachineRegisterInfo &MRI, 100 const MachineInstr &MI, 101 MachineBasicBlock &LoopBB, 102 MachineBasicBlock &RemainderBB, 103 unsigned SaveReg, 104 const MachineOperand &IdxReg); 105 106 void emitLoadM0FromVGPRLoop(MachineBasicBlock &LoopBB, DebugLoc DL, 107 MachineInstr *MovRel, 108 const MachineOperand &IdxReg, 109 int Offset); 110 111 bool loadM0(MachineInstr &MI, MachineInstr *MovRel, int Offset = 0); 112 std::pair<unsigned, int> computeIndirectRegAndOffset(unsigned VecReg, 113 int Offset) const; 114 bool indirectSrc(MachineInstr &MI); 115 bool indirectDst(MachineInstr &MI); 116 117 public: 118 static char ID; 119 120 SILowerControlFlow() : 121 MachineFunctionPass(ID), TRI(nullptr), TII(nullptr) { } 122 123 bool runOnMachineFunction(MachineFunction &MF) override; 124 125 const char *getPassName() const override { 126 return "SI Lower control flow pseudo instructions"; 127 } 128 }; 129 130 } // End anonymous namespace 131 132 char SILowerControlFlow::ID = 0; 133 134 INITIALIZE_PASS(SILowerControlFlow, DEBUG_TYPE, 135 "SI lower control flow", false, false) 136 137 char &llvm::SILowerControlFlowPassID = SILowerControlFlow::ID; 138 139 140 FunctionPass *llvm::createSILowerControlFlowPass() { 141 return new SILowerControlFlow(); 142 } 143 144 static bool opcodeEmitsNoInsts(unsigned Opc) { 145 switch (Opc) { 146 case TargetOpcode::IMPLICIT_DEF: 147 case TargetOpcode::KILL: 148 case TargetOpcode::BUNDLE: 149 case TargetOpcode::CFI_INSTRUCTION: 150 case TargetOpcode::EH_LABEL: 151 case TargetOpcode::GC_LABEL: 152 case TargetOpcode::DBG_VALUE: 153 return true; 154 default: 155 return false; 156 } 157 } 158 159 bool SILowerControlFlow::shouldSkip(MachineBasicBlock *From, 160 MachineBasicBlock *To) { 161 if (From->succ_empty()) 162 return false; 163 164 unsigned NumInstr = 0; 165 MachineFunction *MF = From->getParent(); 166 167 for (MachineFunction::iterator MBBI(From), ToI(To), End = MF->end(); 168 MBBI != End && MBBI != ToI; ++MBBI) { 169 MachineBasicBlock &MBB = *MBBI; 170 171 for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); 172 NumInstr < SkipThreshold && I != E; ++I) { 173 if (opcodeEmitsNoInsts(I->getOpcode())) 174 continue; 175 176 // When a uniform loop is inside non-uniform control flow, the branch 177 // leaving the loop might be an S_CBRANCH_VCCNZ, which is never taken 178 // when EXEC = 0. We should skip the loop lest it becomes infinite. 179 if (I->getOpcode() == AMDGPU::S_CBRANCH_VCCNZ || 180 I->getOpcode() == AMDGPU::S_CBRANCH_VCCZ) 181 return true; 182 183 if (I->isInlineAsm()) { 184 const MCAsmInfo *MAI = MF->getTarget().getMCAsmInfo(); 185 const char *AsmStr = I->getOperand(0).getSymbolName(); 186 187 // inlineasm length estimate is number of bytes assuming the longest 188 // instruction. 189 uint64_t MaxAsmSize = TII->getInlineAsmLength(AsmStr, *MAI); 190 NumInstr += MaxAsmSize / MAI->getMaxInstLength(); 191 } else { 192 ++NumInstr; 193 } 194 195 if (NumInstr >= SkipThreshold) 196 return true; 197 } 198 } 199 200 return false; 201 } 202 203 void SILowerControlFlow::Skip(MachineInstr &From, MachineOperand &To) { 204 205 if (!shouldSkip(*From.getParent()->succ_begin(), To.getMBB())) 206 return; 207 208 DebugLoc DL = From.getDebugLoc(); 209 BuildMI(*From.getParent(), &From, DL, TII->get(AMDGPU::S_CBRANCH_EXECZ)) 210 .addOperand(To); 211 } 212 213 bool SILowerControlFlow::skipIfDead(MachineInstr &MI, MachineBasicBlock &NextBB) { 214 MachineBasicBlock &MBB = *MI.getParent(); 215 MachineFunction *MF = MBB.getParent(); 216 217 if (MF->getFunction()->getCallingConv() != CallingConv::AMDGPU_PS || 218 !shouldSkip(&MBB, &MBB.getParent()->back())) 219 return false; 220 221 MachineBasicBlock *SkipBB = insertSkipBlock(MBB, MI.getIterator()); 222 MBB.addSuccessor(SkipBB); 223 224 const DebugLoc &DL = MI.getDebugLoc(); 225 226 // If the exec mask is non-zero, skip the next two instructions 227 BuildMI(&MBB, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ)) 228 .addMBB(&NextBB); 229 230 MachineBasicBlock::iterator Insert = SkipBB->begin(); 231 232 // Exec mask is zero: Export to NULL target... 233 BuildMI(*SkipBB, Insert, DL, TII->get(AMDGPU::EXP)) 234 .addImm(0) 235 .addImm(0x09) // V_008DFC_SQ_EXP_NULL 236 .addImm(0) 237 .addImm(1) 238 .addImm(1) 239 .addReg(AMDGPU::VGPR0, RegState::Undef) 240 .addReg(AMDGPU::VGPR0, RegState::Undef) 241 .addReg(AMDGPU::VGPR0, RegState::Undef) 242 .addReg(AMDGPU::VGPR0, RegState::Undef); 243 244 // ... and terminate wavefront. 245 BuildMI(*SkipBB, Insert, DL, TII->get(AMDGPU::S_ENDPGM)); 246 247 return true; 248 } 249 250 void SILowerControlFlow::If(MachineInstr &MI) { 251 MachineBasicBlock &MBB = *MI.getParent(); 252 DebugLoc DL = MI.getDebugLoc(); 253 unsigned Reg = MI.getOperand(0).getReg(); 254 unsigned Vcc = MI.getOperand(1).getReg(); 255 256 BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), Reg) 257 .addReg(Vcc); 258 259 BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_XOR_B64), Reg) 260 .addReg(AMDGPU::EXEC) 261 .addReg(Reg); 262 263 Skip(MI, MI.getOperand(2)); 264 265 // Insert a pseudo terminator to help keep the verifier happy. 266 BuildMI(MBB, &MI, DL, TII->get(AMDGPU::SI_MASK_BRANCH)) 267 .addOperand(MI.getOperand(2)) 268 .addReg(Reg); 269 270 MI.eraseFromParent(); 271 } 272 273 void SILowerControlFlow::Else(MachineInstr &MI, bool ExecModified) { 274 MachineBasicBlock &MBB = *MI.getParent(); 275 DebugLoc DL = MI.getDebugLoc(); 276 unsigned Dst = MI.getOperand(0).getReg(); 277 unsigned Src = MI.getOperand(1).getReg(); 278 279 BuildMI(MBB, MBB.getFirstNonPHI(), DL, 280 TII->get(AMDGPU::S_OR_SAVEEXEC_B64), Dst) 281 .addReg(Src); // Saved EXEC 282 283 if (ExecModified) { 284 // Adjust the saved exec to account for the modifications during the flow 285 // block that contains the ELSE. This can happen when WQM mode is switched 286 // off. 287 BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_B64), Dst) 288 .addReg(AMDGPU::EXEC) 289 .addReg(Dst); 290 } 291 292 BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC) 293 .addReg(AMDGPU::EXEC) 294 .addReg(Dst); 295 296 Skip(MI, MI.getOperand(2)); 297 298 // Insert a pseudo terminator to help keep the verifier happy. 299 BuildMI(MBB, &MI, DL, TII->get(AMDGPU::SI_MASK_BRANCH)) 300 .addOperand(MI.getOperand(2)) 301 .addReg(Dst); 302 303 MI.eraseFromParent(); 304 } 305 306 void SILowerControlFlow::Break(MachineInstr &MI) { 307 MachineBasicBlock &MBB = *MI.getParent(); 308 DebugLoc DL = MI.getDebugLoc(); 309 310 unsigned Dst = MI.getOperand(0).getReg(); 311 unsigned Src = MI.getOperand(1).getReg(); 312 313 BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst) 314 .addReg(AMDGPU::EXEC) 315 .addReg(Src); 316 317 MI.eraseFromParent(); 318 } 319 320 void SILowerControlFlow::IfBreak(MachineInstr &MI) { 321 MachineBasicBlock &MBB = *MI.getParent(); 322 DebugLoc DL = MI.getDebugLoc(); 323 324 unsigned Dst = MI.getOperand(0).getReg(); 325 unsigned Vcc = MI.getOperand(1).getReg(); 326 unsigned Src = MI.getOperand(2).getReg(); 327 328 BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst) 329 .addReg(Vcc) 330 .addReg(Src); 331 332 MI.eraseFromParent(); 333 } 334 335 void SILowerControlFlow::ElseBreak(MachineInstr &MI) { 336 MachineBasicBlock &MBB = *MI.getParent(); 337 DebugLoc DL = MI.getDebugLoc(); 338 339 unsigned Dst = MI.getOperand(0).getReg(); 340 unsigned Saved = MI.getOperand(1).getReg(); 341 unsigned Src = MI.getOperand(2).getReg(); 342 343 BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst) 344 .addReg(Saved) 345 .addReg(Src); 346 347 MI.eraseFromParent(); 348 } 349 350 void SILowerControlFlow::Loop(MachineInstr &MI) { 351 MachineBasicBlock &MBB = *MI.getParent(); 352 DebugLoc DL = MI.getDebugLoc(); 353 unsigned Src = MI.getOperand(0).getReg(); 354 355 BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_ANDN2_B64), AMDGPU::EXEC) 356 .addReg(AMDGPU::EXEC) 357 .addReg(Src); 358 359 BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ)) 360 .addOperand(MI.getOperand(1)); 361 362 MI.eraseFromParent(); 363 } 364 365 void SILowerControlFlow::EndCf(MachineInstr &MI) { 366 MachineBasicBlock &MBB = *MI.getParent(); 367 DebugLoc DL = MI.getDebugLoc(); 368 unsigned Reg = MI.getOperand(0).getReg(); 369 370 BuildMI(MBB, MBB.getFirstNonPHI(), DL, 371 TII->get(AMDGPU::S_OR_B64), AMDGPU::EXEC) 372 .addReg(AMDGPU::EXEC) 373 .addReg(Reg); 374 375 MI.eraseFromParent(); 376 } 377 378 void SILowerControlFlow::Branch(MachineInstr &MI) { 379 MachineBasicBlock *MBB = MI.getOperand(0).getMBB(); 380 if (MBB == MI.getParent()->getNextNode()) 381 MI.eraseFromParent(); 382 383 // If these aren't equal, this is probably an infinite loop. 384 } 385 386 void SILowerControlFlow::Kill(MachineInstr &MI) { 387 MachineBasicBlock &MBB = *MI.getParent(); 388 DebugLoc DL = MI.getDebugLoc(); 389 const MachineOperand &Op = MI.getOperand(0); 390 391 #ifndef NDEBUG 392 CallingConv::ID CallConv = MBB.getParent()->getFunction()->getCallingConv(); 393 // Kill is only allowed in pixel / geometry shaders. 394 assert(CallConv == CallingConv::AMDGPU_PS || 395 CallConv == CallingConv::AMDGPU_GS); 396 #endif 397 398 // Clear this thread from the exec mask if the operand is negative 399 if ((Op.isImm())) { 400 // Constant operand: Set exec mask to 0 or do nothing 401 if (Op.getImm() & 0x80000000) { 402 BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC) 403 .addImm(0); 404 } 405 } else { 406 BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMPX_LE_F32_e32)) 407 .addImm(0) 408 .addOperand(Op); 409 } 410 411 MI.eraseFromParent(); 412 } 413 414 // All currently live registers must remain so in the remainder block. 415 void SILowerControlFlow::splitLoadM0BlockLiveIns(LivePhysRegs &RemainderLiveRegs, 416 const MachineRegisterInfo &MRI, 417 const MachineInstr &MI, 418 MachineBasicBlock &LoopBB, 419 MachineBasicBlock &RemainderBB, 420 unsigned SaveReg, 421 const MachineOperand &IdxReg) { 422 // Add reg defined in loop body. 423 RemainderLiveRegs.addReg(SaveReg); 424 425 if (const MachineOperand *Val = TII->getNamedOperand(MI, AMDGPU::OpName::val)) { 426 if (!Val->isUndef()) { 427 RemainderLiveRegs.addReg(Val->getReg()); 428 LoopBB.addLiveIn(Val->getReg()); 429 } 430 } 431 432 for (unsigned Reg : RemainderLiveRegs) { 433 if (MRI.isAllocatable(Reg)) 434 RemainderBB.addLiveIn(Reg); 435 } 436 437 const MachineOperand *Src = TII->getNamedOperand(MI, AMDGPU::OpName::src); 438 if (!Src->isUndef()) 439 LoopBB.addLiveIn(Src->getReg()); 440 441 if (!IdxReg.isUndef()) 442 LoopBB.addLiveIn(IdxReg.getReg()); 443 LoopBB.sortUniqueLiveIns(); 444 } 445 446 void SILowerControlFlow::emitLoadM0FromVGPRLoop(MachineBasicBlock &LoopBB, 447 DebugLoc DL, 448 MachineInstr *MovRel, 449 const MachineOperand &IdxReg, 450 int Offset) { 451 MachineBasicBlock::iterator I = LoopBB.begin(); 452 453 // Read the next variant into VCC (lower 32 bits) <- also loop target 454 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), AMDGPU::VCC_LO) 455 .addReg(IdxReg.getReg(), getUndefRegState(IdxReg.isUndef())); 456 457 // Move index from VCC into M0 458 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0) 459 .addReg(AMDGPU::VCC_LO); 460 461 // Compare the just read M0 value to all possible Idx values 462 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e32)) 463 .addReg(AMDGPU::M0) 464 .addReg(IdxReg.getReg(), getUndefRegState(IdxReg.isUndef())); 465 466 // Update EXEC, save the original EXEC value to VCC 467 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), AMDGPU::VCC) 468 .addReg(AMDGPU::VCC); 469 470 if (Offset != 0) { 471 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0) 472 .addReg(AMDGPU::M0) 473 .addImm(Offset); 474 } 475 476 // Do the actual move 477 LoopBB.insert(I, MovRel); 478 479 // Update EXEC, switch all done bits to 0 and all todo bits to 1 480 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC) 481 .addReg(AMDGPU::EXEC) 482 .addReg(AMDGPU::VCC); 483 484 // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover 485 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ)) 486 .addMBB(&LoopBB); 487 } 488 489 MachineBasicBlock *SILowerControlFlow::insertSkipBlock( 490 MachineBasicBlock &MBB, MachineBasicBlock::iterator I) const { 491 MachineFunction *MF = MBB.getParent(); 492 493 MachineBasicBlock *SkipBB = MF->CreateMachineBasicBlock(); 494 MachineFunction::iterator MBBI(MBB); 495 ++MBBI; 496 497 MF->insert(MBBI, SkipBB); 498 499 return SkipBB; 500 } 501 502 std::pair<MachineBasicBlock *, MachineBasicBlock *> 503 SILowerControlFlow::splitBlock(MachineBasicBlock &MBB, 504 MachineBasicBlock::iterator I) { 505 MachineFunction *MF = MBB.getParent(); 506 507 // To insert the loop we need to split the block. Move everything after this 508 // point to a new block, and insert a new empty block between the two. 509 MachineBasicBlock *LoopBB = MF->CreateMachineBasicBlock(); 510 MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock(); 511 MachineFunction::iterator MBBI(MBB); 512 ++MBBI; 513 514 MF->insert(MBBI, LoopBB); 515 MF->insert(MBBI, RemainderBB); 516 517 // Move the rest of the block into a new block. 518 RemainderBB->transferSuccessors(&MBB); 519 RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end()); 520 521 MBB.addSuccessor(LoopBB); 522 523 return std::make_pair(LoopBB, RemainderBB); 524 } 525 526 // Returns true if a new block was inserted. 527 bool SILowerControlFlow::loadM0(MachineInstr &MI, MachineInstr *MovRel, int Offset) { 528 MachineBasicBlock &MBB = *MI.getParent(); 529 DebugLoc DL = MI.getDebugLoc(); 530 MachineBasicBlock::iterator I(&MI); 531 532 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx); 533 534 if (AMDGPU::SReg_32RegClass.contains(Idx->getReg())) { 535 if (Offset != 0) { 536 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0) 537 .addReg(Idx->getReg(), getUndefRegState(Idx->isUndef())) 538 .addImm(Offset); 539 } else { 540 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0) 541 .addReg(Idx->getReg(), getUndefRegState(Idx->isUndef())); 542 } 543 544 MBB.insert(I, MovRel); 545 MI.eraseFromParent(); 546 return false; 547 } 548 549 MachineOperand *SaveOp = TII->getNamedOperand(MI, AMDGPU::OpName::sdst); 550 SaveOp->setIsDead(false); 551 unsigned Save = SaveOp->getReg(); 552 553 // Reading from a VGPR requires looping over all workitems in the wavefront. 554 assert(AMDGPU::SReg_64RegClass.contains(Save) && 555 AMDGPU::VGPR_32RegClass.contains(Idx->getReg())); 556 557 // Save the EXEC mask 558 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B64), Save) 559 .addReg(AMDGPU::EXEC); 560 561 LivePhysRegs RemainderLiveRegs(TRI); 562 563 RemainderLiveRegs.addLiveOuts(MBB); 564 565 MachineBasicBlock *LoopBB; 566 MachineBasicBlock *RemainderBB; 567 568 std::tie(LoopBB, RemainderBB) = splitBlock(MBB, I); 569 570 for (const MachineInstr &Inst : reverse(*RemainderBB)) 571 RemainderLiveRegs.stepBackward(Inst); 572 573 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 574 LoopBB->addSuccessor(RemainderBB); 575 LoopBB->addSuccessor(LoopBB); 576 577 splitLoadM0BlockLiveIns(RemainderLiveRegs, MRI, MI, *LoopBB, 578 *RemainderBB, Save, *Idx); 579 580 emitLoadM0FromVGPRLoop(*LoopBB, DL, MovRel, *Idx, Offset); 581 582 MachineBasicBlock::iterator First = RemainderBB->begin(); 583 BuildMI(*RemainderBB, First, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC) 584 .addReg(Save); 585 586 MI.eraseFromParent(); 587 return true; 588 } 589 590 /// \param @VecReg The register which holds element zero of the vector being 591 /// addressed into. 592 // 593 /// \param[in] @Idx The index operand from the movrel instruction. This must be 594 // a register, but may be NoRegister. 595 /// 596 /// \param[in] @Offset As an input, this is the constant offset part of the 597 // indirect Index. e.g. v0 = v[VecReg + Offset] As an output, this is a constant 598 // value that needs to be added to the value stored in M0. 599 std::pair<unsigned, int> 600 SILowerControlFlow::computeIndirectRegAndOffset(unsigned VecReg, int Offset) const { 601 unsigned SubReg = TRI->getSubReg(VecReg, AMDGPU::sub0); 602 if (!SubReg) 603 SubReg = VecReg; 604 605 const TargetRegisterClass *SuperRC = TRI->getPhysRegClass(VecReg); 606 const TargetRegisterClass *RC = TRI->getPhysRegClass(SubReg); 607 int NumElts = SuperRC->getSize() / RC->getSize(); 608 609 int BaseRegIdx = TRI->getHWRegIndex(SubReg); 610 611 // Skip out of bounds offsets, or else we would end up using an undefined 612 // register. 613 if (Offset >= NumElts) 614 return std::make_pair(RC->getRegister(BaseRegIdx), Offset); 615 616 int RegIdx = BaseRegIdx + Offset; 617 if (RegIdx < 0) { 618 Offset = RegIdx; 619 RegIdx = 0; 620 } else { 621 Offset = 0; 622 } 623 624 unsigned Reg = RC->getRegister(RegIdx); 625 return std::make_pair(Reg, Offset); 626 } 627 628 // Return true if a new block was inserted. 629 bool SILowerControlFlow::indirectSrc(MachineInstr &MI) { 630 MachineBasicBlock &MBB = *MI.getParent(); 631 const DebugLoc &DL = MI.getDebugLoc(); 632 633 unsigned Dst = MI.getOperand(0).getReg(); 634 const MachineOperand *SrcVec = TII->getNamedOperand(MI, AMDGPU::OpName::src); 635 int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm(); 636 unsigned Reg; 637 638 std::tie(Reg, Offset) = computeIndirectRegAndOffset(SrcVec->getReg(), Offset); 639 640 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx); 641 if (Idx->getReg() == AMDGPU::NoRegister) { 642 // Only had a constant offset, copy the register directly. 643 BuildMI(MBB, MI.getIterator(), DL, TII->get(AMDGPU::V_MOV_B32_e32), Dst) 644 .addReg(Reg, getUndefRegState(SrcVec->isUndef())); 645 MI.eraseFromParent(); 646 return false; 647 } 648 649 MachineInstr *MovRel = 650 BuildMI(*MBB.getParent(), DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst) 651 .addReg(Reg, getUndefRegState(SrcVec->isUndef())) 652 .addReg(SrcVec->getReg(), RegState::Implicit); 653 654 return loadM0(MI, MovRel, Offset); 655 } 656 657 // Return true if a new block was inserted. 658 bool SILowerControlFlow::indirectDst(MachineInstr &MI) { 659 MachineBasicBlock &MBB = *MI.getParent(); 660 const DebugLoc &DL = MI.getDebugLoc(); 661 662 unsigned Dst = MI.getOperand(0).getReg(); 663 int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm(); 664 unsigned Reg; 665 666 const MachineOperand *Val = TII->getNamedOperand(MI, AMDGPU::OpName::val); 667 std::tie(Reg, Offset) = computeIndirectRegAndOffset(Dst, Offset); 668 669 MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx); 670 if (Idx->getReg() == AMDGPU::NoRegister) { 671 // Only had a constant offset, copy the register directly. 672 BuildMI(MBB, MI.getIterator(), DL, TII->get(AMDGPU::V_MOV_B32_e32), Reg) 673 .addOperand(*Val); 674 MI.eraseFromParent(); 675 return false; 676 } 677 678 MachineInstr *MovRel = 679 BuildMI(*MBB.getParent(), DL, TII->get(AMDGPU::V_MOVRELD_B32_e32), Reg) 680 .addReg(Val->getReg(), getUndefRegState(Val->isUndef())) 681 .addReg(Dst, RegState::Implicit); 682 683 return loadM0(MI, MovRel, Offset); 684 } 685 686 bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) { 687 const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); 688 TII = ST.getInstrInfo(); 689 TRI = &TII->getRegisterInfo(); 690 691 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 692 693 bool HaveKill = false; 694 bool NeedFlat = false; 695 unsigned Depth = 0; 696 697 MachineFunction::iterator NextBB; 698 699 for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); 700 BI != BE; BI = NextBB) { 701 NextBB = std::next(BI); 702 MachineBasicBlock &MBB = *BI; 703 704 MachineBasicBlock *EmptyMBBAtEnd = nullptr; 705 MachineBasicBlock::iterator I, Next; 706 bool ExecModified = false; 707 708 for (I = MBB.begin(); I != MBB.end(); I = Next) { 709 Next = std::next(I); 710 711 MachineInstr &MI = *I; 712 713 // Flat uses m0 in case it needs to access LDS. 714 if (TII->isFLAT(MI)) 715 NeedFlat = true; 716 717 if (I->modifiesRegister(AMDGPU::EXEC, TRI)) 718 ExecModified = true; 719 720 switch (MI.getOpcode()) { 721 default: break; 722 case AMDGPU::SI_IF: 723 ++Depth; 724 If(MI); 725 break; 726 727 case AMDGPU::SI_ELSE: 728 Else(MI, ExecModified); 729 break; 730 731 case AMDGPU::SI_BREAK: 732 Break(MI); 733 break; 734 735 case AMDGPU::SI_IF_BREAK: 736 IfBreak(MI); 737 break; 738 739 case AMDGPU::SI_ELSE_BREAK: 740 ElseBreak(MI); 741 break; 742 743 case AMDGPU::SI_LOOP: 744 ++Depth; 745 Loop(MI); 746 break; 747 748 case AMDGPU::SI_END_CF: 749 if (--Depth == 0 && HaveKill) { 750 HaveKill = false; 751 // TODO: Insert skip if exec is 0? 752 } 753 754 EndCf(MI); 755 break; 756 757 case AMDGPU::SI_KILL_TERMINATOR: 758 if (Depth == 0) { 759 if (skipIfDead(MI, *NextBB)) { 760 NextBB = std::next(BI); 761 BE = MF.end(); 762 } 763 } else 764 HaveKill = true; 765 Kill(MI); 766 break; 767 768 case AMDGPU::S_BRANCH: 769 Branch(MI); 770 break; 771 772 case AMDGPU::SI_INDIRECT_SRC_V1: 773 case AMDGPU::SI_INDIRECT_SRC_V2: 774 case AMDGPU::SI_INDIRECT_SRC_V4: 775 case AMDGPU::SI_INDIRECT_SRC_V8: 776 case AMDGPU::SI_INDIRECT_SRC_V16: 777 if (indirectSrc(MI)) { 778 // The block was split at this point. We can safely skip the middle 779 // inserted block to the following which contains the rest of this 780 // block's instructions. 781 NextBB = std::next(BI); 782 BE = MF.end(); 783 Next = MBB.end(); 784 } 785 786 break; 787 788 case AMDGPU::SI_INDIRECT_DST_V1: 789 case AMDGPU::SI_INDIRECT_DST_V2: 790 case AMDGPU::SI_INDIRECT_DST_V4: 791 case AMDGPU::SI_INDIRECT_DST_V8: 792 case AMDGPU::SI_INDIRECT_DST_V16: 793 if (indirectDst(MI)) { 794 // The block was split at this point. We can safely skip the middle 795 // inserted block to the following which contains the rest of this 796 // block's instructions. 797 NextBB = std::next(BI); 798 BE = MF.end(); 799 Next = MBB.end(); 800 } 801 802 break; 803 804 case AMDGPU::SI_RETURN: { 805 assert(!MF.getInfo<SIMachineFunctionInfo>()->returnsVoid()); 806 807 // Graphics shaders returning non-void shouldn't contain S_ENDPGM, 808 // because external bytecode will be appended at the end. 809 if (BI != --MF.end() || I != MBB.getFirstTerminator()) { 810 // SI_RETURN is not the last instruction. Add an empty block at 811 // the end and jump there. 812 if (!EmptyMBBAtEnd) { 813 EmptyMBBAtEnd = MF.CreateMachineBasicBlock(); 814 MF.insert(MF.end(), EmptyMBBAtEnd); 815 } 816 817 MBB.addSuccessor(EmptyMBBAtEnd); 818 BuildMI(*BI, I, MI.getDebugLoc(), TII->get(AMDGPU::S_BRANCH)) 819 .addMBB(EmptyMBBAtEnd); 820 I->eraseFromParent(); 821 } 822 break; 823 } 824 } 825 } 826 } 827 828 if (NeedFlat && MFI->IsKernel) { 829 // TODO: What to use with function calls? 830 // We will need to Initialize the flat scratch register pair. 831 if (NeedFlat) 832 MFI->setHasFlatInstructions(true); 833 } 834 835 return true; 836 } 837