1 //===-- SIFoldOperands.cpp - Fold operands --- ----------------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 /// \file 9 //===----------------------------------------------------------------------===// 10 // 11 12 #include "AMDGPU.h" 13 #include "AMDGPUSubtarget.h" 14 #include "SIInstrInfo.h" 15 #include "llvm/CodeGen/LiveIntervalAnalysis.h" 16 #include "llvm/CodeGen/MachineFunctionPass.h" 17 #include "llvm/CodeGen/MachineInstrBuilder.h" 18 #include "llvm/CodeGen/MachineRegisterInfo.h" 19 #include "llvm/Support/Debug.h" 20 #include "llvm/Support/raw_ostream.h" 21 #include "llvm/Target/TargetMachine.h" 22 23 #define DEBUG_TYPE "si-fold-operands" 24 using namespace llvm; 25 26 namespace { 27 28 class SIFoldOperands : public MachineFunctionPass { 29 public: 30 static char ID; 31 32 public: 33 SIFoldOperands() : MachineFunctionPass(ID) { 34 initializeSIFoldOperandsPass(*PassRegistry::getPassRegistry()); 35 } 36 37 bool runOnMachineFunction(MachineFunction &MF) override; 38 39 const char *getPassName() const override { 40 return "SI Fold Operands"; 41 } 42 43 void getAnalysisUsage(AnalysisUsage &AU) const override { 44 AU.setPreservesCFG(); 45 MachineFunctionPass::getAnalysisUsage(AU); 46 } 47 }; 48 49 struct FoldCandidate { 50 MachineInstr *UseMI; 51 union { 52 MachineOperand *OpToFold; 53 uint64_t ImmToFold; 54 int FrameIndexToFold; 55 }; 56 unsigned char UseOpNo; 57 MachineOperand::MachineOperandType Kind; 58 59 FoldCandidate(MachineInstr *MI, unsigned OpNo, MachineOperand *FoldOp) : 60 UseMI(MI), OpToFold(nullptr), UseOpNo(OpNo), Kind(FoldOp->getType()) { 61 if (FoldOp->isImm()) { 62 ImmToFold = FoldOp->getImm(); 63 } else if (FoldOp->isFI()) { 64 FrameIndexToFold = FoldOp->getIndex(); 65 } else { 66 assert(FoldOp->isReg()); 67 OpToFold = FoldOp; 68 } 69 } 70 71 bool isFI() const { 72 return Kind == MachineOperand::MO_FrameIndex; 73 } 74 75 bool isImm() const { 76 return Kind == MachineOperand::MO_Immediate; 77 } 78 79 bool isReg() const { 80 return Kind == MachineOperand::MO_Register; 81 } 82 }; 83 84 } // End anonymous namespace. 85 86 INITIALIZE_PASS(SIFoldOperands, DEBUG_TYPE, 87 "SI Fold Operands", false, false) 88 89 char SIFoldOperands::ID = 0; 90 91 char &llvm::SIFoldOperandsID = SIFoldOperands::ID; 92 93 FunctionPass *llvm::createSIFoldOperandsPass() { 94 return new SIFoldOperands(); 95 } 96 97 static bool isSafeToFold(unsigned Opcode) { 98 switch(Opcode) { 99 case AMDGPU::V_MOV_B32_e32: 100 case AMDGPU::V_MOV_B32_e64: 101 case AMDGPU::V_MOV_B64_PSEUDO: 102 case AMDGPU::S_MOV_B32: 103 case AMDGPU::S_MOV_B64: 104 case AMDGPU::COPY: 105 return true; 106 default: 107 return false; 108 } 109 } 110 111 static bool updateOperand(FoldCandidate &Fold, 112 const TargetRegisterInfo &TRI) { 113 MachineInstr *MI = Fold.UseMI; 114 MachineOperand &Old = MI->getOperand(Fold.UseOpNo); 115 assert(Old.isReg()); 116 117 if (Fold.isImm()) { 118 Old.ChangeToImmediate(Fold.ImmToFold); 119 return true; 120 } 121 122 if (Fold.isFI()) { 123 Old.ChangeToFrameIndex(Fold.FrameIndexToFold); 124 return true; 125 } 126 127 MachineOperand *New = Fold.OpToFold; 128 if (TargetRegisterInfo::isVirtualRegister(Old.getReg()) && 129 TargetRegisterInfo::isVirtualRegister(New->getReg())) { 130 Old.substVirtReg(New->getReg(), New->getSubReg(), TRI); 131 return true; 132 } 133 134 // FIXME: Handle physical registers. 135 136 return false; 137 } 138 139 static bool isUseMIInFoldList(const std::vector<FoldCandidate> &FoldList, 140 const MachineInstr *MI) { 141 for (auto Candidate : FoldList) { 142 if (Candidate.UseMI == MI) 143 return true; 144 } 145 return false; 146 } 147 148 static bool tryAddToFoldList(std::vector<FoldCandidate> &FoldList, 149 MachineInstr *MI, unsigned OpNo, 150 MachineOperand *OpToFold, 151 const SIInstrInfo *TII) { 152 if (!TII->isOperandLegal(*MI, OpNo, OpToFold)) { 153 154 // Special case for v_mac_f32_e64 if we are trying to fold into src2 155 unsigned Opc = MI->getOpcode(); 156 if (Opc == AMDGPU::V_MAC_F32_e64 && 157 (int)OpNo == AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)) { 158 // Check if changing this to a v_mad_f32 instruction will allow us to 159 // fold the operand. 160 MI->setDesc(TII->get(AMDGPU::V_MAD_F32)); 161 bool FoldAsMAD = tryAddToFoldList(FoldList, MI, OpNo, OpToFold, TII); 162 if (FoldAsMAD) { 163 MI->untieRegOperand(OpNo); 164 return true; 165 } 166 MI->setDesc(TII->get(Opc)); 167 } 168 169 // If we are already folding into another operand of MI, then 170 // we can't commute the instruction, otherwise we risk making the 171 // other fold illegal. 172 if (isUseMIInFoldList(FoldList, MI)) 173 return false; 174 175 // Operand is not legal, so try to commute the instruction to 176 // see if this makes it possible to fold. 177 unsigned CommuteIdx0 = TargetInstrInfo::CommuteAnyOperandIndex; 178 unsigned CommuteIdx1 = TargetInstrInfo::CommuteAnyOperandIndex; 179 bool CanCommute = TII->findCommutedOpIndices(*MI, CommuteIdx0, CommuteIdx1); 180 181 if (CanCommute) { 182 if (CommuteIdx0 == OpNo) 183 OpNo = CommuteIdx1; 184 else if (CommuteIdx1 == OpNo) 185 OpNo = CommuteIdx0; 186 } 187 188 // One of operands might be an Imm operand, and OpNo may refer to it after 189 // the call of commuteInstruction() below. Such situations are avoided 190 // here explicitly as OpNo must be a register operand to be a candidate 191 // for memory folding. 192 if (CanCommute && (!MI->getOperand(CommuteIdx0).isReg() || 193 !MI->getOperand(CommuteIdx1).isReg())) 194 return false; 195 196 if (!CanCommute || 197 !TII->commuteInstruction(*MI, false, CommuteIdx0, CommuteIdx1)) 198 return false; 199 200 if (!TII->isOperandLegal(*MI, OpNo, OpToFold)) 201 return false; 202 } 203 204 FoldList.push_back(FoldCandidate(MI, OpNo, OpToFold)); 205 return true; 206 } 207 208 static void foldOperand(MachineOperand &OpToFold, MachineInstr *UseMI, 209 unsigned UseOpIdx, 210 std::vector<FoldCandidate> &FoldList, 211 SmallVectorImpl<MachineInstr *> &CopiesToReplace, 212 const SIInstrInfo *TII, const SIRegisterInfo &TRI, 213 MachineRegisterInfo &MRI) { 214 const MachineOperand &UseOp = UseMI->getOperand(UseOpIdx); 215 216 // FIXME: Fold operands with subregs. 217 if (UseOp.isReg() && OpToFold.isReg()) { 218 if (UseOp.isImplicit() || UseOp.getSubReg() != AMDGPU::NoSubRegister) 219 return; 220 221 // Don't fold subregister extracts into tied operands, only if it is a full 222 // copy since a subregister use tied to a full register def doesn't really 223 // make sense. e.g. don't fold: 224 // 225 // %vreg1 = COPY %vreg0:sub1 226 // %vreg2<tied3> = V_MAC_F32 %vreg3, %vreg4, %vreg1<tied0> 227 // 228 // into 229 // %vreg2<tied3> = V_MAC_F32 %vreg3, %vreg4, %vreg0:sub1<tied0> 230 if (UseOp.isTied() && OpToFold.getSubReg() != AMDGPU::NoSubRegister) 231 return; 232 } 233 234 bool FoldingImm = OpToFold.isImm(); 235 APInt Imm; 236 237 if (FoldingImm) { 238 unsigned UseReg = UseOp.getReg(); 239 const TargetRegisterClass *UseRC 240 = TargetRegisterInfo::isVirtualRegister(UseReg) ? 241 MRI.getRegClass(UseReg) : 242 TRI.getPhysRegClass(UseReg); 243 244 Imm = APInt(64, OpToFold.getImm()); 245 246 const MCInstrDesc &FoldDesc = TII->get(OpToFold.getParent()->getOpcode()); 247 const TargetRegisterClass *FoldRC = 248 TRI.getRegClass(FoldDesc.OpInfo[0].RegClass); 249 250 // Split 64-bit constants into 32-bits for folding. 251 if (FoldRC->getSize() == 8 && UseOp.getSubReg()) { 252 if (UseRC->getSize() != 8) 253 return; 254 255 if (UseOp.getSubReg() == AMDGPU::sub0) { 256 Imm = Imm.getLoBits(32); 257 } else { 258 assert(UseOp.getSubReg() == AMDGPU::sub1); 259 Imm = Imm.getHiBits(32); 260 } 261 } 262 263 // In order to fold immediates into copies, we need to change the 264 // copy to a MOV. 265 if (UseMI->getOpcode() == AMDGPU::COPY) { 266 unsigned DestReg = UseMI->getOperand(0).getReg(); 267 const TargetRegisterClass *DestRC 268 = TargetRegisterInfo::isVirtualRegister(DestReg) ? 269 MRI.getRegClass(DestReg) : 270 TRI.getPhysRegClass(DestReg); 271 272 unsigned MovOp = TII->getMovOpcode(DestRC); 273 if (MovOp == AMDGPU::COPY) 274 return; 275 276 UseMI->setDesc(TII->get(MovOp)); 277 CopiesToReplace.push_back(UseMI); 278 } 279 } 280 281 // Special case for REG_SEQUENCE: We can't fold literals into 282 // REG_SEQUENCE instructions, so we have to fold them into the 283 // uses of REG_SEQUENCE. 284 if (UseMI->getOpcode() == AMDGPU::REG_SEQUENCE) { 285 unsigned RegSeqDstReg = UseMI->getOperand(0).getReg(); 286 unsigned RegSeqDstSubReg = UseMI->getOperand(UseOpIdx + 1).getImm(); 287 288 for (MachineRegisterInfo::use_iterator 289 RSUse = MRI.use_begin(RegSeqDstReg), 290 RSE = MRI.use_end(); RSUse != RSE; ++RSUse) { 291 292 MachineInstr *RSUseMI = RSUse->getParent(); 293 if (RSUse->getSubReg() != RegSeqDstSubReg) 294 continue; 295 296 foldOperand(OpToFold, RSUseMI, RSUse.getOperandNo(), FoldList, 297 CopiesToReplace, TII, TRI, MRI); 298 } 299 return; 300 } 301 302 const MCInstrDesc &UseDesc = UseMI->getDesc(); 303 304 // Don't fold into target independent nodes. Target independent opcodes 305 // don't have defined register classes. 306 if (UseDesc.isVariadic() || 307 UseDesc.OpInfo[UseOpIdx].RegClass == -1) 308 return; 309 310 if (FoldingImm) { 311 MachineOperand ImmOp = MachineOperand::CreateImm(Imm.getSExtValue()); 312 tryAddToFoldList(FoldList, UseMI, UseOpIdx, &ImmOp, TII); 313 return; 314 } 315 316 tryAddToFoldList(FoldList, UseMI, UseOpIdx, &OpToFold, TII); 317 318 // FIXME: We could try to change the instruction from 64-bit to 32-bit 319 // to enable more folding opportunites. The shrink operands pass 320 // already does this. 321 return; 322 } 323 324 static bool evalBinaryInstruction(unsigned Opcode, int32_t &Result, 325 int32_t LHS, int32_t RHS) { 326 switch (Opcode) { 327 case AMDGPU::V_AND_B32_e64: 328 case AMDGPU::S_AND_B32: 329 Result = LHS & RHS; 330 return true; 331 case AMDGPU::V_OR_B32_e64: 332 case AMDGPU::S_OR_B32: 333 Result = LHS | RHS; 334 return true; 335 case AMDGPU::V_XOR_B32_e64: 336 case AMDGPU::S_XOR_B32: 337 Result = LHS ^ RHS; 338 return true; 339 default: 340 return false; 341 } 342 } 343 344 static unsigned getMovOpc(bool IsScalar) { 345 return IsScalar ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32; 346 } 347 348 // Try to simplify operations with a constant that may appear after instruction 349 // selection. 350 static bool tryConstantFoldOp(MachineRegisterInfo &MRI, 351 const SIInstrInfo *TII, 352 MachineInstr *MI) { 353 unsigned Opc = MI->getOpcode(); 354 355 if (Opc == AMDGPU::V_NOT_B32_e64 || Opc == AMDGPU::V_NOT_B32_e32 || 356 Opc == AMDGPU::S_NOT_B32) { 357 MachineOperand &Src0 = MI->getOperand(1); 358 if (Src0.isImm()) { 359 Src0.setImm(~Src0.getImm()); 360 MI->setDesc(TII->get(getMovOpc(Opc == AMDGPU::S_NOT_B32))); 361 return true; 362 } 363 364 return false; 365 } 366 367 if (!MI->isCommutable()) 368 return false; 369 370 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); 371 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1); 372 373 MachineOperand *Src0 = &MI->getOperand(Src0Idx); 374 MachineOperand *Src1 = &MI->getOperand(Src1Idx); 375 if (!Src0->isImm() && !Src1->isImm()) 376 return false; 377 378 // and k0, k1 -> v_mov_b32 (k0 & k1) 379 // or k0, k1 -> v_mov_b32 (k0 | k1) 380 // xor k0, k1 -> v_mov_b32 (k0 ^ k1) 381 if (Src0->isImm() && Src1->isImm()) { 382 int32_t NewImm; 383 if (!evalBinaryInstruction(Opc, NewImm, Src0->getImm(), Src1->getImm())) 384 return false; 385 386 const SIRegisterInfo &TRI = TII->getRegisterInfo(); 387 bool IsSGPR = TRI.isSGPRReg(MRI, MI->getOperand(0).getReg()); 388 389 Src0->setImm(NewImm); 390 MI->RemoveOperand(Src1Idx); 391 MI->setDesc(TII->get(getMovOpc(IsSGPR))); 392 return true; 393 } 394 395 if (Src0->isImm() && !Src1->isImm()) { 396 std::swap(Src0, Src1); 397 std::swap(Src0Idx, Src1Idx); 398 } 399 400 int32_t Src1Val = static_cast<int32_t>(Src1->getImm()); 401 if (Opc == AMDGPU::V_OR_B32_e64 || Opc == AMDGPU::S_OR_B32) { 402 if (Src1Val == 0) { 403 // y = or x, 0 => y = copy x 404 MI->RemoveOperand(Src1Idx); 405 MI->setDesc(TII->get(AMDGPU::COPY)); 406 } else if (Src1Val == -1) { 407 // y = or x, -1 => y = v_mov_b32 -1 408 MI->RemoveOperand(Src1Idx); 409 MI->setDesc(TII->get(getMovOpc(Opc == AMDGPU::S_OR_B32))); 410 } else 411 return false; 412 413 return true; 414 } 415 416 if (MI->getOpcode() == AMDGPU::V_AND_B32_e64 || 417 MI->getOpcode() == AMDGPU::S_AND_B32) { 418 if (Src1Val == 0) { 419 // y = and x, 0 => y = v_mov_b32 0 420 MI->RemoveOperand(Src0Idx); 421 MI->setDesc(TII->get(getMovOpc(Opc == AMDGPU::S_AND_B32))); 422 } else if (Src1Val == -1) { 423 // y = and x, -1 => y = copy x 424 MI->RemoveOperand(Src1Idx); 425 MI->setDesc(TII->get(AMDGPU::COPY)); 426 } else 427 return false; 428 429 return true; 430 } 431 432 if (MI->getOpcode() == AMDGPU::V_XOR_B32_e64 || 433 MI->getOpcode() == AMDGPU::S_XOR_B32) { 434 if (Src1Val == 0) { 435 // y = xor x, 0 => y = copy x 436 MI->RemoveOperand(Src1Idx); 437 MI->setDesc(TII->get(AMDGPU::COPY)); 438 } 439 } 440 441 return false; 442 } 443 444 bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) { 445 if (skipFunction(*MF.getFunction())) 446 return false; 447 448 const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); 449 450 MachineRegisterInfo &MRI = MF.getRegInfo(); 451 const SIInstrInfo *TII = ST.getInstrInfo(); 452 const SIRegisterInfo &TRI = TII->getRegisterInfo(); 453 454 for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); 455 BI != BE; ++BI) { 456 457 MachineBasicBlock &MBB = *BI; 458 MachineBasicBlock::iterator I, Next; 459 for (I = MBB.begin(); I != MBB.end(); I = Next) { 460 Next = std::next(I); 461 MachineInstr &MI = *I; 462 463 if (!isSafeToFold(MI.getOpcode())) 464 continue; 465 466 unsigned OpSize = TII->getOpSize(MI, 1); 467 MachineOperand &OpToFold = MI.getOperand(1); 468 bool FoldingImm = OpToFold.isImm() || OpToFold.isFI(); 469 470 // FIXME: We could also be folding things like FrameIndexes and 471 // TargetIndexes. 472 if (!FoldingImm && !OpToFold.isReg()) 473 continue; 474 475 // Folding immediates with more than one use will increase program size. 476 // FIXME: This will also reduce register usage, which may be better 477 // in some cases. A better heuristic is needed. 478 if (FoldingImm && !TII->isInlineConstant(OpToFold, OpSize) && 479 !MRI.hasOneUse(MI.getOperand(0).getReg())) 480 continue; 481 482 if (OpToFold.isReg() && 483 !TargetRegisterInfo::isVirtualRegister(OpToFold.getReg())) 484 continue; 485 486 // Prevent folding operands backwards in the function. For example, 487 // the COPY opcode must not be replaced by 1 in this example: 488 // 489 // %vreg3<def> = COPY %VGPR0; VGPR_32:%vreg3 490 // ... 491 // %VGPR0<def> = V_MOV_B32_e32 1, %EXEC<imp-use> 492 MachineOperand &Dst = MI.getOperand(0); 493 if (Dst.isReg() && 494 !TargetRegisterInfo::isVirtualRegister(Dst.getReg())) 495 continue; 496 497 // We need mutate the operands of new mov instructions to add implicit 498 // uses of EXEC, but adding them invalidates the use_iterator, so defer 499 // this. 500 SmallVector<MachineInstr *, 4> CopiesToReplace; 501 502 std::vector<FoldCandidate> FoldList; 503 for (MachineRegisterInfo::use_iterator 504 Use = MRI.use_begin(MI.getOperand(0).getReg()), E = MRI.use_end(); 505 Use != E; ++Use) { 506 507 MachineInstr *UseMI = Use->getParent(); 508 509 foldOperand(OpToFold, UseMI, Use.getOperandNo(), FoldList, 510 CopiesToReplace, TII, TRI, MRI); 511 } 512 513 // Make sure we add EXEC uses to any new v_mov instructions created. 514 for (MachineInstr *Copy : CopiesToReplace) 515 Copy->addImplicitDefUseOperands(MF); 516 517 for (FoldCandidate &Fold : FoldList) { 518 if (updateOperand(Fold, TRI)) { 519 // Clear kill flags. 520 if (Fold.isReg()) { 521 assert(Fold.OpToFold && Fold.OpToFold->isReg()); 522 // FIXME: Probably shouldn't bother trying to fold if not an 523 // SGPR. PeepholeOptimizer can eliminate redundant VGPR->VGPR 524 // copies. 525 MRI.clearKillFlags(Fold.OpToFold->getReg()); 526 } 527 DEBUG(dbgs() << "Folded source from " << MI << " into OpNo " << 528 Fold.UseOpNo << " of " << *Fold.UseMI << '\n'); 529 530 // Folding the immediate may reveal operations that can be constant 531 // folded or replaced with a copy. This can happen for example after 532 // frame indices are lowered to constants or from splitting 64-bit 533 // constants. 534 tryConstantFoldOp(MRI, TII, Fold.UseMI); 535 } 536 } 537 } 538 } 539 return false; 540 } 541