1 //===-- SIFoldOperands.cpp - Fold operands --- ----------------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 /// \file 9 //===----------------------------------------------------------------------===// 10 // 11 12 #include "AMDGPU.h" 13 #include "AMDGPUSubtarget.h" 14 #include "SIInstrInfo.h" 15 #include "llvm/CodeGen/LiveIntervalAnalysis.h" 16 #include "llvm/CodeGen/MachineFunctionPass.h" 17 #include "llvm/CodeGen/MachineInstrBuilder.h" 18 #include "llvm/CodeGen/MachineRegisterInfo.h" 19 #include "llvm/Support/Debug.h" 20 #include "llvm/Support/raw_ostream.h" 21 #include "llvm/Target/TargetMachine.h" 22 23 #define DEBUG_TYPE "si-fold-operands" 24 using namespace llvm; 25 26 namespace { 27 28 class SIFoldOperands : public MachineFunctionPass { 29 public: 30 static char ID; 31 32 public: 33 SIFoldOperands() : MachineFunctionPass(ID) { 34 initializeSIFoldOperandsPass(*PassRegistry::getPassRegistry()); 35 } 36 37 bool runOnMachineFunction(MachineFunction &MF) override; 38 39 StringRef getPassName() const override { return "SI Fold Operands"; } 40 41 void getAnalysisUsage(AnalysisUsage &AU) const override { 42 AU.setPreservesCFG(); 43 MachineFunctionPass::getAnalysisUsage(AU); 44 } 45 }; 46 47 struct FoldCandidate { 48 MachineInstr *UseMI; 49 union { 50 MachineOperand *OpToFold; 51 uint64_t ImmToFold; 52 int FrameIndexToFold; 53 }; 54 unsigned char UseOpNo; 55 MachineOperand::MachineOperandType Kind; 56 57 FoldCandidate(MachineInstr *MI, unsigned OpNo, MachineOperand *FoldOp) : 58 UseMI(MI), OpToFold(nullptr), UseOpNo(OpNo), Kind(FoldOp->getType()) { 59 if (FoldOp->isImm()) { 60 ImmToFold = FoldOp->getImm(); 61 } else if (FoldOp->isFI()) { 62 FrameIndexToFold = FoldOp->getIndex(); 63 } else { 64 assert(FoldOp->isReg()); 65 OpToFold = FoldOp; 66 } 67 } 68 69 bool isFI() const { 70 return Kind == MachineOperand::MO_FrameIndex; 71 } 72 73 bool isImm() const { 74 return Kind == MachineOperand::MO_Immediate; 75 } 76 77 bool isReg() const { 78 return Kind == MachineOperand::MO_Register; 79 } 80 }; 81 82 } // End anonymous namespace. 83 84 INITIALIZE_PASS(SIFoldOperands, DEBUG_TYPE, 85 "SI Fold Operands", false, false) 86 87 char SIFoldOperands::ID = 0; 88 89 char &llvm::SIFoldOperandsID = SIFoldOperands::ID; 90 91 FunctionPass *llvm::createSIFoldOperandsPass() { 92 return new SIFoldOperands(); 93 } 94 95 static bool isSafeToFold(const MachineInstr &MI) { 96 switch (MI.getOpcode()) { 97 case AMDGPU::V_MOV_B32_e32: 98 case AMDGPU::V_MOV_B32_e64: 99 case AMDGPU::V_MOV_B64_PSEUDO: { 100 // If there are additional implicit register operands, this may be used for 101 // register indexing so the source register operand isn't simply copied. 102 unsigned NumOps = MI.getDesc().getNumOperands() + 103 MI.getDesc().getNumImplicitUses(); 104 105 return MI.getNumOperands() == NumOps; 106 } 107 case AMDGPU::S_MOV_B32: 108 case AMDGPU::S_MOV_B64: 109 case AMDGPU::COPY: 110 return true; 111 default: 112 return false; 113 } 114 } 115 116 static bool updateOperand(FoldCandidate &Fold, 117 const TargetRegisterInfo &TRI) { 118 MachineInstr *MI = Fold.UseMI; 119 MachineOperand &Old = MI->getOperand(Fold.UseOpNo); 120 assert(Old.isReg()); 121 122 if (Fold.isImm()) { 123 Old.ChangeToImmediate(Fold.ImmToFold); 124 return true; 125 } 126 127 if (Fold.isFI()) { 128 Old.ChangeToFrameIndex(Fold.FrameIndexToFold); 129 return true; 130 } 131 132 MachineOperand *New = Fold.OpToFold; 133 if (TargetRegisterInfo::isVirtualRegister(Old.getReg()) && 134 TargetRegisterInfo::isVirtualRegister(New->getReg())) { 135 Old.substVirtReg(New->getReg(), New->getSubReg(), TRI); 136 return true; 137 } 138 139 // FIXME: Handle physical registers. 140 141 return false; 142 } 143 144 static bool isUseMIInFoldList(const std::vector<FoldCandidate> &FoldList, 145 const MachineInstr *MI) { 146 for (auto Candidate : FoldList) { 147 if (Candidate.UseMI == MI) 148 return true; 149 } 150 return false; 151 } 152 153 static bool tryAddToFoldList(std::vector<FoldCandidate> &FoldList, 154 MachineInstr *MI, unsigned OpNo, 155 MachineOperand *OpToFold, 156 const SIInstrInfo *TII) { 157 if (!TII->isOperandLegal(*MI, OpNo, OpToFold)) { 158 159 // Special case for v_mac_{f16, f32}_e64 if we are trying to fold into src2 160 unsigned Opc = MI->getOpcode(); 161 if ((Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64) && 162 (int)OpNo == AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)) { 163 bool IsF32 = Opc == AMDGPU::V_MAC_F32_e64; 164 165 // Check if changing this to a v_mad_{f16, f32} instruction will allow us 166 // to fold the operand. 167 MI->setDesc(TII->get(IsF32 ? AMDGPU::V_MAD_F32 : AMDGPU::V_MAD_F16)); 168 bool FoldAsMAD = tryAddToFoldList(FoldList, MI, OpNo, OpToFold, TII); 169 if (FoldAsMAD) { 170 MI->untieRegOperand(OpNo); 171 return true; 172 } 173 MI->setDesc(TII->get(Opc)); 174 } 175 176 // If we are already folding into another operand of MI, then 177 // we can't commute the instruction, otherwise we risk making the 178 // other fold illegal. 179 if (isUseMIInFoldList(FoldList, MI)) 180 return false; 181 182 // Operand is not legal, so try to commute the instruction to 183 // see if this makes it possible to fold. 184 unsigned CommuteIdx0 = TargetInstrInfo::CommuteAnyOperandIndex; 185 unsigned CommuteIdx1 = TargetInstrInfo::CommuteAnyOperandIndex; 186 bool CanCommute = TII->findCommutedOpIndices(*MI, CommuteIdx0, CommuteIdx1); 187 188 if (CanCommute) { 189 if (CommuteIdx0 == OpNo) 190 OpNo = CommuteIdx1; 191 else if (CommuteIdx1 == OpNo) 192 OpNo = CommuteIdx0; 193 } 194 195 // One of operands might be an Imm operand, and OpNo may refer to it after 196 // the call of commuteInstruction() below. Such situations are avoided 197 // here explicitly as OpNo must be a register operand to be a candidate 198 // for memory folding. 199 if (CanCommute && (!MI->getOperand(CommuteIdx0).isReg() || 200 !MI->getOperand(CommuteIdx1).isReg())) 201 return false; 202 203 if (!CanCommute || 204 !TII->commuteInstruction(*MI, false, CommuteIdx0, CommuteIdx1)) 205 return false; 206 207 if (!TII->isOperandLegal(*MI, OpNo, OpToFold)) 208 return false; 209 } 210 211 FoldList.push_back(FoldCandidate(MI, OpNo, OpToFold)); 212 return true; 213 } 214 215 // If the use operand doesn't care about the value, this may be an operand only 216 // used for register indexing, in which case it is unsafe to fold. 217 static bool isUseSafeToFold(const MachineInstr &MI, 218 const MachineOperand &UseMO) { 219 return !UseMO.isUndef(); 220 //return !MI.hasRegisterImplicitUseOperand(UseMO.getReg()); 221 } 222 223 static void foldOperand(MachineOperand &OpToFold, MachineInstr *UseMI, 224 unsigned UseOpIdx, 225 std::vector<FoldCandidate> &FoldList, 226 SmallVectorImpl<MachineInstr *> &CopiesToReplace, 227 const SIInstrInfo *TII, const SIRegisterInfo &TRI, 228 MachineRegisterInfo &MRI) { 229 const MachineOperand &UseOp = UseMI->getOperand(UseOpIdx); 230 231 if (!isUseSafeToFold(*UseMI, UseOp)) 232 return; 233 234 // FIXME: Fold operands with subregs. 235 if (UseOp.isReg() && OpToFold.isReg()) { 236 if (UseOp.isImplicit() || UseOp.getSubReg() != AMDGPU::NoSubRegister) 237 return; 238 239 // Don't fold subregister extracts into tied operands, only if it is a full 240 // copy since a subregister use tied to a full register def doesn't really 241 // make sense. e.g. don't fold: 242 // 243 // %vreg1 = COPY %vreg0:sub1 244 // %vreg2<tied3> = V_MAC_{F16, F32} %vreg3, %vreg4, %vreg1<tied0> 245 // 246 // into 247 // %vreg2<tied3> = V_MAC_{F16, F32} %vreg3, %vreg4, %vreg0:sub1<tied0> 248 if (UseOp.isTied() && OpToFold.getSubReg() != AMDGPU::NoSubRegister) 249 return; 250 } 251 252 bool FoldingImm = OpToFold.isImm(); 253 APInt Imm; 254 255 if (FoldingImm) { 256 unsigned UseReg = UseOp.getReg(); 257 const TargetRegisterClass *UseRC 258 = TargetRegisterInfo::isVirtualRegister(UseReg) ? 259 MRI.getRegClass(UseReg) : 260 TRI.getPhysRegClass(UseReg); 261 262 Imm = APInt(64, OpToFold.getImm()); 263 264 const MCInstrDesc &FoldDesc = TII->get(OpToFold.getParent()->getOpcode()); 265 const TargetRegisterClass *FoldRC = 266 TRI.getRegClass(FoldDesc.OpInfo[0].RegClass); 267 268 // Split 64-bit constants into 32-bits for folding. 269 if (FoldRC->getSize() == 8 && UseOp.getSubReg()) { 270 if (UseRC->getSize() != 8) 271 return; 272 273 if (UseOp.getSubReg() == AMDGPU::sub0) { 274 Imm = Imm.getLoBits(32); 275 } else { 276 assert(UseOp.getSubReg() == AMDGPU::sub1); 277 Imm = Imm.getHiBits(32); 278 } 279 } 280 281 // In order to fold immediates into copies, we need to change the 282 // copy to a MOV. 283 if (UseMI->getOpcode() == AMDGPU::COPY) { 284 unsigned DestReg = UseMI->getOperand(0).getReg(); 285 const TargetRegisterClass *DestRC 286 = TargetRegisterInfo::isVirtualRegister(DestReg) ? 287 MRI.getRegClass(DestReg) : 288 TRI.getPhysRegClass(DestReg); 289 290 unsigned MovOp = TII->getMovOpcode(DestRC); 291 if (MovOp == AMDGPU::COPY) 292 return; 293 294 UseMI->setDesc(TII->get(MovOp)); 295 CopiesToReplace.push_back(UseMI); 296 } 297 } 298 299 // Special case for REG_SEQUENCE: We can't fold literals into 300 // REG_SEQUENCE instructions, so we have to fold them into the 301 // uses of REG_SEQUENCE. 302 if (UseMI->getOpcode() == AMDGPU::REG_SEQUENCE) { 303 unsigned RegSeqDstReg = UseMI->getOperand(0).getReg(); 304 unsigned RegSeqDstSubReg = UseMI->getOperand(UseOpIdx + 1).getImm(); 305 306 for (MachineRegisterInfo::use_iterator 307 RSUse = MRI.use_begin(RegSeqDstReg), 308 RSE = MRI.use_end(); RSUse != RSE; ++RSUse) { 309 310 MachineInstr *RSUseMI = RSUse->getParent(); 311 if (RSUse->getSubReg() != RegSeqDstSubReg) 312 continue; 313 314 foldOperand(OpToFold, RSUseMI, RSUse.getOperandNo(), FoldList, 315 CopiesToReplace, TII, TRI, MRI); 316 } 317 return; 318 } 319 320 const MCInstrDesc &UseDesc = UseMI->getDesc(); 321 322 // Don't fold into target independent nodes. Target independent opcodes 323 // don't have defined register classes. 324 if (UseDesc.isVariadic() || 325 UseDesc.OpInfo[UseOpIdx].RegClass == -1) 326 return; 327 328 if (FoldingImm) { 329 MachineOperand ImmOp = MachineOperand::CreateImm(Imm.getSExtValue()); 330 tryAddToFoldList(FoldList, UseMI, UseOpIdx, &ImmOp, TII); 331 return; 332 } 333 334 tryAddToFoldList(FoldList, UseMI, UseOpIdx, &OpToFold, TII); 335 336 // FIXME: We could try to change the instruction from 64-bit to 32-bit 337 // to enable more folding opportunites. The shrink operands pass 338 // already does this. 339 return; 340 } 341 342 static bool evalBinaryInstruction(unsigned Opcode, int32_t &Result, 343 int32_t LHS, int32_t RHS) { 344 switch (Opcode) { 345 case AMDGPU::V_AND_B32_e64: 346 case AMDGPU::S_AND_B32: 347 Result = LHS & RHS; 348 return true; 349 case AMDGPU::V_OR_B32_e64: 350 case AMDGPU::S_OR_B32: 351 Result = LHS | RHS; 352 return true; 353 case AMDGPU::V_XOR_B32_e64: 354 case AMDGPU::S_XOR_B32: 355 Result = LHS ^ RHS; 356 return true; 357 default: 358 return false; 359 } 360 } 361 362 static unsigned getMovOpc(bool IsScalar) { 363 return IsScalar ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32; 364 } 365 366 /// Remove any leftover implicit operands from mutating the instruction. e.g. 367 /// if we replace an s_and_b32 with a copy, we don't need the implicit scc def 368 /// anymore. 369 static void stripExtraCopyOperands(MachineInstr &MI) { 370 const MCInstrDesc &Desc = MI.getDesc(); 371 unsigned NumOps = Desc.getNumOperands() + 372 Desc.getNumImplicitUses() + 373 Desc.getNumImplicitDefs(); 374 375 for (unsigned I = MI.getNumOperands() - 1; I >= NumOps; --I) 376 MI.RemoveOperand(I); 377 } 378 379 static void mutateCopyOp(MachineInstr &MI, const MCInstrDesc &NewDesc) { 380 MI.setDesc(NewDesc); 381 stripExtraCopyOperands(MI); 382 } 383 384 // Try to simplify operations with a constant that may appear after instruction 385 // selection. 386 static bool tryConstantFoldOp(MachineRegisterInfo &MRI, 387 const SIInstrInfo *TII, 388 MachineInstr *MI) { 389 unsigned Opc = MI->getOpcode(); 390 391 if (Opc == AMDGPU::V_NOT_B32_e64 || Opc == AMDGPU::V_NOT_B32_e32 || 392 Opc == AMDGPU::S_NOT_B32) { 393 MachineOperand &Src0 = MI->getOperand(1); 394 if (Src0.isImm()) { 395 Src0.setImm(~Src0.getImm()); 396 mutateCopyOp(*MI, TII->get(getMovOpc(Opc == AMDGPU::S_NOT_B32))); 397 return true; 398 } 399 400 return false; 401 } 402 403 if (!MI->isCommutable()) 404 return false; 405 406 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); 407 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1); 408 409 MachineOperand *Src0 = &MI->getOperand(Src0Idx); 410 MachineOperand *Src1 = &MI->getOperand(Src1Idx); 411 if (!Src0->isImm() && !Src1->isImm()) 412 return false; 413 414 // and k0, k1 -> v_mov_b32 (k0 & k1) 415 // or k0, k1 -> v_mov_b32 (k0 | k1) 416 // xor k0, k1 -> v_mov_b32 (k0 ^ k1) 417 if (Src0->isImm() && Src1->isImm()) { 418 int32_t NewImm; 419 if (!evalBinaryInstruction(Opc, NewImm, Src0->getImm(), Src1->getImm())) 420 return false; 421 422 const SIRegisterInfo &TRI = TII->getRegisterInfo(); 423 bool IsSGPR = TRI.isSGPRReg(MRI, MI->getOperand(0).getReg()); 424 425 Src0->setImm(NewImm); 426 MI->RemoveOperand(Src1Idx); 427 mutateCopyOp(*MI, TII->get(getMovOpc(IsSGPR))); 428 return true; 429 } 430 431 if (Src0->isImm() && !Src1->isImm()) { 432 std::swap(Src0, Src1); 433 std::swap(Src0Idx, Src1Idx); 434 } 435 436 int32_t Src1Val = static_cast<int32_t>(Src1->getImm()); 437 if (Opc == AMDGPU::V_OR_B32_e64 || Opc == AMDGPU::S_OR_B32) { 438 if (Src1Val == 0) { 439 // y = or x, 0 => y = copy x 440 MI->RemoveOperand(Src1Idx); 441 mutateCopyOp(*MI, TII->get(AMDGPU::COPY)); 442 } else if (Src1Val == -1) { 443 // y = or x, -1 => y = v_mov_b32 -1 444 MI->RemoveOperand(Src1Idx); 445 mutateCopyOp(*MI, TII->get(getMovOpc(Opc == AMDGPU::S_OR_B32))); 446 } else 447 return false; 448 449 return true; 450 } 451 452 if (MI->getOpcode() == AMDGPU::V_AND_B32_e64 || 453 MI->getOpcode() == AMDGPU::S_AND_B32) { 454 if (Src1Val == 0) { 455 // y = and x, 0 => y = v_mov_b32 0 456 MI->RemoveOperand(Src0Idx); 457 mutateCopyOp(*MI, TII->get(getMovOpc(Opc == AMDGPU::S_AND_B32))); 458 } else if (Src1Val == -1) { 459 // y = and x, -1 => y = copy x 460 MI->RemoveOperand(Src1Idx); 461 mutateCopyOp(*MI, TII->get(AMDGPU::COPY)); 462 stripExtraCopyOperands(*MI); 463 } else 464 return false; 465 466 return true; 467 } 468 469 if (MI->getOpcode() == AMDGPU::V_XOR_B32_e64 || 470 MI->getOpcode() == AMDGPU::S_XOR_B32) { 471 if (Src1Val == 0) { 472 // y = xor x, 0 => y = copy x 473 MI->RemoveOperand(Src1Idx); 474 mutateCopyOp(*MI, TII->get(AMDGPU::COPY)); 475 } 476 } 477 478 return false; 479 } 480 481 bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) { 482 if (skipFunction(*MF.getFunction())) 483 return false; 484 485 const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); 486 487 MachineRegisterInfo &MRI = MF.getRegInfo(); 488 const SIInstrInfo *TII = ST.getInstrInfo(); 489 const SIRegisterInfo &TRI = TII->getRegisterInfo(); 490 491 for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); 492 BI != BE; ++BI) { 493 494 MachineBasicBlock &MBB = *BI; 495 MachineBasicBlock::iterator I, Next; 496 for (I = MBB.begin(); I != MBB.end(); I = Next) { 497 Next = std::next(I); 498 MachineInstr &MI = *I; 499 500 if (!isSafeToFold(MI)) 501 continue; 502 503 unsigned OpSize = TII->getOpSize(MI, 1); 504 MachineOperand &OpToFold = MI.getOperand(1); 505 bool FoldingImm = OpToFold.isImm() || OpToFold.isFI(); 506 507 // FIXME: We could also be folding things like FrameIndexes and 508 // TargetIndexes. 509 if (!FoldingImm && !OpToFold.isReg()) 510 continue; 511 512 // Folding immediates with more than one use will increase program size. 513 // FIXME: This will also reduce register usage, which may be better 514 // in some cases. A better heuristic is needed. 515 if (FoldingImm && !TII->isInlineConstant(OpToFold, OpSize) && 516 !MRI.hasOneUse(MI.getOperand(0).getReg())) 517 continue; 518 519 if (OpToFold.isReg() && 520 !TargetRegisterInfo::isVirtualRegister(OpToFold.getReg())) 521 continue; 522 523 // Prevent folding operands backwards in the function. For example, 524 // the COPY opcode must not be replaced by 1 in this example: 525 // 526 // %vreg3<def> = COPY %VGPR0; VGPR_32:%vreg3 527 // ... 528 // %VGPR0<def> = V_MOV_B32_e32 1, %EXEC<imp-use> 529 MachineOperand &Dst = MI.getOperand(0); 530 if (Dst.isReg() && 531 !TargetRegisterInfo::isVirtualRegister(Dst.getReg())) 532 continue; 533 534 // We need mutate the operands of new mov instructions to add implicit 535 // uses of EXEC, but adding them invalidates the use_iterator, so defer 536 // this. 537 SmallVector<MachineInstr *, 4> CopiesToReplace; 538 539 std::vector<FoldCandidate> FoldList; 540 for (MachineRegisterInfo::use_iterator 541 Use = MRI.use_begin(MI.getOperand(0).getReg()), E = MRI.use_end(); 542 Use != E; ++Use) { 543 544 MachineInstr *UseMI = Use->getParent(); 545 546 foldOperand(OpToFold, UseMI, Use.getOperandNo(), FoldList, 547 CopiesToReplace, TII, TRI, MRI); 548 } 549 550 // Make sure we add EXEC uses to any new v_mov instructions created. 551 for (MachineInstr *Copy : CopiesToReplace) 552 Copy->addImplicitDefUseOperands(MF); 553 554 for (FoldCandidate &Fold : FoldList) { 555 if (updateOperand(Fold, TRI)) { 556 // Clear kill flags. 557 if (Fold.isReg()) { 558 assert(Fold.OpToFold && Fold.OpToFold->isReg()); 559 // FIXME: Probably shouldn't bother trying to fold if not an 560 // SGPR. PeepholeOptimizer can eliminate redundant VGPR->VGPR 561 // copies. 562 MRI.clearKillFlags(Fold.OpToFold->getReg()); 563 } 564 DEBUG(dbgs() << "Folded source from " << MI << " into OpNo " << 565 static_cast<int>(Fold.UseOpNo) << " of " << *Fold.UseMI << '\n'); 566 567 // Folding the immediate may reveal operations that can be constant 568 // folded or replaced with a copy. This can happen for example after 569 // frame indices are lowered to constants or from splitting 64-bit 570 // constants. 571 tryConstantFoldOp(MRI, TII, Fold.UseMI); 572 } 573 } 574 } 575 } 576 return false; 577 } 578