1 //===-- SIFoldOperands.cpp - Fold operands --- ----------------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 /// \file 9 //===----------------------------------------------------------------------===// 10 // 11 12 #include "AMDGPU.h" 13 #include "AMDGPUSubtarget.h" 14 #include "SIInstrInfo.h" 15 #include "llvm/CodeGen/LiveIntervalAnalysis.h" 16 #include "llvm/CodeGen/MachineFunctionPass.h" 17 #include "llvm/CodeGen/MachineInstrBuilder.h" 18 #include "llvm/CodeGen/MachineRegisterInfo.h" 19 #include "llvm/Support/Debug.h" 20 #include "llvm/Support/raw_ostream.h" 21 #include "llvm/Target/TargetMachine.h" 22 23 #define DEBUG_TYPE "si-fold-operands" 24 using namespace llvm; 25 26 namespace { 27 28 class SIFoldOperands : public MachineFunctionPass { 29 public: 30 static char ID; 31 32 public: 33 SIFoldOperands() : MachineFunctionPass(ID) { 34 initializeSIFoldOperandsPass(*PassRegistry::getPassRegistry()); 35 } 36 37 bool runOnMachineFunction(MachineFunction &MF) override; 38 39 StringRef getPassName() const override { return "SI Fold Operands"; } 40 41 void getAnalysisUsage(AnalysisUsage &AU) const override { 42 AU.setPreservesCFG(); 43 MachineFunctionPass::getAnalysisUsage(AU); 44 } 45 }; 46 47 struct FoldCandidate { 48 MachineInstr *UseMI; 49 union { 50 MachineOperand *OpToFold; 51 uint64_t ImmToFold; 52 int FrameIndexToFold; 53 }; 54 unsigned char UseOpNo; 55 MachineOperand::MachineOperandType Kind; 56 57 FoldCandidate(MachineInstr *MI, unsigned OpNo, MachineOperand *FoldOp) : 58 UseMI(MI), OpToFold(nullptr), UseOpNo(OpNo), Kind(FoldOp->getType()) { 59 if (FoldOp->isImm()) { 60 ImmToFold = FoldOp->getImm(); 61 } else if (FoldOp->isFI()) { 62 FrameIndexToFold = FoldOp->getIndex(); 63 } else { 64 assert(FoldOp->isReg()); 65 OpToFold = FoldOp; 66 } 67 } 68 69 bool isFI() const { 70 return Kind == MachineOperand::MO_FrameIndex; 71 } 72 73 bool isImm() const { 74 return Kind == MachineOperand::MO_Immediate; 75 } 76 77 bool isReg() const { 78 return Kind == MachineOperand::MO_Register; 79 } 80 }; 81 82 } // End anonymous namespace. 83 84 INITIALIZE_PASS(SIFoldOperands, DEBUG_TYPE, 85 "SI Fold Operands", false, false) 86 87 char SIFoldOperands::ID = 0; 88 89 char &llvm::SIFoldOperandsID = SIFoldOperands::ID; 90 91 FunctionPass *llvm::createSIFoldOperandsPass() { 92 return new SIFoldOperands(); 93 } 94 95 static bool isSafeToFold(const MachineInstr &MI) { 96 switch (MI.getOpcode()) { 97 case AMDGPU::V_MOV_B32_e32: 98 case AMDGPU::V_MOV_B32_e64: 99 case AMDGPU::V_MOV_B64_PSEUDO: { 100 // If there are additional implicit register operands, this may be used for 101 // register indexing so the source register operand isn't simply copied. 102 unsigned NumOps = MI.getDesc().getNumOperands() + 103 MI.getDesc().getNumImplicitUses(); 104 105 return MI.getNumOperands() == NumOps; 106 } 107 case AMDGPU::S_MOV_B32: 108 case AMDGPU::S_MOV_B64: 109 case AMDGPU::COPY: 110 return true; 111 default: 112 return false; 113 } 114 } 115 116 static bool updateOperand(FoldCandidate &Fold, 117 const TargetRegisterInfo &TRI) { 118 MachineInstr *MI = Fold.UseMI; 119 MachineOperand &Old = MI->getOperand(Fold.UseOpNo); 120 assert(Old.isReg()); 121 122 if (Fold.isImm()) { 123 Old.ChangeToImmediate(Fold.ImmToFold); 124 return true; 125 } 126 127 if (Fold.isFI()) { 128 Old.ChangeToFrameIndex(Fold.FrameIndexToFold); 129 return true; 130 } 131 132 MachineOperand *New = Fold.OpToFold; 133 if (TargetRegisterInfo::isVirtualRegister(Old.getReg()) && 134 TargetRegisterInfo::isVirtualRegister(New->getReg())) { 135 Old.substVirtReg(New->getReg(), New->getSubReg(), TRI); 136 return true; 137 } 138 139 // FIXME: Handle physical registers. 140 141 return false; 142 } 143 144 static bool isUseMIInFoldList(const std::vector<FoldCandidate> &FoldList, 145 const MachineInstr *MI) { 146 for (auto Candidate : FoldList) { 147 if (Candidate.UseMI == MI) 148 return true; 149 } 150 return false; 151 } 152 153 static bool tryAddToFoldList(std::vector<FoldCandidate> &FoldList, 154 MachineInstr *MI, unsigned OpNo, 155 MachineOperand *OpToFold, 156 const SIInstrInfo *TII) { 157 if (!TII->isOperandLegal(*MI, OpNo, OpToFold)) { 158 159 // Special case for v_mac_{f16, f32}_e64 if we are trying to fold into src2 160 unsigned Opc = MI->getOpcode(); 161 if ((Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64) && 162 (int)OpNo == AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)) { 163 bool IsF32 = Opc == AMDGPU::V_MAC_F32_e64; 164 165 // Check if changing this to a v_mad_{f16, f32} instruction will allow us 166 // to fold the operand. 167 MI->setDesc(TII->get(IsF32 ? AMDGPU::V_MAD_F32 : AMDGPU::V_MAD_F16)); 168 bool FoldAsMAD = tryAddToFoldList(FoldList, MI, OpNo, OpToFold, TII); 169 if (FoldAsMAD) { 170 MI->untieRegOperand(OpNo); 171 return true; 172 } 173 MI->setDesc(TII->get(Opc)); 174 } 175 176 // If we are already folding into another operand of MI, then 177 // we can't commute the instruction, otherwise we risk making the 178 // other fold illegal. 179 if (isUseMIInFoldList(FoldList, MI)) 180 return false; 181 182 // Operand is not legal, so try to commute the instruction to 183 // see if this makes it possible to fold. 184 unsigned CommuteIdx0 = TargetInstrInfo::CommuteAnyOperandIndex; 185 unsigned CommuteIdx1 = TargetInstrInfo::CommuteAnyOperandIndex; 186 bool CanCommute = TII->findCommutedOpIndices(*MI, CommuteIdx0, CommuteIdx1); 187 188 if (CanCommute) { 189 if (CommuteIdx0 == OpNo) 190 OpNo = CommuteIdx1; 191 else if (CommuteIdx1 == OpNo) 192 OpNo = CommuteIdx0; 193 } 194 195 // One of operands might be an Imm operand, and OpNo may refer to it after 196 // the call of commuteInstruction() below. Such situations are avoided 197 // here explicitly as OpNo must be a register operand to be a candidate 198 // for memory folding. 199 if (CanCommute && (!MI->getOperand(CommuteIdx0).isReg() || 200 !MI->getOperand(CommuteIdx1).isReg())) 201 return false; 202 203 if (!CanCommute || 204 !TII->commuteInstruction(*MI, false, CommuteIdx0, CommuteIdx1)) 205 return false; 206 207 if (!TII->isOperandLegal(*MI, OpNo, OpToFold)) 208 return false; 209 } 210 211 FoldList.push_back(FoldCandidate(MI, OpNo, OpToFold)); 212 return true; 213 } 214 215 // If the use operand doesn't care about the value, this may be an operand only 216 // used for register indexing, in which case it is unsafe to fold. 217 static bool isUseSafeToFold(const MachineInstr &MI, 218 const MachineOperand &UseMO) { 219 return !UseMO.isUndef(); 220 //return !MI.hasRegisterImplicitUseOperand(UseMO.getReg()); 221 } 222 223 static void foldOperand(MachineOperand &OpToFold, MachineInstr *UseMI, 224 unsigned UseOpIdx, 225 std::vector<FoldCandidate> &FoldList, 226 SmallVectorImpl<MachineInstr *> &CopiesToReplace, 227 const SIInstrInfo *TII, const SIRegisterInfo &TRI, 228 MachineRegisterInfo &MRI) { 229 const MachineOperand &UseOp = UseMI->getOperand(UseOpIdx); 230 231 if (!isUseSafeToFold(*UseMI, UseOp)) 232 return; 233 234 // FIXME: Fold operands with subregs. 235 if (UseOp.isReg() && OpToFold.isReg()) { 236 if (UseOp.isImplicit() || UseOp.getSubReg() != AMDGPU::NoSubRegister) 237 return; 238 239 // Don't fold subregister extracts into tied operands, only if it is a full 240 // copy since a subregister use tied to a full register def doesn't really 241 // make sense. e.g. don't fold: 242 // 243 // %vreg1 = COPY %vreg0:sub1 244 // %vreg2<tied3> = V_MAC_{F16, F32} %vreg3, %vreg4, %vreg1<tied0> 245 // 246 // into 247 // %vreg2<tied3> = V_MAC_{F16, F32} %vreg3, %vreg4, %vreg0:sub1<tied0> 248 if (UseOp.isTied() && OpToFold.getSubReg() != AMDGPU::NoSubRegister) 249 return; 250 } 251 252 // Special case for REG_SEQUENCE: We can't fold literals into 253 // REG_SEQUENCE instructions, so we have to fold them into the 254 // uses of REG_SEQUENCE. 255 if (UseMI->isRegSequence()) { 256 unsigned RegSeqDstReg = UseMI->getOperand(0).getReg(); 257 unsigned RegSeqDstSubReg = UseMI->getOperand(UseOpIdx + 1).getImm(); 258 259 for (MachineRegisterInfo::use_iterator 260 RSUse = MRI.use_begin(RegSeqDstReg), RSE = MRI.use_end(); 261 RSUse != RSE; ++RSUse) { 262 263 MachineInstr *RSUseMI = RSUse->getParent(); 264 if (RSUse->getSubReg() != RegSeqDstSubReg) 265 continue; 266 267 foldOperand(OpToFold, RSUseMI, RSUse.getOperandNo(), FoldList, 268 CopiesToReplace, TII, TRI, MRI); 269 } 270 271 return; 272 } 273 274 275 bool FoldingImm = OpToFold.isImm(); 276 277 // In order to fold immediates into copies, we need to change the 278 // copy to a MOV. 279 if (FoldingImm && UseMI->isCopy()) { 280 unsigned DestReg = UseMI->getOperand(0).getReg(); 281 const TargetRegisterClass *DestRC 282 = TargetRegisterInfo::isVirtualRegister(DestReg) ? 283 MRI.getRegClass(DestReg) : 284 TRI.getPhysRegClass(DestReg); 285 286 unsigned MovOp = TII->getMovOpcode(DestRC); 287 if (MovOp == AMDGPU::COPY) 288 return; 289 290 UseMI->setDesc(TII->get(MovOp)); 291 CopiesToReplace.push_back(UseMI); 292 } else { 293 const MCInstrDesc &UseDesc = UseMI->getDesc(); 294 295 // Don't fold into target independent nodes. Target independent opcodes 296 // don't have defined register classes. 297 if (UseDesc.isVariadic() || 298 UseDesc.OpInfo[UseOpIdx].RegClass == -1) 299 return; 300 } 301 302 if (!FoldingImm) { 303 tryAddToFoldList(FoldList, UseMI, UseOpIdx, &OpToFold, TII); 304 305 // FIXME: We could try to change the instruction from 64-bit to 32-bit 306 // to enable more folding opportunites. The shrink operands pass 307 // already does this. 308 return; 309 } 310 311 APInt Imm(64, OpToFold.getImm()); 312 313 const MCInstrDesc &FoldDesc = OpToFold.getParent()->getDesc(); 314 const TargetRegisterClass *FoldRC = 315 TRI.getRegClass(FoldDesc.OpInfo[0].RegClass); 316 317 // Split 64-bit constants into 32-bits for folding. 318 if (UseOp.getSubReg() && AMDGPU::getRegBitWidth(FoldRC->getID()) == 64) { 319 unsigned UseReg = UseOp.getReg(); 320 const TargetRegisterClass *UseRC 321 = TargetRegisterInfo::isVirtualRegister(UseReg) ? 322 MRI.getRegClass(UseReg) : 323 TRI.getPhysRegClass(UseReg); 324 325 if (AMDGPU::getRegBitWidth(UseRC->getID()) != 64) 326 return; 327 328 if (UseOp.getSubReg() == AMDGPU::sub0) { 329 Imm = Imm.getLoBits(32); 330 } else { 331 assert(UseOp.getSubReg() == AMDGPU::sub1); 332 Imm = Imm.getHiBits(32); 333 } 334 } 335 336 MachineOperand ImmOp = MachineOperand::CreateImm(Imm.getSExtValue()); 337 tryAddToFoldList(FoldList, UseMI, UseOpIdx, &ImmOp, TII); 338 } 339 340 static bool evalBinaryInstruction(unsigned Opcode, int32_t &Result, 341 int32_t LHS, int32_t RHS) { 342 switch (Opcode) { 343 case AMDGPU::V_AND_B32_e64: 344 case AMDGPU::S_AND_B32: 345 Result = LHS & RHS; 346 return true; 347 case AMDGPU::V_OR_B32_e64: 348 case AMDGPU::S_OR_B32: 349 Result = LHS | RHS; 350 return true; 351 case AMDGPU::V_XOR_B32_e64: 352 case AMDGPU::S_XOR_B32: 353 Result = LHS ^ RHS; 354 return true; 355 default: 356 return false; 357 } 358 } 359 360 static unsigned getMovOpc(bool IsScalar) { 361 return IsScalar ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32; 362 } 363 364 /// Remove any leftover implicit operands from mutating the instruction. e.g. 365 /// if we replace an s_and_b32 with a copy, we don't need the implicit scc def 366 /// anymore. 367 static void stripExtraCopyOperands(MachineInstr &MI) { 368 const MCInstrDesc &Desc = MI.getDesc(); 369 unsigned NumOps = Desc.getNumOperands() + 370 Desc.getNumImplicitUses() + 371 Desc.getNumImplicitDefs(); 372 373 for (unsigned I = MI.getNumOperands() - 1; I >= NumOps; --I) 374 MI.RemoveOperand(I); 375 } 376 377 static void mutateCopyOp(MachineInstr &MI, const MCInstrDesc &NewDesc) { 378 MI.setDesc(NewDesc); 379 stripExtraCopyOperands(MI); 380 } 381 382 // Try to simplify operations with a constant that may appear after instruction 383 // selection. 384 static bool tryConstantFoldOp(MachineRegisterInfo &MRI, 385 const SIInstrInfo *TII, 386 MachineInstr *MI) { 387 unsigned Opc = MI->getOpcode(); 388 389 if (Opc == AMDGPU::V_NOT_B32_e64 || Opc == AMDGPU::V_NOT_B32_e32 || 390 Opc == AMDGPU::S_NOT_B32) { 391 MachineOperand &Src0 = MI->getOperand(1); 392 if (Src0.isImm()) { 393 Src0.setImm(~Src0.getImm()); 394 mutateCopyOp(*MI, TII->get(getMovOpc(Opc == AMDGPU::S_NOT_B32))); 395 return true; 396 } 397 398 return false; 399 } 400 401 if (!MI->isCommutable()) 402 return false; 403 404 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); 405 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1); 406 407 MachineOperand *Src0 = &MI->getOperand(Src0Idx); 408 MachineOperand *Src1 = &MI->getOperand(Src1Idx); 409 if (!Src0->isImm() && !Src1->isImm()) 410 return false; 411 412 // and k0, k1 -> v_mov_b32 (k0 & k1) 413 // or k0, k1 -> v_mov_b32 (k0 | k1) 414 // xor k0, k1 -> v_mov_b32 (k0 ^ k1) 415 if (Src0->isImm() && Src1->isImm()) { 416 int32_t NewImm; 417 if (!evalBinaryInstruction(Opc, NewImm, Src0->getImm(), Src1->getImm())) 418 return false; 419 420 const SIRegisterInfo &TRI = TII->getRegisterInfo(); 421 bool IsSGPR = TRI.isSGPRReg(MRI, MI->getOperand(0).getReg()); 422 423 Src0->setImm(NewImm); 424 MI->RemoveOperand(Src1Idx); 425 mutateCopyOp(*MI, TII->get(getMovOpc(IsSGPR))); 426 return true; 427 } 428 429 if (Src0->isImm() && !Src1->isImm()) { 430 std::swap(Src0, Src1); 431 std::swap(Src0Idx, Src1Idx); 432 } 433 434 int32_t Src1Val = static_cast<int32_t>(Src1->getImm()); 435 if (Opc == AMDGPU::V_OR_B32_e64 || Opc == AMDGPU::S_OR_B32) { 436 if (Src1Val == 0) { 437 // y = or x, 0 => y = copy x 438 MI->RemoveOperand(Src1Idx); 439 mutateCopyOp(*MI, TII->get(AMDGPU::COPY)); 440 } else if (Src1Val == -1) { 441 // y = or x, -1 => y = v_mov_b32 -1 442 MI->RemoveOperand(Src1Idx); 443 mutateCopyOp(*MI, TII->get(getMovOpc(Opc == AMDGPU::S_OR_B32))); 444 } else 445 return false; 446 447 return true; 448 } 449 450 if (MI->getOpcode() == AMDGPU::V_AND_B32_e64 || 451 MI->getOpcode() == AMDGPU::S_AND_B32) { 452 if (Src1Val == 0) { 453 // y = and x, 0 => y = v_mov_b32 0 454 MI->RemoveOperand(Src0Idx); 455 mutateCopyOp(*MI, TII->get(getMovOpc(Opc == AMDGPU::S_AND_B32))); 456 } else if (Src1Val == -1) { 457 // y = and x, -1 => y = copy x 458 MI->RemoveOperand(Src1Idx); 459 mutateCopyOp(*MI, TII->get(AMDGPU::COPY)); 460 stripExtraCopyOperands(*MI); 461 } else 462 return false; 463 464 return true; 465 } 466 467 if (MI->getOpcode() == AMDGPU::V_XOR_B32_e64 || 468 MI->getOpcode() == AMDGPU::S_XOR_B32) { 469 if (Src1Val == 0) { 470 // y = xor x, 0 => y = copy x 471 MI->RemoveOperand(Src1Idx); 472 mutateCopyOp(*MI, TII->get(AMDGPU::COPY)); 473 } 474 } 475 476 return false; 477 } 478 479 bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) { 480 if (skipFunction(*MF.getFunction())) 481 return false; 482 483 const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); 484 485 MachineRegisterInfo &MRI = MF.getRegInfo(); 486 const SIInstrInfo *TII = ST.getInstrInfo(); 487 const SIRegisterInfo &TRI = TII->getRegisterInfo(); 488 489 for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); 490 BI != BE; ++BI) { 491 492 MachineBasicBlock &MBB = *BI; 493 MachineBasicBlock::iterator I, Next; 494 for (I = MBB.begin(); I != MBB.end(); I = Next) { 495 Next = std::next(I); 496 MachineInstr &MI = *I; 497 498 if (!isSafeToFold(MI)) 499 continue; 500 501 unsigned OpSize = TII->getOpSize(MI, 1); 502 MachineOperand &OpToFold = MI.getOperand(1); 503 bool FoldingImm = OpToFold.isImm() || OpToFold.isFI(); 504 505 // FIXME: We could also be folding things like FrameIndexes and 506 // TargetIndexes. 507 if (!FoldingImm && !OpToFold.isReg()) 508 continue; 509 510 // Folding immediates with more than one use will increase program size. 511 // FIXME: This will also reduce register usage, which may be better 512 // in some cases. A better heuristic is needed. 513 if (FoldingImm && !TII->isInlineConstant(OpToFold, OpSize) && 514 !MRI.hasOneUse(MI.getOperand(0).getReg())) 515 continue; 516 517 if (OpToFold.isReg() && 518 !TargetRegisterInfo::isVirtualRegister(OpToFold.getReg())) 519 continue; 520 521 // Prevent folding operands backwards in the function. For example, 522 // the COPY opcode must not be replaced by 1 in this example: 523 // 524 // %vreg3<def> = COPY %VGPR0; VGPR_32:%vreg3 525 // ... 526 // %VGPR0<def> = V_MOV_B32_e32 1, %EXEC<imp-use> 527 MachineOperand &Dst = MI.getOperand(0); 528 if (Dst.isReg() && 529 !TargetRegisterInfo::isVirtualRegister(Dst.getReg())) 530 continue; 531 532 // We need mutate the operands of new mov instructions to add implicit 533 // uses of EXEC, but adding them invalidates the use_iterator, so defer 534 // this. 535 SmallVector<MachineInstr *, 4> CopiesToReplace; 536 537 std::vector<FoldCandidate> FoldList; 538 for (MachineRegisterInfo::use_iterator 539 Use = MRI.use_begin(MI.getOperand(0).getReg()), E = MRI.use_end(); 540 Use != E; ++Use) { 541 542 MachineInstr *UseMI = Use->getParent(); 543 544 foldOperand(OpToFold, UseMI, Use.getOperandNo(), FoldList, 545 CopiesToReplace, TII, TRI, MRI); 546 } 547 548 // Make sure we add EXEC uses to any new v_mov instructions created. 549 for (MachineInstr *Copy : CopiesToReplace) 550 Copy->addImplicitDefUseOperands(MF); 551 552 for (FoldCandidate &Fold : FoldList) { 553 if (updateOperand(Fold, TRI)) { 554 // Clear kill flags. 555 if (Fold.isReg()) { 556 assert(Fold.OpToFold && Fold.OpToFold->isReg()); 557 // FIXME: Probably shouldn't bother trying to fold if not an 558 // SGPR. PeepholeOptimizer can eliminate redundant VGPR->VGPR 559 // copies. 560 MRI.clearKillFlags(Fold.OpToFold->getReg()); 561 } 562 DEBUG(dbgs() << "Folded source from " << MI << " into OpNo " << 563 static_cast<int>(Fold.UseOpNo) << " of " << *Fold.UseMI << '\n'); 564 565 // Folding the immediate may reveal operations that can be constant 566 // folded or replaced with a copy. This can happen for example after 567 // frame indices are lowered to constants or from splitting 64-bit 568 // constants. 569 tryConstantFoldOp(MRI, TII, Fold.UseMI); 570 } 571 } 572 } 573 } 574 return false; 575 } 576