1 //===-- SIFoldOperands.cpp - Fold operands --- ----------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 /// \file 8 //===----------------------------------------------------------------------===// 9 // 10 11 #include "AMDGPU.h" 12 #include "GCNSubtarget.h" 13 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 14 #include "SIMachineFunctionInfo.h" 15 #include "llvm/ADT/DepthFirstIterator.h" 16 #include "llvm/CodeGen/MachineFunctionPass.h" 17 #include "llvm/CodeGen/MachineOperand.h" 18 19 #define DEBUG_TYPE "si-fold-operands" 20 using namespace llvm; 21 22 namespace { 23 24 struct FoldCandidate { 25 MachineInstr *UseMI; 26 union { 27 MachineOperand *OpToFold; 28 uint64_t ImmToFold; 29 int FrameIndexToFold; 30 }; 31 int ShrinkOpcode; 32 unsigned UseOpNo; 33 MachineOperand::MachineOperandType Kind; 34 bool Commuted; 35 36 FoldCandidate(MachineInstr *MI, unsigned OpNo, MachineOperand *FoldOp, 37 bool Commuted_ = false, 38 int ShrinkOp = -1) : 39 UseMI(MI), OpToFold(nullptr), ShrinkOpcode(ShrinkOp), UseOpNo(OpNo), 40 Kind(FoldOp->getType()), 41 Commuted(Commuted_) { 42 if (FoldOp->isImm()) { 43 ImmToFold = FoldOp->getImm(); 44 } else if (FoldOp->isFI()) { 45 FrameIndexToFold = FoldOp->getIndex(); 46 } else { 47 assert(FoldOp->isReg() || FoldOp->isGlobal()); 48 OpToFold = FoldOp; 49 } 50 } 51 52 bool isFI() const { 53 return Kind == MachineOperand::MO_FrameIndex; 54 } 55 56 bool isImm() const { 57 return Kind == MachineOperand::MO_Immediate; 58 } 59 60 bool isReg() const { 61 return Kind == MachineOperand::MO_Register; 62 } 63 64 bool isGlobal() const { return Kind == MachineOperand::MO_GlobalAddress; } 65 66 bool needsShrink() const { return ShrinkOpcode != -1; } 67 }; 68 69 class SIFoldOperands : public MachineFunctionPass { 70 public: 71 static char ID; 72 MachineRegisterInfo *MRI; 73 const SIInstrInfo *TII; 74 const SIRegisterInfo *TRI; 75 const GCNSubtarget *ST; 76 const SIMachineFunctionInfo *MFI; 77 78 bool frameIndexMayFold(const MachineInstr &UseMI, int OpNo, 79 const MachineOperand &OpToFold) const; 80 81 bool updateOperand(FoldCandidate &Fold) const; 82 83 bool tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList, 84 MachineInstr *MI, unsigned OpNo, 85 MachineOperand *OpToFold) const; 86 bool isUseSafeToFold(const MachineInstr &MI, 87 const MachineOperand &UseMO) const; 88 bool 89 getRegSeqInit(SmallVectorImpl<std::pair<MachineOperand *, unsigned>> &Defs, 90 Register UseReg, uint8_t OpTy) const; 91 bool tryToFoldACImm(const MachineOperand &OpToFold, MachineInstr *UseMI, 92 unsigned UseOpIdx, 93 SmallVectorImpl<FoldCandidate> &FoldList) const; 94 void foldOperand(MachineOperand &OpToFold, 95 MachineInstr *UseMI, 96 int UseOpIdx, 97 SmallVectorImpl<FoldCandidate> &FoldList, 98 SmallVectorImpl<MachineInstr *> &CopiesToReplace) const; 99 100 MachineOperand *getImmOrMaterializedImm(MachineOperand &Op) const; 101 bool tryConstantFoldOp(MachineInstr *MI) const; 102 bool tryFoldCndMask(MachineInstr &MI) const; 103 bool tryFoldZeroHighBits(MachineInstr &MI) const; 104 bool foldInstOperand(MachineInstr &MI, MachineOperand &OpToFold) const; 105 bool tryFoldFoldableCopy(MachineInstr &MI, 106 MachineOperand *&CurrentKnownM0Val) const; 107 108 const MachineOperand *isClamp(const MachineInstr &MI) const; 109 bool tryFoldClamp(MachineInstr &MI); 110 111 std::pair<const MachineOperand *, int> isOMod(const MachineInstr &MI) const; 112 bool tryFoldOMod(MachineInstr &MI); 113 bool tryFoldRegSequence(MachineInstr &MI); 114 bool tryFoldPhiAGPR(MachineInstr &MI); 115 bool tryFoldLoad(MachineInstr &MI); 116 117 bool tryOptimizeAGPRPhis(MachineBasicBlock &MBB); 118 119 public: 120 SIFoldOperands() : MachineFunctionPass(ID) { 121 initializeSIFoldOperandsPass(*PassRegistry::getPassRegistry()); 122 } 123 124 bool runOnMachineFunction(MachineFunction &MF) override; 125 126 StringRef getPassName() const override { return "SI Fold Operands"; } 127 128 void getAnalysisUsage(AnalysisUsage &AU) const override { 129 AU.setPreservesCFG(); 130 MachineFunctionPass::getAnalysisUsage(AU); 131 } 132 }; 133 134 } // End anonymous namespace. 135 136 INITIALIZE_PASS(SIFoldOperands, DEBUG_TYPE, 137 "SI Fold Operands", false, false) 138 139 char SIFoldOperands::ID = 0; 140 141 char &llvm::SIFoldOperandsID = SIFoldOperands::ID; 142 143 static const TargetRegisterClass *getRegOpRC(const MachineRegisterInfo &MRI, 144 const TargetRegisterInfo &TRI, 145 const MachineOperand &MO) { 146 const TargetRegisterClass *RC = MRI.getRegClass(MO.getReg()); 147 if (const TargetRegisterClass *SubRC = 148 TRI.getSubRegisterClass(RC, MO.getSubReg())) 149 RC = SubRC; 150 return RC; 151 } 152 153 // Map multiply-accumulate opcode to corresponding multiply-add opcode if any. 154 static unsigned macToMad(unsigned Opc) { 155 switch (Opc) { 156 case AMDGPU::V_MAC_F32_e64: 157 return AMDGPU::V_MAD_F32_e64; 158 case AMDGPU::V_MAC_F16_e64: 159 return AMDGPU::V_MAD_F16_e64; 160 case AMDGPU::V_FMAC_F32_e64: 161 return AMDGPU::V_FMA_F32_e64; 162 case AMDGPU::V_FMAC_F16_e64: 163 return AMDGPU::V_FMA_F16_gfx9_e64; 164 case AMDGPU::V_FMAC_F16_t16_e64: 165 return AMDGPU::V_FMA_F16_gfx9_e64; 166 case AMDGPU::V_FMAC_LEGACY_F32_e64: 167 return AMDGPU::V_FMA_LEGACY_F32_e64; 168 case AMDGPU::V_FMAC_F64_e64: 169 return AMDGPU::V_FMA_F64_e64; 170 } 171 return AMDGPU::INSTRUCTION_LIST_END; 172 } 173 174 // TODO: Add heuristic that the frame index might not fit in the addressing mode 175 // immediate offset to avoid materializing in loops. 176 bool SIFoldOperands::frameIndexMayFold(const MachineInstr &UseMI, int OpNo, 177 const MachineOperand &OpToFold) const { 178 if (!OpToFold.isFI()) 179 return false; 180 181 const unsigned Opc = UseMI.getOpcode(); 182 if (TII->isMUBUF(UseMI)) 183 return OpNo == AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr); 184 if (!TII->isFLATScratch(UseMI)) 185 return false; 186 187 int SIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr); 188 if (OpNo == SIdx) 189 return true; 190 191 int VIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr); 192 return OpNo == VIdx && SIdx == -1; 193 } 194 195 FunctionPass *llvm::createSIFoldOperandsPass() { 196 return new SIFoldOperands(); 197 } 198 199 bool SIFoldOperands::updateOperand(FoldCandidate &Fold) const { 200 MachineInstr *MI = Fold.UseMI; 201 MachineOperand &Old = MI->getOperand(Fold.UseOpNo); 202 assert(Old.isReg()); 203 204 205 const uint64_t TSFlags = MI->getDesc().TSFlags; 206 if (Fold.isImm()) { 207 if (TSFlags & SIInstrFlags::IsPacked && !(TSFlags & SIInstrFlags::IsMAI) && 208 (!ST->hasDOTOpSelHazard() || !(TSFlags & SIInstrFlags::IsDOT)) && 209 AMDGPU::isFoldableLiteralV216(Fold.ImmToFold, 210 ST->hasInv2PiInlineImm())) { 211 // Set op_sel/op_sel_hi on this operand or bail out if op_sel is 212 // already set. 213 unsigned Opcode = MI->getOpcode(); 214 int OpNo = MI->getOperandNo(&Old); 215 int ModIdx = -1; 216 if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0)) 217 ModIdx = AMDGPU::OpName::src0_modifiers; 218 else if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1)) 219 ModIdx = AMDGPU::OpName::src1_modifiers; 220 else if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2)) 221 ModIdx = AMDGPU::OpName::src2_modifiers; 222 assert(ModIdx != -1); 223 ModIdx = AMDGPU::getNamedOperandIdx(Opcode, ModIdx); 224 MachineOperand &Mod = MI->getOperand(ModIdx); 225 unsigned Val = Mod.getImm(); 226 if (!(Val & SISrcMods::OP_SEL_0) && (Val & SISrcMods::OP_SEL_1)) { 227 // Only apply the following transformation if that operand requires 228 // a packed immediate. 229 switch (TII->get(Opcode).operands()[OpNo].OperandType) { 230 case AMDGPU::OPERAND_REG_IMM_V2FP16: 231 case AMDGPU::OPERAND_REG_IMM_V2INT16: 232 case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: 233 case AMDGPU::OPERAND_REG_INLINE_C_V2INT16: 234 // If upper part is all zero we do not need op_sel_hi. 235 if (!isUInt<16>(Fold.ImmToFold)) { 236 if (!(Fold.ImmToFold & 0xffff)) { 237 Mod.setImm(Mod.getImm() | SISrcMods::OP_SEL_0); 238 Mod.setImm(Mod.getImm() & ~SISrcMods::OP_SEL_1); 239 Old.ChangeToImmediate((Fold.ImmToFold >> 16) & 0xffff); 240 return true; 241 } 242 Mod.setImm(Mod.getImm() & ~SISrcMods::OP_SEL_1); 243 Old.ChangeToImmediate(Fold.ImmToFold & 0xffff); 244 return true; 245 } 246 break; 247 default: 248 break; 249 } 250 } 251 } 252 } 253 254 if ((Fold.isImm() || Fold.isFI() || Fold.isGlobal()) && Fold.needsShrink()) { 255 MachineBasicBlock *MBB = MI->getParent(); 256 auto Liveness = MBB->computeRegisterLiveness(TRI, AMDGPU::VCC, MI, 16); 257 if (Liveness != MachineBasicBlock::LQR_Dead) { 258 LLVM_DEBUG(dbgs() << "Not shrinking " << MI << " due to vcc liveness\n"); 259 return false; 260 } 261 262 int Op32 = Fold.ShrinkOpcode; 263 MachineOperand &Dst0 = MI->getOperand(0); 264 MachineOperand &Dst1 = MI->getOperand(1); 265 assert(Dst0.isDef() && Dst1.isDef()); 266 267 bool HaveNonDbgCarryUse = !MRI->use_nodbg_empty(Dst1.getReg()); 268 269 const TargetRegisterClass *Dst0RC = MRI->getRegClass(Dst0.getReg()); 270 Register NewReg0 = MRI->createVirtualRegister(Dst0RC); 271 272 MachineInstr *Inst32 = TII->buildShrunkInst(*MI, Op32); 273 274 if (HaveNonDbgCarryUse) { 275 BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(AMDGPU::COPY), 276 Dst1.getReg()) 277 .addReg(AMDGPU::VCC, RegState::Kill); 278 } 279 280 // Keep the old instruction around to avoid breaking iterators, but 281 // replace it with a dummy instruction to remove uses. 282 // 283 // FIXME: We should not invert how this pass looks at operands to avoid 284 // this. Should track set of foldable movs instead of looking for uses 285 // when looking at a use. 286 Dst0.setReg(NewReg0); 287 for (unsigned I = MI->getNumOperands() - 1; I > 0; --I) 288 MI->removeOperand(I); 289 MI->setDesc(TII->get(AMDGPU::IMPLICIT_DEF)); 290 291 if (Fold.Commuted) 292 TII->commuteInstruction(*Inst32, false); 293 return true; 294 } 295 296 assert(!Fold.needsShrink() && "not handled"); 297 298 if (Fold.isImm()) { 299 if (Old.isTied()) { 300 int NewMFMAOpc = AMDGPU::getMFMAEarlyClobberOp(MI->getOpcode()); 301 if (NewMFMAOpc == -1) 302 return false; 303 MI->setDesc(TII->get(NewMFMAOpc)); 304 MI->untieRegOperand(0); 305 } 306 Old.ChangeToImmediate(Fold.ImmToFold); 307 return true; 308 } 309 310 if (Fold.isGlobal()) { 311 Old.ChangeToGA(Fold.OpToFold->getGlobal(), Fold.OpToFold->getOffset(), 312 Fold.OpToFold->getTargetFlags()); 313 return true; 314 } 315 316 if (Fold.isFI()) { 317 Old.ChangeToFrameIndex(Fold.FrameIndexToFold); 318 return true; 319 } 320 321 MachineOperand *New = Fold.OpToFold; 322 Old.substVirtReg(New->getReg(), New->getSubReg(), *TRI); 323 Old.setIsUndef(New->isUndef()); 324 return true; 325 } 326 327 static bool isUseMIInFoldList(ArrayRef<FoldCandidate> FoldList, 328 const MachineInstr *MI) { 329 return any_of(FoldList, [&](const auto &C) { return C.UseMI == MI; }); 330 } 331 332 static void appendFoldCandidate(SmallVectorImpl<FoldCandidate> &FoldList, 333 MachineInstr *MI, unsigned OpNo, 334 MachineOperand *FoldOp, bool Commuted = false, 335 int ShrinkOp = -1) { 336 // Skip additional folding on the same operand. 337 for (FoldCandidate &Fold : FoldList) 338 if (Fold.UseMI == MI && Fold.UseOpNo == OpNo) 339 return; 340 LLVM_DEBUG(dbgs() << "Append " << (Commuted ? "commuted" : "normal") 341 << " operand " << OpNo << "\n " << *MI); 342 FoldList.emplace_back(MI, OpNo, FoldOp, Commuted, ShrinkOp); 343 } 344 345 bool SIFoldOperands::tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList, 346 MachineInstr *MI, unsigned OpNo, 347 MachineOperand *OpToFold) const { 348 const unsigned Opc = MI->getOpcode(); 349 350 auto tryToFoldAsFMAAKorMK = [&]() { 351 if (!OpToFold->isImm()) 352 return false; 353 354 const bool TryAK = OpNo == 3; 355 const unsigned NewOpc = TryAK ? AMDGPU::S_FMAAK_F32 : AMDGPU::S_FMAMK_F32; 356 MI->setDesc(TII->get(NewOpc)); 357 358 // We have to fold into operand which would be Imm not into OpNo. 359 bool FoldAsFMAAKorMK = 360 tryAddToFoldList(FoldList, MI, TryAK ? 3 : 2, OpToFold); 361 if (FoldAsFMAAKorMK) { 362 // Untie Src2 of fmac. 363 MI->untieRegOperand(3); 364 // For fmamk swap operands 1 and 2 if OpToFold was meant for operand 1. 365 if (OpNo == 1) { 366 MachineOperand &Op1 = MI->getOperand(1); 367 MachineOperand &Op2 = MI->getOperand(2); 368 Register OldReg = Op1.getReg(); 369 // Operand 2 might be an inlinable constant 370 if (Op2.isImm()) { 371 Op1.ChangeToImmediate(Op2.getImm()); 372 Op2.ChangeToRegister(OldReg, false); 373 } else { 374 Op1.setReg(Op2.getReg()); 375 Op2.setReg(OldReg); 376 } 377 } 378 return true; 379 } 380 MI->setDesc(TII->get(Opc)); 381 return false; 382 }; 383 384 if (!TII->isOperandLegal(*MI, OpNo, OpToFold)) { 385 // Special case for v_mac_{f16, f32}_e64 if we are trying to fold into src2 386 unsigned NewOpc = macToMad(Opc); 387 if (NewOpc != AMDGPU::INSTRUCTION_LIST_END) { 388 // Check if changing this to a v_mad_{f16, f32} instruction will allow us 389 // to fold the operand. 390 MI->setDesc(TII->get(NewOpc)); 391 bool AddOpSel = !AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::op_sel) && 392 AMDGPU::hasNamedOperand(NewOpc, AMDGPU::OpName::op_sel); 393 if (AddOpSel) 394 MI->addOperand(MachineOperand::CreateImm(0)); 395 bool FoldAsMAD = tryAddToFoldList(FoldList, MI, OpNo, OpToFold); 396 if (FoldAsMAD) { 397 MI->untieRegOperand(OpNo); 398 return true; 399 } 400 if (AddOpSel) 401 MI->removeOperand(MI->getNumExplicitOperands() - 1); 402 MI->setDesc(TII->get(Opc)); 403 } 404 405 // Special case for s_fmac_f32 if we are trying to fold into Src2. 406 // By transforming into fmaak we can untie Src2 and make folding legal. 407 if (Opc == AMDGPU::S_FMAC_F32 && OpNo == 3) { 408 if (tryToFoldAsFMAAKorMK()) 409 return true; 410 } 411 412 // Special case for s_setreg_b32 413 if (OpToFold->isImm()) { 414 unsigned ImmOpc = 0; 415 if (Opc == AMDGPU::S_SETREG_B32) 416 ImmOpc = AMDGPU::S_SETREG_IMM32_B32; 417 else if (Opc == AMDGPU::S_SETREG_B32_mode) 418 ImmOpc = AMDGPU::S_SETREG_IMM32_B32_mode; 419 if (ImmOpc) { 420 MI->setDesc(TII->get(ImmOpc)); 421 appendFoldCandidate(FoldList, MI, OpNo, OpToFold); 422 return true; 423 } 424 } 425 426 // If we are already folding into another operand of MI, then 427 // we can't commute the instruction, otherwise we risk making the 428 // other fold illegal. 429 if (isUseMIInFoldList(FoldList, MI)) 430 return false; 431 432 // Operand is not legal, so try to commute the instruction to 433 // see if this makes it possible to fold. 434 unsigned CommuteOpNo = TargetInstrInfo::CommuteAnyOperandIndex; 435 bool CanCommute = TII->findCommutedOpIndices(*MI, OpNo, CommuteOpNo); 436 if (!CanCommute) 437 return false; 438 439 // One of operands might be an Imm operand, and OpNo may refer to it after 440 // the call of commuteInstruction() below. Such situations are avoided 441 // here explicitly as OpNo must be a register operand to be a candidate 442 // for memory folding. 443 if (!MI->getOperand(OpNo).isReg() || !MI->getOperand(CommuteOpNo).isReg()) 444 return false; 445 446 if (!TII->commuteInstruction(*MI, false, OpNo, CommuteOpNo)) 447 return false; 448 449 int Op32 = -1; 450 if (!TII->isOperandLegal(*MI, CommuteOpNo, OpToFold)) { 451 if ((Opc != AMDGPU::V_ADD_CO_U32_e64 && Opc != AMDGPU::V_SUB_CO_U32_e64 && 452 Opc != AMDGPU::V_SUBREV_CO_U32_e64) || // FIXME 453 (!OpToFold->isImm() && !OpToFold->isFI() && !OpToFold->isGlobal())) { 454 TII->commuteInstruction(*MI, false, OpNo, CommuteOpNo); 455 return false; 456 } 457 458 // Verify the other operand is a VGPR, otherwise we would violate the 459 // constant bus restriction. 460 MachineOperand &OtherOp = MI->getOperand(OpNo); 461 if (!OtherOp.isReg() || 462 !TII->getRegisterInfo().isVGPR(*MRI, OtherOp.getReg())) 463 return false; 464 465 assert(MI->getOperand(1).isDef()); 466 467 // Make sure to get the 32-bit version of the commuted opcode. 468 unsigned MaybeCommutedOpc = MI->getOpcode(); 469 Op32 = AMDGPU::getVOPe32(MaybeCommutedOpc); 470 } 471 472 appendFoldCandidate(FoldList, MI, CommuteOpNo, OpToFold, true, Op32); 473 return true; 474 } 475 476 // Inlineable constant might have been folded into Imm operand of fmaak or 477 // fmamk and we are trying to fold a non-inlinable constant. 478 if ((Opc == AMDGPU::S_FMAAK_F32 || Opc == AMDGPU::S_FMAMK_F32) && 479 !OpToFold->isReg() && !TII->isInlineConstant(*OpToFold)) { 480 unsigned ImmIdx = Opc == AMDGPU::S_FMAAK_F32 ? 3 : 2; 481 MachineOperand &OpImm = MI->getOperand(ImmIdx); 482 if (!OpImm.isReg() && 483 TII->isInlineConstant(*MI, MI->getOperand(OpNo), OpImm)) 484 return tryToFoldAsFMAAKorMK(); 485 } 486 487 // Special case for s_fmac_f32 if we are trying to fold into Src0 or Src1. 488 // By changing into fmamk we can untie Src2. 489 // If folding for Src0 happens first and it is identical operand to Src1 we 490 // should avoid transforming into fmamk which requires commuting as it would 491 // cause folding into Src1 to fail later on due to wrong OpNo used. 492 if (Opc == AMDGPU::S_FMAC_F32 && 493 (OpNo != 1 || !MI->getOperand(1).isIdenticalTo(MI->getOperand(2)))) { 494 if (tryToFoldAsFMAAKorMK()) 495 return true; 496 } 497 498 // Check the case where we might introduce a second constant operand to a 499 // scalar instruction 500 if (TII->isSALU(MI->getOpcode())) { 501 const MCInstrDesc &InstDesc = MI->getDesc(); 502 const MCOperandInfo &OpInfo = InstDesc.operands()[OpNo]; 503 504 // Fine if the operand can be encoded as an inline constant 505 if (!OpToFold->isReg() && !TII->isInlineConstant(*OpToFold, OpInfo)) { 506 // Otherwise check for another constant 507 for (unsigned i = 0, e = InstDesc.getNumOperands(); i != e; ++i) { 508 auto &Op = MI->getOperand(i); 509 if (OpNo != i && !Op.isReg() && 510 !TII->isInlineConstant(Op, InstDesc.operands()[i])) 511 return false; 512 } 513 } 514 } 515 516 appendFoldCandidate(FoldList, MI, OpNo, OpToFold); 517 return true; 518 } 519 520 bool SIFoldOperands::isUseSafeToFold(const MachineInstr &MI, 521 const MachineOperand &UseMO) const { 522 // Operands of SDWA instructions must be registers. 523 return !TII->isSDWA(MI); 524 } 525 526 // Find a def of the UseReg, check if it is a reg_sequence and find initializers 527 // for each subreg, tracking it to foldable inline immediate if possible. 528 // Returns true on success. 529 bool SIFoldOperands::getRegSeqInit( 530 SmallVectorImpl<std::pair<MachineOperand *, unsigned>> &Defs, 531 Register UseReg, uint8_t OpTy) const { 532 MachineInstr *Def = MRI->getVRegDef(UseReg); 533 if (!Def || !Def->isRegSequence()) 534 return false; 535 536 for (unsigned I = 1, E = Def->getNumExplicitOperands(); I < E; I += 2) { 537 MachineOperand *Sub = &Def->getOperand(I); 538 assert(Sub->isReg()); 539 540 for (MachineInstr *SubDef = MRI->getVRegDef(Sub->getReg()); 541 SubDef && Sub->isReg() && Sub->getReg().isVirtual() && 542 !Sub->getSubReg() && TII->isFoldableCopy(*SubDef); 543 SubDef = MRI->getVRegDef(Sub->getReg())) { 544 MachineOperand *Op = &SubDef->getOperand(1); 545 if (Op->isImm()) { 546 if (TII->isInlineConstant(*Op, OpTy)) 547 Sub = Op; 548 break; 549 } 550 if (!Op->isReg() || Op->getReg().isPhysical()) 551 break; 552 Sub = Op; 553 } 554 555 Defs.emplace_back(Sub, Def->getOperand(I + 1).getImm()); 556 } 557 558 return true; 559 } 560 561 bool SIFoldOperands::tryToFoldACImm( 562 const MachineOperand &OpToFold, MachineInstr *UseMI, unsigned UseOpIdx, 563 SmallVectorImpl<FoldCandidate> &FoldList) const { 564 const MCInstrDesc &Desc = UseMI->getDesc(); 565 if (UseOpIdx >= Desc.getNumOperands()) 566 return false; 567 568 if (!AMDGPU::isSISrcInlinableOperand(Desc, UseOpIdx)) 569 return false; 570 571 uint8_t OpTy = Desc.operands()[UseOpIdx].OperandType; 572 if (OpToFold.isImm() && TII->isInlineConstant(OpToFold, OpTy) && 573 TII->isOperandLegal(*UseMI, UseOpIdx, &OpToFold)) { 574 UseMI->getOperand(UseOpIdx).ChangeToImmediate(OpToFold.getImm()); 575 return true; 576 } 577 578 if (!OpToFold.isReg()) 579 return false; 580 581 Register UseReg = OpToFold.getReg(); 582 if (!UseReg.isVirtual()) 583 return false; 584 585 if (isUseMIInFoldList(FoldList, UseMI)) 586 return false; 587 588 // Maybe it is just a COPY of an immediate itself. 589 MachineInstr *Def = MRI->getVRegDef(UseReg); 590 MachineOperand &UseOp = UseMI->getOperand(UseOpIdx); 591 if (!UseOp.getSubReg() && Def && TII->isFoldableCopy(*Def)) { 592 MachineOperand &DefOp = Def->getOperand(1); 593 if (DefOp.isImm() && TII->isInlineConstant(DefOp, OpTy) && 594 TII->isOperandLegal(*UseMI, UseOpIdx, &DefOp)) { 595 UseMI->getOperand(UseOpIdx).ChangeToImmediate(DefOp.getImm()); 596 return true; 597 } 598 } 599 600 SmallVector<std::pair<MachineOperand*, unsigned>, 32> Defs; 601 if (!getRegSeqInit(Defs, UseReg, OpTy)) 602 return false; 603 604 int32_t Imm; 605 for (unsigned I = 0, E = Defs.size(); I != E; ++I) { 606 const MachineOperand *Op = Defs[I].first; 607 if (!Op->isImm()) 608 return false; 609 610 auto SubImm = Op->getImm(); 611 if (!I) { 612 Imm = SubImm; 613 if (!TII->isInlineConstant(*Op, OpTy) || 614 !TII->isOperandLegal(*UseMI, UseOpIdx, Op)) 615 return false; 616 617 continue; 618 } 619 if (Imm != SubImm) 620 return false; // Can only fold splat constants 621 } 622 623 appendFoldCandidate(FoldList, UseMI, UseOpIdx, Defs[0].first); 624 return true; 625 } 626 627 void SIFoldOperands::foldOperand( 628 MachineOperand &OpToFold, 629 MachineInstr *UseMI, 630 int UseOpIdx, 631 SmallVectorImpl<FoldCandidate> &FoldList, 632 SmallVectorImpl<MachineInstr *> &CopiesToReplace) const { 633 const MachineOperand &UseOp = UseMI->getOperand(UseOpIdx); 634 635 if (!isUseSafeToFold(*UseMI, UseOp)) 636 return; 637 638 // FIXME: Fold operands with subregs. 639 if (UseOp.isReg() && OpToFold.isReg() && 640 (UseOp.isImplicit() || UseOp.getSubReg() != AMDGPU::NoSubRegister)) 641 return; 642 643 // Special case for REG_SEQUENCE: We can't fold literals into 644 // REG_SEQUENCE instructions, so we have to fold them into the 645 // uses of REG_SEQUENCE. 646 if (UseMI->isRegSequence()) { 647 Register RegSeqDstReg = UseMI->getOperand(0).getReg(); 648 unsigned RegSeqDstSubReg = UseMI->getOperand(UseOpIdx + 1).getImm(); 649 650 for (auto &RSUse : make_early_inc_range(MRI->use_nodbg_operands(RegSeqDstReg))) { 651 MachineInstr *RSUseMI = RSUse.getParent(); 652 653 if (tryToFoldACImm(UseMI->getOperand(0), RSUseMI, 654 RSUseMI->getOperandNo(&RSUse), FoldList)) 655 continue; 656 657 if (RSUse.getSubReg() != RegSeqDstSubReg) 658 continue; 659 660 foldOperand(OpToFold, RSUseMI, RSUseMI->getOperandNo(&RSUse), FoldList, 661 CopiesToReplace); 662 } 663 664 return; 665 } 666 667 if (tryToFoldACImm(OpToFold, UseMI, UseOpIdx, FoldList)) 668 return; 669 670 if (frameIndexMayFold(*UseMI, UseOpIdx, OpToFold)) { 671 // Verify that this is a stack access. 672 // FIXME: Should probably use stack pseudos before frame lowering. 673 674 if (TII->isMUBUF(*UseMI)) { 675 if (TII->getNamedOperand(*UseMI, AMDGPU::OpName::srsrc)->getReg() != 676 MFI->getScratchRSrcReg()) 677 return; 678 679 // Ensure this is either relative to the current frame or the current 680 // wave. 681 MachineOperand &SOff = 682 *TII->getNamedOperand(*UseMI, AMDGPU::OpName::soffset); 683 if (!SOff.isImm() || SOff.getImm() != 0) 684 return; 685 } 686 687 // A frame index will resolve to a positive constant, so it should always be 688 // safe to fold the addressing mode, even pre-GFX9. 689 UseMI->getOperand(UseOpIdx).ChangeToFrameIndex(OpToFold.getIndex()); 690 691 const unsigned Opc = UseMI->getOpcode(); 692 if (TII->isFLATScratch(*UseMI) && 693 AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vaddr) && 694 !AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::saddr)) { 695 unsigned NewOpc = AMDGPU::getFlatScratchInstSSfromSV(Opc); 696 UseMI->setDesc(TII->get(NewOpc)); 697 } 698 699 return; 700 } 701 702 bool FoldingImmLike = 703 OpToFold.isImm() || OpToFold.isFI() || OpToFold.isGlobal(); 704 705 if (FoldingImmLike && UseMI->isCopy()) { 706 Register DestReg = UseMI->getOperand(0).getReg(); 707 Register SrcReg = UseMI->getOperand(1).getReg(); 708 assert(SrcReg.isVirtual()); 709 710 const TargetRegisterClass *SrcRC = MRI->getRegClass(SrcReg); 711 712 // Don't fold into a copy to a physical register with the same class. Doing 713 // so would interfere with the register coalescer's logic which would avoid 714 // redundant initializations. 715 if (DestReg.isPhysical() && SrcRC->contains(DestReg)) 716 return; 717 718 const TargetRegisterClass *DestRC = TRI->getRegClassForReg(*MRI, DestReg); 719 if (!DestReg.isPhysical()) { 720 if (DestRC == &AMDGPU::AGPR_32RegClass && 721 TII->isInlineConstant(OpToFold, AMDGPU::OPERAND_REG_INLINE_C_INT32)) { 722 UseMI->setDesc(TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64)); 723 UseMI->getOperand(1).ChangeToImmediate(OpToFold.getImm()); 724 CopiesToReplace.push_back(UseMI); 725 return; 726 } 727 } 728 729 // In order to fold immediates into copies, we need to change the 730 // copy to a MOV. 731 732 unsigned MovOp = TII->getMovOpcode(DestRC); 733 if (MovOp == AMDGPU::COPY) 734 return; 735 736 UseMI->setDesc(TII->get(MovOp)); 737 MachineInstr::mop_iterator ImpOpI = UseMI->implicit_operands().begin(); 738 MachineInstr::mop_iterator ImpOpE = UseMI->implicit_operands().end(); 739 while (ImpOpI != ImpOpE) { 740 MachineInstr::mop_iterator Tmp = ImpOpI; 741 ImpOpI++; 742 UseMI->removeOperand(UseMI->getOperandNo(Tmp)); 743 } 744 CopiesToReplace.push_back(UseMI); 745 } else { 746 if (UseMI->isCopy() && OpToFold.isReg() && 747 UseMI->getOperand(0).getReg().isVirtual() && 748 !UseMI->getOperand(1).getSubReg()) { 749 LLVM_DEBUG(dbgs() << "Folding " << OpToFold << "\n into " << *UseMI); 750 unsigned Size = TII->getOpSize(*UseMI, 1); 751 Register UseReg = OpToFold.getReg(); 752 UseMI->getOperand(1).setReg(UseReg); 753 UseMI->getOperand(1).setSubReg(OpToFold.getSubReg()); 754 UseMI->getOperand(1).setIsKill(false); 755 CopiesToReplace.push_back(UseMI); 756 OpToFold.setIsKill(false); 757 758 // Remove kill flags as kills may now be out of order with uses. 759 MRI->clearKillFlags(OpToFold.getReg()); 760 761 // That is very tricky to store a value into an AGPR. v_accvgpr_write_b32 762 // can only accept VGPR or inline immediate. Recreate a reg_sequence with 763 // its initializers right here, so we will rematerialize immediates and 764 // avoid copies via different reg classes. 765 SmallVector<std::pair<MachineOperand*, unsigned>, 32> Defs; 766 if (Size > 4 && TRI->isAGPR(*MRI, UseMI->getOperand(0).getReg()) && 767 getRegSeqInit(Defs, UseReg, AMDGPU::OPERAND_REG_INLINE_C_INT32)) { 768 const DebugLoc &DL = UseMI->getDebugLoc(); 769 MachineBasicBlock &MBB = *UseMI->getParent(); 770 771 UseMI->setDesc(TII->get(AMDGPU::REG_SEQUENCE)); 772 for (unsigned I = UseMI->getNumOperands() - 1; I > 0; --I) 773 UseMI->removeOperand(I); 774 775 MachineInstrBuilder B(*MBB.getParent(), UseMI); 776 DenseMap<TargetInstrInfo::RegSubRegPair, Register> VGPRCopies; 777 SmallSetVector<TargetInstrInfo::RegSubRegPair, 32> SeenAGPRs; 778 for (unsigned I = 0; I < Size / 4; ++I) { 779 MachineOperand *Def = Defs[I].first; 780 TargetInstrInfo::RegSubRegPair CopyToVGPR; 781 if (Def->isImm() && 782 TII->isInlineConstant(*Def, AMDGPU::OPERAND_REG_INLINE_C_INT32)) { 783 int64_t Imm = Def->getImm(); 784 785 auto Tmp = MRI->createVirtualRegister(&AMDGPU::AGPR_32RegClass); 786 BuildMI(MBB, UseMI, DL, 787 TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), Tmp).addImm(Imm); 788 B.addReg(Tmp); 789 } else if (Def->isReg() && TRI->isAGPR(*MRI, Def->getReg())) { 790 auto Src = getRegSubRegPair(*Def); 791 Def->setIsKill(false); 792 if (!SeenAGPRs.insert(Src)) { 793 // We cannot build a reg_sequence out of the same registers, they 794 // must be copied. Better do it here before copyPhysReg() created 795 // several reads to do the AGPR->VGPR->AGPR copy. 796 CopyToVGPR = Src; 797 } else { 798 B.addReg(Src.Reg, Def->isUndef() ? RegState::Undef : 0, 799 Src.SubReg); 800 } 801 } else { 802 assert(Def->isReg()); 803 Def->setIsKill(false); 804 auto Src = getRegSubRegPair(*Def); 805 806 // Direct copy from SGPR to AGPR is not possible. To avoid creation 807 // of exploded copies SGPR->VGPR->AGPR in the copyPhysReg() later, 808 // create a copy here and track if we already have such a copy. 809 if (TRI->isSGPRReg(*MRI, Src.Reg)) { 810 CopyToVGPR = Src; 811 } else { 812 auto Tmp = MRI->createVirtualRegister(&AMDGPU::AGPR_32RegClass); 813 BuildMI(MBB, UseMI, DL, TII->get(AMDGPU::COPY), Tmp).add(*Def); 814 B.addReg(Tmp); 815 } 816 } 817 818 if (CopyToVGPR.Reg) { 819 Register Vgpr; 820 if (VGPRCopies.count(CopyToVGPR)) { 821 Vgpr = VGPRCopies[CopyToVGPR]; 822 } else { 823 Vgpr = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 824 BuildMI(MBB, UseMI, DL, TII->get(AMDGPU::COPY), Vgpr).add(*Def); 825 VGPRCopies[CopyToVGPR] = Vgpr; 826 } 827 auto Tmp = MRI->createVirtualRegister(&AMDGPU::AGPR_32RegClass); 828 BuildMI(MBB, UseMI, DL, 829 TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), Tmp).addReg(Vgpr); 830 B.addReg(Tmp); 831 } 832 833 B.addImm(Defs[I].second); 834 } 835 LLVM_DEBUG(dbgs() << "Folded " << *UseMI); 836 return; 837 } 838 839 if (Size != 4) 840 return; 841 842 Register Reg0 = UseMI->getOperand(0).getReg(); 843 Register Reg1 = UseMI->getOperand(1).getReg(); 844 if (TRI->isAGPR(*MRI, Reg0) && TRI->isVGPR(*MRI, Reg1)) 845 UseMI->setDesc(TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64)); 846 else if (TRI->isVGPR(*MRI, Reg0) && TRI->isAGPR(*MRI, Reg1)) 847 UseMI->setDesc(TII->get(AMDGPU::V_ACCVGPR_READ_B32_e64)); 848 else if (ST->hasGFX90AInsts() && TRI->isAGPR(*MRI, Reg0) && 849 TRI->isAGPR(*MRI, Reg1)) 850 UseMI->setDesc(TII->get(AMDGPU::V_ACCVGPR_MOV_B32)); 851 return; 852 } 853 854 unsigned UseOpc = UseMI->getOpcode(); 855 if (UseOpc == AMDGPU::V_READFIRSTLANE_B32 || 856 (UseOpc == AMDGPU::V_READLANE_B32 && 857 (int)UseOpIdx == 858 AMDGPU::getNamedOperandIdx(UseOpc, AMDGPU::OpName::src0))) { 859 // %vgpr = V_MOV_B32 imm 860 // %sgpr = V_READFIRSTLANE_B32 %vgpr 861 // => 862 // %sgpr = S_MOV_B32 imm 863 if (FoldingImmLike) { 864 if (execMayBeModifiedBeforeUse(*MRI, 865 UseMI->getOperand(UseOpIdx).getReg(), 866 *OpToFold.getParent(), 867 *UseMI)) 868 return; 869 870 UseMI->setDesc(TII->get(AMDGPU::S_MOV_B32)); 871 872 if (OpToFold.isImm()) 873 UseMI->getOperand(1).ChangeToImmediate(OpToFold.getImm()); 874 else 875 UseMI->getOperand(1).ChangeToFrameIndex(OpToFold.getIndex()); 876 UseMI->removeOperand(2); // Remove exec read (or src1 for readlane) 877 return; 878 } 879 880 if (OpToFold.isReg() && TRI->isSGPRReg(*MRI, OpToFold.getReg())) { 881 if (execMayBeModifiedBeforeUse(*MRI, 882 UseMI->getOperand(UseOpIdx).getReg(), 883 *OpToFold.getParent(), 884 *UseMI)) 885 return; 886 887 // %vgpr = COPY %sgpr0 888 // %sgpr1 = V_READFIRSTLANE_B32 %vgpr 889 // => 890 // %sgpr1 = COPY %sgpr0 891 UseMI->setDesc(TII->get(AMDGPU::COPY)); 892 UseMI->getOperand(1).setReg(OpToFold.getReg()); 893 UseMI->getOperand(1).setSubReg(OpToFold.getSubReg()); 894 UseMI->getOperand(1).setIsKill(false); 895 UseMI->removeOperand(2); // Remove exec read (or src1 for readlane) 896 return; 897 } 898 } 899 900 const MCInstrDesc &UseDesc = UseMI->getDesc(); 901 902 // Don't fold into target independent nodes. Target independent opcodes 903 // don't have defined register classes. 904 if (UseDesc.isVariadic() || UseOp.isImplicit() || 905 UseDesc.operands()[UseOpIdx].RegClass == -1) 906 return; 907 } 908 909 if (!FoldingImmLike) { 910 if (OpToFold.isReg() && ST->needsAlignedVGPRs()) { 911 // Don't fold if OpToFold doesn't hold an aligned register. 912 const TargetRegisterClass *RC = 913 TRI->getRegClassForReg(*MRI, OpToFold.getReg()); 914 if (TRI->hasVectorRegisters(RC) && OpToFold.getSubReg()) { 915 unsigned SubReg = OpToFold.getSubReg(); 916 if (const TargetRegisterClass *SubRC = 917 TRI->getSubRegisterClass(RC, SubReg)) 918 RC = SubRC; 919 } 920 921 if (!RC || !TRI->isProperlyAlignedRC(*RC)) 922 return; 923 } 924 925 tryAddToFoldList(FoldList, UseMI, UseOpIdx, &OpToFold); 926 927 // FIXME: We could try to change the instruction from 64-bit to 32-bit 928 // to enable more folding opportunities. The shrink operands pass 929 // already does this. 930 return; 931 } 932 933 934 const MCInstrDesc &FoldDesc = OpToFold.getParent()->getDesc(); 935 const TargetRegisterClass *FoldRC = 936 TRI->getRegClass(FoldDesc.operands()[0].RegClass); 937 938 // Split 64-bit constants into 32-bits for folding. 939 if (UseOp.getSubReg() && AMDGPU::getRegBitWidth(*FoldRC) == 64) { 940 Register UseReg = UseOp.getReg(); 941 const TargetRegisterClass *UseRC = MRI->getRegClass(UseReg); 942 if (AMDGPU::getRegBitWidth(*UseRC) != 64) 943 return; 944 945 APInt Imm(64, OpToFold.getImm()); 946 if (UseOp.getSubReg() == AMDGPU::sub0) { 947 Imm = Imm.getLoBits(32); 948 } else { 949 assert(UseOp.getSubReg() == AMDGPU::sub1); 950 Imm = Imm.getHiBits(32); 951 } 952 953 MachineOperand ImmOp = MachineOperand::CreateImm(Imm.getSExtValue()); 954 tryAddToFoldList(FoldList, UseMI, UseOpIdx, &ImmOp); 955 return; 956 } 957 958 tryAddToFoldList(FoldList, UseMI, UseOpIdx, &OpToFold); 959 } 960 961 static bool evalBinaryInstruction(unsigned Opcode, int32_t &Result, 962 uint32_t LHS, uint32_t RHS) { 963 switch (Opcode) { 964 case AMDGPU::V_AND_B32_e64: 965 case AMDGPU::V_AND_B32_e32: 966 case AMDGPU::S_AND_B32: 967 Result = LHS & RHS; 968 return true; 969 case AMDGPU::V_OR_B32_e64: 970 case AMDGPU::V_OR_B32_e32: 971 case AMDGPU::S_OR_B32: 972 Result = LHS | RHS; 973 return true; 974 case AMDGPU::V_XOR_B32_e64: 975 case AMDGPU::V_XOR_B32_e32: 976 case AMDGPU::S_XOR_B32: 977 Result = LHS ^ RHS; 978 return true; 979 case AMDGPU::S_XNOR_B32: 980 Result = ~(LHS ^ RHS); 981 return true; 982 case AMDGPU::S_NAND_B32: 983 Result = ~(LHS & RHS); 984 return true; 985 case AMDGPU::S_NOR_B32: 986 Result = ~(LHS | RHS); 987 return true; 988 case AMDGPU::S_ANDN2_B32: 989 Result = LHS & ~RHS; 990 return true; 991 case AMDGPU::S_ORN2_B32: 992 Result = LHS | ~RHS; 993 return true; 994 case AMDGPU::V_LSHL_B32_e64: 995 case AMDGPU::V_LSHL_B32_e32: 996 case AMDGPU::S_LSHL_B32: 997 // The instruction ignores the high bits for out of bounds shifts. 998 Result = LHS << (RHS & 31); 999 return true; 1000 case AMDGPU::V_LSHLREV_B32_e64: 1001 case AMDGPU::V_LSHLREV_B32_e32: 1002 Result = RHS << (LHS & 31); 1003 return true; 1004 case AMDGPU::V_LSHR_B32_e64: 1005 case AMDGPU::V_LSHR_B32_e32: 1006 case AMDGPU::S_LSHR_B32: 1007 Result = LHS >> (RHS & 31); 1008 return true; 1009 case AMDGPU::V_LSHRREV_B32_e64: 1010 case AMDGPU::V_LSHRREV_B32_e32: 1011 Result = RHS >> (LHS & 31); 1012 return true; 1013 case AMDGPU::V_ASHR_I32_e64: 1014 case AMDGPU::V_ASHR_I32_e32: 1015 case AMDGPU::S_ASHR_I32: 1016 Result = static_cast<int32_t>(LHS) >> (RHS & 31); 1017 return true; 1018 case AMDGPU::V_ASHRREV_I32_e64: 1019 case AMDGPU::V_ASHRREV_I32_e32: 1020 Result = static_cast<int32_t>(RHS) >> (LHS & 31); 1021 return true; 1022 default: 1023 return false; 1024 } 1025 } 1026 1027 static unsigned getMovOpc(bool IsScalar) { 1028 return IsScalar ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32; 1029 } 1030 1031 static void mutateCopyOp(MachineInstr &MI, const MCInstrDesc &NewDesc) { 1032 MI.setDesc(NewDesc); 1033 1034 // Remove any leftover implicit operands from mutating the instruction. e.g. 1035 // if we replace an s_and_b32 with a copy, we don't need the implicit scc def 1036 // anymore. 1037 const MCInstrDesc &Desc = MI.getDesc(); 1038 unsigned NumOps = Desc.getNumOperands() + Desc.implicit_uses().size() + 1039 Desc.implicit_defs().size(); 1040 1041 for (unsigned I = MI.getNumOperands() - 1; I >= NumOps; --I) 1042 MI.removeOperand(I); 1043 } 1044 1045 MachineOperand * 1046 SIFoldOperands::getImmOrMaterializedImm(MachineOperand &Op) const { 1047 // If this has a subregister, it obviously is a register source. 1048 if (!Op.isReg() || Op.getSubReg() != AMDGPU::NoSubRegister || 1049 !Op.getReg().isVirtual()) 1050 return &Op; 1051 1052 MachineInstr *Def = MRI->getVRegDef(Op.getReg()); 1053 if (Def && Def->isMoveImmediate()) { 1054 MachineOperand &ImmSrc = Def->getOperand(1); 1055 if (ImmSrc.isImm()) 1056 return &ImmSrc; 1057 } 1058 1059 return &Op; 1060 } 1061 1062 // Try to simplify operations with a constant that may appear after instruction 1063 // selection. 1064 // TODO: See if a frame index with a fixed offset can fold. 1065 bool SIFoldOperands::tryConstantFoldOp(MachineInstr *MI) const { 1066 if (!MI->allImplicitDefsAreDead()) 1067 return false; 1068 1069 unsigned Opc = MI->getOpcode(); 1070 1071 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); 1072 if (Src0Idx == -1) 1073 return false; 1074 MachineOperand *Src0 = getImmOrMaterializedImm(MI->getOperand(Src0Idx)); 1075 1076 if ((Opc == AMDGPU::V_NOT_B32_e64 || Opc == AMDGPU::V_NOT_B32_e32 || 1077 Opc == AMDGPU::S_NOT_B32) && 1078 Src0->isImm()) { 1079 MI->getOperand(1).ChangeToImmediate(~Src0->getImm()); 1080 mutateCopyOp(*MI, TII->get(getMovOpc(Opc == AMDGPU::S_NOT_B32))); 1081 return true; 1082 } 1083 1084 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1); 1085 if (Src1Idx == -1) 1086 return false; 1087 MachineOperand *Src1 = getImmOrMaterializedImm(MI->getOperand(Src1Idx)); 1088 1089 if (!Src0->isImm() && !Src1->isImm()) 1090 return false; 1091 1092 // and k0, k1 -> v_mov_b32 (k0 & k1) 1093 // or k0, k1 -> v_mov_b32 (k0 | k1) 1094 // xor k0, k1 -> v_mov_b32 (k0 ^ k1) 1095 if (Src0->isImm() && Src1->isImm()) { 1096 int32_t NewImm; 1097 if (!evalBinaryInstruction(Opc, NewImm, Src0->getImm(), Src1->getImm())) 1098 return false; 1099 1100 bool IsSGPR = TRI->isSGPRReg(*MRI, MI->getOperand(0).getReg()); 1101 1102 // Be careful to change the right operand, src0 may belong to a different 1103 // instruction. 1104 MI->getOperand(Src0Idx).ChangeToImmediate(NewImm); 1105 MI->removeOperand(Src1Idx); 1106 mutateCopyOp(*MI, TII->get(getMovOpc(IsSGPR))); 1107 return true; 1108 } 1109 1110 if (!MI->isCommutable()) 1111 return false; 1112 1113 if (Src0->isImm() && !Src1->isImm()) { 1114 std::swap(Src0, Src1); 1115 std::swap(Src0Idx, Src1Idx); 1116 } 1117 1118 int32_t Src1Val = static_cast<int32_t>(Src1->getImm()); 1119 if (Opc == AMDGPU::V_OR_B32_e64 || 1120 Opc == AMDGPU::V_OR_B32_e32 || 1121 Opc == AMDGPU::S_OR_B32) { 1122 if (Src1Val == 0) { 1123 // y = or x, 0 => y = copy x 1124 MI->removeOperand(Src1Idx); 1125 mutateCopyOp(*MI, TII->get(AMDGPU::COPY)); 1126 } else if (Src1Val == -1) { 1127 // y = or x, -1 => y = v_mov_b32 -1 1128 MI->removeOperand(Src1Idx); 1129 mutateCopyOp(*MI, TII->get(getMovOpc(Opc == AMDGPU::S_OR_B32))); 1130 } else 1131 return false; 1132 1133 return true; 1134 } 1135 1136 if (Opc == AMDGPU::V_AND_B32_e64 || Opc == AMDGPU::V_AND_B32_e32 || 1137 Opc == AMDGPU::S_AND_B32) { 1138 if (Src1Val == 0) { 1139 // y = and x, 0 => y = v_mov_b32 0 1140 MI->removeOperand(Src0Idx); 1141 mutateCopyOp(*MI, TII->get(getMovOpc(Opc == AMDGPU::S_AND_B32))); 1142 } else if (Src1Val == -1) { 1143 // y = and x, -1 => y = copy x 1144 MI->removeOperand(Src1Idx); 1145 mutateCopyOp(*MI, TII->get(AMDGPU::COPY)); 1146 } else 1147 return false; 1148 1149 return true; 1150 } 1151 1152 if (Opc == AMDGPU::V_XOR_B32_e64 || Opc == AMDGPU::V_XOR_B32_e32 || 1153 Opc == AMDGPU::S_XOR_B32) { 1154 if (Src1Val == 0) { 1155 // y = xor x, 0 => y = copy x 1156 MI->removeOperand(Src1Idx); 1157 mutateCopyOp(*MI, TII->get(AMDGPU::COPY)); 1158 return true; 1159 } 1160 } 1161 1162 return false; 1163 } 1164 1165 // Try to fold an instruction into a simpler one 1166 bool SIFoldOperands::tryFoldCndMask(MachineInstr &MI) const { 1167 unsigned Opc = MI.getOpcode(); 1168 if (Opc != AMDGPU::V_CNDMASK_B32_e32 && Opc != AMDGPU::V_CNDMASK_B32_e64 && 1169 Opc != AMDGPU::V_CNDMASK_B64_PSEUDO) 1170 return false; 1171 1172 MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); 1173 MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); 1174 if (!Src1->isIdenticalTo(*Src0)) { 1175 auto *Src0Imm = getImmOrMaterializedImm(*Src0); 1176 auto *Src1Imm = getImmOrMaterializedImm(*Src1); 1177 if (!Src1Imm->isIdenticalTo(*Src0Imm)) 1178 return false; 1179 } 1180 1181 int Src1ModIdx = 1182 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1_modifiers); 1183 int Src0ModIdx = 1184 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0_modifiers); 1185 if ((Src1ModIdx != -1 && MI.getOperand(Src1ModIdx).getImm() != 0) || 1186 (Src0ModIdx != -1 && MI.getOperand(Src0ModIdx).getImm() != 0)) 1187 return false; 1188 1189 LLVM_DEBUG(dbgs() << "Folded " << MI << " into "); 1190 auto &NewDesc = 1191 TII->get(Src0->isReg() ? (unsigned)AMDGPU::COPY : getMovOpc(false)); 1192 int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2); 1193 if (Src2Idx != -1) 1194 MI.removeOperand(Src2Idx); 1195 MI.removeOperand(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1)); 1196 if (Src1ModIdx != -1) 1197 MI.removeOperand(Src1ModIdx); 1198 if (Src0ModIdx != -1) 1199 MI.removeOperand(Src0ModIdx); 1200 mutateCopyOp(MI, NewDesc); 1201 LLVM_DEBUG(dbgs() << MI); 1202 return true; 1203 } 1204 1205 bool SIFoldOperands::tryFoldZeroHighBits(MachineInstr &MI) const { 1206 if (MI.getOpcode() != AMDGPU::V_AND_B32_e64 && 1207 MI.getOpcode() != AMDGPU::V_AND_B32_e32) 1208 return false; 1209 1210 MachineOperand *Src0 = getImmOrMaterializedImm(MI.getOperand(1)); 1211 if (!Src0->isImm() || Src0->getImm() != 0xffff) 1212 return false; 1213 1214 Register Src1 = MI.getOperand(2).getReg(); 1215 MachineInstr *SrcDef = MRI->getVRegDef(Src1); 1216 if (!ST->zeroesHigh16BitsOfDest(SrcDef->getOpcode())) 1217 return false; 1218 1219 Register Dst = MI.getOperand(0).getReg(); 1220 MRI->replaceRegWith(Dst, SrcDef->getOperand(0).getReg()); 1221 MI.eraseFromParent(); 1222 return true; 1223 } 1224 1225 bool SIFoldOperands::foldInstOperand(MachineInstr &MI, 1226 MachineOperand &OpToFold) const { 1227 // We need mutate the operands of new mov instructions to add implicit 1228 // uses of EXEC, but adding them invalidates the use_iterator, so defer 1229 // this. 1230 SmallVector<MachineInstr *, 4> CopiesToReplace; 1231 SmallVector<FoldCandidate, 4> FoldList; 1232 MachineOperand &Dst = MI.getOperand(0); 1233 bool Changed = false; 1234 1235 if (OpToFold.isImm()) { 1236 for (auto &UseMI : 1237 make_early_inc_range(MRI->use_nodbg_instructions(Dst.getReg()))) { 1238 // Folding the immediate may reveal operations that can be constant 1239 // folded or replaced with a copy. This can happen for example after 1240 // frame indices are lowered to constants or from splitting 64-bit 1241 // constants. 1242 // 1243 // We may also encounter cases where one or both operands are 1244 // immediates materialized into a register, which would ordinarily not 1245 // be folded due to multiple uses or operand constraints. 1246 if (tryConstantFoldOp(&UseMI)) { 1247 LLVM_DEBUG(dbgs() << "Constant folded " << UseMI); 1248 Changed = true; 1249 } 1250 } 1251 } 1252 1253 SmallVector<MachineOperand *, 4> UsesToProcess; 1254 for (auto &Use : MRI->use_nodbg_operands(Dst.getReg())) 1255 UsesToProcess.push_back(&Use); 1256 for (auto *U : UsesToProcess) { 1257 MachineInstr *UseMI = U->getParent(); 1258 foldOperand(OpToFold, UseMI, UseMI->getOperandNo(U), FoldList, 1259 CopiesToReplace); 1260 } 1261 1262 if (CopiesToReplace.empty() && FoldList.empty()) 1263 return Changed; 1264 1265 MachineFunction *MF = MI.getParent()->getParent(); 1266 // Make sure we add EXEC uses to any new v_mov instructions created. 1267 for (MachineInstr *Copy : CopiesToReplace) 1268 Copy->addImplicitDefUseOperands(*MF); 1269 1270 for (FoldCandidate &Fold : FoldList) { 1271 assert(!Fold.isReg() || Fold.OpToFold); 1272 if (Fold.isReg() && Fold.OpToFold->getReg().isVirtual()) { 1273 Register Reg = Fold.OpToFold->getReg(); 1274 MachineInstr *DefMI = Fold.OpToFold->getParent(); 1275 if (DefMI->readsRegister(AMDGPU::EXEC, TRI) && 1276 execMayBeModifiedBeforeUse(*MRI, Reg, *DefMI, *Fold.UseMI)) 1277 continue; 1278 } 1279 if (updateOperand(Fold)) { 1280 // Clear kill flags. 1281 if (Fold.isReg()) { 1282 assert(Fold.OpToFold && Fold.OpToFold->isReg()); 1283 // FIXME: Probably shouldn't bother trying to fold if not an 1284 // SGPR. PeepholeOptimizer can eliminate redundant VGPR->VGPR 1285 // copies. 1286 MRI->clearKillFlags(Fold.OpToFold->getReg()); 1287 } 1288 LLVM_DEBUG(dbgs() << "Folded source from " << MI << " into OpNo " 1289 << static_cast<int>(Fold.UseOpNo) << " of " 1290 << *Fold.UseMI); 1291 } else if (Fold.Commuted) { 1292 // Restoring instruction's original operand order if fold has failed. 1293 TII->commuteInstruction(*Fold.UseMI, false); 1294 } 1295 } 1296 return true; 1297 } 1298 1299 bool SIFoldOperands::tryFoldFoldableCopy( 1300 MachineInstr &MI, MachineOperand *&CurrentKnownM0Val) const { 1301 // Specially track simple redefs of m0 to the same value in a block, so we 1302 // can erase the later ones. 1303 if (MI.getOperand(0).getReg() == AMDGPU::M0) { 1304 MachineOperand &NewM0Val = MI.getOperand(1); 1305 if (CurrentKnownM0Val && CurrentKnownM0Val->isIdenticalTo(NewM0Val)) { 1306 MI.eraseFromParent(); 1307 return true; 1308 } 1309 1310 // We aren't tracking other physical registers 1311 CurrentKnownM0Val = (NewM0Val.isReg() && NewM0Val.getReg().isPhysical()) 1312 ? nullptr 1313 : &NewM0Val; 1314 return false; 1315 } 1316 1317 MachineOperand &OpToFold = MI.getOperand(1); 1318 bool FoldingImm = OpToFold.isImm() || OpToFold.isFI() || OpToFold.isGlobal(); 1319 1320 // FIXME: We could also be folding things like TargetIndexes. 1321 if (!FoldingImm && !OpToFold.isReg()) 1322 return false; 1323 1324 if (OpToFold.isReg() && !OpToFold.getReg().isVirtual()) 1325 return false; 1326 1327 // Prevent folding operands backwards in the function. For example, 1328 // the COPY opcode must not be replaced by 1 in this example: 1329 // 1330 // %3 = COPY %vgpr0; VGPR_32:%3 1331 // ... 1332 // %vgpr0 = V_MOV_B32_e32 1, implicit %exec 1333 if (!MI.getOperand(0).getReg().isVirtual()) 1334 return false; 1335 1336 bool Changed = foldInstOperand(MI, OpToFold); 1337 1338 // If we managed to fold all uses of this copy then we might as well 1339 // delete it now. 1340 // The only reason we need to follow chains of copies here is that 1341 // tryFoldRegSequence looks forward through copies before folding a 1342 // REG_SEQUENCE into its eventual users. 1343 auto *InstToErase = &MI; 1344 while (MRI->use_nodbg_empty(InstToErase->getOperand(0).getReg())) { 1345 auto &SrcOp = InstToErase->getOperand(1); 1346 auto SrcReg = SrcOp.isReg() ? SrcOp.getReg() : Register(); 1347 InstToErase->eraseFromParent(); 1348 Changed = true; 1349 InstToErase = nullptr; 1350 if (!SrcReg || SrcReg.isPhysical()) 1351 break; 1352 InstToErase = MRI->getVRegDef(SrcReg); 1353 if (!InstToErase || !TII->isFoldableCopy(*InstToErase)) 1354 break; 1355 } 1356 1357 if (InstToErase && InstToErase->isRegSequence() && 1358 MRI->use_nodbg_empty(InstToErase->getOperand(0).getReg())) { 1359 InstToErase->eraseFromParent(); 1360 Changed = true; 1361 } 1362 1363 return Changed; 1364 } 1365 1366 // Clamp patterns are canonically selected to v_max_* instructions, so only 1367 // handle them. 1368 const MachineOperand *SIFoldOperands::isClamp(const MachineInstr &MI) const { 1369 unsigned Op = MI.getOpcode(); 1370 switch (Op) { 1371 case AMDGPU::V_MAX_F32_e64: 1372 case AMDGPU::V_MAX_F16_e64: 1373 case AMDGPU::V_MAX_F16_t16_e64: 1374 case AMDGPU::V_MAX_F16_fake16_e64: 1375 case AMDGPU::V_MAX_F64_e64: 1376 case AMDGPU::V_PK_MAX_F16: { 1377 if (!TII->getNamedOperand(MI, AMDGPU::OpName::clamp)->getImm()) 1378 return nullptr; 1379 1380 // Make sure sources are identical. 1381 const MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); 1382 const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); 1383 if (!Src0->isReg() || !Src1->isReg() || 1384 Src0->getReg() != Src1->getReg() || 1385 Src0->getSubReg() != Src1->getSubReg() || 1386 Src0->getSubReg() != AMDGPU::NoSubRegister) 1387 return nullptr; 1388 1389 // Can't fold up if we have modifiers. 1390 if (TII->hasModifiersSet(MI, AMDGPU::OpName::omod)) 1391 return nullptr; 1392 1393 unsigned Src0Mods 1394 = TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm(); 1395 unsigned Src1Mods 1396 = TII->getNamedOperand(MI, AMDGPU::OpName::src1_modifiers)->getImm(); 1397 1398 // Having a 0 op_sel_hi would require swizzling the output in the source 1399 // instruction, which we can't do. 1400 unsigned UnsetMods = (Op == AMDGPU::V_PK_MAX_F16) ? SISrcMods::OP_SEL_1 1401 : 0u; 1402 if (Src0Mods != UnsetMods && Src1Mods != UnsetMods) 1403 return nullptr; 1404 return Src0; 1405 } 1406 default: 1407 return nullptr; 1408 } 1409 } 1410 1411 // FIXME: Clamp for v_mad_mixhi_f16 handled during isel. 1412 bool SIFoldOperands::tryFoldClamp(MachineInstr &MI) { 1413 const MachineOperand *ClampSrc = isClamp(MI); 1414 if (!ClampSrc || !MRI->hasOneNonDBGUser(ClampSrc->getReg())) 1415 return false; 1416 1417 MachineInstr *Def = MRI->getVRegDef(ClampSrc->getReg()); 1418 1419 // The type of clamp must be compatible. 1420 if (TII->getClampMask(*Def) != TII->getClampMask(MI)) 1421 return false; 1422 1423 MachineOperand *DefClamp = TII->getNamedOperand(*Def, AMDGPU::OpName::clamp); 1424 if (!DefClamp) 1425 return false; 1426 1427 LLVM_DEBUG(dbgs() << "Folding clamp " << *DefClamp << " into " << *Def); 1428 1429 // Clamp is applied after omod, so it is OK if omod is set. 1430 DefClamp->setImm(1); 1431 MRI->replaceRegWith(MI.getOperand(0).getReg(), Def->getOperand(0).getReg()); 1432 MI.eraseFromParent(); 1433 1434 // Use of output modifiers forces VOP3 encoding for a VOP2 mac/fmac 1435 // instruction, so we might as well convert it to the more flexible VOP3-only 1436 // mad/fma form. 1437 if (TII->convertToThreeAddress(*Def, nullptr, nullptr)) 1438 Def->eraseFromParent(); 1439 1440 return true; 1441 } 1442 1443 static int getOModValue(unsigned Opc, int64_t Val) { 1444 switch (Opc) { 1445 case AMDGPU::V_MUL_F64_e64: { 1446 switch (Val) { 1447 case 0x3fe0000000000000: // 0.5 1448 return SIOutMods::DIV2; 1449 case 0x4000000000000000: // 2.0 1450 return SIOutMods::MUL2; 1451 case 0x4010000000000000: // 4.0 1452 return SIOutMods::MUL4; 1453 default: 1454 return SIOutMods::NONE; 1455 } 1456 } 1457 case AMDGPU::V_MUL_F32_e64: { 1458 switch (static_cast<uint32_t>(Val)) { 1459 case 0x3f000000: // 0.5 1460 return SIOutMods::DIV2; 1461 case 0x40000000: // 2.0 1462 return SIOutMods::MUL2; 1463 case 0x40800000: // 4.0 1464 return SIOutMods::MUL4; 1465 default: 1466 return SIOutMods::NONE; 1467 } 1468 } 1469 case AMDGPU::V_MUL_F16_e64: 1470 case AMDGPU::V_MUL_F16_t16_e64: 1471 case AMDGPU::V_MUL_F16_fake16_e64: { 1472 switch (static_cast<uint16_t>(Val)) { 1473 case 0x3800: // 0.5 1474 return SIOutMods::DIV2; 1475 case 0x4000: // 2.0 1476 return SIOutMods::MUL2; 1477 case 0x4400: // 4.0 1478 return SIOutMods::MUL4; 1479 default: 1480 return SIOutMods::NONE; 1481 } 1482 } 1483 default: 1484 llvm_unreachable("invalid mul opcode"); 1485 } 1486 } 1487 1488 // FIXME: Does this really not support denormals with f16? 1489 // FIXME: Does this need to check IEEE mode bit? SNaNs are generally not 1490 // handled, so will anything other than that break? 1491 std::pair<const MachineOperand *, int> 1492 SIFoldOperands::isOMod(const MachineInstr &MI) const { 1493 unsigned Op = MI.getOpcode(); 1494 switch (Op) { 1495 case AMDGPU::V_MUL_F64_e64: 1496 case AMDGPU::V_MUL_F32_e64: 1497 case AMDGPU::V_MUL_F16_t16_e64: 1498 case AMDGPU::V_MUL_F16_fake16_e64: 1499 case AMDGPU::V_MUL_F16_e64: { 1500 // If output denormals are enabled, omod is ignored. 1501 if ((Op == AMDGPU::V_MUL_F32_e64 && 1502 MFI->getMode().FP32Denormals.Output != DenormalMode::PreserveSign) || 1503 ((Op == AMDGPU::V_MUL_F64_e64 || Op == AMDGPU::V_MUL_F16_e64 || 1504 Op == AMDGPU::V_MUL_F16_t16_e64 || 1505 Op == AMDGPU::V_MUL_F16_fake16_e64) && 1506 MFI->getMode().FP64FP16Denormals.Output != DenormalMode::PreserveSign)) 1507 return std::pair(nullptr, SIOutMods::NONE); 1508 1509 const MachineOperand *RegOp = nullptr; 1510 const MachineOperand *ImmOp = nullptr; 1511 const MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); 1512 const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); 1513 if (Src0->isImm()) { 1514 ImmOp = Src0; 1515 RegOp = Src1; 1516 } else if (Src1->isImm()) { 1517 ImmOp = Src1; 1518 RegOp = Src0; 1519 } else 1520 return std::pair(nullptr, SIOutMods::NONE); 1521 1522 int OMod = getOModValue(Op, ImmOp->getImm()); 1523 if (OMod == SIOutMods::NONE || 1524 TII->hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers) || 1525 TII->hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers) || 1526 TII->hasModifiersSet(MI, AMDGPU::OpName::omod) || 1527 TII->hasModifiersSet(MI, AMDGPU::OpName::clamp)) 1528 return std::pair(nullptr, SIOutMods::NONE); 1529 1530 return std::pair(RegOp, OMod); 1531 } 1532 case AMDGPU::V_ADD_F64_e64: 1533 case AMDGPU::V_ADD_F32_e64: 1534 case AMDGPU::V_ADD_F16_e64: 1535 case AMDGPU::V_ADD_F16_t16_e64: 1536 case AMDGPU::V_ADD_F16_fake16_e64: { 1537 // If output denormals are enabled, omod is ignored. 1538 if ((Op == AMDGPU::V_ADD_F32_e64 && 1539 MFI->getMode().FP32Denormals.Output != DenormalMode::PreserveSign) || 1540 ((Op == AMDGPU::V_ADD_F64_e64 || Op == AMDGPU::V_ADD_F16_e64 || 1541 Op == AMDGPU::V_ADD_F16_t16_e64 || 1542 Op == AMDGPU::V_ADD_F16_fake16_e64) && 1543 MFI->getMode().FP64FP16Denormals.Output != DenormalMode::PreserveSign)) 1544 return std::pair(nullptr, SIOutMods::NONE); 1545 1546 // Look through the DAGCombiner canonicalization fmul x, 2 -> fadd x, x 1547 const MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); 1548 const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); 1549 1550 if (Src0->isReg() && Src1->isReg() && Src0->getReg() == Src1->getReg() && 1551 Src0->getSubReg() == Src1->getSubReg() && 1552 !TII->hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers) && 1553 !TII->hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers) && 1554 !TII->hasModifiersSet(MI, AMDGPU::OpName::clamp) && 1555 !TII->hasModifiersSet(MI, AMDGPU::OpName::omod)) 1556 return std::pair(Src0, SIOutMods::MUL2); 1557 1558 return std::pair(nullptr, SIOutMods::NONE); 1559 } 1560 default: 1561 return std::pair(nullptr, SIOutMods::NONE); 1562 } 1563 } 1564 1565 // FIXME: Does this need to check IEEE bit on function? 1566 bool SIFoldOperands::tryFoldOMod(MachineInstr &MI) { 1567 const MachineOperand *RegOp; 1568 int OMod; 1569 std::tie(RegOp, OMod) = isOMod(MI); 1570 if (OMod == SIOutMods::NONE || !RegOp->isReg() || 1571 RegOp->getSubReg() != AMDGPU::NoSubRegister || 1572 !MRI->hasOneNonDBGUser(RegOp->getReg())) 1573 return false; 1574 1575 MachineInstr *Def = MRI->getVRegDef(RegOp->getReg()); 1576 MachineOperand *DefOMod = TII->getNamedOperand(*Def, AMDGPU::OpName::omod); 1577 if (!DefOMod || DefOMod->getImm() != SIOutMods::NONE) 1578 return false; 1579 1580 // Clamp is applied after omod. If the source already has clamp set, don't 1581 // fold it. 1582 if (TII->hasModifiersSet(*Def, AMDGPU::OpName::clamp)) 1583 return false; 1584 1585 LLVM_DEBUG(dbgs() << "Folding omod " << MI << " into " << *Def); 1586 1587 DefOMod->setImm(OMod); 1588 MRI->replaceRegWith(MI.getOperand(0).getReg(), Def->getOperand(0).getReg()); 1589 MI.eraseFromParent(); 1590 1591 // Use of output modifiers forces VOP3 encoding for a VOP2 mac/fmac 1592 // instruction, so we might as well convert it to the more flexible VOP3-only 1593 // mad/fma form. 1594 if (TII->convertToThreeAddress(*Def, nullptr, nullptr)) 1595 Def->eraseFromParent(); 1596 1597 return true; 1598 } 1599 1600 // Try to fold a reg_sequence with vgpr output and agpr inputs into an 1601 // instruction which can take an agpr. So far that means a store. 1602 bool SIFoldOperands::tryFoldRegSequence(MachineInstr &MI) { 1603 assert(MI.isRegSequence()); 1604 auto Reg = MI.getOperand(0).getReg(); 1605 1606 if (!ST->hasGFX90AInsts() || !TRI->isVGPR(*MRI, Reg) || 1607 !MRI->hasOneNonDBGUse(Reg)) 1608 return false; 1609 1610 SmallVector<std::pair<MachineOperand*, unsigned>, 32> Defs; 1611 if (!getRegSeqInit(Defs, Reg, MCOI::OPERAND_REGISTER)) 1612 return false; 1613 1614 for (auto &Def : Defs) { 1615 const auto *Op = Def.first; 1616 if (!Op->isReg()) 1617 return false; 1618 if (TRI->isAGPR(*MRI, Op->getReg())) 1619 continue; 1620 // Maybe this is a COPY from AREG 1621 const MachineInstr *SubDef = MRI->getVRegDef(Op->getReg()); 1622 if (!SubDef || !SubDef->isCopy() || SubDef->getOperand(1).getSubReg()) 1623 return false; 1624 if (!TRI->isAGPR(*MRI, SubDef->getOperand(1).getReg())) 1625 return false; 1626 } 1627 1628 MachineOperand *Op = &*MRI->use_nodbg_begin(Reg); 1629 MachineInstr *UseMI = Op->getParent(); 1630 while (UseMI->isCopy() && !Op->getSubReg()) { 1631 Reg = UseMI->getOperand(0).getReg(); 1632 if (!TRI->isVGPR(*MRI, Reg) || !MRI->hasOneNonDBGUse(Reg)) 1633 return false; 1634 Op = &*MRI->use_nodbg_begin(Reg); 1635 UseMI = Op->getParent(); 1636 } 1637 1638 if (Op->getSubReg()) 1639 return false; 1640 1641 unsigned OpIdx = Op - &UseMI->getOperand(0); 1642 const MCInstrDesc &InstDesc = UseMI->getDesc(); 1643 const TargetRegisterClass *OpRC = 1644 TII->getRegClass(InstDesc, OpIdx, TRI, *MI.getMF()); 1645 if (!OpRC || !TRI->isVectorSuperClass(OpRC)) 1646 return false; 1647 1648 const auto *NewDstRC = TRI->getEquivalentAGPRClass(MRI->getRegClass(Reg)); 1649 auto Dst = MRI->createVirtualRegister(NewDstRC); 1650 auto RS = BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), 1651 TII->get(AMDGPU::REG_SEQUENCE), Dst); 1652 1653 for (unsigned I = 0; I < Defs.size(); ++I) { 1654 MachineOperand *Def = Defs[I].first; 1655 Def->setIsKill(false); 1656 if (TRI->isAGPR(*MRI, Def->getReg())) { 1657 RS.add(*Def); 1658 } else { // This is a copy 1659 MachineInstr *SubDef = MRI->getVRegDef(Def->getReg()); 1660 SubDef->getOperand(1).setIsKill(false); 1661 RS.addReg(SubDef->getOperand(1).getReg(), 0, Def->getSubReg()); 1662 } 1663 RS.addImm(Defs[I].second); 1664 } 1665 1666 Op->setReg(Dst); 1667 if (!TII->isOperandLegal(*UseMI, OpIdx, Op)) { 1668 Op->setReg(Reg); 1669 RS->eraseFromParent(); 1670 return false; 1671 } 1672 1673 LLVM_DEBUG(dbgs() << "Folded " << *RS << " into " << *UseMI); 1674 1675 // Erase the REG_SEQUENCE eagerly, unless we followed a chain of COPY users, 1676 // in which case we can erase them all later in runOnMachineFunction. 1677 if (MRI->use_nodbg_empty(MI.getOperand(0).getReg())) 1678 MI.eraseFromParent(); 1679 return true; 1680 } 1681 1682 /// Checks whether \p Copy is a AGPR -> VGPR copy. Returns `true` on success and 1683 /// stores the AGPR register in \p OutReg and the subreg in \p OutSubReg 1684 static bool isAGPRCopy(const SIRegisterInfo &TRI, 1685 const MachineRegisterInfo &MRI, const MachineInstr &Copy, 1686 Register &OutReg, unsigned &OutSubReg) { 1687 assert(Copy.isCopy()); 1688 1689 const MachineOperand &CopySrc = Copy.getOperand(1); 1690 Register CopySrcReg = CopySrc.getReg(); 1691 if (!CopySrcReg.isVirtual()) 1692 return false; 1693 1694 // Common case: copy from AGPR directly, e.g. 1695 // %1:vgpr_32 = COPY %0:agpr_32 1696 if (TRI.isAGPR(MRI, CopySrcReg)) { 1697 OutReg = CopySrcReg; 1698 OutSubReg = CopySrc.getSubReg(); 1699 return true; 1700 } 1701 1702 // Sometimes it can also involve two copies, e.g. 1703 // %1:vgpr_256 = COPY %0:agpr_256 1704 // %2:vgpr_32 = COPY %1:vgpr_256.sub0 1705 const MachineInstr *CopySrcDef = MRI.getVRegDef(CopySrcReg); 1706 if (!CopySrcDef || !CopySrcDef->isCopy()) 1707 return false; 1708 1709 const MachineOperand &OtherCopySrc = CopySrcDef->getOperand(1); 1710 Register OtherCopySrcReg = OtherCopySrc.getReg(); 1711 if (!OtherCopySrcReg.isVirtual() || 1712 CopySrcDef->getOperand(0).getSubReg() != AMDGPU::NoSubRegister || 1713 OtherCopySrc.getSubReg() != AMDGPU::NoSubRegister || 1714 !TRI.isAGPR(MRI, OtherCopySrcReg)) 1715 return false; 1716 1717 OutReg = OtherCopySrcReg; 1718 OutSubReg = CopySrc.getSubReg(); 1719 return true; 1720 } 1721 1722 // Try to hoist an AGPR to VGPR copy across a PHI. 1723 // This should allow folding of an AGPR into a consumer which may support it. 1724 // 1725 // Example 1: LCSSA PHI 1726 // loop: 1727 // %1:vreg = COPY %0:areg 1728 // exit: 1729 // %2:vreg = PHI %1:vreg, %loop 1730 // => 1731 // loop: 1732 // exit: 1733 // %1:areg = PHI %0:areg, %loop 1734 // %2:vreg = COPY %1:areg 1735 // 1736 // Example 2: PHI with multiple incoming values: 1737 // entry: 1738 // %1:vreg = GLOBAL_LOAD(..) 1739 // loop: 1740 // %2:vreg = PHI %1:vreg, %entry, %5:vreg, %loop 1741 // %3:areg = COPY %2:vreg 1742 // %4:areg = (instr using %3:areg) 1743 // %5:vreg = COPY %4:areg 1744 // => 1745 // entry: 1746 // %1:vreg = GLOBAL_LOAD(..) 1747 // %2:areg = COPY %1:vreg 1748 // loop: 1749 // %3:areg = PHI %2:areg, %entry, %X:areg, 1750 // %4:areg = (instr using %3:areg) 1751 bool SIFoldOperands::tryFoldPhiAGPR(MachineInstr &PHI) { 1752 assert(PHI.isPHI()); 1753 1754 Register PhiOut = PHI.getOperand(0).getReg(); 1755 if (!TRI->isVGPR(*MRI, PhiOut)) 1756 return false; 1757 1758 // Iterate once over all incoming values of the PHI to check if this PHI is 1759 // eligible, and determine the exact AGPR RC we'll target. 1760 const TargetRegisterClass *ARC = nullptr; 1761 for (unsigned K = 1; K < PHI.getNumExplicitOperands(); K += 2) { 1762 MachineOperand &MO = PHI.getOperand(K); 1763 MachineInstr *Copy = MRI->getVRegDef(MO.getReg()); 1764 if (!Copy || !Copy->isCopy()) 1765 continue; 1766 1767 Register AGPRSrc; 1768 unsigned AGPRRegMask = AMDGPU::NoSubRegister; 1769 if (!isAGPRCopy(*TRI, *MRI, *Copy, AGPRSrc, AGPRRegMask)) 1770 continue; 1771 1772 const TargetRegisterClass *CopyInRC = MRI->getRegClass(AGPRSrc); 1773 if (const auto *SubRC = TRI->getSubRegisterClass(CopyInRC, AGPRRegMask)) 1774 CopyInRC = SubRC; 1775 1776 if (ARC && !ARC->hasSubClassEq(CopyInRC)) 1777 return false; 1778 ARC = CopyInRC; 1779 } 1780 1781 if (!ARC) 1782 return false; 1783 1784 bool IsAGPR32 = (ARC == &AMDGPU::AGPR_32RegClass); 1785 1786 // Rewrite the PHI's incoming values to ARC. 1787 LLVM_DEBUG(dbgs() << "Folding AGPR copies into: " << PHI); 1788 for (unsigned K = 1; K < PHI.getNumExplicitOperands(); K += 2) { 1789 MachineOperand &MO = PHI.getOperand(K); 1790 Register Reg = MO.getReg(); 1791 1792 MachineBasicBlock::iterator InsertPt; 1793 MachineBasicBlock *InsertMBB = nullptr; 1794 1795 // Look at the def of Reg, ignoring all copies. 1796 unsigned CopyOpc = AMDGPU::COPY; 1797 if (MachineInstr *Def = MRI->getVRegDef(Reg)) { 1798 1799 // Look at pre-existing COPY instructions from ARC: Steal the operand. If 1800 // the copy was single-use, it will be removed by DCE later. 1801 if (Def->isCopy()) { 1802 Register AGPRSrc; 1803 unsigned AGPRSubReg = AMDGPU::NoSubRegister; 1804 if (isAGPRCopy(*TRI, *MRI, *Def, AGPRSrc, AGPRSubReg)) { 1805 MO.setReg(AGPRSrc); 1806 MO.setSubReg(AGPRSubReg); 1807 continue; 1808 } 1809 1810 // If this is a multi-use SGPR -> VGPR copy, use V_ACCVGPR_WRITE on 1811 // GFX908 directly instead of a COPY. Otherwise, SIFoldOperand may try 1812 // to fold the sgpr -> vgpr -> agpr copy into a sgpr -> agpr copy which 1813 // is unlikely to be profitable. 1814 // 1815 // Note that V_ACCVGPR_WRITE is only used for AGPR_32. 1816 MachineOperand &CopyIn = Def->getOperand(1); 1817 if (IsAGPR32 && !ST->hasGFX90AInsts() && !MRI->hasOneNonDBGUse(Reg) && 1818 TRI->isSGPRReg(*MRI, CopyIn.getReg())) 1819 CopyOpc = AMDGPU::V_ACCVGPR_WRITE_B32_e64; 1820 } 1821 1822 InsertMBB = Def->getParent(); 1823 InsertPt = InsertMBB->SkipPHIsLabelsAndDebug(++Def->getIterator()); 1824 } else { 1825 InsertMBB = PHI.getOperand(MO.getOperandNo() + 1).getMBB(); 1826 InsertPt = InsertMBB->getFirstTerminator(); 1827 } 1828 1829 Register NewReg = MRI->createVirtualRegister(ARC); 1830 MachineInstr *MI = BuildMI(*InsertMBB, InsertPt, PHI.getDebugLoc(), 1831 TII->get(CopyOpc), NewReg) 1832 .addReg(Reg); 1833 MO.setReg(NewReg); 1834 1835 (void)MI; 1836 LLVM_DEBUG(dbgs() << " Created COPY: " << *MI); 1837 } 1838 1839 // Replace the PHI's result with a new register. 1840 Register NewReg = MRI->createVirtualRegister(ARC); 1841 PHI.getOperand(0).setReg(NewReg); 1842 1843 // COPY that new register back to the original PhiOut register. This COPY will 1844 // usually be folded out later. 1845 MachineBasicBlock *MBB = PHI.getParent(); 1846 BuildMI(*MBB, MBB->getFirstNonPHI(), PHI.getDebugLoc(), 1847 TII->get(AMDGPU::COPY), PhiOut) 1848 .addReg(NewReg); 1849 1850 LLVM_DEBUG(dbgs() << " Done: Folded " << PHI); 1851 return true; 1852 } 1853 1854 // Attempt to convert VGPR load to an AGPR load. 1855 bool SIFoldOperands::tryFoldLoad(MachineInstr &MI) { 1856 assert(MI.mayLoad()); 1857 if (!ST->hasGFX90AInsts() || MI.getNumExplicitDefs() != 1) 1858 return false; 1859 1860 MachineOperand &Def = MI.getOperand(0); 1861 if (!Def.isDef()) 1862 return false; 1863 1864 Register DefReg = Def.getReg(); 1865 1866 if (DefReg.isPhysical() || !TRI->isVGPR(*MRI, DefReg)) 1867 return false; 1868 1869 SmallVector<const MachineInstr*, 8> Users; 1870 SmallVector<Register, 8> MoveRegs; 1871 for (const MachineInstr &I : MRI->use_nodbg_instructions(DefReg)) 1872 Users.push_back(&I); 1873 1874 if (Users.empty()) 1875 return false; 1876 1877 // Check that all uses a copy to an agpr or a reg_sequence producing an agpr. 1878 while (!Users.empty()) { 1879 const MachineInstr *I = Users.pop_back_val(); 1880 if (!I->isCopy() && !I->isRegSequence()) 1881 return false; 1882 Register DstReg = I->getOperand(0).getReg(); 1883 // Physical registers may have more than one instruction definitions 1884 if (DstReg.isPhysical()) 1885 return false; 1886 if (TRI->isAGPR(*MRI, DstReg)) 1887 continue; 1888 MoveRegs.push_back(DstReg); 1889 for (const MachineInstr &U : MRI->use_nodbg_instructions(DstReg)) 1890 Users.push_back(&U); 1891 } 1892 1893 const TargetRegisterClass *RC = MRI->getRegClass(DefReg); 1894 MRI->setRegClass(DefReg, TRI->getEquivalentAGPRClass(RC)); 1895 if (!TII->isOperandLegal(MI, 0, &Def)) { 1896 MRI->setRegClass(DefReg, RC); 1897 return false; 1898 } 1899 1900 while (!MoveRegs.empty()) { 1901 Register Reg = MoveRegs.pop_back_val(); 1902 MRI->setRegClass(Reg, TRI->getEquivalentAGPRClass(MRI->getRegClass(Reg))); 1903 } 1904 1905 LLVM_DEBUG(dbgs() << "Folded " << MI); 1906 1907 return true; 1908 } 1909 1910 // tryFoldPhiAGPR will aggressively try to create AGPR PHIs. 1911 // For GFX90A and later, this is pretty much always a good thing, but for GFX908 1912 // there's cases where it can create a lot more AGPR-AGPR copies, which are 1913 // expensive on this architecture due to the lack of V_ACCVGPR_MOV. 1914 // 1915 // This function looks at all AGPR PHIs in a basic block and collects their 1916 // operands. Then, it checks for register that are used more than once across 1917 // all PHIs and caches them in a VGPR. This prevents ExpandPostRAPseudo from 1918 // having to create one VGPR temporary per use, which can get very messy if 1919 // these PHIs come from a broken-up large PHI (e.g. 32 AGPR phis, one per vector 1920 // element). 1921 // 1922 // Example 1923 // a: 1924 // %in:agpr_256 = COPY %foo:vgpr_256 1925 // c: 1926 // %x:agpr_32 = .. 1927 // b: 1928 // %0:areg = PHI %in.sub0:agpr_32, %a, %x, %c 1929 // %1:areg = PHI %in.sub0:agpr_32, %a, %y, %c 1930 // %2:areg = PHI %in.sub0:agpr_32, %a, %z, %c 1931 // => 1932 // a: 1933 // %in:agpr_256 = COPY %foo:vgpr_256 1934 // %tmp:vgpr_32 = V_ACCVGPR_READ_B32_e64 %in.sub0:agpr_32 1935 // %tmp_agpr:agpr_32 = COPY %tmp 1936 // c: 1937 // %x:agpr_32 = .. 1938 // b: 1939 // %0:areg = PHI %tmp_agpr, %a, %x, %c 1940 // %1:areg = PHI %tmp_agpr, %a, %y, %c 1941 // %2:areg = PHI %tmp_agpr, %a, %z, %c 1942 bool SIFoldOperands::tryOptimizeAGPRPhis(MachineBasicBlock &MBB) { 1943 // This is only really needed on GFX908 where AGPR-AGPR copies are 1944 // unreasonably difficult. 1945 if (ST->hasGFX90AInsts()) 1946 return false; 1947 1948 // Look at all AGPR Phis and collect the register + subregister used. 1949 DenseMap<std::pair<Register, unsigned>, std::vector<MachineOperand *>> 1950 RegToMO; 1951 1952 for (auto &MI : MBB) { 1953 if (!MI.isPHI()) 1954 break; 1955 1956 if (!TRI->isAGPR(*MRI, MI.getOperand(0).getReg())) 1957 continue; 1958 1959 for (unsigned K = 1; K < MI.getNumOperands(); K += 2) { 1960 MachineOperand &PhiMO = MI.getOperand(K); 1961 RegToMO[{PhiMO.getReg(), PhiMO.getSubReg()}].push_back(&PhiMO); 1962 } 1963 } 1964 1965 // For all (Reg, SubReg) pair that are used more than once, cache the value in 1966 // a VGPR. 1967 bool Changed = false; 1968 for (const auto &[Entry, MOs] : RegToMO) { 1969 if (MOs.size() == 1) 1970 continue; 1971 1972 const auto [Reg, SubReg] = Entry; 1973 MachineInstr *Def = MRI->getVRegDef(Reg); 1974 MachineBasicBlock *DefMBB = Def->getParent(); 1975 1976 // Create a copy in a VGPR using V_ACCVGPR_READ_B32_e64 so it's not folded 1977 // out. 1978 const TargetRegisterClass *ARC = getRegOpRC(*MRI, *TRI, *MOs.front()); 1979 Register TempVGPR = 1980 MRI->createVirtualRegister(TRI->getEquivalentVGPRClass(ARC)); 1981 MachineInstr *VGPRCopy = 1982 BuildMI(*DefMBB, ++Def->getIterator(), Def->getDebugLoc(), 1983 TII->get(AMDGPU::V_ACCVGPR_READ_B32_e64), TempVGPR) 1984 .addReg(Reg, /* flags */ 0, SubReg); 1985 1986 // Copy back to an AGPR and use that instead of the AGPR subreg in all MOs. 1987 Register TempAGPR = MRI->createVirtualRegister(ARC); 1988 BuildMI(*DefMBB, ++VGPRCopy->getIterator(), Def->getDebugLoc(), 1989 TII->get(AMDGPU::COPY), TempAGPR) 1990 .addReg(TempVGPR); 1991 1992 LLVM_DEBUG(dbgs() << "Caching AGPR into VGPR: " << *VGPRCopy); 1993 for (MachineOperand *MO : MOs) { 1994 MO->setReg(TempAGPR); 1995 MO->setSubReg(AMDGPU::NoSubRegister); 1996 LLVM_DEBUG(dbgs() << " Changed PHI Operand: " << *MO << "\n"); 1997 } 1998 1999 Changed = true; 2000 } 2001 2002 return Changed; 2003 } 2004 2005 bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) { 2006 if (skipFunction(MF.getFunction())) 2007 return false; 2008 2009 MRI = &MF.getRegInfo(); 2010 ST = &MF.getSubtarget<GCNSubtarget>(); 2011 TII = ST->getInstrInfo(); 2012 TRI = &TII->getRegisterInfo(); 2013 MFI = MF.getInfo<SIMachineFunctionInfo>(); 2014 2015 // omod is ignored by hardware if IEEE bit is enabled. omod also does not 2016 // correctly handle signed zeros. 2017 // 2018 // FIXME: Also need to check strictfp 2019 bool IsIEEEMode = MFI->getMode().IEEE; 2020 bool HasNSZ = MFI->hasNoSignedZerosFPMath(); 2021 2022 bool Changed = false; 2023 for (MachineBasicBlock *MBB : depth_first(&MF)) { 2024 MachineOperand *CurrentKnownM0Val = nullptr; 2025 for (auto &MI : make_early_inc_range(*MBB)) { 2026 Changed |= tryFoldCndMask(MI); 2027 2028 if (tryFoldZeroHighBits(MI)) { 2029 Changed = true; 2030 continue; 2031 } 2032 2033 if (MI.isRegSequence() && tryFoldRegSequence(MI)) { 2034 Changed = true; 2035 continue; 2036 } 2037 2038 if (MI.isPHI() && tryFoldPhiAGPR(MI)) { 2039 Changed = true; 2040 continue; 2041 } 2042 2043 if (MI.mayLoad() && tryFoldLoad(MI)) { 2044 Changed = true; 2045 continue; 2046 } 2047 2048 if (TII->isFoldableCopy(MI)) { 2049 Changed |= tryFoldFoldableCopy(MI, CurrentKnownM0Val); 2050 continue; 2051 } 2052 2053 // Saw an unknown clobber of m0, so we no longer know what it is. 2054 if (CurrentKnownM0Val && MI.modifiesRegister(AMDGPU::M0, TRI)) 2055 CurrentKnownM0Val = nullptr; 2056 2057 // TODO: Omod might be OK if there is NSZ only on the source 2058 // instruction, and not the omod multiply. 2059 if (IsIEEEMode || (!HasNSZ && !MI.getFlag(MachineInstr::FmNsz)) || 2060 !tryFoldOMod(MI)) 2061 Changed |= tryFoldClamp(MI); 2062 } 2063 2064 Changed |= tryOptimizeAGPRPhis(*MBB); 2065 } 2066 2067 return Changed; 2068 } 2069