1 //===-- SIFoldOperands.cpp - Fold operands --- ----------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 /// \file 8 //===----------------------------------------------------------------------===// 9 // 10 11 #include "AMDGPU.h" 12 #include "GCNSubtarget.h" 13 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 14 #include "SIMachineFunctionInfo.h" 15 #include "llvm/ADT/DepthFirstIterator.h" 16 #include "llvm/CodeGen/MachineFunctionPass.h" 17 #include "llvm/CodeGen/MachineOperand.h" 18 19 #define DEBUG_TYPE "si-fold-operands" 20 using namespace llvm; 21 22 namespace { 23 24 struct FoldCandidate { 25 MachineInstr *UseMI; 26 union { 27 MachineOperand *OpToFold; 28 uint64_t ImmToFold; 29 int FrameIndexToFold; 30 }; 31 int ShrinkOpcode; 32 unsigned UseOpNo; 33 MachineOperand::MachineOperandType Kind; 34 bool Commuted; 35 36 FoldCandidate(MachineInstr *MI, unsigned OpNo, MachineOperand *FoldOp, 37 bool Commuted_ = false, 38 int ShrinkOp = -1) : 39 UseMI(MI), OpToFold(nullptr), ShrinkOpcode(ShrinkOp), UseOpNo(OpNo), 40 Kind(FoldOp->getType()), 41 Commuted(Commuted_) { 42 if (FoldOp->isImm()) { 43 ImmToFold = FoldOp->getImm(); 44 } else if (FoldOp->isFI()) { 45 FrameIndexToFold = FoldOp->getIndex(); 46 } else { 47 assert(FoldOp->isReg() || FoldOp->isGlobal()); 48 OpToFold = FoldOp; 49 } 50 } 51 52 bool isFI() const { 53 return Kind == MachineOperand::MO_FrameIndex; 54 } 55 56 bool isImm() const { 57 return Kind == MachineOperand::MO_Immediate; 58 } 59 60 bool isReg() const { 61 return Kind == MachineOperand::MO_Register; 62 } 63 64 bool isGlobal() const { return Kind == MachineOperand::MO_GlobalAddress; } 65 66 bool needsShrink() const { return ShrinkOpcode != -1; } 67 }; 68 69 class SIFoldOperands : public MachineFunctionPass { 70 public: 71 static char ID; 72 MachineRegisterInfo *MRI; 73 const SIInstrInfo *TII; 74 const SIRegisterInfo *TRI; 75 const GCNSubtarget *ST; 76 const SIMachineFunctionInfo *MFI; 77 78 bool frameIndexMayFold(const MachineInstr &UseMI, int OpNo, 79 const MachineOperand &OpToFold) const; 80 81 bool updateOperand(FoldCandidate &Fold) const; 82 83 bool canUseImmWithOpSel(FoldCandidate &Fold) const; 84 85 bool tryFoldImmWithOpSel(FoldCandidate &Fold) const; 86 87 bool tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList, 88 MachineInstr *MI, unsigned OpNo, 89 MachineOperand *OpToFold) const; 90 bool isUseSafeToFold(const MachineInstr &MI, 91 const MachineOperand &UseMO) const; 92 bool 93 getRegSeqInit(SmallVectorImpl<std::pair<MachineOperand *, unsigned>> &Defs, 94 Register UseReg, uint8_t OpTy) const; 95 bool tryToFoldACImm(const MachineOperand &OpToFold, MachineInstr *UseMI, 96 unsigned UseOpIdx, 97 SmallVectorImpl<FoldCandidate> &FoldList) const; 98 void foldOperand(MachineOperand &OpToFold, 99 MachineInstr *UseMI, 100 int UseOpIdx, 101 SmallVectorImpl<FoldCandidate> &FoldList, 102 SmallVectorImpl<MachineInstr *> &CopiesToReplace) const; 103 104 MachineOperand *getImmOrMaterializedImm(MachineOperand &Op) const; 105 bool tryConstantFoldOp(MachineInstr *MI) const; 106 bool tryFoldCndMask(MachineInstr &MI) const; 107 bool tryFoldZeroHighBits(MachineInstr &MI) const; 108 bool foldInstOperand(MachineInstr &MI, MachineOperand &OpToFold) const; 109 bool tryFoldFoldableCopy(MachineInstr &MI, 110 MachineOperand *&CurrentKnownM0Val) const; 111 112 const MachineOperand *isClamp(const MachineInstr &MI) const; 113 bool tryFoldClamp(MachineInstr &MI); 114 115 std::pair<const MachineOperand *, int> isOMod(const MachineInstr &MI) const; 116 bool tryFoldOMod(MachineInstr &MI); 117 bool tryFoldRegSequence(MachineInstr &MI); 118 bool tryFoldPhiAGPR(MachineInstr &MI); 119 bool tryFoldLoad(MachineInstr &MI); 120 121 bool tryOptimizeAGPRPhis(MachineBasicBlock &MBB); 122 123 public: 124 SIFoldOperands() : MachineFunctionPass(ID) { 125 initializeSIFoldOperandsPass(*PassRegistry::getPassRegistry()); 126 } 127 128 bool runOnMachineFunction(MachineFunction &MF) override; 129 130 StringRef getPassName() const override { return "SI Fold Operands"; } 131 132 void getAnalysisUsage(AnalysisUsage &AU) const override { 133 AU.setPreservesCFG(); 134 MachineFunctionPass::getAnalysisUsage(AU); 135 } 136 }; 137 138 } // End anonymous namespace. 139 140 INITIALIZE_PASS(SIFoldOperands, DEBUG_TYPE, 141 "SI Fold Operands", false, false) 142 143 char SIFoldOperands::ID = 0; 144 145 char &llvm::SIFoldOperandsID = SIFoldOperands::ID; 146 147 static const TargetRegisterClass *getRegOpRC(const MachineRegisterInfo &MRI, 148 const TargetRegisterInfo &TRI, 149 const MachineOperand &MO) { 150 const TargetRegisterClass *RC = MRI.getRegClass(MO.getReg()); 151 if (const TargetRegisterClass *SubRC = 152 TRI.getSubRegisterClass(RC, MO.getSubReg())) 153 RC = SubRC; 154 return RC; 155 } 156 157 // Map multiply-accumulate opcode to corresponding multiply-add opcode if any. 158 static unsigned macToMad(unsigned Opc) { 159 switch (Opc) { 160 case AMDGPU::V_MAC_F32_e64: 161 return AMDGPU::V_MAD_F32_e64; 162 case AMDGPU::V_MAC_F16_e64: 163 return AMDGPU::V_MAD_F16_e64; 164 case AMDGPU::V_FMAC_F32_e64: 165 return AMDGPU::V_FMA_F32_e64; 166 case AMDGPU::V_FMAC_F16_e64: 167 return AMDGPU::V_FMA_F16_gfx9_e64; 168 case AMDGPU::V_FMAC_F16_t16_e64: 169 return AMDGPU::V_FMA_F16_gfx9_e64; 170 case AMDGPU::V_FMAC_LEGACY_F32_e64: 171 return AMDGPU::V_FMA_LEGACY_F32_e64; 172 case AMDGPU::V_FMAC_F64_e64: 173 return AMDGPU::V_FMA_F64_e64; 174 } 175 return AMDGPU::INSTRUCTION_LIST_END; 176 } 177 178 // TODO: Add heuristic that the frame index might not fit in the addressing mode 179 // immediate offset to avoid materializing in loops. 180 bool SIFoldOperands::frameIndexMayFold(const MachineInstr &UseMI, int OpNo, 181 const MachineOperand &OpToFold) const { 182 if (!OpToFold.isFI()) 183 return false; 184 185 const unsigned Opc = UseMI.getOpcode(); 186 if (TII->isMUBUF(UseMI)) 187 return OpNo == AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr); 188 if (!TII->isFLATScratch(UseMI)) 189 return false; 190 191 int SIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr); 192 if (OpNo == SIdx) 193 return true; 194 195 int VIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr); 196 return OpNo == VIdx && SIdx == -1; 197 } 198 199 FunctionPass *llvm::createSIFoldOperandsPass() { 200 return new SIFoldOperands(); 201 } 202 203 bool SIFoldOperands::canUseImmWithOpSel(FoldCandidate &Fold) const { 204 MachineInstr *MI = Fold.UseMI; 205 MachineOperand &Old = MI->getOperand(Fold.UseOpNo); 206 const uint64_t TSFlags = MI->getDesc().TSFlags; 207 208 assert(Old.isReg() && Fold.isImm()); 209 210 if (!(TSFlags & SIInstrFlags::IsPacked) || (TSFlags & SIInstrFlags::IsMAI) || 211 (ST->hasDOTOpSelHazard() && (TSFlags & SIInstrFlags::IsDOT)) || 212 isUInt<16>(Fold.ImmToFold) || 213 !AMDGPU::isFoldableLiteralV216(Fold.ImmToFold, ST->hasInv2PiInlineImm())) 214 return false; 215 216 unsigned Opcode = MI->getOpcode(); 217 int OpNo = MI->getOperandNo(&Old); 218 uint8_t OpType = TII->get(Opcode).operands()[OpNo].OperandType; 219 switch (OpType) { 220 default: 221 return false; 222 case AMDGPU::OPERAND_REG_IMM_V2FP16: 223 case AMDGPU::OPERAND_REG_IMM_V2INT16: 224 case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: 225 case AMDGPU::OPERAND_REG_INLINE_C_V2INT16: 226 break; 227 } 228 229 return true; 230 } 231 232 bool SIFoldOperands::tryFoldImmWithOpSel(FoldCandidate &Fold) const { 233 MachineInstr *MI = Fold.UseMI; 234 MachineOperand &Old = MI->getOperand(Fold.UseOpNo); 235 unsigned Opcode = MI->getOpcode(); 236 int OpNo = MI->getOperandNo(&Old); 237 238 // Set op_sel/op_sel_hi on this operand or bail out if op_sel is 239 // already set. 240 int ModIdx = -1; 241 if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0)) 242 ModIdx = AMDGPU::OpName::src0_modifiers; 243 else if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1)) 244 ModIdx = AMDGPU::OpName::src1_modifiers; 245 else if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2)) 246 ModIdx = AMDGPU::OpName::src2_modifiers; 247 assert(ModIdx != -1); 248 ModIdx = AMDGPU::getNamedOperandIdx(Opcode, ModIdx); 249 MachineOperand &Mod = MI->getOperand(ModIdx); 250 unsigned Val = Mod.getImm(); 251 if ((Val & SISrcMods::OP_SEL_0) || !(Val & SISrcMods::OP_SEL_1)) 252 return false; 253 254 // Only apply the following transformation if that operand requires 255 // a packed immediate. 256 // If upper part is all zero we do not need op_sel_hi. 257 if (!(Fold.ImmToFold & 0xffff)) { 258 MachineOperand New = 259 MachineOperand::CreateImm((Fold.ImmToFold >> 16) & 0xffff); 260 if (!TII->isOperandLegal(*MI, OpNo, &New)) 261 return false; 262 Mod.setImm(Mod.getImm() | SISrcMods::OP_SEL_0); 263 Mod.setImm(Mod.getImm() & ~SISrcMods::OP_SEL_1); 264 Old.ChangeToImmediate((Fold.ImmToFold >> 16) & 0xffff); 265 return true; 266 } 267 MachineOperand New = MachineOperand::CreateImm(Fold.ImmToFold & 0xffff); 268 if (!TII->isOperandLegal(*MI, OpNo, &New)) 269 return false; 270 Mod.setImm(Mod.getImm() & ~SISrcMods::OP_SEL_1); 271 Old.ChangeToImmediate(Fold.ImmToFold & 0xffff); 272 return true; 273 } 274 275 bool SIFoldOperands::updateOperand(FoldCandidate &Fold) const { 276 MachineInstr *MI = Fold.UseMI; 277 MachineOperand &Old = MI->getOperand(Fold.UseOpNo); 278 assert(Old.isReg()); 279 280 if (Fold.isImm() && canUseImmWithOpSel(Fold)) 281 return tryFoldImmWithOpSel(Fold); 282 283 if ((Fold.isImm() || Fold.isFI() || Fold.isGlobal()) && Fold.needsShrink()) { 284 MachineBasicBlock *MBB = MI->getParent(); 285 auto Liveness = MBB->computeRegisterLiveness(TRI, AMDGPU::VCC, MI, 16); 286 if (Liveness != MachineBasicBlock::LQR_Dead) { 287 LLVM_DEBUG(dbgs() << "Not shrinking " << MI << " due to vcc liveness\n"); 288 return false; 289 } 290 291 int Op32 = Fold.ShrinkOpcode; 292 MachineOperand &Dst0 = MI->getOperand(0); 293 MachineOperand &Dst1 = MI->getOperand(1); 294 assert(Dst0.isDef() && Dst1.isDef()); 295 296 bool HaveNonDbgCarryUse = !MRI->use_nodbg_empty(Dst1.getReg()); 297 298 const TargetRegisterClass *Dst0RC = MRI->getRegClass(Dst0.getReg()); 299 Register NewReg0 = MRI->createVirtualRegister(Dst0RC); 300 301 MachineInstr *Inst32 = TII->buildShrunkInst(*MI, Op32); 302 303 if (HaveNonDbgCarryUse) { 304 BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(AMDGPU::COPY), 305 Dst1.getReg()) 306 .addReg(AMDGPU::VCC, RegState::Kill); 307 } 308 309 // Keep the old instruction around to avoid breaking iterators, but 310 // replace it with a dummy instruction to remove uses. 311 // 312 // FIXME: We should not invert how this pass looks at operands to avoid 313 // this. Should track set of foldable movs instead of looking for uses 314 // when looking at a use. 315 Dst0.setReg(NewReg0); 316 for (unsigned I = MI->getNumOperands() - 1; I > 0; --I) 317 MI->removeOperand(I); 318 MI->setDesc(TII->get(AMDGPU::IMPLICIT_DEF)); 319 320 if (Fold.Commuted) 321 TII->commuteInstruction(*Inst32, false); 322 return true; 323 } 324 325 assert(!Fold.needsShrink() && "not handled"); 326 327 if (Fold.isImm()) { 328 if (Old.isTied()) { 329 int NewMFMAOpc = AMDGPU::getMFMAEarlyClobberOp(MI->getOpcode()); 330 if (NewMFMAOpc == -1) 331 return false; 332 MI->setDesc(TII->get(NewMFMAOpc)); 333 MI->untieRegOperand(0); 334 } 335 Old.ChangeToImmediate(Fold.ImmToFold); 336 return true; 337 } 338 339 if (Fold.isGlobal()) { 340 Old.ChangeToGA(Fold.OpToFold->getGlobal(), Fold.OpToFold->getOffset(), 341 Fold.OpToFold->getTargetFlags()); 342 return true; 343 } 344 345 if (Fold.isFI()) { 346 Old.ChangeToFrameIndex(Fold.FrameIndexToFold); 347 return true; 348 } 349 350 MachineOperand *New = Fold.OpToFold; 351 Old.substVirtReg(New->getReg(), New->getSubReg(), *TRI); 352 Old.setIsUndef(New->isUndef()); 353 return true; 354 } 355 356 static bool isUseMIInFoldList(ArrayRef<FoldCandidate> FoldList, 357 const MachineInstr *MI) { 358 return any_of(FoldList, [&](const auto &C) { return C.UseMI == MI; }); 359 } 360 361 static void appendFoldCandidate(SmallVectorImpl<FoldCandidate> &FoldList, 362 MachineInstr *MI, unsigned OpNo, 363 MachineOperand *FoldOp, bool Commuted = false, 364 int ShrinkOp = -1) { 365 // Skip additional folding on the same operand. 366 for (FoldCandidate &Fold : FoldList) 367 if (Fold.UseMI == MI && Fold.UseOpNo == OpNo) 368 return; 369 LLVM_DEBUG(dbgs() << "Append " << (Commuted ? "commuted" : "normal") 370 << " operand " << OpNo << "\n " << *MI); 371 FoldList.emplace_back(MI, OpNo, FoldOp, Commuted, ShrinkOp); 372 } 373 374 bool SIFoldOperands::tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList, 375 MachineInstr *MI, unsigned OpNo, 376 MachineOperand *OpToFold) const { 377 const unsigned Opc = MI->getOpcode(); 378 379 auto tryToFoldAsFMAAKorMK = [&]() { 380 if (!OpToFold->isImm()) 381 return false; 382 383 const bool TryAK = OpNo == 3; 384 const unsigned NewOpc = TryAK ? AMDGPU::S_FMAAK_F32 : AMDGPU::S_FMAMK_F32; 385 MI->setDesc(TII->get(NewOpc)); 386 387 // We have to fold into operand which would be Imm not into OpNo. 388 bool FoldAsFMAAKorMK = 389 tryAddToFoldList(FoldList, MI, TryAK ? 3 : 2, OpToFold); 390 if (FoldAsFMAAKorMK) { 391 // Untie Src2 of fmac. 392 MI->untieRegOperand(3); 393 // For fmamk swap operands 1 and 2 if OpToFold was meant for operand 1. 394 if (OpNo == 1) { 395 MachineOperand &Op1 = MI->getOperand(1); 396 MachineOperand &Op2 = MI->getOperand(2); 397 Register OldReg = Op1.getReg(); 398 // Operand 2 might be an inlinable constant 399 if (Op2.isImm()) { 400 Op1.ChangeToImmediate(Op2.getImm()); 401 Op2.ChangeToRegister(OldReg, false); 402 } else { 403 Op1.setReg(Op2.getReg()); 404 Op2.setReg(OldReg); 405 } 406 } 407 return true; 408 } 409 MI->setDesc(TII->get(Opc)); 410 return false; 411 }; 412 413 bool IsLegal = TII->isOperandLegal(*MI, OpNo, OpToFold); 414 if (!IsLegal && OpToFold->isImm()) { 415 FoldCandidate Fold(MI, OpNo, OpToFold); 416 IsLegal = canUseImmWithOpSel(Fold); 417 } 418 419 if (!IsLegal) { 420 // Special case for v_mac_{f16, f32}_e64 if we are trying to fold into src2 421 unsigned NewOpc = macToMad(Opc); 422 if (NewOpc != AMDGPU::INSTRUCTION_LIST_END) { 423 // Check if changing this to a v_mad_{f16, f32} instruction will allow us 424 // to fold the operand. 425 MI->setDesc(TII->get(NewOpc)); 426 bool AddOpSel = !AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::op_sel) && 427 AMDGPU::hasNamedOperand(NewOpc, AMDGPU::OpName::op_sel); 428 if (AddOpSel) 429 MI->addOperand(MachineOperand::CreateImm(0)); 430 bool FoldAsMAD = tryAddToFoldList(FoldList, MI, OpNo, OpToFold); 431 if (FoldAsMAD) { 432 MI->untieRegOperand(OpNo); 433 return true; 434 } 435 if (AddOpSel) 436 MI->removeOperand(MI->getNumExplicitOperands() - 1); 437 MI->setDesc(TII->get(Opc)); 438 } 439 440 // Special case for s_fmac_f32 if we are trying to fold into Src2. 441 // By transforming into fmaak we can untie Src2 and make folding legal. 442 if (Opc == AMDGPU::S_FMAC_F32 && OpNo == 3) { 443 if (tryToFoldAsFMAAKorMK()) 444 return true; 445 } 446 447 // Special case for s_setreg_b32 448 if (OpToFold->isImm()) { 449 unsigned ImmOpc = 0; 450 if (Opc == AMDGPU::S_SETREG_B32) 451 ImmOpc = AMDGPU::S_SETREG_IMM32_B32; 452 else if (Opc == AMDGPU::S_SETREG_B32_mode) 453 ImmOpc = AMDGPU::S_SETREG_IMM32_B32_mode; 454 if (ImmOpc) { 455 MI->setDesc(TII->get(ImmOpc)); 456 appendFoldCandidate(FoldList, MI, OpNo, OpToFold); 457 return true; 458 } 459 } 460 461 // If we are already folding into another operand of MI, then 462 // we can't commute the instruction, otherwise we risk making the 463 // other fold illegal. 464 if (isUseMIInFoldList(FoldList, MI)) 465 return false; 466 467 // Operand is not legal, so try to commute the instruction to 468 // see if this makes it possible to fold. 469 unsigned CommuteOpNo = TargetInstrInfo::CommuteAnyOperandIndex; 470 bool CanCommute = TII->findCommutedOpIndices(*MI, OpNo, CommuteOpNo); 471 if (!CanCommute) 472 return false; 473 474 // One of operands might be an Imm operand, and OpNo may refer to it after 475 // the call of commuteInstruction() below. Such situations are avoided 476 // here explicitly as OpNo must be a register operand to be a candidate 477 // for memory folding. 478 if (!MI->getOperand(OpNo).isReg() || !MI->getOperand(CommuteOpNo).isReg()) 479 return false; 480 481 if (!TII->commuteInstruction(*MI, false, OpNo, CommuteOpNo)) 482 return false; 483 484 int Op32 = -1; 485 if (!TII->isOperandLegal(*MI, CommuteOpNo, OpToFold)) { 486 if ((Opc != AMDGPU::V_ADD_CO_U32_e64 && Opc != AMDGPU::V_SUB_CO_U32_e64 && 487 Opc != AMDGPU::V_SUBREV_CO_U32_e64) || // FIXME 488 (!OpToFold->isImm() && !OpToFold->isFI() && !OpToFold->isGlobal())) { 489 TII->commuteInstruction(*MI, false, OpNo, CommuteOpNo); 490 return false; 491 } 492 493 // Verify the other operand is a VGPR, otherwise we would violate the 494 // constant bus restriction. 495 MachineOperand &OtherOp = MI->getOperand(OpNo); 496 if (!OtherOp.isReg() || 497 !TII->getRegisterInfo().isVGPR(*MRI, OtherOp.getReg())) 498 return false; 499 500 assert(MI->getOperand(1).isDef()); 501 502 // Make sure to get the 32-bit version of the commuted opcode. 503 unsigned MaybeCommutedOpc = MI->getOpcode(); 504 Op32 = AMDGPU::getVOPe32(MaybeCommutedOpc); 505 } 506 507 appendFoldCandidate(FoldList, MI, CommuteOpNo, OpToFold, true, Op32); 508 return true; 509 } 510 511 // Inlineable constant might have been folded into Imm operand of fmaak or 512 // fmamk and we are trying to fold a non-inlinable constant. 513 if ((Opc == AMDGPU::S_FMAAK_F32 || Opc == AMDGPU::S_FMAMK_F32) && 514 !OpToFold->isReg() && !TII->isInlineConstant(*OpToFold)) { 515 unsigned ImmIdx = Opc == AMDGPU::S_FMAAK_F32 ? 3 : 2; 516 MachineOperand &OpImm = MI->getOperand(ImmIdx); 517 if (!OpImm.isReg() && 518 TII->isInlineConstant(*MI, MI->getOperand(OpNo), OpImm)) 519 return tryToFoldAsFMAAKorMK(); 520 } 521 522 // Special case for s_fmac_f32 if we are trying to fold into Src0 or Src1. 523 // By changing into fmamk we can untie Src2. 524 // If folding for Src0 happens first and it is identical operand to Src1 we 525 // should avoid transforming into fmamk which requires commuting as it would 526 // cause folding into Src1 to fail later on due to wrong OpNo used. 527 if (Opc == AMDGPU::S_FMAC_F32 && 528 (OpNo != 1 || !MI->getOperand(1).isIdenticalTo(MI->getOperand(2)))) { 529 if (tryToFoldAsFMAAKorMK()) 530 return true; 531 } 532 533 // Check the case where we might introduce a second constant operand to a 534 // scalar instruction 535 if (TII->isSALU(MI->getOpcode())) { 536 const MCInstrDesc &InstDesc = MI->getDesc(); 537 const MCOperandInfo &OpInfo = InstDesc.operands()[OpNo]; 538 539 // Fine if the operand can be encoded as an inline constant 540 if (!OpToFold->isReg() && !TII->isInlineConstant(*OpToFold, OpInfo)) { 541 // Otherwise check for another constant 542 for (unsigned i = 0, e = InstDesc.getNumOperands(); i != e; ++i) { 543 auto &Op = MI->getOperand(i); 544 if (OpNo != i && !Op.isReg() && 545 !TII->isInlineConstant(Op, InstDesc.operands()[i])) 546 return false; 547 } 548 } 549 } 550 551 appendFoldCandidate(FoldList, MI, OpNo, OpToFold); 552 return true; 553 } 554 555 bool SIFoldOperands::isUseSafeToFold(const MachineInstr &MI, 556 const MachineOperand &UseMO) const { 557 // Operands of SDWA instructions must be registers. 558 return !TII->isSDWA(MI); 559 } 560 561 // Find a def of the UseReg, check if it is a reg_sequence and find initializers 562 // for each subreg, tracking it to foldable inline immediate if possible. 563 // Returns true on success. 564 bool SIFoldOperands::getRegSeqInit( 565 SmallVectorImpl<std::pair<MachineOperand *, unsigned>> &Defs, 566 Register UseReg, uint8_t OpTy) const { 567 MachineInstr *Def = MRI->getVRegDef(UseReg); 568 if (!Def || !Def->isRegSequence()) 569 return false; 570 571 for (unsigned I = 1, E = Def->getNumExplicitOperands(); I < E; I += 2) { 572 MachineOperand *Sub = &Def->getOperand(I); 573 assert(Sub->isReg()); 574 575 for (MachineInstr *SubDef = MRI->getVRegDef(Sub->getReg()); 576 SubDef && Sub->isReg() && Sub->getReg().isVirtual() && 577 !Sub->getSubReg() && TII->isFoldableCopy(*SubDef); 578 SubDef = MRI->getVRegDef(Sub->getReg())) { 579 MachineOperand *Op = &SubDef->getOperand(1); 580 if (Op->isImm()) { 581 if (TII->isInlineConstant(*Op, OpTy)) 582 Sub = Op; 583 break; 584 } 585 if (!Op->isReg() || Op->getReg().isPhysical()) 586 break; 587 Sub = Op; 588 } 589 590 Defs.emplace_back(Sub, Def->getOperand(I + 1).getImm()); 591 } 592 593 return true; 594 } 595 596 bool SIFoldOperands::tryToFoldACImm( 597 const MachineOperand &OpToFold, MachineInstr *UseMI, unsigned UseOpIdx, 598 SmallVectorImpl<FoldCandidate> &FoldList) const { 599 const MCInstrDesc &Desc = UseMI->getDesc(); 600 if (UseOpIdx >= Desc.getNumOperands()) 601 return false; 602 603 if (!AMDGPU::isSISrcInlinableOperand(Desc, UseOpIdx)) 604 return false; 605 606 uint8_t OpTy = Desc.operands()[UseOpIdx].OperandType; 607 if (OpToFold.isImm() && TII->isInlineConstant(OpToFold, OpTy) && 608 TII->isOperandLegal(*UseMI, UseOpIdx, &OpToFold)) { 609 UseMI->getOperand(UseOpIdx).ChangeToImmediate(OpToFold.getImm()); 610 return true; 611 } 612 613 if (!OpToFold.isReg()) 614 return false; 615 616 Register UseReg = OpToFold.getReg(); 617 if (!UseReg.isVirtual()) 618 return false; 619 620 if (isUseMIInFoldList(FoldList, UseMI)) 621 return false; 622 623 // Maybe it is just a COPY of an immediate itself. 624 MachineInstr *Def = MRI->getVRegDef(UseReg); 625 MachineOperand &UseOp = UseMI->getOperand(UseOpIdx); 626 if (!UseOp.getSubReg() && Def && TII->isFoldableCopy(*Def)) { 627 MachineOperand &DefOp = Def->getOperand(1); 628 if (DefOp.isImm() && TII->isInlineConstant(DefOp, OpTy) && 629 TII->isOperandLegal(*UseMI, UseOpIdx, &DefOp)) { 630 UseMI->getOperand(UseOpIdx).ChangeToImmediate(DefOp.getImm()); 631 return true; 632 } 633 } 634 635 SmallVector<std::pair<MachineOperand*, unsigned>, 32> Defs; 636 if (!getRegSeqInit(Defs, UseReg, OpTy)) 637 return false; 638 639 int32_t Imm; 640 for (unsigned I = 0, E = Defs.size(); I != E; ++I) { 641 const MachineOperand *Op = Defs[I].first; 642 if (!Op->isImm()) 643 return false; 644 645 auto SubImm = Op->getImm(); 646 if (!I) { 647 Imm = SubImm; 648 if (!TII->isInlineConstant(*Op, OpTy) || 649 !TII->isOperandLegal(*UseMI, UseOpIdx, Op)) 650 return false; 651 652 continue; 653 } 654 if (Imm != SubImm) 655 return false; // Can only fold splat constants 656 } 657 658 appendFoldCandidate(FoldList, UseMI, UseOpIdx, Defs[0].first); 659 return true; 660 } 661 662 void SIFoldOperands::foldOperand( 663 MachineOperand &OpToFold, 664 MachineInstr *UseMI, 665 int UseOpIdx, 666 SmallVectorImpl<FoldCandidate> &FoldList, 667 SmallVectorImpl<MachineInstr *> &CopiesToReplace) const { 668 const MachineOperand &UseOp = UseMI->getOperand(UseOpIdx); 669 670 if (!isUseSafeToFold(*UseMI, UseOp)) 671 return; 672 673 // FIXME: Fold operands with subregs. 674 if (UseOp.isReg() && OpToFold.isReg() && 675 (UseOp.isImplicit() || UseOp.getSubReg() != AMDGPU::NoSubRegister)) 676 return; 677 678 // Special case for REG_SEQUENCE: We can't fold literals into 679 // REG_SEQUENCE instructions, so we have to fold them into the 680 // uses of REG_SEQUENCE. 681 if (UseMI->isRegSequence()) { 682 Register RegSeqDstReg = UseMI->getOperand(0).getReg(); 683 unsigned RegSeqDstSubReg = UseMI->getOperand(UseOpIdx + 1).getImm(); 684 685 for (auto &RSUse : make_early_inc_range(MRI->use_nodbg_operands(RegSeqDstReg))) { 686 MachineInstr *RSUseMI = RSUse.getParent(); 687 688 if (tryToFoldACImm(UseMI->getOperand(0), RSUseMI, 689 RSUseMI->getOperandNo(&RSUse), FoldList)) 690 continue; 691 692 if (RSUse.getSubReg() != RegSeqDstSubReg) 693 continue; 694 695 foldOperand(OpToFold, RSUseMI, RSUseMI->getOperandNo(&RSUse), FoldList, 696 CopiesToReplace); 697 } 698 699 return; 700 } 701 702 if (tryToFoldACImm(OpToFold, UseMI, UseOpIdx, FoldList)) 703 return; 704 705 if (frameIndexMayFold(*UseMI, UseOpIdx, OpToFold)) { 706 // Verify that this is a stack access. 707 // FIXME: Should probably use stack pseudos before frame lowering. 708 709 if (TII->isMUBUF(*UseMI)) { 710 if (TII->getNamedOperand(*UseMI, AMDGPU::OpName::srsrc)->getReg() != 711 MFI->getScratchRSrcReg()) 712 return; 713 714 // Ensure this is either relative to the current frame or the current 715 // wave. 716 MachineOperand &SOff = 717 *TII->getNamedOperand(*UseMI, AMDGPU::OpName::soffset); 718 if (!SOff.isImm() || SOff.getImm() != 0) 719 return; 720 } 721 722 // A frame index will resolve to a positive constant, so it should always be 723 // safe to fold the addressing mode, even pre-GFX9. 724 UseMI->getOperand(UseOpIdx).ChangeToFrameIndex(OpToFold.getIndex()); 725 726 const unsigned Opc = UseMI->getOpcode(); 727 if (TII->isFLATScratch(*UseMI) && 728 AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vaddr) && 729 !AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::saddr)) { 730 unsigned NewOpc = AMDGPU::getFlatScratchInstSSfromSV(Opc); 731 UseMI->setDesc(TII->get(NewOpc)); 732 } 733 734 return; 735 } 736 737 bool FoldingImmLike = 738 OpToFold.isImm() || OpToFold.isFI() || OpToFold.isGlobal(); 739 740 if (FoldingImmLike && UseMI->isCopy()) { 741 Register DestReg = UseMI->getOperand(0).getReg(); 742 Register SrcReg = UseMI->getOperand(1).getReg(); 743 assert(SrcReg.isVirtual()); 744 745 const TargetRegisterClass *SrcRC = MRI->getRegClass(SrcReg); 746 747 // Don't fold into a copy to a physical register with the same class. Doing 748 // so would interfere with the register coalescer's logic which would avoid 749 // redundant initializations. 750 if (DestReg.isPhysical() && SrcRC->contains(DestReg)) 751 return; 752 753 const TargetRegisterClass *DestRC = TRI->getRegClassForReg(*MRI, DestReg); 754 if (!DestReg.isPhysical()) { 755 if (DestRC == &AMDGPU::AGPR_32RegClass && 756 TII->isInlineConstant(OpToFold, AMDGPU::OPERAND_REG_INLINE_C_INT32)) { 757 UseMI->setDesc(TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64)); 758 UseMI->getOperand(1).ChangeToImmediate(OpToFold.getImm()); 759 CopiesToReplace.push_back(UseMI); 760 return; 761 } 762 } 763 764 // In order to fold immediates into copies, we need to change the 765 // copy to a MOV. 766 767 unsigned MovOp = TII->getMovOpcode(DestRC); 768 if (MovOp == AMDGPU::COPY) 769 return; 770 771 UseMI->setDesc(TII->get(MovOp)); 772 MachineInstr::mop_iterator ImpOpI = UseMI->implicit_operands().begin(); 773 MachineInstr::mop_iterator ImpOpE = UseMI->implicit_operands().end(); 774 while (ImpOpI != ImpOpE) { 775 MachineInstr::mop_iterator Tmp = ImpOpI; 776 ImpOpI++; 777 UseMI->removeOperand(UseMI->getOperandNo(Tmp)); 778 } 779 CopiesToReplace.push_back(UseMI); 780 } else { 781 if (UseMI->isCopy() && OpToFold.isReg() && 782 UseMI->getOperand(0).getReg().isVirtual() && 783 !UseMI->getOperand(1).getSubReg()) { 784 LLVM_DEBUG(dbgs() << "Folding " << OpToFold << "\n into " << *UseMI); 785 unsigned Size = TII->getOpSize(*UseMI, 1); 786 Register UseReg = OpToFold.getReg(); 787 UseMI->getOperand(1).setReg(UseReg); 788 UseMI->getOperand(1).setSubReg(OpToFold.getSubReg()); 789 UseMI->getOperand(1).setIsKill(false); 790 CopiesToReplace.push_back(UseMI); 791 OpToFold.setIsKill(false); 792 793 // Remove kill flags as kills may now be out of order with uses. 794 MRI->clearKillFlags(OpToFold.getReg()); 795 796 // That is very tricky to store a value into an AGPR. v_accvgpr_write_b32 797 // can only accept VGPR or inline immediate. Recreate a reg_sequence with 798 // its initializers right here, so we will rematerialize immediates and 799 // avoid copies via different reg classes. 800 SmallVector<std::pair<MachineOperand*, unsigned>, 32> Defs; 801 if (Size > 4 && TRI->isAGPR(*MRI, UseMI->getOperand(0).getReg()) && 802 getRegSeqInit(Defs, UseReg, AMDGPU::OPERAND_REG_INLINE_C_INT32)) { 803 const DebugLoc &DL = UseMI->getDebugLoc(); 804 MachineBasicBlock &MBB = *UseMI->getParent(); 805 806 UseMI->setDesc(TII->get(AMDGPU::REG_SEQUENCE)); 807 for (unsigned I = UseMI->getNumOperands() - 1; I > 0; --I) 808 UseMI->removeOperand(I); 809 810 MachineInstrBuilder B(*MBB.getParent(), UseMI); 811 DenseMap<TargetInstrInfo::RegSubRegPair, Register> VGPRCopies; 812 SmallSetVector<TargetInstrInfo::RegSubRegPair, 32> SeenAGPRs; 813 for (unsigned I = 0; I < Size / 4; ++I) { 814 MachineOperand *Def = Defs[I].first; 815 TargetInstrInfo::RegSubRegPair CopyToVGPR; 816 if (Def->isImm() && 817 TII->isInlineConstant(*Def, AMDGPU::OPERAND_REG_INLINE_C_INT32)) { 818 int64_t Imm = Def->getImm(); 819 820 auto Tmp = MRI->createVirtualRegister(&AMDGPU::AGPR_32RegClass); 821 BuildMI(MBB, UseMI, DL, 822 TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), Tmp).addImm(Imm); 823 B.addReg(Tmp); 824 } else if (Def->isReg() && TRI->isAGPR(*MRI, Def->getReg())) { 825 auto Src = getRegSubRegPair(*Def); 826 Def->setIsKill(false); 827 if (!SeenAGPRs.insert(Src)) { 828 // We cannot build a reg_sequence out of the same registers, they 829 // must be copied. Better do it here before copyPhysReg() created 830 // several reads to do the AGPR->VGPR->AGPR copy. 831 CopyToVGPR = Src; 832 } else { 833 B.addReg(Src.Reg, Def->isUndef() ? RegState::Undef : 0, 834 Src.SubReg); 835 } 836 } else { 837 assert(Def->isReg()); 838 Def->setIsKill(false); 839 auto Src = getRegSubRegPair(*Def); 840 841 // Direct copy from SGPR to AGPR is not possible. To avoid creation 842 // of exploded copies SGPR->VGPR->AGPR in the copyPhysReg() later, 843 // create a copy here and track if we already have such a copy. 844 if (TRI->isSGPRReg(*MRI, Src.Reg)) { 845 CopyToVGPR = Src; 846 } else { 847 auto Tmp = MRI->createVirtualRegister(&AMDGPU::AGPR_32RegClass); 848 BuildMI(MBB, UseMI, DL, TII->get(AMDGPU::COPY), Tmp).add(*Def); 849 B.addReg(Tmp); 850 } 851 } 852 853 if (CopyToVGPR.Reg) { 854 Register Vgpr; 855 if (VGPRCopies.count(CopyToVGPR)) { 856 Vgpr = VGPRCopies[CopyToVGPR]; 857 } else { 858 Vgpr = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 859 BuildMI(MBB, UseMI, DL, TII->get(AMDGPU::COPY), Vgpr).add(*Def); 860 VGPRCopies[CopyToVGPR] = Vgpr; 861 } 862 auto Tmp = MRI->createVirtualRegister(&AMDGPU::AGPR_32RegClass); 863 BuildMI(MBB, UseMI, DL, 864 TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), Tmp).addReg(Vgpr); 865 B.addReg(Tmp); 866 } 867 868 B.addImm(Defs[I].second); 869 } 870 LLVM_DEBUG(dbgs() << "Folded " << *UseMI); 871 return; 872 } 873 874 if (Size != 4) 875 return; 876 877 Register Reg0 = UseMI->getOperand(0).getReg(); 878 Register Reg1 = UseMI->getOperand(1).getReg(); 879 if (TRI->isAGPR(*MRI, Reg0) && TRI->isVGPR(*MRI, Reg1)) 880 UseMI->setDesc(TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64)); 881 else if (TRI->isVGPR(*MRI, Reg0) && TRI->isAGPR(*MRI, Reg1)) 882 UseMI->setDesc(TII->get(AMDGPU::V_ACCVGPR_READ_B32_e64)); 883 else if (ST->hasGFX90AInsts() && TRI->isAGPR(*MRI, Reg0) && 884 TRI->isAGPR(*MRI, Reg1)) 885 UseMI->setDesc(TII->get(AMDGPU::V_ACCVGPR_MOV_B32)); 886 return; 887 } 888 889 unsigned UseOpc = UseMI->getOpcode(); 890 if (UseOpc == AMDGPU::V_READFIRSTLANE_B32 || 891 (UseOpc == AMDGPU::V_READLANE_B32 && 892 (int)UseOpIdx == 893 AMDGPU::getNamedOperandIdx(UseOpc, AMDGPU::OpName::src0))) { 894 // %vgpr = V_MOV_B32 imm 895 // %sgpr = V_READFIRSTLANE_B32 %vgpr 896 // => 897 // %sgpr = S_MOV_B32 imm 898 if (FoldingImmLike) { 899 if (execMayBeModifiedBeforeUse(*MRI, 900 UseMI->getOperand(UseOpIdx).getReg(), 901 *OpToFold.getParent(), 902 *UseMI)) 903 return; 904 905 UseMI->setDesc(TII->get(AMDGPU::S_MOV_B32)); 906 907 if (OpToFold.isImm()) 908 UseMI->getOperand(1).ChangeToImmediate(OpToFold.getImm()); 909 else 910 UseMI->getOperand(1).ChangeToFrameIndex(OpToFold.getIndex()); 911 UseMI->removeOperand(2); // Remove exec read (or src1 for readlane) 912 return; 913 } 914 915 if (OpToFold.isReg() && TRI->isSGPRReg(*MRI, OpToFold.getReg())) { 916 if (execMayBeModifiedBeforeUse(*MRI, 917 UseMI->getOperand(UseOpIdx).getReg(), 918 *OpToFold.getParent(), 919 *UseMI)) 920 return; 921 922 // %vgpr = COPY %sgpr0 923 // %sgpr1 = V_READFIRSTLANE_B32 %vgpr 924 // => 925 // %sgpr1 = COPY %sgpr0 926 UseMI->setDesc(TII->get(AMDGPU::COPY)); 927 UseMI->getOperand(1).setReg(OpToFold.getReg()); 928 UseMI->getOperand(1).setSubReg(OpToFold.getSubReg()); 929 UseMI->getOperand(1).setIsKill(false); 930 UseMI->removeOperand(2); // Remove exec read (or src1 for readlane) 931 return; 932 } 933 } 934 935 const MCInstrDesc &UseDesc = UseMI->getDesc(); 936 937 // Don't fold into target independent nodes. Target independent opcodes 938 // don't have defined register classes. 939 if (UseDesc.isVariadic() || UseOp.isImplicit() || 940 UseDesc.operands()[UseOpIdx].RegClass == -1) 941 return; 942 } 943 944 if (!FoldingImmLike) { 945 if (OpToFold.isReg() && ST->needsAlignedVGPRs()) { 946 // Don't fold if OpToFold doesn't hold an aligned register. 947 const TargetRegisterClass *RC = 948 TRI->getRegClassForReg(*MRI, OpToFold.getReg()); 949 if (TRI->hasVectorRegisters(RC) && OpToFold.getSubReg()) { 950 unsigned SubReg = OpToFold.getSubReg(); 951 if (const TargetRegisterClass *SubRC = 952 TRI->getSubRegisterClass(RC, SubReg)) 953 RC = SubRC; 954 } 955 956 if (!RC || !TRI->isProperlyAlignedRC(*RC)) 957 return; 958 } 959 960 tryAddToFoldList(FoldList, UseMI, UseOpIdx, &OpToFold); 961 962 // FIXME: We could try to change the instruction from 64-bit to 32-bit 963 // to enable more folding opportunities. The shrink operands pass 964 // already does this. 965 return; 966 } 967 968 969 const MCInstrDesc &FoldDesc = OpToFold.getParent()->getDesc(); 970 const TargetRegisterClass *FoldRC = 971 TRI->getRegClass(FoldDesc.operands()[0].RegClass); 972 973 // Split 64-bit constants into 32-bits for folding. 974 if (UseOp.getSubReg() && AMDGPU::getRegBitWidth(*FoldRC) == 64) { 975 Register UseReg = UseOp.getReg(); 976 const TargetRegisterClass *UseRC = MRI->getRegClass(UseReg); 977 if (AMDGPU::getRegBitWidth(*UseRC) != 64) 978 return; 979 980 APInt Imm(64, OpToFold.getImm()); 981 if (UseOp.getSubReg() == AMDGPU::sub0) { 982 Imm = Imm.getLoBits(32); 983 } else { 984 assert(UseOp.getSubReg() == AMDGPU::sub1); 985 Imm = Imm.getHiBits(32); 986 } 987 988 MachineOperand ImmOp = MachineOperand::CreateImm(Imm.getSExtValue()); 989 tryAddToFoldList(FoldList, UseMI, UseOpIdx, &ImmOp); 990 return; 991 } 992 993 tryAddToFoldList(FoldList, UseMI, UseOpIdx, &OpToFold); 994 } 995 996 static bool evalBinaryInstruction(unsigned Opcode, int32_t &Result, 997 uint32_t LHS, uint32_t RHS) { 998 switch (Opcode) { 999 case AMDGPU::V_AND_B32_e64: 1000 case AMDGPU::V_AND_B32_e32: 1001 case AMDGPU::S_AND_B32: 1002 Result = LHS & RHS; 1003 return true; 1004 case AMDGPU::V_OR_B32_e64: 1005 case AMDGPU::V_OR_B32_e32: 1006 case AMDGPU::S_OR_B32: 1007 Result = LHS | RHS; 1008 return true; 1009 case AMDGPU::V_XOR_B32_e64: 1010 case AMDGPU::V_XOR_B32_e32: 1011 case AMDGPU::S_XOR_B32: 1012 Result = LHS ^ RHS; 1013 return true; 1014 case AMDGPU::S_XNOR_B32: 1015 Result = ~(LHS ^ RHS); 1016 return true; 1017 case AMDGPU::S_NAND_B32: 1018 Result = ~(LHS & RHS); 1019 return true; 1020 case AMDGPU::S_NOR_B32: 1021 Result = ~(LHS | RHS); 1022 return true; 1023 case AMDGPU::S_ANDN2_B32: 1024 Result = LHS & ~RHS; 1025 return true; 1026 case AMDGPU::S_ORN2_B32: 1027 Result = LHS | ~RHS; 1028 return true; 1029 case AMDGPU::V_LSHL_B32_e64: 1030 case AMDGPU::V_LSHL_B32_e32: 1031 case AMDGPU::S_LSHL_B32: 1032 // The instruction ignores the high bits for out of bounds shifts. 1033 Result = LHS << (RHS & 31); 1034 return true; 1035 case AMDGPU::V_LSHLREV_B32_e64: 1036 case AMDGPU::V_LSHLREV_B32_e32: 1037 Result = RHS << (LHS & 31); 1038 return true; 1039 case AMDGPU::V_LSHR_B32_e64: 1040 case AMDGPU::V_LSHR_B32_e32: 1041 case AMDGPU::S_LSHR_B32: 1042 Result = LHS >> (RHS & 31); 1043 return true; 1044 case AMDGPU::V_LSHRREV_B32_e64: 1045 case AMDGPU::V_LSHRREV_B32_e32: 1046 Result = RHS >> (LHS & 31); 1047 return true; 1048 case AMDGPU::V_ASHR_I32_e64: 1049 case AMDGPU::V_ASHR_I32_e32: 1050 case AMDGPU::S_ASHR_I32: 1051 Result = static_cast<int32_t>(LHS) >> (RHS & 31); 1052 return true; 1053 case AMDGPU::V_ASHRREV_I32_e64: 1054 case AMDGPU::V_ASHRREV_I32_e32: 1055 Result = static_cast<int32_t>(RHS) >> (LHS & 31); 1056 return true; 1057 default: 1058 return false; 1059 } 1060 } 1061 1062 static unsigned getMovOpc(bool IsScalar) { 1063 return IsScalar ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32; 1064 } 1065 1066 static void mutateCopyOp(MachineInstr &MI, const MCInstrDesc &NewDesc) { 1067 MI.setDesc(NewDesc); 1068 1069 // Remove any leftover implicit operands from mutating the instruction. e.g. 1070 // if we replace an s_and_b32 with a copy, we don't need the implicit scc def 1071 // anymore. 1072 const MCInstrDesc &Desc = MI.getDesc(); 1073 unsigned NumOps = Desc.getNumOperands() + Desc.implicit_uses().size() + 1074 Desc.implicit_defs().size(); 1075 1076 for (unsigned I = MI.getNumOperands() - 1; I >= NumOps; --I) 1077 MI.removeOperand(I); 1078 } 1079 1080 MachineOperand * 1081 SIFoldOperands::getImmOrMaterializedImm(MachineOperand &Op) const { 1082 // If this has a subregister, it obviously is a register source. 1083 if (!Op.isReg() || Op.getSubReg() != AMDGPU::NoSubRegister || 1084 !Op.getReg().isVirtual()) 1085 return &Op; 1086 1087 MachineInstr *Def = MRI->getVRegDef(Op.getReg()); 1088 if (Def && Def->isMoveImmediate()) { 1089 MachineOperand &ImmSrc = Def->getOperand(1); 1090 if (ImmSrc.isImm()) 1091 return &ImmSrc; 1092 } 1093 1094 return &Op; 1095 } 1096 1097 // Try to simplify operations with a constant that may appear after instruction 1098 // selection. 1099 // TODO: See if a frame index with a fixed offset can fold. 1100 bool SIFoldOperands::tryConstantFoldOp(MachineInstr *MI) const { 1101 if (!MI->allImplicitDefsAreDead()) 1102 return false; 1103 1104 unsigned Opc = MI->getOpcode(); 1105 1106 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); 1107 if (Src0Idx == -1) 1108 return false; 1109 MachineOperand *Src0 = getImmOrMaterializedImm(MI->getOperand(Src0Idx)); 1110 1111 if ((Opc == AMDGPU::V_NOT_B32_e64 || Opc == AMDGPU::V_NOT_B32_e32 || 1112 Opc == AMDGPU::S_NOT_B32) && 1113 Src0->isImm()) { 1114 MI->getOperand(1).ChangeToImmediate(~Src0->getImm()); 1115 mutateCopyOp(*MI, TII->get(getMovOpc(Opc == AMDGPU::S_NOT_B32))); 1116 return true; 1117 } 1118 1119 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1); 1120 if (Src1Idx == -1) 1121 return false; 1122 MachineOperand *Src1 = getImmOrMaterializedImm(MI->getOperand(Src1Idx)); 1123 1124 if (!Src0->isImm() && !Src1->isImm()) 1125 return false; 1126 1127 // and k0, k1 -> v_mov_b32 (k0 & k1) 1128 // or k0, k1 -> v_mov_b32 (k0 | k1) 1129 // xor k0, k1 -> v_mov_b32 (k0 ^ k1) 1130 if (Src0->isImm() && Src1->isImm()) { 1131 int32_t NewImm; 1132 if (!evalBinaryInstruction(Opc, NewImm, Src0->getImm(), Src1->getImm())) 1133 return false; 1134 1135 bool IsSGPR = TRI->isSGPRReg(*MRI, MI->getOperand(0).getReg()); 1136 1137 // Be careful to change the right operand, src0 may belong to a different 1138 // instruction. 1139 MI->getOperand(Src0Idx).ChangeToImmediate(NewImm); 1140 MI->removeOperand(Src1Idx); 1141 mutateCopyOp(*MI, TII->get(getMovOpc(IsSGPR))); 1142 return true; 1143 } 1144 1145 if (!MI->isCommutable()) 1146 return false; 1147 1148 if (Src0->isImm() && !Src1->isImm()) { 1149 std::swap(Src0, Src1); 1150 std::swap(Src0Idx, Src1Idx); 1151 } 1152 1153 int32_t Src1Val = static_cast<int32_t>(Src1->getImm()); 1154 if (Opc == AMDGPU::V_OR_B32_e64 || 1155 Opc == AMDGPU::V_OR_B32_e32 || 1156 Opc == AMDGPU::S_OR_B32) { 1157 if (Src1Val == 0) { 1158 // y = or x, 0 => y = copy x 1159 MI->removeOperand(Src1Idx); 1160 mutateCopyOp(*MI, TII->get(AMDGPU::COPY)); 1161 } else if (Src1Val == -1) { 1162 // y = or x, -1 => y = v_mov_b32 -1 1163 MI->removeOperand(Src1Idx); 1164 mutateCopyOp(*MI, TII->get(getMovOpc(Opc == AMDGPU::S_OR_B32))); 1165 } else 1166 return false; 1167 1168 return true; 1169 } 1170 1171 if (Opc == AMDGPU::V_AND_B32_e64 || Opc == AMDGPU::V_AND_B32_e32 || 1172 Opc == AMDGPU::S_AND_B32) { 1173 if (Src1Val == 0) { 1174 // y = and x, 0 => y = v_mov_b32 0 1175 MI->removeOperand(Src0Idx); 1176 mutateCopyOp(*MI, TII->get(getMovOpc(Opc == AMDGPU::S_AND_B32))); 1177 } else if (Src1Val == -1) { 1178 // y = and x, -1 => y = copy x 1179 MI->removeOperand(Src1Idx); 1180 mutateCopyOp(*MI, TII->get(AMDGPU::COPY)); 1181 } else 1182 return false; 1183 1184 return true; 1185 } 1186 1187 if (Opc == AMDGPU::V_XOR_B32_e64 || Opc == AMDGPU::V_XOR_B32_e32 || 1188 Opc == AMDGPU::S_XOR_B32) { 1189 if (Src1Val == 0) { 1190 // y = xor x, 0 => y = copy x 1191 MI->removeOperand(Src1Idx); 1192 mutateCopyOp(*MI, TII->get(AMDGPU::COPY)); 1193 return true; 1194 } 1195 } 1196 1197 return false; 1198 } 1199 1200 // Try to fold an instruction into a simpler one 1201 bool SIFoldOperands::tryFoldCndMask(MachineInstr &MI) const { 1202 unsigned Opc = MI.getOpcode(); 1203 if (Opc != AMDGPU::V_CNDMASK_B32_e32 && Opc != AMDGPU::V_CNDMASK_B32_e64 && 1204 Opc != AMDGPU::V_CNDMASK_B64_PSEUDO) 1205 return false; 1206 1207 MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); 1208 MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); 1209 if (!Src1->isIdenticalTo(*Src0)) { 1210 auto *Src0Imm = getImmOrMaterializedImm(*Src0); 1211 auto *Src1Imm = getImmOrMaterializedImm(*Src1); 1212 if (!Src1Imm->isIdenticalTo(*Src0Imm)) 1213 return false; 1214 } 1215 1216 int Src1ModIdx = 1217 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1_modifiers); 1218 int Src0ModIdx = 1219 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0_modifiers); 1220 if ((Src1ModIdx != -1 && MI.getOperand(Src1ModIdx).getImm() != 0) || 1221 (Src0ModIdx != -1 && MI.getOperand(Src0ModIdx).getImm() != 0)) 1222 return false; 1223 1224 LLVM_DEBUG(dbgs() << "Folded " << MI << " into "); 1225 auto &NewDesc = 1226 TII->get(Src0->isReg() ? (unsigned)AMDGPU::COPY : getMovOpc(false)); 1227 int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2); 1228 if (Src2Idx != -1) 1229 MI.removeOperand(Src2Idx); 1230 MI.removeOperand(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1)); 1231 if (Src1ModIdx != -1) 1232 MI.removeOperand(Src1ModIdx); 1233 if (Src0ModIdx != -1) 1234 MI.removeOperand(Src0ModIdx); 1235 mutateCopyOp(MI, NewDesc); 1236 LLVM_DEBUG(dbgs() << MI); 1237 return true; 1238 } 1239 1240 bool SIFoldOperands::tryFoldZeroHighBits(MachineInstr &MI) const { 1241 if (MI.getOpcode() != AMDGPU::V_AND_B32_e64 && 1242 MI.getOpcode() != AMDGPU::V_AND_B32_e32) 1243 return false; 1244 1245 MachineOperand *Src0 = getImmOrMaterializedImm(MI.getOperand(1)); 1246 if (!Src0->isImm() || Src0->getImm() != 0xffff) 1247 return false; 1248 1249 Register Src1 = MI.getOperand(2).getReg(); 1250 MachineInstr *SrcDef = MRI->getVRegDef(Src1); 1251 if (!ST->zeroesHigh16BitsOfDest(SrcDef->getOpcode())) 1252 return false; 1253 1254 Register Dst = MI.getOperand(0).getReg(); 1255 MRI->replaceRegWith(Dst, SrcDef->getOperand(0).getReg()); 1256 MI.eraseFromParent(); 1257 return true; 1258 } 1259 1260 bool SIFoldOperands::foldInstOperand(MachineInstr &MI, 1261 MachineOperand &OpToFold) const { 1262 // We need mutate the operands of new mov instructions to add implicit 1263 // uses of EXEC, but adding them invalidates the use_iterator, so defer 1264 // this. 1265 SmallVector<MachineInstr *, 4> CopiesToReplace; 1266 SmallVector<FoldCandidate, 4> FoldList; 1267 MachineOperand &Dst = MI.getOperand(0); 1268 bool Changed = false; 1269 1270 if (OpToFold.isImm()) { 1271 for (auto &UseMI : 1272 make_early_inc_range(MRI->use_nodbg_instructions(Dst.getReg()))) { 1273 // Folding the immediate may reveal operations that can be constant 1274 // folded or replaced with a copy. This can happen for example after 1275 // frame indices are lowered to constants or from splitting 64-bit 1276 // constants. 1277 // 1278 // We may also encounter cases where one or both operands are 1279 // immediates materialized into a register, which would ordinarily not 1280 // be folded due to multiple uses or operand constraints. 1281 if (tryConstantFoldOp(&UseMI)) { 1282 LLVM_DEBUG(dbgs() << "Constant folded " << UseMI); 1283 Changed = true; 1284 } 1285 } 1286 } 1287 1288 SmallVector<MachineOperand *, 4> UsesToProcess; 1289 for (auto &Use : MRI->use_nodbg_operands(Dst.getReg())) 1290 UsesToProcess.push_back(&Use); 1291 for (auto *U : UsesToProcess) { 1292 MachineInstr *UseMI = U->getParent(); 1293 foldOperand(OpToFold, UseMI, UseMI->getOperandNo(U), FoldList, 1294 CopiesToReplace); 1295 } 1296 1297 if (CopiesToReplace.empty() && FoldList.empty()) 1298 return Changed; 1299 1300 MachineFunction *MF = MI.getParent()->getParent(); 1301 // Make sure we add EXEC uses to any new v_mov instructions created. 1302 for (MachineInstr *Copy : CopiesToReplace) 1303 Copy->addImplicitDefUseOperands(*MF); 1304 1305 for (FoldCandidate &Fold : FoldList) { 1306 assert(!Fold.isReg() || Fold.OpToFold); 1307 if (Fold.isReg() && Fold.OpToFold->getReg().isVirtual()) { 1308 Register Reg = Fold.OpToFold->getReg(); 1309 MachineInstr *DefMI = Fold.OpToFold->getParent(); 1310 if (DefMI->readsRegister(AMDGPU::EXEC, TRI) && 1311 execMayBeModifiedBeforeUse(*MRI, Reg, *DefMI, *Fold.UseMI)) 1312 continue; 1313 } 1314 if (updateOperand(Fold)) { 1315 // Clear kill flags. 1316 if (Fold.isReg()) { 1317 assert(Fold.OpToFold && Fold.OpToFold->isReg()); 1318 // FIXME: Probably shouldn't bother trying to fold if not an 1319 // SGPR. PeepholeOptimizer can eliminate redundant VGPR->VGPR 1320 // copies. 1321 MRI->clearKillFlags(Fold.OpToFold->getReg()); 1322 } 1323 LLVM_DEBUG(dbgs() << "Folded source from " << MI << " into OpNo " 1324 << static_cast<int>(Fold.UseOpNo) << " of " 1325 << *Fold.UseMI); 1326 } else if (Fold.Commuted) { 1327 // Restoring instruction's original operand order if fold has failed. 1328 TII->commuteInstruction(*Fold.UseMI, false); 1329 } 1330 } 1331 return true; 1332 } 1333 1334 bool SIFoldOperands::tryFoldFoldableCopy( 1335 MachineInstr &MI, MachineOperand *&CurrentKnownM0Val) const { 1336 // Specially track simple redefs of m0 to the same value in a block, so we 1337 // can erase the later ones. 1338 if (MI.getOperand(0).getReg() == AMDGPU::M0) { 1339 MachineOperand &NewM0Val = MI.getOperand(1); 1340 if (CurrentKnownM0Val && CurrentKnownM0Val->isIdenticalTo(NewM0Val)) { 1341 MI.eraseFromParent(); 1342 return true; 1343 } 1344 1345 // We aren't tracking other physical registers 1346 CurrentKnownM0Val = (NewM0Val.isReg() && NewM0Val.getReg().isPhysical()) 1347 ? nullptr 1348 : &NewM0Val; 1349 return false; 1350 } 1351 1352 MachineOperand &OpToFold = MI.getOperand(1); 1353 bool FoldingImm = OpToFold.isImm() || OpToFold.isFI() || OpToFold.isGlobal(); 1354 1355 // FIXME: We could also be folding things like TargetIndexes. 1356 if (!FoldingImm && !OpToFold.isReg()) 1357 return false; 1358 1359 if (OpToFold.isReg() && !OpToFold.getReg().isVirtual()) 1360 return false; 1361 1362 // Prevent folding operands backwards in the function. For example, 1363 // the COPY opcode must not be replaced by 1 in this example: 1364 // 1365 // %3 = COPY %vgpr0; VGPR_32:%3 1366 // ... 1367 // %vgpr0 = V_MOV_B32_e32 1, implicit %exec 1368 if (!MI.getOperand(0).getReg().isVirtual()) 1369 return false; 1370 1371 bool Changed = foldInstOperand(MI, OpToFold); 1372 1373 // If we managed to fold all uses of this copy then we might as well 1374 // delete it now. 1375 // The only reason we need to follow chains of copies here is that 1376 // tryFoldRegSequence looks forward through copies before folding a 1377 // REG_SEQUENCE into its eventual users. 1378 auto *InstToErase = &MI; 1379 while (MRI->use_nodbg_empty(InstToErase->getOperand(0).getReg())) { 1380 auto &SrcOp = InstToErase->getOperand(1); 1381 auto SrcReg = SrcOp.isReg() ? SrcOp.getReg() : Register(); 1382 InstToErase->eraseFromParent(); 1383 Changed = true; 1384 InstToErase = nullptr; 1385 if (!SrcReg || SrcReg.isPhysical()) 1386 break; 1387 InstToErase = MRI->getVRegDef(SrcReg); 1388 if (!InstToErase || !TII->isFoldableCopy(*InstToErase)) 1389 break; 1390 } 1391 1392 if (InstToErase && InstToErase->isRegSequence() && 1393 MRI->use_nodbg_empty(InstToErase->getOperand(0).getReg())) { 1394 InstToErase->eraseFromParent(); 1395 Changed = true; 1396 } 1397 1398 return Changed; 1399 } 1400 1401 // Clamp patterns are canonically selected to v_max_* instructions, so only 1402 // handle them. 1403 const MachineOperand *SIFoldOperands::isClamp(const MachineInstr &MI) const { 1404 unsigned Op = MI.getOpcode(); 1405 switch (Op) { 1406 case AMDGPU::V_MAX_F32_e64: 1407 case AMDGPU::V_MAX_F16_e64: 1408 case AMDGPU::V_MAX_F16_t16_e64: 1409 case AMDGPU::V_MAX_F16_fake16_e64: 1410 case AMDGPU::V_MAX_F64_e64: 1411 case AMDGPU::V_PK_MAX_F16: { 1412 if (!TII->getNamedOperand(MI, AMDGPU::OpName::clamp)->getImm()) 1413 return nullptr; 1414 1415 // Make sure sources are identical. 1416 const MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); 1417 const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); 1418 if (!Src0->isReg() || !Src1->isReg() || 1419 Src0->getReg() != Src1->getReg() || 1420 Src0->getSubReg() != Src1->getSubReg() || 1421 Src0->getSubReg() != AMDGPU::NoSubRegister) 1422 return nullptr; 1423 1424 // Can't fold up if we have modifiers. 1425 if (TII->hasModifiersSet(MI, AMDGPU::OpName::omod)) 1426 return nullptr; 1427 1428 unsigned Src0Mods 1429 = TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm(); 1430 unsigned Src1Mods 1431 = TII->getNamedOperand(MI, AMDGPU::OpName::src1_modifiers)->getImm(); 1432 1433 // Having a 0 op_sel_hi would require swizzling the output in the source 1434 // instruction, which we can't do. 1435 unsigned UnsetMods = (Op == AMDGPU::V_PK_MAX_F16) ? SISrcMods::OP_SEL_1 1436 : 0u; 1437 if (Src0Mods != UnsetMods && Src1Mods != UnsetMods) 1438 return nullptr; 1439 return Src0; 1440 } 1441 default: 1442 return nullptr; 1443 } 1444 } 1445 1446 // FIXME: Clamp for v_mad_mixhi_f16 handled during isel. 1447 bool SIFoldOperands::tryFoldClamp(MachineInstr &MI) { 1448 const MachineOperand *ClampSrc = isClamp(MI); 1449 if (!ClampSrc || !MRI->hasOneNonDBGUser(ClampSrc->getReg())) 1450 return false; 1451 1452 MachineInstr *Def = MRI->getVRegDef(ClampSrc->getReg()); 1453 1454 // The type of clamp must be compatible. 1455 if (TII->getClampMask(*Def) != TII->getClampMask(MI)) 1456 return false; 1457 1458 MachineOperand *DefClamp = TII->getNamedOperand(*Def, AMDGPU::OpName::clamp); 1459 if (!DefClamp) 1460 return false; 1461 1462 LLVM_DEBUG(dbgs() << "Folding clamp " << *DefClamp << " into " << *Def); 1463 1464 // Clamp is applied after omod, so it is OK if omod is set. 1465 DefClamp->setImm(1); 1466 MRI->replaceRegWith(MI.getOperand(0).getReg(), Def->getOperand(0).getReg()); 1467 MI.eraseFromParent(); 1468 1469 // Use of output modifiers forces VOP3 encoding for a VOP2 mac/fmac 1470 // instruction, so we might as well convert it to the more flexible VOP3-only 1471 // mad/fma form. 1472 if (TII->convertToThreeAddress(*Def, nullptr, nullptr)) 1473 Def->eraseFromParent(); 1474 1475 return true; 1476 } 1477 1478 static int getOModValue(unsigned Opc, int64_t Val) { 1479 switch (Opc) { 1480 case AMDGPU::V_MUL_F64_e64: { 1481 switch (Val) { 1482 case 0x3fe0000000000000: // 0.5 1483 return SIOutMods::DIV2; 1484 case 0x4000000000000000: // 2.0 1485 return SIOutMods::MUL2; 1486 case 0x4010000000000000: // 4.0 1487 return SIOutMods::MUL4; 1488 default: 1489 return SIOutMods::NONE; 1490 } 1491 } 1492 case AMDGPU::V_MUL_F32_e64: { 1493 switch (static_cast<uint32_t>(Val)) { 1494 case 0x3f000000: // 0.5 1495 return SIOutMods::DIV2; 1496 case 0x40000000: // 2.0 1497 return SIOutMods::MUL2; 1498 case 0x40800000: // 4.0 1499 return SIOutMods::MUL4; 1500 default: 1501 return SIOutMods::NONE; 1502 } 1503 } 1504 case AMDGPU::V_MUL_F16_e64: 1505 case AMDGPU::V_MUL_F16_t16_e64: 1506 case AMDGPU::V_MUL_F16_fake16_e64: { 1507 switch (static_cast<uint16_t>(Val)) { 1508 case 0x3800: // 0.5 1509 return SIOutMods::DIV2; 1510 case 0x4000: // 2.0 1511 return SIOutMods::MUL2; 1512 case 0x4400: // 4.0 1513 return SIOutMods::MUL4; 1514 default: 1515 return SIOutMods::NONE; 1516 } 1517 } 1518 default: 1519 llvm_unreachable("invalid mul opcode"); 1520 } 1521 } 1522 1523 // FIXME: Does this really not support denormals with f16? 1524 // FIXME: Does this need to check IEEE mode bit? SNaNs are generally not 1525 // handled, so will anything other than that break? 1526 std::pair<const MachineOperand *, int> 1527 SIFoldOperands::isOMod(const MachineInstr &MI) const { 1528 unsigned Op = MI.getOpcode(); 1529 switch (Op) { 1530 case AMDGPU::V_MUL_F64_e64: 1531 case AMDGPU::V_MUL_F32_e64: 1532 case AMDGPU::V_MUL_F16_t16_e64: 1533 case AMDGPU::V_MUL_F16_fake16_e64: 1534 case AMDGPU::V_MUL_F16_e64: { 1535 // If output denormals are enabled, omod is ignored. 1536 if ((Op == AMDGPU::V_MUL_F32_e64 && 1537 MFI->getMode().FP32Denormals.Output != DenormalMode::PreserveSign) || 1538 ((Op == AMDGPU::V_MUL_F64_e64 || Op == AMDGPU::V_MUL_F16_e64 || 1539 Op == AMDGPU::V_MUL_F16_t16_e64 || 1540 Op == AMDGPU::V_MUL_F16_fake16_e64) && 1541 MFI->getMode().FP64FP16Denormals.Output != DenormalMode::PreserveSign)) 1542 return std::pair(nullptr, SIOutMods::NONE); 1543 1544 const MachineOperand *RegOp = nullptr; 1545 const MachineOperand *ImmOp = nullptr; 1546 const MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); 1547 const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); 1548 if (Src0->isImm()) { 1549 ImmOp = Src0; 1550 RegOp = Src1; 1551 } else if (Src1->isImm()) { 1552 ImmOp = Src1; 1553 RegOp = Src0; 1554 } else 1555 return std::pair(nullptr, SIOutMods::NONE); 1556 1557 int OMod = getOModValue(Op, ImmOp->getImm()); 1558 if (OMod == SIOutMods::NONE || 1559 TII->hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers) || 1560 TII->hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers) || 1561 TII->hasModifiersSet(MI, AMDGPU::OpName::omod) || 1562 TII->hasModifiersSet(MI, AMDGPU::OpName::clamp)) 1563 return std::pair(nullptr, SIOutMods::NONE); 1564 1565 return std::pair(RegOp, OMod); 1566 } 1567 case AMDGPU::V_ADD_F64_e64: 1568 case AMDGPU::V_ADD_F32_e64: 1569 case AMDGPU::V_ADD_F16_e64: 1570 case AMDGPU::V_ADD_F16_t16_e64: 1571 case AMDGPU::V_ADD_F16_fake16_e64: { 1572 // If output denormals are enabled, omod is ignored. 1573 if ((Op == AMDGPU::V_ADD_F32_e64 && 1574 MFI->getMode().FP32Denormals.Output != DenormalMode::PreserveSign) || 1575 ((Op == AMDGPU::V_ADD_F64_e64 || Op == AMDGPU::V_ADD_F16_e64 || 1576 Op == AMDGPU::V_ADD_F16_t16_e64 || 1577 Op == AMDGPU::V_ADD_F16_fake16_e64) && 1578 MFI->getMode().FP64FP16Denormals.Output != DenormalMode::PreserveSign)) 1579 return std::pair(nullptr, SIOutMods::NONE); 1580 1581 // Look through the DAGCombiner canonicalization fmul x, 2 -> fadd x, x 1582 const MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); 1583 const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); 1584 1585 if (Src0->isReg() && Src1->isReg() && Src0->getReg() == Src1->getReg() && 1586 Src0->getSubReg() == Src1->getSubReg() && 1587 !TII->hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers) && 1588 !TII->hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers) && 1589 !TII->hasModifiersSet(MI, AMDGPU::OpName::clamp) && 1590 !TII->hasModifiersSet(MI, AMDGPU::OpName::omod)) 1591 return std::pair(Src0, SIOutMods::MUL2); 1592 1593 return std::pair(nullptr, SIOutMods::NONE); 1594 } 1595 default: 1596 return std::pair(nullptr, SIOutMods::NONE); 1597 } 1598 } 1599 1600 // FIXME: Does this need to check IEEE bit on function? 1601 bool SIFoldOperands::tryFoldOMod(MachineInstr &MI) { 1602 const MachineOperand *RegOp; 1603 int OMod; 1604 std::tie(RegOp, OMod) = isOMod(MI); 1605 if (OMod == SIOutMods::NONE || !RegOp->isReg() || 1606 RegOp->getSubReg() != AMDGPU::NoSubRegister || 1607 !MRI->hasOneNonDBGUser(RegOp->getReg())) 1608 return false; 1609 1610 MachineInstr *Def = MRI->getVRegDef(RegOp->getReg()); 1611 MachineOperand *DefOMod = TII->getNamedOperand(*Def, AMDGPU::OpName::omod); 1612 if (!DefOMod || DefOMod->getImm() != SIOutMods::NONE) 1613 return false; 1614 1615 // Clamp is applied after omod. If the source already has clamp set, don't 1616 // fold it. 1617 if (TII->hasModifiersSet(*Def, AMDGPU::OpName::clamp)) 1618 return false; 1619 1620 LLVM_DEBUG(dbgs() << "Folding omod " << MI << " into " << *Def); 1621 1622 DefOMod->setImm(OMod); 1623 MRI->replaceRegWith(MI.getOperand(0).getReg(), Def->getOperand(0).getReg()); 1624 MI.eraseFromParent(); 1625 1626 // Use of output modifiers forces VOP3 encoding for a VOP2 mac/fmac 1627 // instruction, so we might as well convert it to the more flexible VOP3-only 1628 // mad/fma form. 1629 if (TII->convertToThreeAddress(*Def, nullptr, nullptr)) 1630 Def->eraseFromParent(); 1631 1632 return true; 1633 } 1634 1635 // Try to fold a reg_sequence with vgpr output and agpr inputs into an 1636 // instruction which can take an agpr. So far that means a store. 1637 bool SIFoldOperands::tryFoldRegSequence(MachineInstr &MI) { 1638 assert(MI.isRegSequence()); 1639 auto Reg = MI.getOperand(0).getReg(); 1640 1641 if (!ST->hasGFX90AInsts() || !TRI->isVGPR(*MRI, Reg) || 1642 !MRI->hasOneNonDBGUse(Reg)) 1643 return false; 1644 1645 SmallVector<std::pair<MachineOperand*, unsigned>, 32> Defs; 1646 if (!getRegSeqInit(Defs, Reg, MCOI::OPERAND_REGISTER)) 1647 return false; 1648 1649 for (auto &Def : Defs) { 1650 const auto *Op = Def.first; 1651 if (!Op->isReg()) 1652 return false; 1653 if (TRI->isAGPR(*MRI, Op->getReg())) 1654 continue; 1655 // Maybe this is a COPY from AREG 1656 const MachineInstr *SubDef = MRI->getVRegDef(Op->getReg()); 1657 if (!SubDef || !SubDef->isCopy() || SubDef->getOperand(1).getSubReg()) 1658 return false; 1659 if (!TRI->isAGPR(*MRI, SubDef->getOperand(1).getReg())) 1660 return false; 1661 } 1662 1663 MachineOperand *Op = &*MRI->use_nodbg_begin(Reg); 1664 MachineInstr *UseMI = Op->getParent(); 1665 while (UseMI->isCopy() && !Op->getSubReg()) { 1666 Reg = UseMI->getOperand(0).getReg(); 1667 if (!TRI->isVGPR(*MRI, Reg) || !MRI->hasOneNonDBGUse(Reg)) 1668 return false; 1669 Op = &*MRI->use_nodbg_begin(Reg); 1670 UseMI = Op->getParent(); 1671 } 1672 1673 if (Op->getSubReg()) 1674 return false; 1675 1676 unsigned OpIdx = Op - &UseMI->getOperand(0); 1677 const MCInstrDesc &InstDesc = UseMI->getDesc(); 1678 const TargetRegisterClass *OpRC = 1679 TII->getRegClass(InstDesc, OpIdx, TRI, *MI.getMF()); 1680 if (!OpRC || !TRI->isVectorSuperClass(OpRC)) 1681 return false; 1682 1683 const auto *NewDstRC = TRI->getEquivalentAGPRClass(MRI->getRegClass(Reg)); 1684 auto Dst = MRI->createVirtualRegister(NewDstRC); 1685 auto RS = BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), 1686 TII->get(AMDGPU::REG_SEQUENCE), Dst); 1687 1688 for (unsigned I = 0; I < Defs.size(); ++I) { 1689 MachineOperand *Def = Defs[I].first; 1690 Def->setIsKill(false); 1691 if (TRI->isAGPR(*MRI, Def->getReg())) { 1692 RS.add(*Def); 1693 } else { // This is a copy 1694 MachineInstr *SubDef = MRI->getVRegDef(Def->getReg()); 1695 SubDef->getOperand(1).setIsKill(false); 1696 RS.addReg(SubDef->getOperand(1).getReg(), 0, Def->getSubReg()); 1697 } 1698 RS.addImm(Defs[I].second); 1699 } 1700 1701 Op->setReg(Dst); 1702 if (!TII->isOperandLegal(*UseMI, OpIdx, Op)) { 1703 Op->setReg(Reg); 1704 RS->eraseFromParent(); 1705 return false; 1706 } 1707 1708 LLVM_DEBUG(dbgs() << "Folded " << *RS << " into " << *UseMI); 1709 1710 // Erase the REG_SEQUENCE eagerly, unless we followed a chain of COPY users, 1711 // in which case we can erase them all later in runOnMachineFunction. 1712 if (MRI->use_nodbg_empty(MI.getOperand(0).getReg())) 1713 MI.eraseFromParent(); 1714 return true; 1715 } 1716 1717 /// Checks whether \p Copy is a AGPR -> VGPR copy. Returns `true` on success and 1718 /// stores the AGPR register in \p OutReg and the subreg in \p OutSubReg 1719 static bool isAGPRCopy(const SIRegisterInfo &TRI, 1720 const MachineRegisterInfo &MRI, const MachineInstr &Copy, 1721 Register &OutReg, unsigned &OutSubReg) { 1722 assert(Copy.isCopy()); 1723 1724 const MachineOperand &CopySrc = Copy.getOperand(1); 1725 Register CopySrcReg = CopySrc.getReg(); 1726 if (!CopySrcReg.isVirtual()) 1727 return false; 1728 1729 // Common case: copy from AGPR directly, e.g. 1730 // %1:vgpr_32 = COPY %0:agpr_32 1731 if (TRI.isAGPR(MRI, CopySrcReg)) { 1732 OutReg = CopySrcReg; 1733 OutSubReg = CopySrc.getSubReg(); 1734 return true; 1735 } 1736 1737 // Sometimes it can also involve two copies, e.g. 1738 // %1:vgpr_256 = COPY %0:agpr_256 1739 // %2:vgpr_32 = COPY %1:vgpr_256.sub0 1740 const MachineInstr *CopySrcDef = MRI.getVRegDef(CopySrcReg); 1741 if (!CopySrcDef || !CopySrcDef->isCopy()) 1742 return false; 1743 1744 const MachineOperand &OtherCopySrc = CopySrcDef->getOperand(1); 1745 Register OtherCopySrcReg = OtherCopySrc.getReg(); 1746 if (!OtherCopySrcReg.isVirtual() || 1747 CopySrcDef->getOperand(0).getSubReg() != AMDGPU::NoSubRegister || 1748 OtherCopySrc.getSubReg() != AMDGPU::NoSubRegister || 1749 !TRI.isAGPR(MRI, OtherCopySrcReg)) 1750 return false; 1751 1752 OutReg = OtherCopySrcReg; 1753 OutSubReg = CopySrc.getSubReg(); 1754 return true; 1755 } 1756 1757 // Try to hoist an AGPR to VGPR copy across a PHI. 1758 // This should allow folding of an AGPR into a consumer which may support it. 1759 // 1760 // Example 1: LCSSA PHI 1761 // loop: 1762 // %1:vreg = COPY %0:areg 1763 // exit: 1764 // %2:vreg = PHI %1:vreg, %loop 1765 // => 1766 // loop: 1767 // exit: 1768 // %1:areg = PHI %0:areg, %loop 1769 // %2:vreg = COPY %1:areg 1770 // 1771 // Example 2: PHI with multiple incoming values: 1772 // entry: 1773 // %1:vreg = GLOBAL_LOAD(..) 1774 // loop: 1775 // %2:vreg = PHI %1:vreg, %entry, %5:vreg, %loop 1776 // %3:areg = COPY %2:vreg 1777 // %4:areg = (instr using %3:areg) 1778 // %5:vreg = COPY %4:areg 1779 // => 1780 // entry: 1781 // %1:vreg = GLOBAL_LOAD(..) 1782 // %2:areg = COPY %1:vreg 1783 // loop: 1784 // %3:areg = PHI %2:areg, %entry, %X:areg, 1785 // %4:areg = (instr using %3:areg) 1786 bool SIFoldOperands::tryFoldPhiAGPR(MachineInstr &PHI) { 1787 assert(PHI.isPHI()); 1788 1789 Register PhiOut = PHI.getOperand(0).getReg(); 1790 if (!TRI->isVGPR(*MRI, PhiOut)) 1791 return false; 1792 1793 // Iterate once over all incoming values of the PHI to check if this PHI is 1794 // eligible, and determine the exact AGPR RC we'll target. 1795 const TargetRegisterClass *ARC = nullptr; 1796 for (unsigned K = 1; K < PHI.getNumExplicitOperands(); K += 2) { 1797 MachineOperand &MO = PHI.getOperand(K); 1798 MachineInstr *Copy = MRI->getVRegDef(MO.getReg()); 1799 if (!Copy || !Copy->isCopy()) 1800 continue; 1801 1802 Register AGPRSrc; 1803 unsigned AGPRRegMask = AMDGPU::NoSubRegister; 1804 if (!isAGPRCopy(*TRI, *MRI, *Copy, AGPRSrc, AGPRRegMask)) 1805 continue; 1806 1807 const TargetRegisterClass *CopyInRC = MRI->getRegClass(AGPRSrc); 1808 if (const auto *SubRC = TRI->getSubRegisterClass(CopyInRC, AGPRRegMask)) 1809 CopyInRC = SubRC; 1810 1811 if (ARC && !ARC->hasSubClassEq(CopyInRC)) 1812 return false; 1813 ARC = CopyInRC; 1814 } 1815 1816 if (!ARC) 1817 return false; 1818 1819 bool IsAGPR32 = (ARC == &AMDGPU::AGPR_32RegClass); 1820 1821 // Rewrite the PHI's incoming values to ARC. 1822 LLVM_DEBUG(dbgs() << "Folding AGPR copies into: " << PHI); 1823 for (unsigned K = 1; K < PHI.getNumExplicitOperands(); K += 2) { 1824 MachineOperand &MO = PHI.getOperand(K); 1825 Register Reg = MO.getReg(); 1826 1827 MachineBasicBlock::iterator InsertPt; 1828 MachineBasicBlock *InsertMBB = nullptr; 1829 1830 // Look at the def of Reg, ignoring all copies. 1831 unsigned CopyOpc = AMDGPU::COPY; 1832 if (MachineInstr *Def = MRI->getVRegDef(Reg)) { 1833 1834 // Look at pre-existing COPY instructions from ARC: Steal the operand. If 1835 // the copy was single-use, it will be removed by DCE later. 1836 if (Def->isCopy()) { 1837 Register AGPRSrc; 1838 unsigned AGPRSubReg = AMDGPU::NoSubRegister; 1839 if (isAGPRCopy(*TRI, *MRI, *Def, AGPRSrc, AGPRSubReg)) { 1840 MO.setReg(AGPRSrc); 1841 MO.setSubReg(AGPRSubReg); 1842 continue; 1843 } 1844 1845 // If this is a multi-use SGPR -> VGPR copy, use V_ACCVGPR_WRITE on 1846 // GFX908 directly instead of a COPY. Otherwise, SIFoldOperand may try 1847 // to fold the sgpr -> vgpr -> agpr copy into a sgpr -> agpr copy which 1848 // is unlikely to be profitable. 1849 // 1850 // Note that V_ACCVGPR_WRITE is only used for AGPR_32. 1851 MachineOperand &CopyIn = Def->getOperand(1); 1852 if (IsAGPR32 && !ST->hasGFX90AInsts() && !MRI->hasOneNonDBGUse(Reg) && 1853 TRI->isSGPRReg(*MRI, CopyIn.getReg())) 1854 CopyOpc = AMDGPU::V_ACCVGPR_WRITE_B32_e64; 1855 } 1856 1857 InsertMBB = Def->getParent(); 1858 InsertPt = InsertMBB->SkipPHIsLabelsAndDebug(++Def->getIterator()); 1859 } else { 1860 InsertMBB = PHI.getOperand(MO.getOperandNo() + 1).getMBB(); 1861 InsertPt = InsertMBB->getFirstTerminator(); 1862 } 1863 1864 Register NewReg = MRI->createVirtualRegister(ARC); 1865 MachineInstr *MI = BuildMI(*InsertMBB, InsertPt, PHI.getDebugLoc(), 1866 TII->get(CopyOpc), NewReg) 1867 .addReg(Reg); 1868 MO.setReg(NewReg); 1869 1870 (void)MI; 1871 LLVM_DEBUG(dbgs() << " Created COPY: " << *MI); 1872 } 1873 1874 // Replace the PHI's result with a new register. 1875 Register NewReg = MRI->createVirtualRegister(ARC); 1876 PHI.getOperand(0).setReg(NewReg); 1877 1878 // COPY that new register back to the original PhiOut register. This COPY will 1879 // usually be folded out later. 1880 MachineBasicBlock *MBB = PHI.getParent(); 1881 BuildMI(*MBB, MBB->getFirstNonPHI(), PHI.getDebugLoc(), 1882 TII->get(AMDGPU::COPY), PhiOut) 1883 .addReg(NewReg); 1884 1885 LLVM_DEBUG(dbgs() << " Done: Folded " << PHI); 1886 return true; 1887 } 1888 1889 // Attempt to convert VGPR load to an AGPR load. 1890 bool SIFoldOperands::tryFoldLoad(MachineInstr &MI) { 1891 assert(MI.mayLoad()); 1892 if (!ST->hasGFX90AInsts() || MI.getNumExplicitDefs() != 1) 1893 return false; 1894 1895 MachineOperand &Def = MI.getOperand(0); 1896 if (!Def.isDef()) 1897 return false; 1898 1899 Register DefReg = Def.getReg(); 1900 1901 if (DefReg.isPhysical() || !TRI->isVGPR(*MRI, DefReg)) 1902 return false; 1903 1904 SmallVector<const MachineInstr*, 8> Users; 1905 SmallVector<Register, 8> MoveRegs; 1906 for (const MachineInstr &I : MRI->use_nodbg_instructions(DefReg)) 1907 Users.push_back(&I); 1908 1909 if (Users.empty()) 1910 return false; 1911 1912 // Check that all uses a copy to an agpr or a reg_sequence producing an agpr. 1913 while (!Users.empty()) { 1914 const MachineInstr *I = Users.pop_back_val(); 1915 if (!I->isCopy() && !I->isRegSequence()) 1916 return false; 1917 Register DstReg = I->getOperand(0).getReg(); 1918 // Physical registers may have more than one instruction definitions 1919 if (DstReg.isPhysical()) 1920 return false; 1921 if (TRI->isAGPR(*MRI, DstReg)) 1922 continue; 1923 MoveRegs.push_back(DstReg); 1924 for (const MachineInstr &U : MRI->use_nodbg_instructions(DstReg)) 1925 Users.push_back(&U); 1926 } 1927 1928 const TargetRegisterClass *RC = MRI->getRegClass(DefReg); 1929 MRI->setRegClass(DefReg, TRI->getEquivalentAGPRClass(RC)); 1930 if (!TII->isOperandLegal(MI, 0, &Def)) { 1931 MRI->setRegClass(DefReg, RC); 1932 return false; 1933 } 1934 1935 while (!MoveRegs.empty()) { 1936 Register Reg = MoveRegs.pop_back_val(); 1937 MRI->setRegClass(Reg, TRI->getEquivalentAGPRClass(MRI->getRegClass(Reg))); 1938 } 1939 1940 LLVM_DEBUG(dbgs() << "Folded " << MI); 1941 1942 return true; 1943 } 1944 1945 // tryFoldPhiAGPR will aggressively try to create AGPR PHIs. 1946 // For GFX90A and later, this is pretty much always a good thing, but for GFX908 1947 // there's cases where it can create a lot more AGPR-AGPR copies, which are 1948 // expensive on this architecture due to the lack of V_ACCVGPR_MOV. 1949 // 1950 // This function looks at all AGPR PHIs in a basic block and collects their 1951 // operands. Then, it checks for register that are used more than once across 1952 // all PHIs and caches them in a VGPR. This prevents ExpandPostRAPseudo from 1953 // having to create one VGPR temporary per use, which can get very messy if 1954 // these PHIs come from a broken-up large PHI (e.g. 32 AGPR phis, one per vector 1955 // element). 1956 // 1957 // Example 1958 // a: 1959 // %in:agpr_256 = COPY %foo:vgpr_256 1960 // c: 1961 // %x:agpr_32 = .. 1962 // b: 1963 // %0:areg = PHI %in.sub0:agpr_32, %a, %x, %c 1964 // %1:areg = PHI %in.sub0:agpr_32, %a, %y, %c 1965 // %2:areg = PHI %in.sub0:agpr_32, %a, %z, %c 1966 // => 1967 // a: 1968 // %in:agpr_256 = COPY %foo:vgpr_256 1969 // %tmp:vgpr_32 = V_ACCVGPR_READ_B32_e64 %in.sub0:agpr_32 1970 // %tmp_agpr:agpr_32 = COPY %tmp 1971 // c: 1972 // %x:agpr_32 = .. 1973 // b: 1974 // %0:areg = PHI %tmp_agpr, %a, %x, %c 1975 // %1:areg = PHI %tmp_agpr, %a, %y, %c 1976 // %2:areg = PHI %tmp_agpr, %a, %z, %c 1977 bool SIFoldOperands::tryOptimizeAGPRPhis(MachineBasicBlock &MBB) { 1978 // This is only really needed on GFX908 where AGPR-AGPR copies are 1979 // unreasonably difficult. 1980 if (ST->hasGFX90AInsts()) 1981 return false; 1982 1983 // Look at all AGPR Phis and collect the register + subregister used. 1984 DenseMap<std::pair<Register, unsigned>, std::vector<MachineOperand *>> 1985 RegToMO; 1986 1987 for (auto &MI : MBB) { 1988 if (!MI.isPHI()) 1989 break; 1990 1991 if (!TRI->isAGPR(*MRI, MI.getOperand(0).getReg())) 1992 continue; 1993 1994 for (unsigned K = 1; K < MI.getNumOperands(); K += 2) { 1995 MachineOperand &PhiMO = MI.getOperand(K); 1996 RegToMO[{PhiMO.getReg(), PhiMO.getSubReg()}].push_back(&PhiMO); 1997 } 1998 } 1999 2000 // For all (Reg, SubReg) pair that are used more than once, cache the value in 2001 // a VGPR. 2002 bool Changed = false; 2003 for (const auto &[Entry, MOs] : RegToMO) { 2004 if (MOs.size() == 1) 2005 continue; 2006 2007 const auto [Reg, SubReg] = Entry; 2008 MachineInstr *Def = MRI->getVRegDef(Reg); 2009 MachineBasicBlock *DefMBB = Def->getParent(); 2010 2011 // Create a copy in a VGPR using V_ACCVGPR_READ_B32_e64 so it's not folded 2012 // out. 2013 const TargetRegisterClass *ARC = getRegOpRC(*MRI, *TRI, *MOs.front()); 2014 Register TempVGPR = 2015 MRI->createVirtualRegister(TRI->getEquivalentVGPRClass(ARC)); 2016 MachineInstr *VGPRCopy = 2017 BuildMI(*DefMBB, ++Def->getIterator(), Def->getDebugLoc(), 2018 TII->get(AMDGPU::V_ACCVGPR_READ_B32_e64), TempVGPR) 2019 .addReg(Reg, /* flags */ 0, SubReg); 2020 2021 // Copy back to an AGPR and use that instead of the AGPR subreg in all MOs. 2022 Register TempAGPR = MRI->createVirtualRegister(ARC); 2023 BuildMI(*DefMBB, ++VGPRCopy->getIterator(), Def->getDebugLoc(), 2024 TII->get(AMDGPU::COPY), TempAGPR) 2025 .addReg(TempVGPR); 2026 2027 LLVM_DEBUG(dbgs() << "Caching AGPR into VGPR: " << *VGPRCopy); 2028 for (MachineOperand *MO : MOs) { 2029 MO->setReg(TempAGPR); 2030 MO->setSubReg(AMDGPU::NoSubRegister); 2031 LLVM_DEBUG(dbgs() << " Changed PHI Operand: " << *MO << "\n"); 2032 } 2033 2034 Changed = true; 2035 } 2036 2037 return Changed; 2038 } 2039 2040 bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) { 2041 if (skipFunction(MF.getFunction())) 2042 return false; 2043 2044 MRI = &MF.getRegInfo(); 2045 ST = &MF.getSubtarget<GCNSubtarget>(); 2046 TII = ST->getInstrInfo(); 2047 TRI = &TII->getRegisterInfo(); 2048 MFI = MF.getInfo<SIMachineFunctionInfo>(); 2049 2050 // omod is ignored by hardware if IEEE bit is enabled. omod also does not 2051 // correctly handle signed zeros. 2052 // 2053 // FIXME: Also need to check strictfp 2054 bool IsIEEEMode = MFI->getMode().IEEE; 2055 bool HasNSZ = MFI->hasNoSignedZerosFPMath(); 2056 2057 bool Changed = false; 2058 for (MachineBasicBlock *MBB : depth_first(&MF)) { 2059 MachineOperand *CurrentKnownM0Val = nullptr; 2060 for (auto &MI : make_early_inc_range(*MBB)) { 2061 Changed |= tryFoldCndMask(MI); 2062 2063 if (tryFoldZeroHighBits(MI)) { 2064 Changed = true; 2065 continue; 2066 } 2067 2068 if (MI.isRegSequence() && tryFoldRegSequence(MI)) { 2069 Changed = true; 2070 continue; 2071 } 2072 2073 if (MI.isPHI() && tryFoldPhiAGPR(MI)) { 2074 Changed = true; 2075 continue; 2076 } 2077 2078 if (MI.mayLoad() && tryFoldLoad(MI)) { 2079 Changed = true; 2080 continue; 2081 } 2082 2083 if (TII->isFoldableCopy(MI)) { 2084 Changed |= tryFoldFoldableCopy(MI, CurrentKnownM0Val); 2085 continue; 2086 } 2087 2088 // Saw an unknown clobber of m0, so we no longer know what it is. 2089 if (CurrentKnownM0Val && MI.modifiesRegister(AMDGPU::M0, TRI)) 2090 CurrentKnownM0Val = nullptr; 2091 2092 // TODO: Omod might be OK if there is NSZ only on the source 2093 // instruction, and not the omod multiply. 2094 if (IsIEEEMode || (!HasNSZ && !MI.getFlag(MachineInstr::FmNsz)) || 2095 !tryFoldOMod(MI)) 2096 Changed |= tryFoldClamp(MI); 2097 } 2098 2099 Changed |= tryOptimizeAGPRPhis(*MBB); 2100 } 2101 2102 return Changed; 2103 } 2104