1 //===-- SIFoldOperands.cpp - Fold operands --- ----------------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 /// \file 9 //===----------------------------------------------------------------------===// 10 // 11 12 #include "AMDGPU.h" 13 #include "AMDGPUSubtarget.h" 14 #include "SIInstrInfo.h" 15 #include "SIMachineFunctionInfo.h" 16 #include "llvm/CodeGen/LiveIntervalAnalysis.h" 17 #include "llvm/CodeGen/MachineFunctionPass.h" 18 #include "llvm/CodeGen/MachineInstrBuilder.h" 19 #include "llvm/CodeGen/MachineRegisterInfo.h" 20 #include "llvm/Support/Debug.h" 21 #include "llvm/Support/raw_ostream.h" 22 #include "llvm/Target/TargetMachine.h" 23 24 #define DEBUG_TYPE "si-fold-operands" 25 using namespace llvm; 26 27 namespace { 28 29 struct FoldCandidate { 30 MachineInstr *UseMI; 31 union { 32 MachineOperand *OpToFold; 33 uint64_t ImmToFold; 34 int FrameIndexToFold; 35 }; 36 unsigned char UseOpNo; 37 MachineOperand::MachineOperandType Kind; 38 bool Commuted; 39 40 FoldCandidate(MachineInstr *MI, unsigned OpNo, MachineOperand *FoldOp, 41 bool Commuted_ = false) : 42 UseMI(MI), OpToFold(nullptr), UseOpNo(OpNo), Kind(FoldOp->getType()), 43 Commuted(Commuted_) { 44 if (FoldOp->isImm()) { 45 ImmToFold = FoldOp->getImm(); 46 } else if (FoldOp->isFI()) { 47 FrameIndexToFold = FoldOp->getIndex(); 48 } else { 49 assert(FoldOp->isReg()); 50 OpToFold = FoldOp; 51 } 52 } 53 54 bool isFI() const { 55 return Kind == MachineOperand::MO_FrameIndex; 56 } 57 58 bool isImm() const { 59 return Kind == MachineOperand::MO_Immediate; 60 } 61 62 bool isReg() const { 63 return Kind == MachineOperand::MO_Register; 64 } 65 66 bool isCommuted() const { 67 return Commuted; 68 } 69 }; 70 71 class SIFoldOperands : public MachineFunctionPass { 72 public: 73 static char ID; 74 MachineRegisterInfo *MRI; 75 const SIInstrInfo *TII; 76 const SIRegisterInfo *TRI; 77 const SISubtarget *ST; 78 79 void foldOperand(MachineOperand &OpToFold, 80 MachineInstr *UseMI, 81 unsigned UseOpIdx, 82 SmallVectorImpl<FoldCandidate> &FoldList, 83 SmallVectorImpl<MachineInstr *> &CopiesToReplace) const; 84 85 void foldInstOperand(MachineInstr &MI, MachineOperand &OpToFold) const; 86 87 const MachineOperand *isClamp(const MachineInstr &MI) const; 88 bool tryFoldClamp(MachineInstr &MI); 89 90 std::pair<const MachineOperand *, int> isOMod(const MachineInstr &MI) const; 91 bool tryFoldOMod(MachineInstr &MI); 92 93 public: 94 SIFoldOperands() : MachineFunctionPass(ID) { 95 initializeSIFoldOperandsPass(*PassRegistry::getPassRegistry()); 96 } 97 98 bool runOnMachineFunction(MachineFunction &MF) override; 99 100 StringRef getPassName() const override { return "SI Fold Operands"; } 101 102 void getAnalysisUsage(AnalysisUsage &AU) const override { 103 AU.setPreservesCFG(); 104 MachineFunctionPass::getAnalysisUsage(AU); 105 } 106 }; 107 108 } // End anonymous namespace. 109 110 INITIALIZE_PASS(SIFoldOperands, DEBUG_TYPE, 111 "SI Fold Operands", false, false) 112 113 char SIFoldOperands::ID = 0; 114 115 char &llvm::SIFoldOperandsID = SIFoldOperands::ID; 116 117 // Wrapper around isInlineConstant that understands special cases when 118 // instruction types are replaced during operand folding. 119 static bool isInlineConstantIfFolded(const SIInstrInfo *TII, 120 const MachineInstr &UseMI, 121 unsigned OpNo, 122 const MachineOperand &OpToFold) { 123 if (TII->isInlineConstant(UseMI, OpNo, OpToFold)) 124 return true; 125 126 unsigned Opc = UseMI.getOpcode(); 127 switch (Opc) { 128 case AMDGPU::V_MAC_F32_e64: 129 case AMDGPU::V_MAC_F16_e64: { 130 // Special case for mac. Since this is replaced with mad when folded into 131 // src2, we need to check the legality for the final instruction. 132 int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2); 133 if (static_cast<int>(OpNo) == Src2Idx) { 134 bool IsF32 = Opc == AMDGPU::V_MAC_F32_e64; 135 const MCInstrDesc &MadDesc 136 = TII->get(IsF32 ? AMDGPU::V_MAD_F32 : AMDGPU::V_MAD_F16); 137 return TII->isInlineConstant(OpToFold, MadDesc.OpInfo[OpNo].OperandType); 138 } 139 } 140 default: 141 return false; 142 } 143 } 144 145 FunctionPass *llvm::createSIFoldOperandsPass() { 146 return new SIFoldOperands(); 147 } 148 149 static bool updateOperand(FoldCandidate &Fold, 150 const TargetRegisterInfo &TRI) { 151 MachineInstr *MI = Fold.UseMI; 152 MachineOperand &Old = MI->getOperand(Fold.UseOpNo); 153 assert(Old.isReg()); 154 155 if (Fold.isImm()) { 156 Old.ChangeToImmediate(Fold.ImmToFold); 157 return true; 158 } 159 160 if (Fold.isFI()) { 161 Old.ChangeToFrameIndex(Fold.FrameIndexToFold); 162 return true; 163 } 164 165 MachineOperand *New = Fold.OpToFold; 166 if (TargetRegisterInfo::isVirtualRegister(Old.getReg()) && 167 TargetRegisterInfo::isVirtualRegister(New->getReg())) { 168 Old.substVirtReg(New->getReg(), New->getSubReg(), TRI); 169 return true; 170 } 171 172 // FIXME: Handle physical registers. 173 174 return false; 175 } 176 177 static bool isUseMIInFoldList(ArrayRef<FoldCandidate> FoldList, 178 const MachineInstr *MI) { 179 for (auto Candidate : FoldList) { 180 if (Candidate.UseMI == MI) 181 return true; 182 } 183 return false; 184 } 185 186 static bool tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList, 187 MachineInstr *MI, unsigned OpNo, 188 MachineOperand *OpToFold, 189 const SIInstrInfo *TII) { 190 if (!TII->isOperandLegal(*MI, OpNo, OpToFold)) { 191 192 // Special case for v_mac_{f16, f32}_e64 if we are trying to fold into src2 193 unsigned Opc = MI->getOpcode(); 194 if ((Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64) && 195 (int)OpNo == AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)) { 196 bool IsF32 = Opc == AMDGPU::V_MAC_F32_e64; 197 198 // Check if changing this to a v_mad_{f16, f32} instruction will allow us 199 // to fold the operand. 200 MI->setDesc(TII->get(IsF32 ? AMDGPU::V_MAD_F32 : AMDGPU::V_MAD_F16)); 201 bool FoldAsMAD = tryAddToFoldList(FoldList, MI, OpNo, OpToFold, TII); 202 if (FoldAsMAD) { 203 MI->untieRegOperand(OpNo); 204 return true; 205 } 206 MI->setDesc(TII->get(Opc)); 207 } 208 209 // Special case for s_setreg_b32 210 if (Opc == AMDGPU::S_SETREG_B32 && OpToFold->isImm()) { 211 MI->setDesc(TII->get(AMDGPU::S_SETREG_IMM32_B32)); 212 FoldList.push_back(FoldCandidate(MI, OpNo, OpToFold)); 213 return true; 214 } 215 216 // If we are already folding into another operand of MI, then 217 // we can't commute the instruction, otherwise we risk making the 218 // other fold illegal. 219 if (isUseMIInFoldList(FoldList, MI)) 220 return false; 221 222 // Operand is not legal, so try to commute the instruction to 223 // see if this makes it possible to fold. 224 unsigned CommuteIdx0 = TargetInstrInfo::CommuteAnyOperandIndex; 225 unsigned CommuteIdx1 = TargetInstrInfo::CommuteAnyOperandIndex; 226 bool CanCommute = TII->findCommutedOpIndices(*MI, CommuteIdx0, CommuteIdx1); 227 228 if (CanCommute) { 229 if (CommuteIdx0 == OpNo) 230 OpNo = CommuteIdx1; 231 else if (CommuteIdx1 == OpNo) 232 OpNo = CommuteIdx0; 233 } 234 235 // One of operands might be an Imm operand, and OpNo may refer to it after 236 // the call of commuteInstruction() below. Such situations are avoided 237 // here explicitly as OpNo must be a register operand to be a candidate 238 // for memory folding. 239 if (CanCommute && (!MI->getOperand(CommuteIdx0).isReg() || 240 !MI->getOperand(CommuteIdx1).isReg())) 241 return false; 242 243 if (!CanCommute || 244 !TII->commuteInstruction(*MI, false, CommuteIdx0, CommuteIdx1)) 245 return false; 246 247 if (!TII->isOperandLegal(*MI, OpNo, OpToFold)) { 248 TII->commuteInstruction(*MI, false, CommuteIdx0, CommuteIdx1); 249 return false; 250 } 251 252 FoldList.push_back(FoldCandidate(MI, OpNo, OpToFold, true)); 253 return true; 254 } 255 256 FoldList.push_back(FoldCandidate(MI, OpNo, OpToFold)); 257 return true; 258 } 259 260 // If the use operand doesn't care about the value, this may be an operand only 261 // used for register indexing, in which case it is unsafe to fold. 262 static bool isUseSafeToFold(const SIInstrInfo *TII, 263 const MachineInstr &MI, 264 const MachineOperand &UseMO) { 265 return !UseMO.isUndef() && !TII->isSDWA(MI); 266 //return !MI.hasRegisterImplicitUseOperand(UseMO.getReg()); 267 } 268 269 void SIFoldOperands::foldOperand( 270 MachineOperand &OpToFold, 271 MachineInstr *UseMI, 272 unsigned UseOpIdx, 273 SmallVectorImpl<FoldCandidate> &FoldList, 274 SmallVectorImpl<MachineInstr *> &CopiesToReplace) const { 275 const MachineOperand &UseOp = UseMI->getOperand(UseOpIdx); 276 277 if (!isUseSafeToFold(TII, *UseMI, UseOp)) 278 return; 279 280 // FIXME: Fold operands with subregs. 281 if (UseOp.isReg() && OpToFold.isReg()) { 282 if (UseOp.isImplicit() || UseOp.getSubReg() != AMDGPU::NoSubRegister) 283 return; 284 285 // Don't fold subregister extracts into tied operands, only if it is a full 286 // copy since a subregister use tied to a full register def doesn't really 287 // make sense. e.g. don't fold: 288 // 289 // %vreg1 = COPY %vreg0:sub1 290 // %vreg2<tied3> = V_MAC_{F16, F32} %vreg3, %vreg4, %vreg1<tied0> 291 // 292 // into 293 // %vreg2<tied3> = V_MAC_{F16, F32} %vreg3, %vreg4, %vreg0:sub1<tied0> 294 if (UseOp.isTied() && OpToFold.getSubReg() != AMDGPU::NoSubRegister) 295 return; 296 } 297 298 // Special case for REG_SEQUENCE: We can't fold literals into 299 // REG_SEQUENCE instructions, so we have to fold them into the 300 // uses of REG_SEQUENCE. 301 if (UseMI->isRegSequence()) { 302 unsigned RegSeqDstReg = UseMI->getOperand(0).getReg(); 303 unsigned RegSeqDstSubReg = UseMI->getOperand(UseOpIdx + 1).getImm(); 304 305 for (MachineRegisterInfo::use_iterator 306 RSUse = MRI->use_begin(RegSeqDstReg), RSE = MRI->use_end(); 307 RSUse != RSE; ++RSUse) { 308 309 MachineInstr *RSUseMI = RSUse->getParent(); 310 if (RSUse->getSubReg() != RegSeqDstSubReg) 311 continue; 312 313 foldOperand(OpToFold, RSUseMI, RSUse.getOperandNo(), FoldList, 314 CopiesToReplace); 315 } 316 317 return; 318 } 319 320 321 bool FoldingImm = OpToFold.isImm(); 322 323 // In order to fold immediates into copies, we need to change the 324 // copy to a MOV. 325 if (FoldingImm && UseMI->isCopy()) { 326 unsigned DestReg = UseMI->getOperand(0).getReg(); 327 const TargetRegisterClass *DestRC 328 = TargetRegisterInfo::isVirtualRegister(DestReg) ? 329 MRI->getRegClass(DestReg) : 330 TRI->getPhysRegClass(DestReg); 331 332 unsigned MovOp = TII->getMovOpcode(DestRC); 333 if (MovOp == AMDGPU::COPY) 334 return; 335 336 UseMI->setDesc(TII->get(MovOp)); 337 CopiesToReplace.push_back(UseMI); 338 } else { 339 const MCInstrDesc &UseDesc = UseMI->getDesc(); 340 341 // Don't fold into target independent nodes. Target independent opcodes 342 // don't have defined register classes. 343 if (UseDesc.isVariadic() || 344 UseDesc.OpInfo[UseOpIdx].RegClass == -1) 345 return; 346 } 347 348 if (!FoldingImm) { 349 tryAddToFoldList(FoldList, UseMI, UseOpIdx, &OpToFold, TII); 350 351 // FIXME: We could try to change the instruction from 64-bit to 32-bit 352 // to enable more folding opportunites. The shrink operands pass 353 // already does this. 354 return; 355 } 356 357 358 const MCInstrDesc &FoldDesc = OpToFold.getParent()->getDesc(); 359 const TargetRegisterClass *FoldRC = 360 TRI->getRegClass(FoldDesc.OpInfo[0].RegClass); 361 362 363 // Split 64-bit constants into 32-bits for folding. 364 if (UseOp.getSubReg() && AMDGPU::getRegBitWidth(FoldRC->getID()) == 64) { 365 unsigned UseReg = UseOp.getReg(); 366 const TargetRegisterClass *UseRC 367 = TargetRegisterInfo::isVirtualRegister(UseReg) ? 368 MRI->getRegClass(UseReg) : 369 TRI->getPhysRegClass(UseReg); 370 371 if (AMDGPU::getRegBitWidth(UseRC->getID()) != 64) 372 return; 373 374 APInt Imm(64, OpToFold.getImm()); 375 if (UseOp.getSubReg() == AMDGPU::sub0) { 376 Imm = Imm.getLoBits(32); 377 } else { 378 assert(UseOp.getSubReg() == AMDGPU::sub1); 379 Imm = Imm.getHiBits(32); 380 } 381 382 MachineOperand ImmOp = MachineOperand::CreateImm(Imm.getSExtValue()); 383 tryAddToFoldList(FoldList, UseMI, UseOpIdx, &ImmOp, TII); 384 return; 385 } 386 387 388 389 tryAddToFoldList(FoldList, UseMI, UseOpIdx, &OpToFold, TII); 390 } 391 392 static bool evalBinaryInstruction(unsigned Opcode, int32_t &Result, 393 uint32_t LHS, uint32_t RHS) { 394 switch (Opcode) { 395 case AMDGPU::V_AND_B32_e64: 396 case AMDGPU::V_AND_B32_e32: 397 case AMDGPU::S_AND_B32: 398 Result = LHS & RHS; 399 return true; 400 case AMDGPU::V_OR_B32_e64: 401 case AMDGPU::V_OR_B32_e32: 402 case AMDGPU::S_OR_B32: 403 Result = LHS | RHS; 404 return true; 405 case AMDGPU::V_XOR_B32_e64: 406 case AMDGPU::V_XOR_B32_e32: 407 case AMDGPU::S_XOR_B32: 408 Result = LHS ^ RHS; 409 return true; 410 case AMDGPU::V_LSHL_B32_e64: 411 case AMDGPU::V_LSHL_B32_e32: 412 case AMDGPU::S_LSHL_B32: 413 // The instruction ignores the high bits for out of bounds shifts. 414 Result = LHS << (RHS & 31); 415 return true; 416 case AMDGPU::V_LSHLREV_B32_e64: 417 case AMDGPU::V_LSHLREV_B32_e32: 418 Result = RHS << (LHS & 31); 419 return true; 420 case AMDGPU::V_LSHR_B32_e64: 421 case AMDGPU::V_LSHR_B32_e32: 422 case AMDGPU::S_LSHR_B32: 423 Result = LHS >> (RHS & 31); 424 return true; 425 case AMDGPU::V_LSHRREV_B32_e64: 426 case AMDGPU::V_LSHRREV_B32_e32: 427 Result = RHS >> (LHS & 31); 428 return true; 429 case AMDGPU::V_ASHR_I32_e64: 430 case AMDGPU::V_ASHR_I32_e32: 431 case AMDGPU::S_ASHR_I32: 432 Result = static_cast<int32_t>(LHS) >> (RHS & 31); 433 return true; 434 case AMDGPU::V_ASHRREV_I32_e64: 435 case AMDGPU::V_ASHRREV_I32_e32: 436 Result = static_cast<int32_t>(RHS) >> (LHS & 31); 437 return true; 438 default: 439 return false; 440 } 441 } 442 443 static unsigned getMovOpc(bool IsScalar) { 444 return IsScalar ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32; 445 } 446 447 /// Remove any leftover implicit operands from mutating the instruction. e.g. 448 /// if we replace an s_and_b32 with a copy, we don't need the implicit scc def 449 /// anymore. 450 static void stripExtraCopyOperands(MachineInstr &MI) { 451 const MCInstrDesc &Desc = MI.getDesc(); 452 unsigned NumOps = Desc.getNumOperands() + 453 Desc.getNumImplicitUses() + 454 Desc.getNumImplicitDefs(); 455 456 for (unsigned I = MI.getNumOperands() - 1; I >= NumOps; --I) 457 MI.RemoveOperand(I); 458 } 459 460 static void mutateCopyOp(MachineInstr &MI, const MCInstrDesc &NewDesc) { 461 MI.setDesc(NewDesc); 462 stripExtraCopyOperands(MI); 463 } 464 465 static MachineOperand *getImmOrMaterializedImm(MachineRegisterInfo &MRI, 466 MachineOperand &Op) { 467 if (Op.isReg()) { 468 // If this has a subregister, it obviously is a register source. 469 if (Op.getSubReg() != AMDGPU::NoSubRegister) 470 return &Op; 471 472 MachineInstr *Def = MRI.getVRegDef(Op.getReg()); 473 if (Def->isMoveImmediate()) { 474 MachineOperand &ImmSrc = Def->getOperand(1); 475 if (ImmSrc.isImm()) 476 return &ImmSrc; 477 } 478 } 479 480 return &Op; 481 } 482 483 // Try to simplify operations with a constant that may appear after instruction 484 // selection. 485 // TODO: See if a frame index with a fixed offset can fold. 486 static bool tryConstantFoldOp(MachineRegisterInfo &MRI, 487 const SIInstrInfo *TII, 488 MachineInstr *MI, 489 MachineOperand *ImmOp) { 490 unsigned Opc = MI->getOpcode(); 491 if (Opc == AMDGPU::V_NOT_B32_e64 || Opc == AMDGPU::V_NOT_B32_e32 || 492 Opc == AMDGPU::S_NOT_B32) { 493 MI->getOperand(1).ChangeToImmediate(~ImmOp->getImm()); 494 mutateCopyOp(*MI, TII->get(getMovOpc(Opc == AMDGPU::S_NOT_B32))); 495 return true; 496 } 497 498 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1); 499 if (Src1Idx == -1) 500 return false; 501 502 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); 503 MachineOperand *Src0 = getImmOrMaterializedImm(MRI, MI->getOperand(Src0Idx)); 504 MachineOperand *Src1 = getImmOrMaterializedImm(MRI, MI->getOperand(Src1Idx)); 505 506 if (!Src0->isImm() && !Src1->isImm()) 507 return false; 508 509 // and k0, k1 -> v_mov_b32 (k0 & k1) 510 // or k0, k1 -> v_mov_b32 (k0 | k1) 511 // xor k0, k1 -> v_mov_b32 (k0 ^ k1) 512 if (Src0->isImm() && Src1->isImm()) { 513 int32_t NewImm; 514 if (!evalBinaryInstruction(Opc, NewImm, Src0->getImm(), Src1->getImm())) 515 return false; 516 517 const SIRegisterInfo &TRI = TII->getRegisterInfo(); 518 bool IsSGPR = TRI.isSGPRReg(MRI, MI->getOperand(0).getReg()); 519 520 // Be careful to change the right operand, src0 may belong to a different 521 // instruction. 522 MI->getOperand(Src0Idx).ChangeToImmediate(NewImm); 523 MI->RemoveOperand(Src1Idx); 524 mutateCopyOp(*MI, TII->get(getMovOpc(IsSGPR))); 525 return true; 526 } 527 528 if (!MI->isCommutable()) 529 return false; 530 531 if (Src0->isImm() && !Src1->isImm()) { 532 std::swap(Src0, Src1); 533 std::swap(Src0Idx, Src1Idx); 534 } 535 536 int32_t Src1Val = static_cast<int32_t>(Src1->getImm()); 537 if (Opc == AMDGPU::V_OR_B32_e64 || 538 Opc == AMDGPU::V_OR_B32_e32 || 539 Opc == AMDGPU::S_OR_B32) { 540 if (Src1Val == 0) { 541 // y = or x, 0 => y = copy x 542 MI->RemoveOperand(Src1Idx); 543 mutateCopyOp(*MI, TII->get(AMDGPU::COPY)); 544 } else if (Src1Val == -1) { 545 // y = or x, -1 => y = v_mov_b32 -1 546 MI->RemoveOperand(Src1Idx); 547 mutateCopyOp(*MI, TII->get(getMovOpc(Opc == AMDGPU::S_OR_B32))); 548 } else 549 return false; 550 551 return true; 552 } 553 554 if (MI->getOpcode() == AMDGPU::V_AND_B32_e64 || 555 MI->getOpcode() == AMDGPU::V_AND_B32_e32 || 556 MI->getOpcode() == AMDGPU::S_AND_B32) { 557 if (Src1Val == 0) { 558 // y = and x, 0 => y = v_mov_b32 0 559 MI->RemoveOperand(Src0Idx); 560 mutateCopyOp(*MI, TII->get(getMovOpc(Opc == AMDGPU::S_AND_B32))); 561 } else if (Src1Val == -1) { 562 // y = and x, -1 => y = copy x 563 MI->RemoveOperand(Src1Idx); 564 mutateCopyOp(*MI, TII->get(AMDGPU::COPY)); 565 stripExtraCopyOperands(*MI); 566 } else 567 return false; 568 569 return true; 570 } 571 572 if (MI->getOpcode() == AMDGPU::V_XOR_B32_e64 || 573 MI->getOpcode() == AMDGPU::V_XOR_B32_e32 || 574 MI->getOpcode() == AMDGPU::S_XOR_B32) { 575 if (Src1Val == 0) { 576 // y = xor x, 0 => y = copy x 577 MI->RemoveOperand(Src1Idx); 578 mutateCopyOp(*MI, TII->get(AMDGPU::COPY)); 579 return true; 580 } 581 } 582 583 return false; 584 } 585 586 // Try to fold an instruction into a simpler one 587 static bool tryFoldInst(const SIInstrInfo *TII, 588 MachineInstr *MI) { 589 unsigned Opc = MI->getOpcode(); 590 591 if (Opc == AMDGPU::V_CNDMASK_B32_e32 || 592 Opc == AMDGPU::V_CNDMASK_B32_e64 || 593 Opc == AMDGPU::V_CNDMASK_B64_PSEUDO) { 594 const MachineOperand *Src0 = TII->getNamedOperand(*MI, AMDGPU::OpName::src0); 595 const MachineOperand *Src1 = TII->getNamedOperand(*MI, AMDGPU::OpName::src1); 596 if (Src1->isIdenticalTo(*Src0)) { 597 DEBUG(dbgs() << "Folded " << *MI << " into "); 598 int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2); 599 if (Src2Idx != -1) 600 MI->RemoveOperand(Src2Idx); 601 MI->RemoveOperand(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1)); 602 mutateCopyOp(*MI, TII->get(Src0->isReg() ? (unsigned)AMDGPU::COPY 603 : getMovOpc(false))); 604 DEBUG(dbgs() << *MI << '\n'); 605 return true; 606 } 607 } 608 609 return false; 610 } 611 612 void SIFoldOperands::foldInstOperand(MachineInstr &MI, 613 MachineOperand &OpToFold) const { 614 // We need mutate the operands of new mov instructions to add implicit 615 // uses of EXEC, but adding them invalidates the use_iterator, so defer 616 // this. 617 SmallVector<MachineInstr *, 4> CopiesToReplace; 618 SmallVector<FoldCandidate, 4> FoldList; 619 MachineOperand &Dst = MI.getOperand(0); 620 621 bool FoldingImm = OpToFold.isImm() || OpToFold.isFI(); 622 if (FoldingImm) { 623 unsigned NumLiteralUses = 0; 624 MachineOperand *NonInlineUse = nullptr; 625 int NonInlineUseOpNo = -1; 626 627 MachineRegisterInfo::use_iterator NextUse, NextInstUse; 628 for (MachineRegisterInfo::use_iterator 629 Use = MRI->use_begin(Dst.getReg()), E = MRI->use_end(); 630 Use != E; Use = NextUse) { 631 NextUse = std::next(Use); 632 MachineInstr *UseMI = Use->getParent(); 633 unsigned OpNo = Use.getOperandNo(); 634 635 // Folding the immediate may reveal operations that can be constant 636 // folded or replaced with a copy. This can happen for example after 637 // frame indices are lowered to constants or from splitting 64-bit 638 // constants. 639 // 640 // We may also encounter cases where one or both operands are 641 // immediates materialized into a register, which would ordinarily not 642 // be folded due to multiple uses or operand constraints. 643 644 if (OpToFold.isImm() && tryConstantFoldOp(*MRI, TII, UseMI, &OpToFold)) { 645 DEBUG(dbgs() << "Constant folded " << *UseMI <<'\n'); 646 647 // Some constant folding cases change the same immediate's use to a new 648 // instruction, e.g. and x, 0 -> 0. Make sure we re-visit the user 649 // again. The same constant folded instruction could also have a second 650 // use operand. 651 NextUse = MRI->use_begin(Dst.getReg()); 652 continue; 653 } 654 655 // Try to fold any inline immediate uses, and then only fold other 656 // constants if they have one use. 657 // 658 // The legality of the inline immediate must be checked based on the use 659 // operand, not the defining instruction, because 32-bit instructions 660 // with 32-bit inline immediate sources may be used to materialize 661 // constants used in 16-bit operands. 662 // 663 // e.g. it is unsafe to fold: 664 // s_mov_b32 s0, 1.0 // materializes 0x3f800000 665 // v_add_f16 v0, v1, s0 // 1.0 f16 inline immediate sees 0x00003c00 666 667 // Folding immediates with more than one use will increase program size. 668 // FIXME: This will also reduce register usage, which may be better 669 // in some cases. A better heuristic is needed. 670 if (isInlineConstantIfFolded(TII, *UseMI, OpNo, OpToFold)) { 671 foldOperand(OpToFold, UseMI, OpNo, FoldList, CopiesToReplace); 672 } else { 673 if (++NumLiteralUses == 1) { 674 NonInlineUse = &*Use; 675 NonInlineUseOpNo = OpNo; 676 } 677 } 678 } 679 680 if (NumLiteralUses == 1) { 681 MachineInstr *UseMI = NonInlineUse->getParent(); 682 foldOperand(OpToFold, UseMI, NonInlineUseOpNo, FoldList, CopiesToReplace); 683 } 684 } else { 685 // Folding register. 686 for (MachineRegisterInfo::use_iterator 687 Use = MRI->use_begin(Dst.getReg()), E = MRI->use_end(); 688 Use != E; ++Use) { 689 MachineInstr *UseMI = Use->getParent(); 690 691 foldOperand(OpToFold, UseMI, Use.getOperandNo(), 692 FoldList, CopiesToReplace); 693 } 694 } 695 696 MachineFunction *MF = MI.getParent()->getParent(); 697 // Make sure we add EXEC uses to any new v_mov instructions created. 698 for (MachineInstr *Copy : CopiesToReplace) 699 Copy->addImplicitDefUseOperands(*MF); 700 701 for (FoldCandidate &Fold : FoldList) { 702 if (updateOperand(Fold, *TRI)) { 703 // Clear kill flags. 704 if (Fold.isReg()) { 705 assert(Fold.OpToFold && Fold.OpToFold->isReg()); 706 // FIXME: Probably shouldn't bother trying to fold if not an 707 // SGPR. PeepholeOptimizer can eliminate redundant VGPR->VGPR 708 // copies. 709 MRI->clearKillFlags(Fold.OpToFold->getReg()); 710 } 711 DEBUG(dbgs() << "Folded source from " << MI << " into OpNo " << 712 static_cast<int>(Fold.UseOpNo) << " of " << *Fold.UseMI << '\n'); 713 tryFoldInst(TII, Fold.UseMI); 714 } else if (Fold.isCommuted()) { 715 // Restoring instruction's original operand order if fold has failed. 716 TII->commuteInstruction(*Fold.UseMI, false); 717 } 718 } 719 } 720 721 const MachineOperand *SIFoldOperands::isClamp(const MachineInstr &MI) const { 722 unsigned Op = MI.getOpcode(); 723 switch (Op) { 724 case AMDGPU::V_MAX_F32_e64: 725 case AMDGPU::V_MAX_F16_e64: 726 case AMDGPU::V_MAX_F64: { 727 if (!TII->getNamedOperand(MI, AMDGPU::OpName::clamp)->getImm()) 728 return nullptr; 729 730 // Make sure sources are identical. 731 const MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); 732 const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); 733 if (!Src0->isReg() || !Src1->isReg() || 734 Src0->getSubReg() != Src1->getSubReg() || 735 Src0->getSubReg() != AMDGPU::NoSubRegister) 736 return nullptr; 737 738 // Can't fold up if we have modifiers. 739 if (TII->hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers) || 740 TII->hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers) || 741 TII->hasModifiersSet(MI, AMDGPU::OpName::omod)) 742 return nullptr; 743 return Src0; 744 } 745 default: 746 return nullptr; 747 } 748 } 749 750 // We obviously have multiple uses in a clamp since the register is used twice 751 // in the same instruction. 752 static bool hasOneNonDBGUseInst(const MachineRegisterInfo &MRI, unsigned Reg) { 753 int Count = 0; 754 for (auto I = MRI.use_instr_nodbg_begin(Reg), E = MRI.use_instr_nodbg_end(); 755 I != E; ++I) { 756 if (++Count > 1) 757 return false; 758 } 759 760 return true; 761 } 762 763 bool SIFoldOperands::tryFoldClamp(MachineInstr &MI) { 764 const MachineOperand *ClampSrc = isClamp(MI); 765 if (!ClampSrc || !hasOneNonDBGUseInst(*MRI, ClampSrc->getReg())) 766 return false; 767 768 MachineInstr *Def = MRI->getVRegDef(ClampSrc->getReg()); 769 if (!TII->hasFPClamp(*Def)) 770 return false; 771 MachineOperand *DefClamp = TII->getNamedOperand(*Def, AMDGPU::OpName::clamp); 772 if (!DefClamp) 773 return false; 774 775 DEBUG(dbgs() << "Folding clamp " << *DefClamp << " into " << *Def << '\n'); 776 777 // Clamp is applied after omod, so it is OK if omod is set. 778 DefClamp->setImm(1); 779 MRI->replaceRegWith(MI.getOperand(0).getReg(), Def->getOperand(0).getReg()); 780 MI.eraseFromParent(); 781 return true; 782 } 783 784 static int getOModValue(unsigned Opc, int64_t Val) { 785 switch (Opc) { 786 case AMDGPU::V_MUL_F32_e64: { 787 switch (static_cast<uint32_t>(Val)) { 788 case 0x3f000000: // 0.5 789 return SIOutMods::DIV2; 790 case 0x40000000: // 2.0 791 return SIOutMods::MUL2; 792 case 0x40800000: // 4.0 793 return SIOutMods::MUL4; 794 default: 795 return SIOutMods::NONE; 796 } 797 } 798 case AMDGPU::V_MUL_F16_e64: { 799 switch (static_cast<uint16_t>(Val)) { 800 case 0x3800: // 0.5 801 return SIOutMods::DIV2; 802 case 0x4000: // 2.0 803 return SIOutMods::MUL2; 804 case 0x4400: // 4.0 805 return SIOutMods::MUL4; 806 default: 807 return SIOutMods::NONE; 808 } 809 } 810 default: 811 llvm_unreachable("invalid mul opcode"); 812 } 813 } 814 815 // FIXME: Does this really not support denormals with f16? 816 // FIXME: Does this need to check IEEE mode bit? SNaNs are generally not 817 // handled, so will anything other than that break? 818 std::pair<const MachineOperand *, int> 819 SIFoldOperands::isOMod(const MachineInstr &MI) const { 820 unsigned Op = MI.getOpcode(); 821 switch (Op) { 822 case AMDGPU::V_MUL_F32_e64: 823 case AMDGPU::V_MUL_F16_e64: { 824 // If output denormals are enabled, omod is ignored. 825 if ((Op == AMDGPU::V_MUL_F32_e64 && ST->hasFP32Denormals()) || 826 (Op == AMDGPU::V_MUL_F16_e64 && ST->hasFP16Denormals())) 827 return std::make_pair(nullptr, SIOutMods::NONE); 828 829 const MachineOperand *RegOp = nullptr; 830 const MachineOperand *ImmOp = nullptr; 831 const MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); 832 const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); 833 if (Src0->isImm()) { 834 ImmOp = Src0; 835 RegOp = Src1; 836 } else if (Src1->isImm()) { 837 ImmOp = Src1; 838 RegOp = Src0; 839 } else 840 return std::make_pair(nullptr, SIOutMods::NONE); 841 842 int OMod = getOModValue(Op, ImmOp->getImm()); 843 if (OMod == SIOutMods::NONE || 844 TII->hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers) || 845 TII->hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers) || 846 TII->hasModifiersSet(MI, AMDGPU::OpName::omod) || 847 TII->hasModifiersSet(MI, AMDGPU::OpName::clamp)) 848 return std::make_pair(nullptr, SIOutMods::NONE); 849 850 return std::make_pair(RegOp, OMod); 851 } 852 case AMDGPU::V_ADD_F32_e64: 853 case AMDGPU::V_ADD_F16_e64: { 854 // If output denormals are enabled, omod is ignored. 855 if ((Op == AMDGPU::V_ADD_F32_e64 && ST->hasFP32Denormals()) || 856 (Op == AMDGPU::V_ADD_F16_e64 && ST->hasFP16Denormals())) 857 return std::make_pair(nullptr, SIOutMods::NONE); 858 859 // Look through the DAGCombiner canonicalization fmul x, 2 -> fadd x, x 860 const MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); 861 const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); 862 863 if (Src0->isReg() && Src1->isReg() && Src0->getReg() == Src1->getReg() && 864 Src0->getSubReg() == Src1->getSubReg() && 865 !TII->hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers) && 866 !TII->hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers) && 867 !TII->hasModifiersSet(MI, AMDGPU::OpName::clamp) && 868 !TII->hasModifiersSet(MI, AMDGPU::OpName::omod)) 869 return std::make_pair(Src0, SIOutMods::MUL2); 870 871 return std::make_pair(nullptr, SIOutMods::NONE); 872 } 873 default: 874 return std::make_pair(nullptr, SIOutMods::NONE); 875 } 876 } 877 878 // FIXME: Does this need to check IEEE bit on function? 879 bool SIFoldOperands::tryFoldOMod(MachineInstr &MI) { 880 const MachineOperand *RegOp; 881 int OMod; 882 std::tie(RegOp, OMod) = isOMod(MI); 883 if (OMod == SIOutMods::NONE || !RegOp->isReg() || 884 RegOp->getSubReg() != AMDGPU::NoSubRegister || 885 !hasOneNonDBGUseInst(*MRI, RegOp->getReg())) 886 return false; 887 888 MachineInstr *Def = MRI->getVRegDef(RegOp->getReg()); 889 MachineOperand *DefOMod = TII->getNamedOperand(*Def, AMDGPU::OpName::omod); 890 if (!DefOMod || DefOMod->getImm() != SIOutMods::NONE) 891 return false; 892 893 // Clamp is applied after omod. If the source already has clamp set, don't 894 // fold it. 895 if (TII->hasModifiersSet(*Def, AMDGPU::OpName::clamp)) 896 return false; 897 898 DEBUG(dbgs() << "Folding omod " << MI << " into " << *Def << '\n'); 899 900 DefOMod->setImm(OMod); 901 MRI->replaceRegWith(MI.getOperand(0).getReg(), Def->getOperand(0).getReg()); 902 MI.eraseFromParent(); 903 return true; 904 } 905 906 bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) { 907 if (skipFunction(*MF.getFunction())) 908 return false; 909 910 MRI = &MF.getRegInfo(); 911 ST = &MF.getSubtarget<SISubtarget>(); 912 TII = ST->getInstrInfo(); 913 TRI = &TII->getRegisterInfo(); 914 915 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 916 917 // omod is ignored by hardware if IEEE bit is enabled. omod also does not 918 // correctly handle signed zeros. 919 // 920 // TODO: Check nsz on instructions when fast math flags are preserved to MI 921 // level. 922 bool IsIEEEMode = ST->enableIEEEBit(MF) || !MFI->hasNoSignedZerosFPMath(); 923 924 for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); 925 BI != BE; ++BI) { 926 927 MachineBasicBlock &MBB = *BI; 928 MachineBasicBlock::iterator I, Next; 929 for (I = MBB.begin(); I != MBB.end(); I = Next) { 930 Next = std::next(I); 931 MachineInstr &MI = *I; 932 933 tryFoldInst(TII, &MI); 934 935 if (!TII->isFoldableCopy(MI)) { 936 if (IsIEEEMode || !tryFoldOMod(MI)) 937 tryFoldClamp(MI); 938 continue; 939 } 940 941 MachineOperand &OpToFold = MI.getOperand(1); 942 bool FoldingImm = OpToFold.isImm() || OpToFold.isFI(); 943 944 // FIXME: We could also be folding things like TargetIndexes. 945 if (!FoldingImm && !OpToFold.isReg()) 946 continue; 947 948 if (OpToFold.isReg() && 949 !TargetRegisterInfo::isVirtualRegister(OpToFold.getReg())) 950 continue; 951 952 // Prevent folding operands backwards in the function. For example, 953 // the COPY opcode must not be replaced by 1 in this example: 954 // 955 // %vreg3<def> = COPY %VGPR0; VGPR_32:%vreg3 956 // ... 957 // %VGPR0<def> = V_MOV_B32_e32 1, %EXEC<imp-use> 958 MachineOperand &Dst = MI.getOperand(0); 959 if (Dst.isReg() && 960 !TargetRegisterInfo::isVirtualRegister(Dst.getReg())) 961 continue; 962 963 foldInstOperand(MI, OpToFold); 964 } 965 } 966 return false; 967 } 968