1 //===-- SIFoldOperands.cpp - Fold operands --- ----------------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 /// \file 9 //===----------------------------------------------------------------------===// 10 // 11 12 #include "AMDGPU.h" 13 #include "AMDGPUSubtarget.h" 14 #include "SIInstrInfo.h" 15 #include "SIMachineFunctionInfo.h" 16 #include "llvm/ADT/DepthFirstIterator.h" 17 #include "llvm/CodeGen/LiveIntervals.h" 18 #include "llvm/CodeGen/MachineFunctionPass.h" 19 #include "llvm/CodeGen/MachineInstrBuilder.h" 20 #include "llvm/CodeGen/MachineRegisterInfo.h" 21 #include "llvm/Support/Debug.h" 22 #include "llvm/Support/raw_ostream.h" 23 #include "llvm/Target/TargetMachine.h" 24 25 #define DEBUG_TYPE "si-fold-operands" 26 using namespace llvm; 27 28 namespace { 29 30 struct FoldCandidate { 31 MachineInstr *UseMI; 32 union { 33 MachineOperand *OpToFold; 34 uint64_t ImmToFold; 35 int FrameIndexToFold; 36 }; 37 unsigned char UseOpNo; 38 MachineOperand::MachineOperandType Kind; 39 bool Commuted; 40 41 FoldCandidate(MachineInstr *MI, unsigned OpNo, MachineOperand *FoldOp, 42 bool Commuted_ = false) : 43 UseMI(MI), OpToFold(nullptr), UseOpNo(OpNo), Kind(FoldOp->getType()), 44 Commuted(Commuted_) { 45 if (FoldOp->isImm()) { 46 ImmToFold = FoldOp->getImm(); 47 } else if (FoldOp->isFI()) { 48 FrameIndexToFold = FoldOp->getIndex(); 49 } else { 50 assert(FoldOp->isReg()); 51 OpToFold = FoldOp; 52 } 53 } 54 55 bool isFI() const { 56 return Kind == MachineOperand::MO_FrameIndex; 57 } 58 59 bool isImm() const { 60 return Kind == MachineOperand::MO_Immediate; 61 } 62 63 bool isReg() const { 64 return Kind == MachineOperand::MO_Register; 65 } 66 67 bool isCommuted() const { 68 return Commuted; 69 } 70 }; 71 72 class SIFoldOperands : public MachineFunctionPass { 73 public: 74 static char ID; 75 MachineRegisterInfo *MRI; 76 const SIInstrInfo *TII; 77 const SIRegisterInfo *TRI; 78 const SISubtarget *ST; 79 80 void foldOperand(MachineOperand &OpToFold, 81 MachineInstr *UseMI, 82 unsigned UseOpIdx, 83 SmallVectorImpl<FoldCandidate> &FoldList, 84 SmallVectorImpl<MachineInstr *> &CopiesToReplace) const; 85 86 void foldInstOperand(MachineInstr &MI, MachineOperand &OpToFold) const; 87 88 const MachineOperand *isClamp(const MachineInstr &MI) const; 89 bool tryFoldClamp(MachineInstr &MI); 90 91 std::pair<const MachineOperand *, int> isOMod(const MachineInstr &MI) const; 92 bool tryFoldOMod(MachineInstr &MI); 93 94 public: 95 SIFoldOperands() : MachineFunctionPass(ID) { 96 initializeSIFoldOperandsPass(*PassRegistry::getPassRegistry()); 97 } 98 99 bool runOnMachineFunction(MachineFunction &MF) override; 100 101 StringRef getPassName() const override { return "SI Fold Operands"; } 102 103 void getAnalysisUsage(AnalysisUsage &AU) const override { 104 AU.setPreservesCFG(); 105 MachineFunctionPass::getAnalysisUsage(AU); 106 } 107 }; 108 109 } // End anonymous namespace. 110 111 INITIALIZE_PASS(SIFoldOperands, DEBUG_TYPE, 112 "SI Fold Operands", false, false) 113 114 char SIFoldOperands::ID = 0; 115 116 char &llvm::SIFoldOperandsID = SIFoldOperands::ID; 117 118 // Wrapper around isInlineConstant that understands special cases when 119 // instruction types are replaced during operand folding. 120 static bool isInlineConstantIfFolded(const SIInstrInfo *TII, 121 const MachineInstr &UseMI, 122 unsigned OpNo, 123 const MachineOperand &OpToFold) { 124 if (TII->isInlineConstant(UseMI, OpNo, OpToFold)) 125 return true; 126 127 unsigned Opc = UseMI.getOpcode(); 128 switch (Opc) { 129 case AMDGPU::V_MAC_F32_e64: 130 case AMDGPU::V_MAC_F16_e64: 131 case AMDGPU::V_FMAC_F32_e64: { 132 // Special case for mac. Since this is replaced with mad when folded into 133 // src2, we need to check the legality for the final instruction. 134 int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2); 135 if (static_cast<int>(OpNo) == Src2Idx) { 136 bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e64; 137 bool IsF32 = Opc == AMDGPU::V_MAC_F32_e64; 138 139 unsigned Opc = IsFMA ? 140 AMDGPU::V_FMA_F32 : (IsF32 ? AMDGPU::V_MAD_F32 : AMDGPU::V_MAD_F16); 141 const MCInstrDesc &MadDesc = TII->get(Opc); 142 return TII->isInlineConstant(OpToFold, MadDesc.OpInfo[OpNo].OperandType); 143 } 144 return false; 145 } 146 default: 147 return false; 148 } 149 } 150 151 FunctionPass *llvm::createSIFoldOperandsPass() { 152 return new SIFoldOperands(); 153 } 154 155 static bool updateOperand(FoldCandidate &Fold, 156 const TargetRegisterInfo &TRI) { 157 MachineInstr *MI = Fold.UseMI; 158 MachineOperand &Old = MI->getOperand(Fold.UseOpNo); 159 assert(Old.isReg()); 160 161 if (Fold.isImm()) { 162 if (MI->getDesc().TSFlags & SIInstrFlags::IsPacked) { 163 // Set op_sel/op_sel_hi on this operand or bail out if op_sel is 164 // already set. 165 unsigned Opcode = MI->getOpcode(); 166 int OpNo = MI->getOperandNo(&Old); 167 int ModIdx = -1; 168 if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0)) 169 ModIdx = AMDGPU::OpName::src0_modifiers; 170 else if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1)) 171 ModIdx = AMDGPU::OpName::src1_modifiers; 172 else if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2)) 173 ModIdx = AMDGPU::OpName::src2_modifiers; 174 assert(ModIdx != -1); 175 ModIdx = AMDGPU::getNamedOperandIdx(Opcode, ModIdx); 176 MachineOperand &Mod = MI->getOperand(ModIdx); 177 unsigned Val = Mod.getImm(); 178 if ((Val & SISrcMods::OP_SEL_0) || !(Val & SISrcMods::OP_SEL_1)) 179 return false; 180 // If upper part is all zero we do not need op_sel_hi. 181 if (!isUInt<16>(Fold.ImmToFold)) { 182 if (!(Fold.ImmToFold & 0xffff)) { 183 Mod.setImm(Mod.getImm() | SISrcMods::OP_SEL_0); 184 Mod.setImm(Mod.getImm() & ~SISrcMods::OP_SEL_1); 185 Old.ChangeToImmediate((Fold.ImmToFold >> 16) & 0xffff); 186 return true; 187 } 188 Mod.setImm(Mod.getImm() & ~SISrcMods::OP_SEL_1); 189 } 190 } 191 Old.ChangeToImmediate(Fold.ImmToFold); 192 return true; 193 } 194 195 if (Fold.isFI()) { 196 Old.ChangeToFrameIndex(Fold.FrameIndexToFold); 197 return true; 198 } 199 200 MachineOperand *New = Fold.OpToFold; 201 if (TargetRegisterInfo::isVirtualRegister(Old.getReg()) && 202 TargetRegisterInfo::isVirtualRegister(New->getReg())) { 203 Old.substVirtReg(New->getReg(), New->getSubReg(), TRI); 204 205 Old.setIsUndef(New->isUndef()); 206 return true; 207 } 208 209 // FIXME: Handle physical registers. 210 211 return false; 212 } 213 214 static bool isUseMIInFoldList(ArrayRef<FoldCandidate> FoldList, 215 const MachineInstr *MI) { 216 for (auto Candidate : FoldList) { 217 if (Candidate.UseMI == MI) 218 return true; 219 } 220 return false; 221 } 222 223 static bool tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList, 224 MachineInstr *MI, unsigned OpNo, 225 MachineOperand *OpToFold, 226 const SIInstrInfo *TII) { 227 if (!TII->isOperandLegal(*MI, OpNo, OpToFold)) { 228 229 // Special case for v_mac_{f16, f32}_e64 if we are trying to fold into src2 230 unsigned Opc = MI->getOpcode(); 231 if ((Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 || 232 Opc == AMDGPU::V_FMAC_F32_e64) && 233 (int)OpNo == AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)) { 234 bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e64; 235 bool IsF32 = Opc == AMDGPU::V_MAC_F32_e64; 236 unsigned NewOpc = IsFMA ? 237 AMDGPU::V_FMA_F32 : (IsF32 ? AMDGPU::V_MAD_F32 : AMDGPU::V_MAD_F16); 238 239 // Check if changing this to a v_mad_{f16, f32} instruction will allow us 240 // to fold the operand. 241 MI->setDesc(TII->get(NewOpc)); 242 bool FoldAsMAD = tryAddToFoldList(FoldList, MI, OpNo, OpToFold, TII); 243 if (FoldAsMAD) { 244 MI->untieRegOperand(OpNo); 245 return true; 246 } 247 MI->setDesc(TII->get(Opc)); 248 } 249 250 // Special case for s_setreg_b32 251 if (Opc == AMDGPU::S_SETREG_B32 && OpToFold->isImm()) { 252 MI->setDesc(TII->get(AMDGPU::S_SETREG_IMM32_B32)); 253 FoldList.push_back(FoldCandidate(MI, OpNo, OpToFold)); 254 return true; 255 } 256 257 // If we are already folding into another operand of MI, then 258 // we can't commute the instruction, otherwise we risk making the 259 // other fold illegal. 260 if (isUseMIInFoldList(FoldList, MI)) 261 return false; 262 263 // Operand is not legal, so try to commute the instruction to 264 // see if this makes it possible to fold. 265 unsigned CommuteIdx0 = TargetInstrInfo::CommuteAnyOperandIndex; 266 unsigned CommuteIdx1 = TargetInstrInfo::CommuteAnyOperandIndex; 267 bool CanCommute = TII->findCommutedOpIndices(*MI, CommuteIdx0, CommuteIdx1); 268 269 if (CanCommute) { 270 if (CommuteIdx0 == OpNo) 271 OpNo = CommuteIdx1; 272 else if (CommuteIdx1 == OpNo) 273 OpNo = CommuteIdx0; 274 } 275 276 // One of operands might be an Imm operand, and OpNo may refer to it after 277 // the call of commuteInstruction() below. Such situations are avoided 278 // here explicitly as OpNo must be a register operand to be a candidate 279 // for memory folding. 280 if (CanCommute && (!MI->getOperand(CommuteIdx0).isReg() || 281 !MI->getOperand(CommuteIdx1).isReg())) 282 return false; 283 284 if (!CanCommute || 285 !TII->commuteInstruction(*MI, false, CommuteIdx0, CommuteIdx1)) 286 return false; 287 288 if (!TII->isOperandLegal(*MI, OpNo, OpToFold)) { 289 TII->commuteInstruction(*MI, false, CommuteIdx0, CommuteIdx1); 290 return false; 291 } 292 293 FoldList.push_back(FoldCandidate(MI, OpNo, OpToFold, true)); 294 return true; 295 } 296 297 FoldList.push_back(FoldCandidate(MI, OpNo, OpToFold)); 298 return true; 299 } 300 301 // If the use operand doesn't care about the value, this may be an operand only 302 // used for register indexing, in which case it is unsafe to fold. 303 static bool isUseSafeToFold(const SIInstrInfo *TII, 304 const MachineInstr &MI, 305 const MachineOperand &UseMO) { 306 return !UseMO.isUndef() && !TII->isSDWA(MI); 307 //return !MI.hasRegisterImplicitUseOperand(UseMO.getReg()); 308 } 309 310 void SIFoldOperands::foldOperand( 311 MachineOperand &OpToFold, 312 MachineInstr *UseMI, 313 unsigned UseOpIdx, 314 SmallVectorImpl<FoldCandidate> &FoldList, 315 SmallVectorImpl<MachineInstr *> &CopiesToReplace) const { 316 const MachineOperand &UseOp = UseMI->getOperand(UseOpIdx); 317 318 if (!isUseSafeToFold(TII, *UseMI, UseOp)) 319 return; 320 321 // FIXME: Fold operands with subregs. 322 if (UseOp.isReg() && OpToFold.isReg()) { 323 if (UseOp.isImplicit() || UseOp.getSubReg() != AMDGPU::NoSubRegister) 324 return; 325 326 // Don't fold subregister extracts into tied operands, only if it is a full 327 // copy since a subregister use tied to a full register def doesn't really 328 // make sense. e.g. don't fold: 329 // 330 // %1 = COPY %0:sub1 331 // %2<tied3> = V_MAC_{F16, F32} %3, %4, %1<tied0> 332 // 333 // into 334 // %2<tied3> = V_MAC_{F16, F32} %3, %4, %0:sub1<tied0> 335 if (UseOp.isTied() && OpToFold.getSubReg() != AMDGPU::NoSubRegister) 336 return; 337 } 338 339 // Special case for REG_SEQUENCE: We can't fold literals into 340 // REG_SEQUENCE instructions, so we have to fold them into the 341 // uses of REG_SEQUENCE. 342 if (UseMI->isRegSequence()) { 343 unsigned RegSeqDstReg = UseMI->getOperand(0).getReg(); 344 unsigned RegSeqDstSubReg = UseMI->getOperand(UseOpIdx + 1).getImm(); 345 346 for (MachineRegisterInfo::use_iterator 347 RSUse = MRI->use_begin(RegSeqDstReg), RSE = MRI->use_end(); 348 RSUse != RSE; ++RSUse) { 349 350 MachineInstr *RSUseMI = RSUse->getParent(); 351 if (RSUse->getSubReg() != RegSeqDstSubReg) 352 continue; 353 354 foldOperand(OpToFold, RSUseMI, RSUse.getOperandNo(), FoldList, 355 CopiesToReplace); 356 } 357 358 return; 359 } 360 361 362 bool FoldingImm = OpToFold.isImm(); 363 364 // In order to fold immediates into copies, we need to change the 365 // copy to a MOV. 366 if (FoldingImm && UseMI->isCopy()) { 367 unsigned DestReg = UseMI->getOperand(0).getReg(); 368 const TargetRegisterClass *DestRC 369 = TargetRegisterInfo::isVirtualRegister(DestReg) ? 370 MRI->getRegClass(DestReg) : 371 TRI->getPhysRegClass(DestReg); 372 373 unsigned MovOp = TII->getMovOpcode(DestRC); 374 if (MovOp == AMDGPU::COPY) 375 return; 376 377 UseMI->setDesc(TII->get(MovOp)); 378 CopiesToReplace.push_back(UseMI); 379 } else { 380 const MCInstrDesc &UseDesc = UseMI->getDesc(); 381 382 // Don't fold into target independent nodes. Target independent opcodes 383 // don't have defined register classes. 384 if (UseDesc.isVariadic() || 385 UseOp.isImplicit() || 386 UseDesc.OpInfo[UseOpIdx].RegClass == -1) 387 return; 388 } 389 390 if (!FoldingImm) { 391 tryAddToFoldList(FoldList, UseMI, UseOpIdx, &OpToFold, TII); 392 393 // FIXME: We could try to change the instruction from 64-bit to 32-bit 394 // to enable more folding opportunites. The shrink operands pass 395 // already does this. 396 return; 397 } 398 399 400 const MCInstrDesc &FoldDesc = OpToFold.getParent()->getDesc(); 401 const TargetRegisterClass *FoldRC = 402 TRI->getRegClass(FoldDesc.OpInfo[0].RegClass); 403 404 405 // Split 64-bit constants into 32-bits for folding. 406 if (UseOp.getSubReg() && AMDGPU::getRegBitWidth(FoldRC->getID()) == 64) { 407 unsigned UseReg = UseOp.getReg(); 408 const TargetRegisterClass *UseRC 409 = TargetRegisterInfo::isVirtualRegister(UseReg) ? 410 MRI->getRegClass(UseReg) : 411 TRI->getPhysRegClass(UseReg); 412 413 if (AMDGPU::getRegBitWidth(UseRC->getID()) != 64) 414 return; 415 416 APInt Imm(64, OpToFold.getImm()); 417 if (UseOp.getSubReg() == AMDGPU::sub0) { 418 Imm = Imm.getLoBits(32); 419 } else { 420 assert(UseOp.getSubReg() == AMDGPU::sub1); 421 Imm = Imm.getHiBits(32); 422 } 423 424 MachineOperand ImmOp = MachineOperand::CreateImm(Imm.getSExtValue()); 425 tryAddToFoldList(FoldList, UseMI, UseOpIdx, &ImmOp, TII); 426 return; 427 } 428 429 430 431 tryAddToFoldList(FoldList, UseMI, UseOpIdx, &OpToFold, TII); 432 } 433 434 static bool evalBinaryInstruction(unsigned Opcode, int32_t &Result, 435 uint32_t LHS, uint32_t RHS) { 436 switch (Opcode) { 437 case AMDGPU::V_AND_B32_e64: 438 case AMDGPU::V_AND_B32_e32: 439 case AMDGPU::S_AND_B32: 440 Result = LHS & RHS; 441 return true; 442 case AMDGPU::V_OR_B32_e64: 443 case AMDGPU::V_OR_B32_e32: 444 case AMDGPU::S_OR_B32: 445 Result = LHS | RHS; 446 return true; 447 case AMDGPU::V_XOR_B32_e64: 448 case AMDGPU::V_XOR_B32_e32: 449 case AMDGPU::S_XOR_B32: 450 Result = LHS ^ RHS; 451 return true; 452 case AMDGPU::V_LSHL_B32_e64: 453 case AMDGPU::V_LSHL_B32_e32: 454 case AMDGPU::S_LSHL_B32: 455 // The instruction ignores the high bits for out of bounds shifts. 456 Result = LHS << (RHS & 31); 457 return true; 458 case AMDGPU::V_LSHLREV_B32_e64: 459 case AMDGPU::V_LSHLREV_B32_e32: 460 Result = RHS << (LHS & 31); 461 return true; 462 case AMDGPU::V_LSHR_B32_e64: 463 case AMDGPU::V_LSHR_B32_e32: 464 case AMDGPU::S_LSHR_B32: 465 Result = LHS >> (RHS & 31); 466 return true; 467 case AMDGPU::V_LSHRREV_B32_e64: 468 case AMDGPU::V_LSHRREV_B32_e32: 469 Result = RHS >> (LHS & 31); 470 return true; 471 case AMDGPU::V_ASHR_I32_e64: 472 case AMDGPU::V_ASHR_I32_e32: 473 case AMDGPU::S_ASHR_I32: 474 Result = static_cast<int32_t>(LHS) >> (RHS & 31); 475 return true; 476 case AMDGPU::V_ASHRREV_I32_e64: 477 case AMDGPU::V_ASHRREV_I32_e32: 478 Result = static_cast<int32_t>(RHS) >> (LHS & 31); 479 return true; 480 default: 481 return false; 482 } 483 } 484 485 static unsigned getMovOpc(bool IsScalar) { 486 return IsScalar ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32; 487 } 488 489 /// Remove any leftover implicit operands from mutating the instruction. e.g. 490 /// if we replace an s_and_b32 with a copy, we don't need the implicit scc def 491 /// anymore. 492 static void stripExtraCopyOperands(MachineInstr &MI) { 493 const MCInstrDesc &Desc = MI.getDesc(); 494 unsigned NumOps = Desc.getNumOperands() + 495 Desc.getNumImplicitUses() + 496 Desc.getNumImplicitDefs(); 497 498 for (unsigned I = MI.getNumOperands() - 1; I >= NumOps; --I) 499 MI.RemoveOperand(I); 500 } 501 502 static void mutateCopyOp(MachineInstr &MI, const MCInstrDesc &NewDesc) { 503 MI.setDesc(NewDesc); 504 stripExtraCopyOperands(MI); 505 } 506 507 static MachineOperand *getImmOrMaterializedImm(MachineRegisterInfo &MRI, 508 MachineOperand &Op) { 509 if (Op.isReg()) { 510 // If this has a subregister, it obviously is a register source. 511 if (Op.getSubReg() != AMDGPU::NoSubRegister || 512 !TargetRegisterInfo::isVirtualRegister(Op.getReg())) 513 return &Op; 514 515 MachineInstr *Def = MRI.getVRegDef(Op.getReg()); 516 if (Def && Def->isMoveImmediate()) { 517 MachineOperand &ImmSrc = Def->getOperand(1); 518 if (ImmSrc.isImm()) 519 return &ImmSrc; 520 } 521 } 522 523 return &Op; 524 } 525 526 // Try to simplify operations with a constant that may appear after instruction 527 // selection. 528 // TODO: See if a frame index with a fixed offset can fold. 529 static bool tryConstantFoldOp(MachineRegisterInfo &MRI, 530 const SIInstrInfo *TII, 531 MachineInstr *MI, 532 MachineOperand *ImmOp) { 533 unsigned Opc = MI->getOpcode(); 534 if (Opc == AMDGPU::V_NOT_B32_e64 || Opc == AMDGPU::V_NOT_B32_e32 || 535 Opc == AMDGPU::S_NOT_B32) { 536 MI->getOperand(1).ChangeToImmediate(~ImmOp->getImm()); 537 mutateCopyOp(*MI, TII->get(getMovOpc(Opc == AMDGPU::S_NOT_B32))); 538 return true; 539 } 540 541 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1); 542 if (Src1Idx == -1) 543 return false; 544 545 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); 546 MachineOperand *Src0 = getImmOrMaterializedImm(MRI, MI->getOperand(Src0Idx)); 547 MachineOperand *Src1 = getImmOrMaterializedImm(MRI, MI->getOperand(Src1Idx)); 548 549 if (!Src0->isImm() && !Src1->isImm()) 550 return false; 551 552 // and k0, k1 -> v_mov_b32 (k0 & k1) 553 // or k0, k1 -> v_mov_b32 (k0 | k1) 554 // xor k0, k1 -> v_mov_b32 (k0 ^ k1) 555 if (Src0->isImm() && Src1->isImm()) { 556 int32_t NewImm; 557 if (!evalBinaryInstruction(Opc, NewImm, Src0->getImm(), Src1->getImm())) 558 return false; 559 560 const SIRegisterInfo &TRI = TII->getRegisterInfo(); 561 bool IsSGPR = TRI.isSGPRReg(MRI, MI->getOperand(0).getReg()); 562 563 // Be careful to change the right operand, src0 may belong to a different 564 // instruction. 565 MI->getOperand(Src0Idx).ChangeToImmediate(NewImm); 566 MI->RemoveOperand(Src1Idx); 567 mutateCopyOp(*MI, TII->get(getMovOpc(IsSGPR))); 568 return true; 569 } 570 571 if (!MI->isCommutable()) 572 return false; 573 574 if (Src0->isImm() && !Src1->isImm()) { 575 std::swap(Src0, Src1); 576 std::swap(Src0Idx, Src1Idx); 577 } 578 579 int32_t Src1Val = static_cast<int32_t>(Src1->getImm()); 580 if (Opc == AMDGPU::V_OR_B32_e64 || 581 Opc == AMDGPU::V_OR_B32_e32 || 582 Opc == AMDGPU::S_OR_B32) { 583 if (Src1Val == 0) { 584 // y = or x, 0 => y = copy x 585 MI->RemoveOperand(Src1Idx); 586 mutateCopyOp(*MI, TII->get(AMDGPU::COPY)); 587 } else if (Src1Val == -1) { 588 // y = or x, -1 => y = v_mov_b32 -1 589 MI->RemoveOperand(Src1Idx); 590 mutateCopyOp(*MI, TII->get(getMovOpc(Opc == AMDGPU::S_OR_B32))); 591 } else 592 return false; 593 594 return true; 595 } 596 597 if (MI->getOpcode() == AMDGPU::V_AND_B32_e64 || 598 MI->getOpcode() == AMDGPU::V_AND_B32_e32 || 599 MI->getOpcode() == AMDGPU::S_AND_B32) { 600 if (Src1Val == 0) { 601 // y = and x, 0 => y = v_mov_b32 0 602 MI->RemoveOperand(Src0Idx); 603 mutateCopyOp(*MI, TII->get(getMovOpc(Opc == AMDGPU::S_AND_B32))); 604 } else if (Src1Val == -1) { 605 // y = and x, -1 => y = copy x 606 MI->RemoveOperand(Src1Idx); 607 mutateCopyOp(*MI, TII->get(AMDGPU::COPY)); 608 stripExtraCopyOperands(*MI); 609 } else 610 return false; 611 612 return true; 613 } 614 615 if (MI->getOpcode() == AMDGPU::V_XOR_B32_e64 || 616 MI->getOpcode() == AMDGPU::V_XOR_B32_e32 || 617 MI->getOpcode() == AMDGPU::S_XOR_B32) { 618 if (Src1Val == 0) { 619 // y = xor x, 0 => y = copy x 620 MI->RemoveOperand(Src1Idx); 621 mutateCopyOp(*MI, TII->get(AMDGPU::COPY)); 622 return true; 623 } 624 } 625 626 return false; 627 } 628 629 // Try to fold an instruction into a simpler one 630 static bool tryFoldInst(const SIInstrInfo *TII, 631 MachineInstr *MI) { 632 unsigned Opc = MI->getOpcode(); 633 634 if (Opc == AMDGPU::V_CNDMASK_B32_e32 || 635 Opc == AMDGPU::V_CNDMASK_B32_e64 || 636 Opc == AMDGPU::V_CNDMASK_B64_PSEUDO) { 637 const MachineOperand *Src0 = TII->getNamedOperand(*MI, AMDGPU::OpName::src0); 638 const MachineOperand *Src1 = TII->getNamedOperand(*MI, AMDGPU::OpName::src1); 639 if (Src1->isIdenticalTo(*Src0)) { 640 LLVM_DEBUG(dbgs() << "Folded " << *MI << " into "); 641 int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2); 642 if (Src2Idx != -1) 643 MI->RemoveOperand(Src2Idx); 644 MI->RemoveOperand(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1)); 645 mutateCopyOp(*MI, TII->get(Src0->isReg() ? (unsigned)AMDGPU::COPY 646 : getMovOpc(false))); 647 LLVM_DEBUG(dbgs() << *MI << '\n'); 648 return true; 649 } 650 } 651 652 return false; 653 } 654 655 void SIFoldOperands::foldInstOperand(MachineInstr &MI, 656 MachineOperand &OpToFold) const { 657 // We need mutate the operands of new mov instructions to add implicit 658 // uses of EXEC, but adding them invalidates the use_iterator, so defer 659 // this. 660 SmallVector<MachineInstr *, 4> CopiesToReplace; 661 SmallVector<FoldCandidate, 4> FoldList; 662 MachineOperand &Dst = MI.getOperand(0); 663 664 bool FoldingImm = OpToFold.isImm() || OpToFold.isFI(); 665 if (FoldingImm) { 666 unsigned NumLiteralUses = 0; 667 MachineOperand *NonInlineUse = nullptr; 668 int NonInlineUseOpNo = -1; 669 670 MachineRegisterInfo::use_iterator NextUse; 671 for (MachineRegisterInfo::use_iterator 672 Use = MRI->use_begin(Dst.getReg()), E = MRI->use_end(); 673 Use != E; Use = NextUse) { 674 NextUse = std::next(Use); 675 MachineInstr *UseMI = Use->getParent(); 676 unsigned OpNo = Use.getOperandNo(); 677 678 // Folding the immediate may reveal operations that can be constant 679 // folded or replaced with a copy. This can happen for example after 680 // frame indices are lowered to constants or from splitting 64-bit 681 // constants. 682 // 683 // We may also encounter cases where one or both operands are 684 // immediates materialized into a register, which would ordinarily not 685 // be folded due to multiple uses or operand constraints. 686 687 if (OpToFold.isImm() && tryConstantFoldOp(*MRI, TII, UseMI, &OpToFold)) { 688 LLVM_DEBUG(dbgs() << "Constant folded " << *UseMI << '\n'); 689 690 // Some constant folding cases change the same immediate's use to a new 691 // instruction, e.g. and x, 0 -> 0. Make sure we re-visit the user 692 // again. The same constant folded instruction could also have a second 693 // use operand. 694 NextUse = MRI->use_begin(Dst.getReg()); 695 FoldList.clear(); 696 continue; 697 } 698 699 // Try to fold any inline immediate uses, and then only fold other 700 // constants if they have one use. 701 // 702 // The legality of the inline immediate must be checked based on the use 703 // operand, not the defining instruction, because 32-bit instructions 704 // with 32-bit inline immediate sources may be used to materialize 705 // constants used in 16-bit operands. 706 // 707 // e.g. it is unsafe to fold: 708 // s_mov_b32 s0, 1.0 // materializes 0x3f800000 709 // v_add_f16 v0, v1, s0 // 1.0 f16 inline immediate sees 0x00003c00 710 711 // Folding immediates with more than one use will increase program size. 712 // FIXME: This will also reduce register usage, which may be better 713 // in some cases. A better heuristic is needed. 714 if (isInlineConstantIfFolded(TII, *UseMI, OpNo, OpToFold)) { 715 foldOperand(OpToFold, UseMI, OpNo, FoldList, CopiesToReplace); 716 } else { 717 if (++NumLiteralUses == 1) { 718 NonInlineUse = &*Use; 719 NonInlineUseOpNo = OpNo; 720 } 721 } 722 } 723 724 if (NumLiteralUses == 1) { 725 MachineInstr *UseMI = NonInlineUse->getParent(); 726 foldOperand(OpToFold, UseMI, NonInlineUseOpNo, FoldList, CopiesToReplace); 727 } 728 } else { 729 // Folding register. 730 for (MachineRegisterInfo::use_iterator 731 Use = MRI->use_begin(Dst.getReg()), E = MRI->use_end(); 732 Use != E; ++Use) { 733 MachineInstr *UseMI = Use->getParent(); 734 735 foldOperand(OpToFold, UseMI, Use.getOperandNo(), 736 FoldList, CopiesToReplace); 737 } 738 } 739 740 MachineFunction *MF = MI.getParent()->getParent(); 741 // Make sure we add EXEC uses to any new v_mov instructions created. 742 for (MachineInstr *Copy : CopiesToReplace) 743 Copy->addImplicitDefUseOperands(*MF); 744 745 for (FoldCandidate &Fold : FoldList) { 746 if (updateOperand(Fold, *TRI)) { 747 // Clear kill flags. 748 if (Fold.isReg()) { 749 assert(Fold.OpToFold && Fold.OpToFold->isReg()); 750 // FIXME: Probably shouldn't bother trying to fold if not an 751 // SGPR. PeepholeOptimizer can eliminate redundant VGPR->VGPR 752 // copies. 753 MRI->clearKillFlags(Fold.OpToFold->getReg()); 754 } 755 LLVM_DEBUG(dbgs() << "Folded source from " << MI << " into OpNo " 756 << static_cast<int>(Fold.UseOpNo) << " of " 757 << *Fold.UseMI << '\n'); 758 tryFoldInst(TII, Fold.UseMI); 759 } else if (Fold.isCommuted()) { 760 // Restoring instruction's original operand order if fold has failed. 761 TII->commuteInstruction(*Fold.UseMI, false); 762 } 763 } 764 } 765 766 // Clamp patterns are canonically selected to v_max_* instructions, so only 767 // handle them. 768 const MachineOperand *SIFoldOperands::isClamp(const MachineInstr &MI) const { 769 unsigned Op = MI.getOpcode(); 770 switch (Op) { 771 case AMDGPU::V_MAX_F32_e64: 772 case AMDGPU::V_MAX_F16_e64: 773 case AMDGPU::V_MAX_F64: 774 case AMDGPU::V_PK_MAX_F16: { 775 if (!TII->getNamedOperand(MI, AMDGPU::OpName::clamp)->getImm()) 776 return nullptr; 777 778 // Make sure sources are identical. 779 const MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); 780 const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); 781 if (!Src0->isReg() || !Src1->isReg() || 782 Src0->getReg() != Src1->getReg() || 783 Src0->getSubReg() != Src1->getSubReg() || 784 Src0->getSubReg() != AMDGPU::NoSubRegister) 785 return nullptr; 786 787 // Can't fold up if we have modifiers. 788 if (TII->hasModifiersSet(MI, AMDGPU::OpName::omod)) 789 return nullptr; 790 791 unsigned Src0Mods 792 = TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm(); 793 unsigned Src1Mods 794 = TII->getNamedOperand(MI, AMDGPU::OpName::src1_modifiers)->getImm(); 795 796 // Having a 0 op_sel_hi would require swizzling the output in the source 797 // instruction, which we can't do. 798 unsigned UnsetMods = (Op == AMDGPU::V_PK_MAX_F16) ? SISrcMods::OP_SEL_1 : 0; 799 if (Src0Mods != UnsetMods && Src1Mods != UnsetMods) 800 return nullptr; 801 return Src0; 802 } 803 default: 804 return nullptr; 805 } 806 } 807 808 // We obviously have multiple uses in a clamp since the register is used twice 809 // in the same instruction. 810 static bool hasOneNonDBGUseInst(const MachineRegisterInfo &MRI, unsigned Reg) { 811 int Count = 0; 812 for (auto I = MRI.use_instr_nodbg_begin(Reg), E = MRI.use_instr_nodbg_end(); 813 I != E; ++I) { 814 if (++Count > 1) 815 return false; 816 } 817 818 return true; 819 } 820 821 // FIXME: Clamp for v_mad_mixhi_f16 handled during isel. 822 bool SIFoldOperands::tryFoldClamp(MachineInstr &MI) { 823 const MachineOperand *ClampSrc = isClamp(MI); 824 if (!ClampSrc || !hasOneNonDBGUseInst(*MRI, ClampSrc->getReg())) 825 return false; 826 827 MachineInstr *Def = MRI->getVRegDef(ClampSrc->getReg()); 828 829 // The type of clamp must be compatible. 830 if (TII->getClampMask(*Def) != TII->getClampMask(MI)) 831 return false; 832 833 MachineOperand *DefClamp = TII->getNamedOperand(*Def, AMDGPU::OpName::clamp); 834 if (!DefClamp) 835 return false; 836 837 LLVM_DEBUG(dbgs() << "Folding clamp " << *DefClamp << " into " << *Def 838 << '\n'); 839 840 // Clamp is applied after omod, so it is OK if omod is set. 841 DefClamp->setImm(1); 842 MRI->replaceRegWith(MI.getOperand(0).getReg(), Def->getOperand(0).getReg()); 843 MI.eraseFromParent(); 844 return true; 845 } 846 847 static int getOModValue(unsigned Opc, int64_t Val) { 848 switch (Opc) { 849 case AMDGPU::V_MUL_F32_e64: { 850 switch (static_cast<uint32_t>(Val)) { 851 case 0x3f000000: // 0.5 852 return SIOutMods::DIV2; 853 case 0x40000000: // 2.0 854 return SIOutMods::MUL2; 855 case 0x40800000: // 4.0 856 return SIOutMods::MUL4; 857 default: 858 return SIOutMods::NONE; 859 } 860 } 861 case AMDGPU::V_MUL_F16_e64: { 862 switch (static_cast<uint16_t>(Val)) { 863 case 0x3800: // 0.5 864 return SIOutMods::DIV2; 865 case 0x4000: // 2.0 866 return SIOutMods::MUL2; 867 case 0x4400: // 4.0 868 return SIOutMods::MUL4; 869 default: 870 return SIOutMods::NONE; 871 } 872 } 873 default: 874 llvm_unreachable("invalid mul opcode"); 875 } 876 } 877 878 // FIXME: Does this really not support denormals with f16? 879 // FIXME: Does this need to check IEEE mode bit? SNaNs are generally not 880 // handled, so will anything other than that break? 881 std::pair<const MachineOperand *, int> 882 SIFoldOperands::isOMod(const MachineInstr &MI) const { 883 unsigned Op = MI.getOpcode(); 884 switch (Op) { 885 case AMDGPU::V_MUL_F32_e64: 886 case AMDGPU::V_MUL_F16_e64: { 887 // If output denormals are enabled, omod is ignored. 888 if ((Op == AMDGPU::V_MUL_F32_e64 && ST->hasFP32Denormals()) || 889 (Op == AMDGPU::V_MUL_F16_e64 && ST->hasFP16Denormals())) 890 return std::make_pair(nullptr, SIOutMods::NONE); 891 892 const MachineOperand *RegOp = nullptr; 893 const MachineOperand *ImmOp = nullptr; 894 const MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); 895 const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); 896 if (Src0->isImm()) { 897 ImmOp = Src0; 898 RegOp = Src1; 899 } else if (Src1->isImm()) { 900 ImmOp = Src1; 901 RegOp = Src0; 902 } else 903 return std::make_pair(nullptr, SIOutMods::NONE); 904 905 int OMod = getOModValue(Op, ImmOp->getImm()); 906 if (OMod == SIOutMods::NONE || 907 TII->hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers) || 908 TII->hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers) || 909 TII->hasModifiersSet(MI, AMDGPU::OpName::omod) || 910 TII->hasModifiersSet(MI, AMDGPU::OpName::clamp)) 911 return std::make_pair(nullptr, SIOutMods::NONE); 912 913 return std::make_pair(RegOp, OMod); 914 } 915 case AMDGPU::V_ADD_F32_e64: 916 case AMDGPU::V_ADD_F16_e64: { 917 // If output denormals are enabled, omod is ignored. 918 if ((Op == AMDGPU::V_ADD_F32_e64 && ST->hasFP32Denormals()) || 919 (Op == AMDGPU::V_ADD_F16_e64 && ST->hasFP16Denormals())) 920 return std::make_pair(nullptr, SIOutMods::NONE); 921 922 // Look through the DAGCombiner canonicalization fmul x, 2 -> fadd x, x 923 const MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); 924 const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); 925 926 if (Src0->isReg() && Src1->isReg() && Src0->getReg() == Src1->getReg() && 927 Src0->getSubReg() == Src1->getSubReg() && 928 !TII->hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers) && 929 !TII->hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers) && 930 !TII->hasModifiersSet(MI, AMDGPU::OpName::clamp) && 931 !TII->hasModifiersSet(MI, AMDGPU::OpName::omod)) 932 return std::make_pair(Src0, SIOutMods::MUL2); 933 934 return std::make_pair(nullptr, SIOutMods::NONE); 935 } 936 default: 937 return std::make_pair(nullptr, SIOutMods::NONE); 938 } 939 } 940 941 // FIXME: Does this need to check IEEE bit on function? 942 bool SIFoldOperands::tryFoldOMod(MachineInstr &MI) { 943 const MachineOperand *RegOp; 944 int OMod; 945 std::tie(RegOp, OMod) = isOMod(MI); 946 if (OMod == SIOutMods::NONE || !RegOp->isReg() || 947 RegOp->getSubReg() != AMDGPU::NoSubRegister || 948 !hasOneNonDBGUseInst(*MRI, RegOp->getReg())) 949 return false; 950 951 MachineInstr *Def = MRI->getVRegDef(RegOp->getReg()); 952 MachineOperand *DefOMod = TII->getNamedOperand(*Def, AMDGPU::OpName::omod); 953 if (!DefOMod || DefOMod->getImm() != SIOutMods::NONE) 954 return false; 955 956 // Clamp is applied after omod. If the source already has clamp set, don't 957 // fold it. 958 if (TII->hasModifiersSet(*Def, AMDGPU::OpName::clamp)) 959 return false; 960 961 LLVM_DEBUG(dbgs() << "Folding omod " << MI << " into " << *Def << '\n'); 962 963 DefOMod->setImm(OMod); 964 MRI->replaceRegWith(MI.getOperand(0).getReg(), Def->getOperand(0).getReg()); 965 MI.eraseFromParent(); 966 return true; 967 } 968 969 bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) { 970 if (skipFunction(MF.getFunction())) 971 return false; 972 973 MRI = &MF.getRegInfo(); 974 ST = &MF.getSubtarget<SISubtarget>(); 975 TII = ST->getInstrInfo(); 976 TRI = &TII->getRegisterInfo(); 977 978 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 979 980 // omod is ignored by hardware if IEEE bit is enabled. omod also does not 981 // correctly handle signed zeros. 982 // 983 // TODO: Check nsz on instructions when fast math flags are preserved to MI 984 // level. 985 bool IsIEEEMode = ST->enableIEEEBit(MF) || !MFI->hasNoSignedZerosFPMath(); 986 987 for (MachineBasicBlock *MBB : depth_first(&MF)) { 988 MachineBasicBlock::iterator I, Next; 989 for (I = MBB->begin(); I != MBB->end(); I = Next) { 990 Next = std::next(I); 991 MachineInstr &MI = *I; 992 993 tryFoldInst(TII, &MI); 994 995 if (!TII->isFoldableCopy(MI)) { 996 if (IsIEEEMode || !tryFoldOMod(MI)) 997 tryFoldClamp(MI); 998 continue; 999 } 1000 1001 MachineOperand &OpToFold = MI.getOperand(1); 1002 bool FoldingImm = OpToFold.isImm() || OpToFold.isFI(); 1003 1004 // FIXME: We could also be folding things like TargetIndexes. 1005 if (!FoldingImm && !OpToFold.isReg()) 1006 continue; 1007 1008 if (OpToFold.isReg() && 1009 !TargetRegisterInfo::isVirtualRegister(OpToFold.getReg())) 1010 continue; 1011 1012 // Prevent folding operands backwards in the function. For example, 1013 // the COPY opcode must not be replaced by 1 in this example: 1014 // 1015 // %3 = COPY %vgpr0; VGPR_32:%3 1016 // ... 1017 // %vgpr0 = V_MOV_B32_e32 1, implicit %exec 1018 MachineOperand &Dst = MI.getOperand(0); 1019 if (Dst.isReg() && 1020 !TargetRegisterInfo::isVirtualRegister(Dst.getReg())) 1021 continue; 1022 1023 foldInstOperand(MI, OpToFold); 1024 } 1025 } 1026 return false; 1027 } 1028