1 //===-- SIShrinkInstructions.cpp - Shrink Instructions --------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 /// The pass tries to use the 32-bit encoding for instructions when possible. 8 //===----------------------------------------------------------------------===// 9 // 10 11 #include "SIShrinkInstructions.h" 12 #include "AMDGPU.h" 13 #include "GCNSubtarget.h" 14 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 15 #include "Utils/AMDGPUBaseInfo.h" 16 #include "llvm/ADT/Statistic.h" 17 #include "llvm/CodeGen/MachineFunctionPass.h" 18 19 #define DEBUG_TYPE "si-shrink-instructions" 20 21 STATISTIC(NumInstructionsShrunk, 22 "Number of 64-bit instruction reduced to 32-bit."); 23 STATISTIC(NumLiteralConstantsFolded, 24 "Number of literal constants folded into 32-bit instructions."); 25 26 using namespace llvm; 27 28 namespace { 29 30 class SIShrinkInstructions { 31 MachineFunction *MF; 32 MachineRegisterInfo *MRI; 33 const GCNSubtarget *ST; 34 const SIInstrInfo *TII; 35 const SIRegisterInfo *TRI; 36 37 bool foldImmediates(MachineInstr &MI, bool TryToCommute = true) const; 38 bool shouldShrinkTrue16(MachineInstr &MI) const; 39 bool isKImmOperand(const MachineOperand &Src) const; 40 bool isKUImmOperand(const MachineOperand &Src) const; 41 bool isKImmOrKUImmOperand(const MachineOperand &Src, bool &IsUnsigned) const; 42 void copyExtraImplicitOps(MachineInstr &NewMI, MachineInstr &MI) const; 43 void shrinkScalarCompare(MachineInstr &MI) const; 44 void shrinkMIMG(MachineInstr &MI) const; 45 void shrinkMadFma(MachineInstr &MI) const; 46 bool shrinkScalarLogicOp(MachineInstr &MI) const; 47 bool tryReplaceDeadSDST(MachineInstr &MI) const; 48 bool instAccessReg(iterator_range<MachineInstr::const_mop_iterator> &&R, 49 Register Reg, unsigned SubReg) const; 50 bool instReadsReg(const MachineInstr *MI, unsigned Reg, 51 unsigned SubReg) const; 52 bool instModifiesReg(const MachineInstr *MI, unsigned Reg, 53 unsigned SubReg) const; 54 TargetInstrInfo::RegSubRegPair getSubRegForIndex(Register Reg, unsigned Sub, 55 unsigned I) const; 56 void dropInstructionKeepingImpDefs(MachineInstr &MI) const; 57 MachineInstr *matchSwap(MachineInstr &MovT) const; 58 59 public: 60 SIShrinkInstructions() = default; 61 bool run(MachineFunction &MF); 62 }; 63 64 class SIShrinkInstructionsLegacy : public MachineFunctionPass { 65 66 public: 67 static char ID; 68 69 SIShrinkInstructionsLegacy() : MachineFunctionPass(ID) {} 70 71 bool runOnMachineFunction(MachineFunction &MF) override; 72 73 StringRef getPassName() const override { return "SI Shrink Instructions"; } 74 75 void getAnalysisUsage(AnalysisUsage &AU) const override { 76 AU.setPreservesCFG(); 77 MachineFunctionPass::getAnalysisUsage(AU); 78 } 79 }; 80 81 } // End anonymous namespace. 82 83 INITIALIZE_PASS(SIShrinkInstructionsLegacy, DEBUG_TYPE, 84 "SI Shrink Instructions", false, false) 85 86 char SIShrinkInstructionsLegacy::ID = 0; 87 88 FunctionPass *llvm::createSIShrinkInstructionsLegacyPass() { 89 return new SIShrinkInstructionsLegacy(); 90 } 91 92 /// This function checks \p MI for operands defined by a move immediate 93 /// instruction and then folds the literal constant into the instruction if it 94 /// can. This function assumes that \p MI is a VOP1, VOP2, or VOPC instructions. 95 bool SIShrinkInstructions::foldImmediates(MachineInstr &MI, 96 bool TryToCommute) const { 97 assert(TII->isVOP1(MI) || TII->isVOP2(MI) || TII->isVOPC(MI)); 98 99 int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0); 100 101 // Try to fold Src0 102 MachineOperand &Src0 = MI.getOperand(Src0Idx); 103 if (Src0.isReg()) { 104 Register Reg = Src0.getReg(); 105 if (Reg.isVirtual()) { 106 MachineInstr *Def = MRI->getUniqueVRegDef(Reg); 107 if (Def && Def->isMoveImmediate()) { 108 MachineOperand &MovSrc = Def->getOperand(1); 109 bool ConstantFolded = false; 110 111 if (TII->isOperandLegal(MI, Src0Idx, &MovSrc)) { 112 if (MovSrc.isImm()) { 113 Src0.ChangeToImmediate(MovSrc.getImm()); 114 ConstantFolded = true; 115 } else if (MovSrc.isFI()) { 116 Src0.ChangeToFrameIndex(MovSrc.getIndex()); 117 ConstantFolded = true; 118 } else if (MovSrc.isGlobal()) { 119 Src0.ChangeToGA(MovSrc.getGlobal(), MovSrc.getOffset(), 120 MovSrc.getTargetFlags()); 121 ConstantFolded = true; 122 } 123 } 124 125 if (ConstantFolded) { 126 if (MRI->use_nodbg_empty(Reg)) 127 Def->eraseFromParent(); 128 ++NumLiteralConstantsFolded; 129 return true; 130 } 131 } 132 } 133 } 134 135 // We have failed to fold src0, so commute the instruction and try again. 136 if (TryToCommute && MI.isCommutable()) { 137 if (TII->commuteInstruction(MI)) { 138 if (foldImmediates(MI, false)) 139 return true; 140 141 // Commute back. 142 TII->commuteInstruction(MI); 143 } 144 } 145 146 return false; 147 } 148 149 /// Do not shrink the instruction if its registers are not expressible in the 150 /// shrunk encoding. 151 bool SIShrinkInstructions::shouldShrinkTrue16(MachineInstr &MI) const { 152 for (unsigned I = 0, E = MI.getNumExplicitOperands(); I != E; ++I) { 153 const MachineOperand &MO = MI.getOperand(I); 154 if (MO.isReg()) { 155 Register Reg = MO.getReg(); 156 assert(!Reg.isVirtual() && "Prior checks should ensure we only shrink " 157 "True16 Instructions post-RA"); 158 if (AMDGPU::VGPR_32RegClass.contains(Reg) && 159 !AMDGPU::VGPR_32_Lo128RegClass.contains(Reg)) 160 return false; 161 162 if (AMDGPU::VGPR_16RegClass.contains(Reg) && 163 !AMDGPU::VGPR_16_Lo128RegClass.contains(Reg)) 164 return false; 165 } 166 } 167 return true; 168 } 169 170 bool SIShrinkInstructions::isKImmOperand(const MachineOperand &Src) const { 171 return isInt<16>(SignExtend64(Src.getImm(), 32)) && 172 !TII->isInlineConstant(*Src.getParent(), Src.getOperandNo()); 173 } 174 175 bool SIShrinkInstructions::isKUImmOperand(const MachineOperand &Src) const { 176 return isUInt<16>(Src.getImm()) && 177 !TII->isInlineConstant(*Src.getParent(), Src.getOperandNo()); 178 } 179 180 bool SIShrinkInstructions::isKImmOrKUImmOperand(const MachineOperand &Src, 181 bool &IsUnsigned) const { 182 if (isInt<16>(SignExtend64(Src.getImm(), 32))) { 183 IsUnsigned = false; 184 return !TII->isInlineConstant(Src); 185 } 186 187 if (isUInt<16>(Src.getImm())) { 188 IsUnsigned = true; 189 return !TII->isInlineConstant(Src); 190 } 191 192 return false; 193 } 194 195 /// \returns the opcode of an instruction a move immediate of the constant \p 196 /// Src can be replaced with if the constant is replaced with \p ModifiedImm. 197 /// i.e. 198 /// 199 /// If the bitreverse of a constant is an inline immediate, reverse the 200 /// immediate and return the bitreverse opcode. 201 /// 202 /// If the bitwise negation of a constant is an inline immediate, reverse the 203 /// immediate and return the bitwise not opcode. 204 static unsigned canModifyToInlineImmOp32(const SIInstrInfo *TII, 205 const MachineOperand &Src, 206 int32_t &ModifiedImm, bool Scalar) { 207 if (TII->isInlineConstant(Src)) 208 return 0; 209 int32_t SrcImm = static_cast<int32_t>(Src.getImm()); 210 211 if (!Scalar) { 212 // We could handle the scalar case with here, but we would need to check 213 // that SCC is not live as S_NOT_B32 clobbers it. It's probably not worth 214 // it, as the reasonable values are already covered by s_movk_i32. 215 ModifiedImm = ~SrcImm; 216 if (TII->isInlineConstant(APInt(32, ModifiedImm, true))) 217 return AMDGPU::V_NOT_B32_e32; 218 } 219 220 ModifiedImm = reverseBits<int32_t>(SrcImm); 221 if (TII->isInlineConstant(APInt(32, ModifiedImm, true))) 222 return Scalar ? AMDGPU::S_BREV_B32 : AMDGPU::V_BFREV_B32_e32; 223 224 return 0; 225 } 226 227 /// Copy implicit register operands from specified instruction to this 228 /// instruction that are not part of the instruction definition. 229 void SIShrinkInstructions::copyExtraImplicitOps(MachineInstr &NewMI, 230 MachineInstr &MI) const { 231 MachineFunction &MF = *MI.getMF(); 232 for (unsigned i = MI.getDesc().getNumOperands() + 233 MI.getDesc().implicit_uses().size() + 234 MI.getDesc().implicit_defs().size(), 235 e = MI.getNumOperands(); 236 i != e; ++i) { 237 const MachineOperand &MO = MI.getOperand(i); 238 if ((MO.isReg() && MO.isImplicit()) || MO.isRegMask()) 239 NewMI.addOperand(MF, MO); 240 } 241 } 242 243 void SIShrinkInstructions::shrinkScalarCompare(MachineInstr &MI) const { 244 if (!ST->hasSCmpK()) 245 return; 246 247 // cmpk instructions do scc = dst <cc op> imm16, so commute the instruction to 248 // get constants on the RHS. 249 if (!MI.getOperand(0).isReg()) 250 TII->commuteInstruction(MI, false, 0, 1); 251 252 // cmpk requires src0 to be a register 253 const MachineOperand &Src0 = MI.getOperand(0); 254 if (!Src0.isReg()) 255 return; 256 257 MachineOperand &Src1 = MI.getOperand(1); 258 if (!Src1.isImm()) 259 return; 260 261 int SOPKOpc = AMDGPU::getSOPKOp(MI.getOpcode()); 262 if (SOPKOpc == -1) 263 return; 264 265 // eq/ne is special because the imm16 can be treated as signed or unsigned, 266 // and initially selected to the unsigned versions. 267 if (SOPKOpc == AMDGPU::S_CMPK_EQ_U32 || SOPKOpc == AMDGPU::S_CMPK_LG_U32) { 268 bool HasUImm; 269 if (isKImmOrKUImmOperand(Src1, HasUImm)) { 270 if (!HasUImm) { 271 SOPKOpc = (SOPKOpc == AMDGPU::S_CMPK_EQ_U32) ? 272 AMDGPU::S_CMPK_EQ_I32 : AMDGPU::S_CMPK_LG_I32; 273 Src1.setImm(SignExtend32(Src1.getImm(), 32)); 274 } 275 276 MI.setDesc(TII->get(SOPKOpc)); 277 } 278 279 return; 280 } 281 282 const MCInstrDesc &NewDesc = TII->get(SOPKOpc); 283 284 if ((SIInstrInfo::sopkIsZext(SOPKOpc) && isKUImmOperand(Src1)) || 285 (!SIInstrInfo::sopkIsZext(SOPKOpc) && isKImmOperand(Src1))) { 286 if (!SIInstrInfo::sopkIsZext(SOPKOpc)) 287 Src1.setImm(SignExtend64(Src1.getImm(), 32)); 288 MI.setDesc(NewDesc); 289 } 290 } 291 292 // Shrink NSA encoded instructions with contiguous VGPRs to non-NSA encoding. 293 void SIShrinkInstructions::shrinkMIMG(MachineInstr &MI) const { 294 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(MI.getOpcode()); 295 if (!Info) 296 return; 297 298 uint8_t NewEncoding; 299 switch (Info->MIMGEncoding) { 300 case AMDGPU::MIMGEncGfx10NSA: 301 NewEncoding = AMDGPU::MIMGEncGfx10Default; 302 break; 303 case AMDGPU::MIMGEncGfx11NSA: 304 NewEncoding = AMDGPU::MIMGEncGfx11Default; 305 break; 306 default: 307 return; 308 } 309 310 int VAddr0Idx = 311 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vaddr0); 312 unsigned NewAddrDwords = Info->VAddrDwords; 313 const TargetRegisterClass *RC; 314 315 if (Info->VAddrDwords == 2) { 316 RC = &AMDGPU::VReg_64RegClass; 317 } else if (Info->VAddrDwords == 3) { 318 RC = &AMDGPU::VReg_96RegClass; 319 } else if (Info->VAddrDwords == 4) { 320 RC = &AMDGPU::VReg_128RegClass; 321 } else if (Info->VAddrDwords == 5) { 322 RC = &AMDGPU::VReg_160RegClass; 323 } else if (Info->VAddrDwords == 6) { 324 RC = &AMDGPU::VReg_192RegClass; 325 } else if (Info->VAddrDwords == 7) { 326 RC = &AMDGPU::VReg_224RegClass; 327 } else if (Info->VAddrDwords == 8) { 328 RC = &AMDGPU::VReg_256RegClass; 329 } else if (Info->VAddrDwords == 9) { 330 RC = &AMDGPU::VReg_288RegClass; 331 } else if (Info->VAddrDwords == 10) { 332 RC = &AMDGPU::VReg_320RegClass; 333 } else if (Info->VAddrDwords == 11) { 334 RC = &AMDGPU::VReg_352RegClass; 335 } else if (Info->VAddrDwords == 12) { 336 RC = &AMDGPU::VReg_384RegClass; 337 } else { 338 RC = &AMDGPU::VReg_512RegClass; 339 NewAddrDwords = 16; 340 } 341 342 unsigned VgprBase = 0; 343 unsigned NextVgpr = 0; 344 bool IsUndef = true; 345 bool IsKill = NewAddrDwords == Info->VAddrDwords; 346 const unsigned NSAMaxSize = ST->getNSAMaxSize(); 347 const bool IsPartialNSA = NewAddrDwords > NSAMaxSize; 348 const unsigned EndVAddr = IsPartialNSA ? NSAMaxSize : Info->VAddrOperands; 349 for (unsigned Idx = 0; Idx < EndVAddr; ++Idx) { 350 const MachineOperand &Op = MI.getOperand(VAddr0Idx + Idx); 351 unsigned Vgpr = TRI->getHWRegIndex(Op.getReg()); 352 unsigned Dwords = TRI->getRegSizeInBits(Op.getReg(), *MRI) / 32; 353 assert(Dwords > 0 && "Un-implemented for less than 32 bit regs"); 354 355 if (Idx == 0) { 356 VgprBase = Vgpr; 357 NextVgpr = Vgpr + Dwords; 358 } else if (Vgpr == NextVgpr) { 359 NextVgpr = Vgpr + Dwords; 360 } else { 361 return; 362 } 363 364 if (!Op.isUndef()) 365 IsUndef = false; 366 if (!Op.isKill()) 367 IsKill = false; 368 } 369 370 if (VgprBase + NewAddrDwords > 256) 371 return; 372 373 // Further check for implicit tied operands - this may be present if TFE is 374 // enabled 375 int TFEIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::tfe); 376 int LWEIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::lwe); 377 unsigned TFEVal = (TFEIdx == -1) ? 0 : MI.getOperand(TFEIdx).getImm(); 378 unsigned LWEVal = (LWEIdx == -1) ? 0 : MI.getOperand(LWEIdx).getImm(); 379 int ToUntie = -1; 380 if (TFEVal || LWEVal) { 381 // TFE/LWE is enabled so we need to deal with an implicit tied operand 382 for (unsigned i = LWEIdx + 1, e = MI.getNumOperands(); i != e; ++i) { 383 if (MI.getOperand(i).isReg() && MI.getOperand(i).isTied() && 384 MI.getOperand(i).isImplicit()) { 385 // This is the tied operand 386 assert( 387 ToUntie == -1 && 388 "found more than one tied implicit operand when expecting only 1"); 389 ToUntie = i; 390 MI.untieRegOperand(ToUntie); 391 } 392 } 393 } 394 395 unsigned NewOpcode = AMDGPU::getMIMGOpcode(Info->BaseOpcode, NewEncoding, 396 Info->VDataDwords, NewAddrDwords); 397 MI.setDesc(TII->get(NewOpcode)); 398 MI.getOperand(VAddr0Idx).setReg(RC->getRegister(VgprBase)); 399 MI.getOperand(VAddr0Idx).setIsUndef(IsUndef); 400 MI.getOperand(VAddr0Idx).setIsKill(IsKill); 401 402 for (unsigned i = 1; i < EndVAddr; ++i) 403 MI.removeOperand(VAddr0Idx + 1); 404 405 if (ToUntie >= 0) { 406 MI.tieOperands( 407 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata), 408 ToUntie - (EndVAddr - 1)); 409 } 410 } 411 412 // Shrink MAD to MADAK/MADMK and FMA to FMAAK/FMAMK. 413 void SIShrinkInstructions::shrinkMadFma(MachineInstr &MI) const { 414 // Pre-GFX10 VOP3 instructions like MAD/FMA cannot take a literal operand so 415 // there is no reason to try to shrink them. 416 if (!ST->hasVOP3Literal()) 417 return; 418 419 // There is no advantage to doing this pre-RA. 420 if (!MF->getProperties().hasProperty( 421 MachineFunctionProperties::Property::NoVRegs)) 422 return; 423 424 if (TII->hasAnyModifiersSet(MI)) 425 return; 426 427 const unsigned Opcode = MI.getOpcode(); 428 MachineOperand &Src0 = *TII->getNamedOperand(MI, AMDGPU::OpName::src0); 429 MachineOperand &Src1 = *TII->getNamedOperand(MI, AMDGPU::OpName::src1); 430 MachineOperand &Src2 = *TII->getNamedOperand(MI, AMDGPU::OpName::src2); 431 unsigned NewOpcode = AMDGPU::INSTRUCTION_LIST_END; 432 433 bool Swap; 434 435 // Detect "Dst = VSrc * VGPR + Imm" and convert to AK form. 436 if (Src2.isImm() && !TII->isInlineConstant(Src2)) { 437 if (Src1.isReg() && TRI->isVGPR(*MRI, Src1.getReg())) 438 Swap = false; 439 else if (Src0.isReg() && TRI->isVGPR(*MRI, Src0.getReg())) 440 Swap = true; 441 else 442 return; 443 444 switch (Opcode) { 445 default: 446 llvm_unreachable("Unexpected mad/fma opcode!"); 447 case AMDGPU::V_MAD_F32_e64: 448 NewOpcode = AMDGPU::V_MADAK_F32; 449 break; 450 case AMDGPU::V_FMA_F32_e64: 451 NewOpcode = AMDGPU::V_FMAAK_F32; 452 break; 453 case AMDGPU::V_MAD_F16_e64: 454 NewOpcode = AMDGPU::V_MADAK_F16; 455 break; 456 case AMDGPU::V_FMA_F16_e64: 457 case AMDGPU::V_FMA_F16_gfx9_e64: 458 case AMDGPU::V_FMA_F16_gfx9_fake16_e64: 459 NewOpcode = ST->hasTrue16BitInsts() ? AMDGPU::V_FMAAK_F16_fake16 460 : AMDGPU::V_FMAAK_F16; 461 break; 462 } 463 } 464 465 // Detect "Dst = VSrc * Imm + VGPR" and convert to MK form. 466 if (Src2.isReg() && TRI->isVGPR(*MRI, Src2.getReg())) { 467 if (Src1.isImm() && !TII->isInlineConstant(Src1)) 468 Swap = false; 469 else if (Src0.isImm() && !TII->isInlineConstant(Src0)) 470 Swap = true; 471 else 472 return; 473 474 switch (Opcode) { 475 default: 476 llvm_unreachable("Unexpected mad/fma opcode!"); 477 case AMDGPU::V_MAD_F32_e64: 478 NewOpcode = AMDGPU::V_MADMK_F32; 479 break; 480 case AMDGPU::V_FMA_F32_e64: 481 NewOpcode = AMDGPU::V_FMAMK_F32; 482 break; 483 case AMDGPU::V_MAD_F16_e64: 484 NewOpcode = AMDGPU::V_MADMK_F16; 485 break; 486 case AMDGPU::V_FMA_F16_e64: 487 case AMDGPU::V_FMA_F16_gfx9_e64: 488 case AMDGPU::V_FMA_F16_gfx9_fake16_e64: 489 NewOpcode = ST->hasTrue16BitInsts() ? AMDGPU::V_FMAMK_F16_fake16 490 : AMDGPU::V_FMAMK_F16; 491 break; 492 } 493 } 494 495 if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) 496 return; 497 498 if (AMDGPU::isTrue16Inst(NewOpcode) && !shouldShrinkTrue16(MI)) 499 return; 500 501 if (Swap) { 502 // Swap Src0 and Src1 by building a new instruction. 503 BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), TII->get(NewOpcode), 504 MI.getOperand(0).getReg()) 505 .add(Src1) 506 .add(Src0) 507 .add(Src2) 508 .setMIFlags(MI.getFlags()); 509 MI.eraseFromParent(); 510 } else { 511 TII->removeModOperands(MI); 512 MI.setDesc(TII->get(NewOpcode)); 513 } 514 } 515 516 /// Attempt to shrink AND/OR/XOR operations requiring non-inlineable literals. 517 /// For AND or OR, try using S_BITSET{0,1} to clear or set bits. 518 /// If the inverse of the immediate is legal, use ANDN2, ORN2 or 519 /// XNOR (as a ^ b == ~(a ^ ~b)). 520 /// \returns true if the caller should continue the machine function iterator 521 bool SIShrinkInstructions::shrinkScalarLogicOp(MachineInstr &MI) const { 522 unsigned Opc = MI.getOpcode(); 523 const MachineOperand *Dest = &MI.getOperand(0); 524 MachineOperand *Src0 = &MI.getOperand(1); 525 MachineOperand *Src1 = &MI.getOperand(2); 526 MachineOperand *SrcReg = Src0; 527 MachineOperand *SrcImm = Src1; 528 529 if (!SrcImm->isImm() || 530 AMDGPU::isInlinableLiteral32(SrcImm->getImm(), ST->hasInv2PiInlineImm())) 531 return false; 532 533 uint32_t Imm = static_cast<uint32_t>(SrcImm->getImm()); 534 uint32_t NewImm = 0; 535 536 if (Opc == AMDGPU::S_AND_B32) { 537 if (isPowerOf2_32(~Imm)) { 538 NewImm = llvm::countr_one(Imm); 539 Opc = AMDGPU::S_BITSET0_B32; 540 } else if (AMDGPU::isInlinableLiteral32(~Imm, ST->hasInv2PiInlineImm())) { 541 NewImm = ~Imm; 542 Opc = AMDGPU::S_ANDN2_B32; 543 } 544 } else if (Opc == AMDGPU::S_OR_B32) { 545 if (isPowerOf2_32(Imm)) { 546 NewImm = llvm::countr_zero(Imm); 547 Opc = AMDGPU::S_BITSET1_B32; 548 } else if (AMDGPU::isInlinableLiteral32(~Imm, ST->hasInv2PiInlineImm())) { 549 NewImm = ~Imm; 550 Opc = AMDGPU::S_ORN2_B32; 551 } 552 } else if (Opc == AMDGPU::S_XOR_B32) { 553 if (AMDGPU::isInlinableLiteral32(~Imm, ST->hasInv2PiInlineImm())) { 554 NewImm = ~Imm; 555 Opc = AMDGPU::S_XNOR_B32; 556 } 557 } else { 558 llvm_unreachable("unexpected opcode"); 559 } 560 561 if (NewImm != 0) { 562 if (Dest->getReg().isVirtual() && SrcReg->isReg()) { 563 MRI->setRegAllocationHint(Dest->getReg(), 0, SrcReg->getReg()); 564 MRI->setRegAllocationHint(SrcReg->getReg(), 0, Dest->getReg()); 565 return true; 566 } 567 568 if (SrcReg->isReg() && SrcReg->getReg() == Dest->getReg()) { 569 const bool IsUndef = SrcReg->isUndef(); 570 const bool IsKill = SrcReg->isKill(); 571 MI.setDesc(TII->get(Opc)); 572 if (Opc == AMDGPU::S_BITSET0_B32 || 573 Opc == AMDGPU::S_BITSET1_B32) { 574 Src0->ChangeToImmediate(NewImm); 575 // Remove the immediate and add the tied input. 576 MI.getOperand(2).ChangeToRegister(Dest->getReg(), /*IsDef*/ false, 577 /*isImp*/ false, IsKill, 578 /*isDead*/ false, IsUndef); 579 MI.tieOperands(0, 2); 580 } else { 581 SrcImm->setImm(NewImm); 582 } 583 } 584 } 585 586 return false; 587 } 588 589 // This is the same as MachineInstr::readsRegister/modifiesRegister except 590 // it takes subregs into account. 591 bool SIShrinkInstructions::instAccessReg( 592 iterator_range<MachineInstr::const_mop_iterator> &&R, Register Reg, 593 unsigned SubReg) const { 594 for (const MachineOperand &MO : R) { 595 if (!MO.isReg()) 596 continue; 597 598 if (Reg.isPhysical() && MO.getReg().isPhysical()) { 599 if (TRI->regsOverlap(Reg, MO.getReg())) 600 return true; 601 } else if (MO.getReg() == Reg && Reg.isVirtual()) { 602 LaneBitmask Overlap = TRI->getSubRegIndexLaneMask(SubReg) & 603 TRI->getSubRegIndexLaneMask(MO.getSubReg()); 604 if (Overlap.any()) 605 return true; 606 } 607 } 608 return false; 609 } 610 611 bool SIShrinkInstructions::instReadsReg(const MachineInstr *MI, unsigned Reg, 612 unsigned SubReg) const { 613 return instAccessReg(MI->uses(), Reg, SubReg); 614 } 615 616 bool SIShrinkInstructions::instModifiesReg(const MachineInstr *MI, unsigned Reg, 617 unsigned SubReg) const { 618 return instAccessReg(MI->defs(), Reg, SubReg); 619 } 620 621 TargetInstrInfo::RegSubRegPair 622 SIShrinkInstructions::getSubRegForIndex(Register Reg, unsigned Sub, 623 unsigned I) const { 624 if (TRI->getRegSizeInBits(Reg, *MRI) != 32) { 625 if (Reg.isPhysical()) { 626 Reg = TRI->getSubReg(Reg, TRI->getSubRegFromChannel(I)); 627 } else { 628 Sub = TRI->getSubRegFromChannel(I + TRI->getChannelFromSubReg(Sub)); 629 } 630 } 631 return TargetInstrInfo::RegSubRegPair(Reg, Sub); 632 } 633 634 void SIShrinkInstructions::dropInstructionKeepingImpDefs( 635 MachineInstr &MI) const { 636 for (unsigned i = MI.getDesc().getNumOperands() + 637 MI.getDesc().implicit_uses().size() + 638 MI.getDesc().implicit_defs().size(), 639 e = MI.getNumOperands(); 640 i != e; ++i) { 641 const MachineOperand &Op = MI.getOperand(i); 642 if (!Op.isDef()) 643 continue; 644 BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(), 645 TII->get(AMDGPU::IMPLICIT_DEF), Op.getReg()); 646 } 647 648 MI.eraseFromParent(); 649 } 650 651 // Match: 652 // mov t, x 653 // mov x, y 654 // mov y, t 655 // 656 // => 657 // 658 // mov t, x (t is potentially dead and move eliminated) 659 // v_swap_b32 x, y 660 // 661 // Returns next valid instruction pointer if was able to create v_swap_b32. 662 // 663 // This shall not be done too early not to prevent possible folding which may 664 // remove matched moves, and this should preferably be done before RA to 665 // release saved registers and also possibly after RA which can insert copies 666 // too. 667 // 668 // This is really just a generic peephole that is not a canonical shrinking, 669 // although requirements match the pass placement and it reduces code size too. 670 MachineInstr *SIShrinkInstructions::matchSwap(MachineInstr &MovT) const { 671 assert(MovT.getOpcode() == AMDGPU::V_MOV_B32_e32 || 672 MovT.getOpcode() == AMDGPU::V_MOV_B16_t16_e32 || 673 MovT.getOpcode() == AMDGPU::COPY); 674 675 Register T = MovT.getOperand(0).getReg(); 676 unsigned Tsub = MovT.getOperand(0).getSubReg(); 677 MachineOperand &Xop = MovT.getOperand(1); 678 679 if (!Xop.isReg()) 680 return nullptr; 681 Register X = Xop.getReg(); 682 unsigned Xsub = Xop.getSubReg(); 683 684 unsigned Size = TII->getOpSize(MovT, 0); 685 686 // We can't match v_swap_b16 pre-RA, because VGPR_16_Lo128 registers 687 // are not allocatble. 688 if (Size == 2 && X.isVirtual()) 689 return nullptr; 690 691 if (!TRI->isVGPR(*MRI, X)) 692 return nullptr; 693 694 const unsigned SearchLimit = 16; 695 unsigned Count = 0; 696 bool KilledT = false; 697 for (auto Iter = std::next(MovT.getIterator()), 698 E = MovT.getParent()->instr_end(); 699 Iter != E && Count < SearchLimit && !KilledT; ++Iter, ++Count) { 700 701 MachineInstr *MovY = &*Iter; 702 KilledT = MovY->killsRegister(T, TRI); 703 704 if ((MovY->getOpcode() != AMDGPU::V_MOV_B32_e32 && 705 MovY->getOpcode() != AMDGPU::V_MOV_B16_t16_e32 && 706 MovY->getOpcode() != AMDGPU::COPY) || 707 !MovY->getOperand(1).isReg() || MovY->getOperand(1).getReg() != T || 708 MovY->getOperand(1).getSubReg() != Tsub) 709 continue; 710 711 Register Y = MovY->getOperand(0).getReg(); 712 unsigned Ysub = MovY->getOperand(0).getSubReg(); 713 714 if (!TRI->isVGPR(*MRI, Y)) 715 continue; 716 717 MachineInstr *MovX = nullptr; 718 for (auto IY = MovY->getIterator(), I = std::next(MovT.getIterator()); 719 I != IY; ++I) { 720 if (instReadsReg(&*I, X, Xsub) || instModifiesReg(&*I, Y, Ysub) || 721 instModifiesReg(&*I, T, Tsub) || 722 (MovX && instModifiesReg(&*I, X, Xsub))) { 723 MovX = nullptr; 724 break; 725 } 726 if (!instReadsReg(&*I, Y, Ysub)) { 727 if (!MovX && instModifiesReg(&*I, X, Xsub)) { 728 MovX = nullptr; 729 break; 730 } 731 continue; 732 } 733 if (MovX || 734 (I->getOpcode() != AMDGPU::V_MOV_B32_e32 && 735 I->getOpcode() != AMDGPU::V_MOV_B16_t16_e32 && 736 I->getOpcode() != AMDGPU::COPY) || 737 I->getOperand(0).getReg() != X || 738 I->getOperand(0).getSubReg() != Xsub) { 739 MovX = nullptr; 740 break; 741 } 742 743 if (Size > 4 && (I->getNumImplicitOperands() > (I->isCopy() ? 0U : 1U))) 744 continue; 745 746 MovX = &*I; 747 } 748 749 if (!MovX) 750 continue; 751 752 LLVM_DEBUG(dbgs() << "Matched v_swap:\n" << MovT << *MovX << *MovY); 753 754 MachineBasicBlock &MBB = *MovT.getParent(); 755 SmallVector<MachineInstr *, 4> Swaps; 756 if (Size == 2) { 757 auto *MIB = BuildMI(MBB, MovX->getIterator(), MovT.getDebugLoc(), 758 TII->get(AMDGPU::V_SWAP_B16)) 759 .addDef(X) 760 .addDef(Y) 761 .addReg(Y) 762 .addReg(X) 763 .getInstr(); 764 Swaps.push_back(MIB); 765 } else { 766 assert(Size > 0 && Size % 4 == 0); 767 for (unsigned I = 0; I < Size / 4; ++I) { 768 TargetInstrInfo::RegSubRegPair X1, Y1; 769 X1 = getSubRegForIndex(X, Xsub, I); 770 Y1 = getSubRegForIndex(Y, Ysub, I); 771 auto *MIB = BuildMI(MBB, MovX->getIterator(), MovT.getDebugLoc(), 772 TII->get(AMDGPU::V_SWAP_B32)) 773 .addDef(X1.Reg, 0, X1.SubReg) 774 .addDef(Y1.Reg, 0, Y1.SubReg) 775 .addReg(Y1.Reg, 0, Y1.SubReg) 776 .addReg(X1.Reg, 0, X1.SubReg) 777 .getInstr(); 778 Swaps.push_back(MIB); 779 } 780 } 781 // Drop implicit EXEC. 782 if (MovX->hasRegisterImplicitUseOperand(AMDGPU::EXEC)) { 783 for (MachineInstr *Swap : Swaps) { 784 Swap->removeOperand(Swap->getNumExplicitOperands()); 785 Swap->copyImplicitOps(*MBB.getParent(), *MovX); 786 } 787 } 788 MovX->eraseFromParent(); 789 dropInstructionKeepingImpDefs(*MovY); 790 MachineInstr *Next = &*std::next(MovT.getIterator()); 791 792 if (T.isVirtual() && MRI->use_nodbg_empty(T)) { 793 dropInstructionKeepingImpDefs(MovT); 794 } else { 795 Xop.setIsKill(false); 796 for (int I = MovT.getNumImplicitOperands() - 1; I >= 0; --I ) { 797 unsigned OpNo = MovT.getNumExplicitOperands() + I; 798 const MachineOperand &Op = MovT.getOperand(OpNo); 799 if (Op.isKill() && TRI->regsOverlap(X, Op.getReg())) 800 MovT.removeOperand(OpNo); 801 } 802 } 803 804 return Next; 805 } 806 807 return nullptr; 808 } 809 810 // If an instruction has dead sdst replace it with NULL register on gfx1030+ 811 bool SIShrinkInstructions::tryReplaceDeadSDST(MachineInstr &MI) const { 812 if (!ST->hasGFX10_3Insts()) 813 return false; 814 815 MachineOperand *Op = TII->getNamedOperand(MI, AMDGPU::OpName::sdst); 816 if (!Op) 817 return false; 818 Register SDstReg = Op->getReg(); 819 if (SDstReg.isPhysical() || !MRI->use_nodbg_empty(SDstReg)) 820 return false; 821 822 Op->setReg(ST->isWave32() ? AMDGPU::SGPR_NULL : AMDGPU::SGPR_NULL64); 823 return true; 824 } 825 826 bool SIShrinkInstructions::run(MachineFunction &MF) { 827 828 this->MF = &MF; 829 MRI = &MF.getRegInfo(); 830 ST = &MF.getSubtarget<GCNSubtarget>(); 831 TII = ST->getInstrInfo(); 832 TRI = &TII->getRegisterInfo(); 833 834 unsigned VCCReg = ST->isWave32() ? AMDGPU::VCC_LO : AMDGPU::VCC; 835 836 std::vector<unsigned> I1Defs; 837 838 for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); 839 BI != BE; ++BI) { 840 841 MachineBasicBlock &MBB = *BI; 842 MachineBasicBlock::iterator I, Next; 843 for (I = MBB.begin(); I != MBB.end(); I = Next) { 844 Next = std::next(I); 845 MachineInstr &MI = *I; 846 847 if (MI.getOpcode() == AMDGPU::V_MOV_B32_e32) { 848 // If this has a literal constant source that is the same as the 849 // reversed bits of an inline immediate, replace with a bitreverse of 850 // that constant. This saves 4 bytes in the common case of materializing 851 // sign bits. 852 853 // Test if we are after regalloc. We only want to do this after any 854 // optimizations happen because this will confuse them. 855 // XXX - not exactly a check for post-regalloc run. 856 MachineOperand &Src = MI.getOperand(1); 857 if (Src.isImm() && MI.getOperand(0).getReg().isPhysical()) { 858 int32_t ModImm; 859 unsigned ModOpcode = 860 canModifyToInlineImmOp32(TII, Src, ModImm, /*Scalar=*/false); 861 if (ModOpcode != 0) { 862 MI.setDesc(TII->get(ModOpcode)); 863 Src.setImm(static_cast<int64_t>(ModImm)); 864 continue; 865 } 866 } 867 } 868 869 if (ST->hasSwap() && (MI.getOpcode() == AMDGPU::V_MOV_B32_e32 || 870 MI.getOpcode() == AMDGPU::V_MOV_B16_t16_e32 || 871 MI.getOpcode() == AMDGPU::COPY)) { 872 if (auto *NextMI = matchSwap(MI)) { 873 Next = NextMI->getIterator(); 874 continue; 875 } 876 } 877 878 // Try to use S_ADDK_I32 and S_MULK_I32. 879 if (MI.getOpcode() == AMDGPU::S_ADD_I32 || 880 MI.getOpcode() == AMDGPU::S_MUL_I32) { 881 const MachineOperand *Dest = &MI.getOperand(0); 882 MachineOperand *Src0 = &MI.getOperand(1); 883 MachineOperand *Src1 = &MI.getOperand(2); 884 885 if (!Src0->isReg() && Src1->isReg()) { 886 if (TII->commuteInstruction(MI, false, 1, 2)) 887 std::swap(Src0, Src1); 888 } 889 890 // FIXME: This could work better if hints worked with subregisters. If 891 // we have a vector add of a constant, we usually don't get the correct 892 // allocation due to the subregister usage. 893 if (Dest->getReg().isVirtual() && Src0->isReg()) { 894 MRI->setRegAllocationHint(Dest->getReg(), 0, Src0->getReg()); 895 MRI->setRegAllocationHint(Src0->getReg(), 0, Dest->getReg()); 896 continue; 897 } 898 899 if (Src0->isReg() && Src0->getReg() == Dest->getReg()) { 900 if (Src1->isImm() && isKImmOperand(*Src1)) { 901 unsigned Opc = (MI.getOpcode() == AMDGPU::S_ADD_I32) ? 902 AMDGPU::S_ADDK_I32 : AMDGPU::S_MULK_I32; 903 904 Src1->setImm(SignExtend64(Src1->getImm(), 32)); 905 MI.setDesc(TII->get(Opc)); 906 MI.tieOperands(0, 1); 907 } 908 } 909 } 910 911 // Try to use s_cmpk_* 912 if (MI.isCompare() && TII->isSOPC(MI)) { 913 shrinkScalarCompare(MI); 914 continue; 915 } 916 917 // Try to use S_MOVK_I32, which will save 4 bytes for small immediates. 918 if (MI.getOpcode() == AMDGPU::S_MOV_B32) { 919 const MachineOperand &Dst = MI.getOperand(0); 920 MachineOperand &Src = MI.getOperand(1); 921 922 if (Src.isImm() && Dst.getReg().isPhysical()) { 923 unsigned ModOpc; 924 int32_t ModImm; 925 if (isKImmOperand(Src)) { 926 MI.setDesc(TII->get(AMDGPU::S_MOVK_I32)); 927 Src.setImm(SignExtend64(Src.getImm(), 32)); 928 } else if ((ModOpc = canModifyToInlineImmOp32(TII, Src, ModImm, 929 /*Scalar=*/true))) { 930 MI.setDesc(TII->get(ModOpc)); 931 Src.setImm(static_cast<int64_t>(ModImm)); 932 } 933 } 934 935 continue; 936 } 937 938 // Shrink scalar logic operations. 939 if (MI.getOpcode() == AMDGPU::S_AND_B32 || 940 MI.getOpcode() == AMDGPU::S_OR_B32 || 941 MI.getOpcode() == AMDGPU::S_XOR_B32) { 942 if (shrinkScalarLogicOp(MI)) 943 continue; 944 } 945 946 if (TII->isMIMG(MI.getOpcode()) && 947 ST->getGeneration() >= AMDGPUSubtarget::GFX10 && 948 MF.getProperties().hasProperty( 949 MachineFunctionProperties::Property::NoVRegs)) { 950 shrinkMIMG(MI); 951 continue; 952 } 953 954 if (!TII->isVOP3(MI)) 955 continue; 956 957 if (MI.getOpcode() == AMDGPU::V_MAD_F32_e64 || 958 MI.getOpcode() == AMDGPU::V_FMA_F32_e64 || 959 MI.getOpcode() == AMDGPU::V_MAD_F16_e64 || 960 MI.getOpcode() == AMDGPU::V_FMA_F16_e64 || 961 MI.getOpcode() == AMDGPU::V_FMA_F16_gfx9_e64 || 962 MI.getOpcode() == AMDGPU::V_FMA_F16_gfx9_fake16_e64) { 963 shrinkMadFma(MI); 964 continue; 965 } 966 967 if (!TII->hasVALU32BitEncoding(MI.getOpcode())) { 968 // If there is no chance we will shrink it and use VCC as sdst to get 969 // a 32 bit form try to replace dead sdst with NULL. 970 tryReplaceDeadSDST(MI); 971 continue; 972 } 973 974 if (!TII->canShrink(MI, *MRI)) { 975 // Try commuting the instruction and see if that enables us to shrink 976 // it. 977 if (!MI.isCommutable() || !TII->commuteInstruction(MI) || 978 !TII->canShrink(MI, *MRI)) { 979 tryReplaceDeadSDST(MI); 980 continue; 981 } 982 } 983 984 int Op32 = AMDGPU::getVOPe32(MI.getOpcode()); 985 986 if (TII->isVOPC(Op32)) { 987 MachineOperand &Op0 = MI.getOperand(0); 988 if (Op0.isReg()) { 989 // Exclude VOPCX instructions as these don't explicitly write a 990 // dst. 991 Register DstReg = Op0.getReg(); 992 if (DstReg.isVirtual()) { 993 // VOPC instructions can only write to the VCC register. We can't 994 // force them to use VCC here, because this is only one register and 995 // cannot deal with sequences which would require multiple copies of 996 // VCC, e.g. S_AND_B64 (vcc = V_CMP_...), (vcc = V_CMP_...) 997 // 998 // So, instead of forcing the instruction to write to VCC, we 999 // provide a hint to the register allocator to use VCC and then we 1000 // will run this pass again after RA and shrink it if it outputs to 1001 // VCC. 1002 MRI->setRegAllocationHint(DstReg, 0, VCCReg); 1003 continue; 1004 } 1005 if (DstReg != VCCReg) 1006 continue; 1007 } 1008 } 1009 1010 if (Op32 == AMDGPU::V_CNDMASK_B32_e32) { 1011 // We shrink V_CNDMASK_B32_e64 using regalloc hints like we do for VOPC 1012 // instructions. 1013 const MachineOperand *Src2 = 1014 TII->getNamedOperand(MI, AMDGPU::OpName::src2); 1015 if (!Src2->isReg()) 1016 continue; 1017 Register SReg = Src2->getReg(); 1018 if (SReg.isVirtual()) { 1019 MRI->setRegAllocationHint(SReg, 0, VCCReg); 1020 continue; 1021 } 1022 if (SReg != VCCReg) 1023 continue; 1024 } 1025 1026 // Check for the bool flag output for instructions like V_ADD_I32_e64. 1027 const MachineOperand *SDst = TII->getNamedOperand(MI, 1028 AMDGPU::OpName::sdst); 1029 1030 if (SDst) { 1031 bool Next = false; 1032 1033 if (SDst->getReg() != VCCReg) { 1034 if (SDst->getReg().isVirtual()) 1035 MRI->setRegAllocationHint(SDst->getReg(), 0, VCCReg); 1036 Next = true; 1037 } 1038 1039 // All of the instructions with carry outs also have an SGPR input in 1040 // src2. 1041 const MachineOperand *Src2 = TII->getNamedOperand(MI, 1042 AMDGPU::OpName::src2); 1043 if (Src2 && Src2->getReg() != VCCReg) { 1044 if (Src2->getReg().isVirtual()) 1045 MRI->setRegAllocationHint(Src2->getReg(), 0, VCCReg); 1046 Next = true; 1047 } 1048 1049 if (Next) 1050 continue; 1051 } 1052 1053 // Pre-GFX10, shrinking VOP3 instructions pre-RA gave us the chance to 1054 // fold an immediate into the shrunk instruction as a literal operand. In 1055 // GFX10 VOP3 instructions can take a literal operand anyway, so there is 1056 // no advantage to doing this. 1057 if (ST->hasVOP3Literal() && 1058 !MF.getProperties().hasProperty( 1059 MachineFunctionProperties::Property::NoVRegs)) 1060 continue; 1061 1062 if (ST->hasTrue16BitInsts() && AMDGPU::isTrue16Inst(MI.getOpcode()) && 1063 !shouldShrinkTrue16(MI)) 1064 continue; 1065 1066 // We can shrink this instruction 1067 LLVM_DEBUG(dbgs() << "Shrinking " << MI); 1068 1069 MachineInstr *Inst32 = TII->buildShrunkInst(MI, Op32); 1070 ++NumInstructionsShrunk; 1071 1072 // Copy extra operands not present in the instruction definition. 1073 copyExtraImplicitOps(*Inst32, MI); 1074 1075 // Copy deadness from the old explicit vcc def to the new implicit def. 1076 if (SDst && SDst->isDead()) 1077 Inst32->findRegisterDefOperand(VCCReg, /*TRI=*/nullptr)->setIsDead(); 1078 1079 MI.eraseFromParent(); 1080 foldImmediates(*Inst32); 1081 1082 LLVM_DEBUG(dbgs() << "e32 MI = " << *Inst32 << '\n'); 1083 } 1084 } 1085 return false; 1086 } 1087 1088 bool SIShrinkInstructionsLegacy::runOnMachineFunction(MachineFunction &MF) { 1089 if (skipFunction(MF.getFunction())) 1090 return false; 1091 1092 return SIShrinkInstructions().run(MF); 1093 } 1094 1095 PreservedAnalyses 1096 SIShrinkInstructionsPass::run(MachineFunction &MF, 1097 MachineFunctionAnalysisManager &) { 1098 if (MF.getFunction().hasOptNone() || !SIShrinkInstructions().run(MF)) 1099 return PreservedAnalyses::all(); 1100 1101 auto PA = getMachineFunctionPassPreservedAnalyses(); 1102 PA.preserveSet<CFGAnalyses>(); 1103 return PA; 1104 } 1105