1 //=======- GCNDPPCombine.cpp - optimization for DPP instructions ---==========// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // The pass combines V_MOV_B32_dpp instruction with its VALU uses as a DPP src0 9 // operand. If any of the use instruction cannot be combined with the mov the 10 // whole sequence is reverted. 11 // 12 // $old = ... 13 // $dpp_value = V_MOV_B32_dpp $old, $vgpr_to_be_read_from_other_lane, 14 // dpp_controls..., $row_mask, $bank_mask, $bound_ctrl 15 // $res = VALU $dpp_value [, src1] 16 // 17 // to 18 // 19 // $res = VALU_DPP $combined_old, $vgpr_to_be_read_from_other_lane, [src1,] 20 // dpp_controls..., $row_mask, $bank_mask, $combined_bound_ctrl 21 // 22 // Combining rules : 23 // 24 // if $row_mask and $bank_mask are fully enabled (0xF) and 25 // $bound_ctrl==DPP_BOUND_ZERO or $old==0 26 // -> $combined_old = undef, 27 // $combined_bound_ctrl = DPP_BOUND_ZERO 28 // 29 // if the VALU op is binary and 30 // $bound_ctrl==DPP_BOUND_OFF and 31 // $old==identity value (immediate) for the VALU op 32 // -> $combined_old = src1, 33 // $combined_bound_ctrl = DPP_BOUND_OFF 34 // 35 // Otherwise cancel. 36 // 37 // The mov_dpp instruction should reside in the same BB as all its uses 38 //===----------------------------------------------------------------------===// 39 40 #include "GCNDPPCombine.h" 41 #include "AMDGPU.h" 42 #include "GCNSubtarget.h" 43 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 44 #include "llvm/ADT/Statistic.h" 45 #include "llvm/CodeGen/MachineFunctionPass.h" 46 47 using namespace llvm; 48 49 #define DEBUG_TYPE "gcn-dpp-combine" 50 51 STATISTIC(NumDPPMovsCombined, "Number of DPP moves combined."); 52 53 namespace { 54 55 class GCNDPPCombine { 56 MachineRegisterInfo *MRI; 57 const SIInstrInfo *TII; 58 const GCNSubtarget *ST; 59 60 using RegSubRegPair = TargetInstrInfo::RegSubRegPair; 61 62 MachineOperand *getOldOpndValue(MachineOperand &OldOpnd) const; 63 64 MachineInstr *createDPPInst(MachineInstr &OrigMI, MachineInstr &MovMI, 65 RegSubRegPair CombOldVGPR, 66 MachineOperand *OldOpnd, bool CombBCZ, 67 bool IsShrinkable) const; 68 69 MachineInstr *createDPPInst(MachineInstr &OrigMI, MachineInstr &MovMI, 70 RegSubRegPair CombOldVGPR, bool CombBCZ, 71 bool IsShrinkable) const; 72 73 bool hasNoImmOrEqual(MachineInstr &MI, 74 unsigned OpndName, 75 int64_t Value, 76 int64_t Mask = -1) const; 77 78 bool combineDPPMov(MachineInstr &MI) const; 79 80 int getDPPOp(unsigned Op, bool IsShrinkable) const; 81 bool isShrinkable(MachineInstr &MI) const; 82 83 public: 84 bool run(MachineFunction &MF); 85 }; 86 87 class GCNDPPCombineLegacy : public MachineFunctionPass { 88 public: 89 static char ID; 90 91 GCNDPPCombineLegacy() : MachineFunctionPass(ID) {} 92 93 bool runOnMachineFunction(MachineFunction &MF) override; 94 95 StringRef getPassName() const override { return "GCN DPP Combine"; } 96 97 void getAnalysisUsage(AnalysisUsage &AU) const override { 98 AU.setPreservesCFG(); 99 MachineFunctionPass::getAnalysisUsage(AU); 100 } 101 102 MachineFunctionProperties getRequiredProperties() const override { 103 return MachineFunctionProperties() 104 .set(MachineFunctionProperties::Property::IsSSA); 105 } 106 }; 107 108 } // end anonymous namespace 109 110 INITIALIZE_PASS(GCNDPPCombineLegacy, DEBUG_TYPE, "GCN DPP Combine", false, 111 false) 112 113 char GCNDPPCombineLegacy::ID = 0; 114 115 char &llvm::GCNDPPCombineLegacyID = GCNDPPCombineLegacy::ID; 116 117 FunctionPass *llvm::createGCNDPPCombinePass() { 118 return new GCNDPPCombineLegacy(); 119 } 120 121 bool GCNDPPCombine::isShrinkable(MachineInstr &MI) const { 122 unsigned Op = MI.getOpcode(); 123 if (!TII->isVOP3(Op)) { 124 return false; 125 } 126 if (!TII->hasVALU32BitEncoding(Op)) { 127 LLVM_DEBUG(dbgs() << " Inst hasn't e32 equivalent\n"); 128 return false; 129 } 130 // Do not shrink True16 instructions pre-RA to avoid the restriction in 131 // register allocation from only being able to use 128 VGPRs 132 if (AMDGPU::isTrue16Inst(Op)) 133 return false; 134 if (const auto *SDst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst)) { 135 // Give up if there are any uses of the sdst in carry-out or VOPC. 136 // The shrunken form of the instruction would write it to vcc instead of to 137 // a virtual register. If we rewrote the uses the shrinking would be 138 // possible. 139 if (!MRI->use_nodbg_empty(SDst->getReg())) 140 return false; 141 } 142 // check if other than abs|neg modifiers are set (opsel for example) 143 const int64_t Mask = ~(SISrcMods::ABS | SISrcMods::NEG); 144 if (!hasNoImmOrEqual(MI, AMDGPU::OpName::src0_modifiers, 0, Mask) || 145 !hasNoImmOrEqual(MI, AMDGPU::OpName::src1_modifiers, 0, Mask) || 146 !hasNoImmOrEqual(MI, AMDGPU::OpName::clamp, 0) || 147 !hasNoImmOrEqual(MI, AMDGPU::OpName::omod, 0) || 148 !hasNoImmOrEqual(MI, AMDGPU::OpName::byte_sel, 0)) { 149 LLVM_DEBUG(dbgs() << " Inst has non-default modifiers\n"); 150 return false; 151 } 152 return true; 153 } 154 155 int GCNDPPCombine::getDPPOp(unsigned Op, bool IsShrinkable) const { 156 int DPP32 = AMDGPU::getDPPOp32(Op); 157 if (IsShrinkable) { 158 assert(DPP32 == -1); 159 int E32 = AMDGPU::getVOPe32(Op); 160 DPP32 = (E32 == -1) ? -1 : AMDGPU::getDPPOp32(E32); 161 } 162 if (DPP32 != -1 && TII->pseudoToMCOpcode(DPP32) != -1) 163 return DPP32; 164 int DPP64 = -1; 165 if (ST->hasVOP3DPP()) 166 DPP64 = AMDGPU::getDPPOp64(Op); 167 if (DPP64 != -1 && TII->pseudoToMCOpcode(DPP64) != -1) 168 return DPP64; 169 return -1; 170 } 171 172 // tracks the register operand definition and returns: 173 // 1. immediate operand used to initialize the register if found 174 // 2. nullptr if the register operand is undef 175 // 3. the operand itself otherwise 176 MachineOperand *GCNDPPCombine::getOldOpndValue(MachineOperand &OldOpnd) const { 177 auto *Def = getVRegSubRegDef(getRegSubRegPair(OldOpnd), *MRI); 178 if (!Def) 179 return nullptr; 180 181 switch(Def->getOpcode()) { 182 default: break; 183 case AMDGPU::IMPLICIT_DEF: 184 return nullptr; 185 case AMDGPU::COPY: 186 case AMDGPU::V_MOV_B32_e32: 187 case AMDGPU::V_MOV_B64_PSEUDO: 188 case AMDGPU::V_MOV_B64_e32: 189 case AMDGPU::V_MOV_B64_e64: { 190 auto &Op1 = Def->getOperand(1); 191 if (Op1.isImm()) 192 return &Op1; 193 break; 194 } 195 } 196 return &OldOpnd; 197 } 198 199 [[maybe_unused]] static unsigned getOperandSize(MachineInstr &MI, unsigned Idx, 200 MachineRegisterInfo &MRI) { 201 int16_t RegClass = MI.getDesc().operands()[Idx].RegClass; 202 if (RegClass == -1) 203 return 0; 204 205 const TargetRegisterInfo *TRI = MRI.getTargetRegisterInfo(); 206 return TRI->getRegSizeInBits(*TRI->getRegClass(RegClass)); 207 } 208 209 MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI, 210 MachineInstr &MovMI, 211 RegSubRegPair CombOldVGPR, 212 bool CombBCZ, 213 bool IsShrinkable) const { 214 assert(MovMI.getOpcode() == AMDGPU::V_MOV_B32_dpp || 215 MovMI.getOpcode() == AMDGPU::V_MOV_B64_dpp || 216 MovMI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO); 217 218 bool HasVOP3DPP = ST->hasVOP3DPP(); 219 auto OrigOp = OrigMI.getOpcode(); 220 auto DPPOp = getDPPOp(OrigOp, IsShrinkable); 221 if (DPPOp == -1) { 222 LLVM_DEBUG(dbgs() << " failed: no DPP opcode\n"); 223 return nullptr; 224 } 225 int OrigOpE32 = AMDGPU::getVOPe32(OrigOp); 226 // Prior checks cover Mask with VOPC condition, but not on purpose 227 auto *RowMaskOpnd = TII->getNamedOperand(MovMI, AMDGPU::OpName::row_mask); 228 assert(RowMaskOpnd && RowMaskOpnd->isImm()); 229 auto *BankMaskOpnd = TII->getNamedOperand(MovMI, AMDGPU::OpName::bank_mask); 230 assert(BankMaskOpnd && BankMaskOpnd->isImm()); 231 const bool MaskAllLanes = 232 RowMaskOpnd->getImm() == 0xF && BankMaskOpnd->getImm() == 0xF; 233 (void)MaskAllLanes; 234 assert((MaskAllLanes || 235 !(TII->isVOPC(DPPOp) || (TII->isVOP3(DPPOp) && OrigOpE32 != -1 && 236 TII->isVOPC(OrigOpE32)))) && 237 "VOPC cannot form DPP unless mask is full"); 238 239 auto DPPInst = BuildMI(*OrigMI.getParent(), OrigMI, 240 OrigMI.getDebugLoc(), TII->get(DPPOp)) 241 .setMIFlags(OrigMI.getFlags()); 242 243 bool Fail = false; 244 do { 245 int NumOperands = 0; 246 if (auto *Dst = TII->getNamedOperand(OrigMI, AMDGPU::OpName::vdst)) { 247 DPPInst.add(*Dst); 248 ++NumOperands; 249 } 250 if (auto *SDst = TII->getNamedOperand(OrigMI, AMDGPU::OpName::sdst)) { 251 if (TII->isOperandLegal(*DPPInst.getInstr(), NumOperands, SDst)) { 252 DPPInst.add(*SDst); 253 ++NumOperands; 254 } 255 // If we shrunk a 64bit vop3b to 32bits, just ignore the sdst 256 } 257 258 const int OldIdx = AMDGPU::getNamedOperandIdx(DPPOp, AMDGPU::OpName::old); 259 if (OldIdx != -1) { 260 assert(OldIdx == NumOperands); 261 assert(isOfRegClass( 262 CombOldVGPR, 263 *MRI->getRegClass( 264 TII->getNamedOperand(MovMI, AMDGPU::OpName::vdst)->getReg()), 265 *MRI)); 266 auto *Def = getVRegSubRegDef(CombOldVGPR, *MRI); 267 DPPInst.addReg(CombOldVGPR.Reg, Def ? 0 : RegState::Undef, 268 CombOldVGPR.SubReg); 269 ++NumOperands; 270 } else if (TII->isVOPC(DPPOp) || (TII->isVOP3(DPPOp) && OrigOpE32 != -1 && 271 TII->isVOPC(OrigOpE32))) { 272 // VOPC DPP and VOPC promoted to VOP3 DPP do not have an old operand 273 // because they write to SGPRs not VGPRs 274 } else { 275 // TODO: this discards MAC/FMA instructions for now, let's add it later 276 LLVM_DEBUG(dbgs() << " failed: no old operand in DPP instruction," 277 " TBD\n"); 278 Fail = true; 279 break; 280 } 281 282 auto *Mod0 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src0_modifiers); 283 if (Mod0) { 284 assert(NumOperands == AMDGPU::getNamedOperandIdx(DPPOp, 285 AMDGPU::OpName::src0_modifiers)); 286 assert(HasVOP3DPP || 287 (0LL == (Mod0->getImm() & ~(SISrcMods::ABS | SISrcMods::NEG)))); 288 DPPInst.addImm(Mod0->getImm()); 289 ++NumOperands; 290 } else if (AMDGPU::hasNamedOperand(DPPOp, AMDGPU::OpName::src0_modifiers)) { 291 DPPInst.addImm(0); 292 ++NumOperands; 293 } 294 auto *Src0 = TII->getNamedOperand(MovMI, AMDGPU::OpName::src0); 295 assert(Src0); 296 int Src0Idx = NumOperands; 297 if (!TII->isOperandLegal(*DPPInst.getInstr(), NumOperands, Src0)) { 298 LLVM_DEBUG(dbgs() << " failed: src0 is illegal\n"); 299 Fail = true; 300 break; 301 } 302 DPPInst.add(*Src0); 303 DPPInst->getOperand(NumOperands).setIsKill(false); 304 ++NumOperands; 305 306 auto *Mod1 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src1_modifiers); 307 if (Mod1) { 308 assert(NumOperands == AMDGPU::getNamedOperandIdx(DPPOp, 309 AMDGPU::OpName::src1_modifiers)); 310 assert(HasVOP3DPP || 311 (0LL == (Mod1->getImm() & ~(SISrcMods::ABS | SISrcMods::NEG)))); 312 DPPInst.addImm(Mod1->getImm()); 313 ++NumOperands; 314 } else if (AMDGPU::hasNamedOperand(DPPOp, AMDGPU::OpName::src1_modifiers)) { 315 DPPInst.addImm(0); 316 ++NumOperands; 317 } 318 auto *Src1 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src1); 319 if (Src1) { 320 int OpNum = NumOperands; 321 // If subtarget does not support SGPRs for src1 operand then the 322 // requirements are the same as for src0. We check src0 instead because 323 // pseudos are shared between subtargets and allow SGPR for src1 on all. 324 if (!ST->hasDPPSrc1SGPR()) { 325 assert(getOperandSize(*DPPInst, Src0Idx, *MRI) == 326 getOperandSize(*DPPInst, NumOperands, *MRI) && 327 "Src0 and Src1 operands should have the same size"); 328 OpNum = Src0Idx; 329 } 330 if (!TII->isOperandLegal(*DPPInst.getInstr(), OpNum, Src1)) { 331 LLVM_DEBUG(dbgs() << " failed: src1 is illegal\n"); 332 Fail = true; 333 break; 334 } 335 DPPInst.add(*Src1); 336 ++NumOperands; 337 } 338 339 auto *Mod2 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src2_modifiers); 340 if (Mod2) { 341 assert(NumOperands == 342 AMDGPU::getNamedOperandIdx(DPPOp, AMDGPU::OpName::src2_modifiers)); 343 assert(HasVOP3DPP || 344 (0LL == (Mod2->getImm() & ~(SISrcMods::ABS | SISrcMods::NEG)))); 345 DPPInst.addImm(Mod2->getImm()); 346 ++NumOperands; 347 } 348 auto *Src2 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src2); 349 if (Src2) { 350 if (!TII->getNamedOperand(*DPPInst.getInstr(), AMDGPU::OpName::src2) || 351 !TII->isOperandLegal(*DPPInst.getInstr(), NumOperands, Src2)) { 352 LLVM_DEBUG(dbgs() << " failed: src2 is illegal\n"); 353 Fail = true; 354 break; 355 } 356 DPPInst.add(*Src2); 357 ++NumOperands; 358 } 359 360 if (HasVOP3DPP) { 361 auto *ClampOpr = TII->getNamedOperand(OrigMI, AMDGPU::OpName::clamp); 362 if (ClampOpr && AMDGPU::hasNamedOperand(DPPOp, AMDGPU::OpName::clamp)) { 363 DPPInst.addImm(ClampOpr->getImm()); 364 } 365 auto *VdstInOpr = TII->getNamedOperand(OrigMI, AMDGPU::OpName::vdst_in); 366 if (VdstInOpr && 367 AMDGPU::hasNamedOperand(DPPOp, AMDGPU::OpName::vdst_in)) { 368 DPPInst.add(*VdstInOpr); 369 } 370 auto *OmodOpr = TII->getNamedOperand(OrigMI, AMDGPU::OpName::omod); 371 if (OmodOpr && AMDGPU::hasNamedOperand(DPPOp, AMDGPU::OpName::omod)) { 372 DPPInst.addImm(OmodOpr->getImm()); 373 } 374 // Validate OP_SEL has to be set to all 0 and OP_SEL_HI has to be set to 375 // all 1. 376 if (TII->getNamedOperand(OrigMI, AMDGPU::OpName::op_sel)) { 377 int64_t OpSel = 0; 378 OpSel |= (Mod0 ? (!!(Mod0->getImm() & SISrcMods::OP_SEL_0) << 0) : 0); 379 OpSel |= (Mod1 ? (!!(Mod1->getImm() & SISrcMods::OP_SEL_0) << 1) : 0); 380 OpSel |= (Mod2 ? (!!(Mod2->getImm() & SISrcMods::OP_SEL_0) << 2) : 0); 381 if (Mod0 && TII->isVOP3(OrigMI) && !TII->isVOP3P(OrigMI)) 382 OpSel |= !!(Mod0->getImm() & SISrcMods::DST_OP_SEL) << 3; 383 384 if (OpSel != 0) { 385 LLVM_DEBUG(dbgs() << " failed: op_sel must be zero\n"); 386 Fail = true; 387 break; 388 } 389 if (AMDGPU::hasNamedOperand(DPPOp, AMDGPU::OpName::op_sel)) 390 DPPInst.addImm(OpSel); 391 } 392 if (TII->getNamedOperand(OrigMI, AMDGPU::OpName::op_sel_hi)) { 393 int64_t OpSelHi = 0; 394 OpSelHi |= (Mod0 ? (!!(Mod0->getImm() & SISrcMods::OP_SEL_1) << 0) : 0); 395 OpSelHi |= (Mod1 ? (!!(Mod1->getImm() & SISrcMods::OP_SEL_1) << 1) : 0); 396 OpSelHi |= (Mod2 ? (!!(Mod2->getImm() & SISrcMods::OP_SEL_1) << 2) : 0); 397 398 // Only vop3p has op_sel_hi, and all vop3p have 3 operands, so check 399 // the bitmask for 3 op_sel_hi bits set 400 assert(Src2 && "Expected vop3p with 3 operands"); 401 if (OpSelHi != 7) { 402 LLVM_DEBUG(dbgs() << " failed: op_sel_hi must be all set to one\n"); 403 Fail = true; 404 break; 405 } 406 if (AMDGPU::hasNamedOperand(DPPOp, AMDGPU::OpName::op_sel_hi)) 407 DPPInst.addImm(OpSelHi); 408 } 409 auto *NegOpr = TII->getNamedOperand(OrigMI, AMDGPU::OpName::neg_lo); 410 if (NegOpr && AMDGPU::hasNamedOperand(DPPOp, AMDGPU::OpName::neg_lo)) { 411 DPPInst.addImm(NegOpr->getImm()); 412 } 413 auto *NegHiOpr = TII->getNamedOperand(OrigMI, AMDGPU::OpName::neg_hi); 414 if (NegHiOpr && AMDGPU::hasNamedOperand(DPPOp, AMDGPU::OpName::neg_hi)) { 415 DPPInst.addImm(NegHiOpr->getImm()); 416 } 417 auto *ByteSelOpr = TII->getNamedOperand(OrigMI, AMDGPU::OpName::byte_sel); 418 if (ByteSelOpr && 419 AMDGPU::hasNamedOperand(DPPOp, AMDGPU::OpName::byte_sel)) { 420 DPPInst.addImm(ByteSelOpr->getImm()); 421 } 422 } 423 DPPInst.add(*TII->getNamedOperand(MovMI, AMDGPU::OpName::dpp_ctrl)); 424 DPPInst.add(*TII->getNamedOperand(MovMI, AMDGPU::OpName::row_mask)); 425 DPPInst.add(*TII->getNamedOperand(MovMI, AMDGPU::OpName::bank_mask)); 426 DPPInst.addImm(CombBCZ ? 1 : 0); 427 } while (false); 428 429 if (Fail) { 430 DPPInst.getInstr()->eraseFromParent(); 431 return nullptr; 432 } 433 LLVM_DEBUG(dbgs() << " combined: " << *DPPInst.getInstr()); 434 return DPPInst.getInstr(); 435 } 436 437 static bool isIdentityValue(unsigned OrigMIOp, MachineOperand *OldOpnd) { 438 assert(OldOpnd->isImm()); 439 switch (OrigMIOp) { 440 default: break; 441 case AMDGPU::V_ADD_U32_e32: 442 case AMDGPU::V_ADD_U32_e64: 443 case AMDGPU::V_ADD_CO_U32_e32: 444 case AMDGPU::V_ADD_CO_U32_e64: 445 case AMDGPU::V_OR_B32_e32: 446 case AMDGPU::V_OR_B32_e64: 447 case AMDGPU::V_SUBREV_U32_e32: 448 case AMDGPU::V_SUBREV_U32_e64: 449 case AMDGPU::V_SUBREV_CO_U32_e32: 450 case AMDGPU::V_SUBREV_CO_U32_e64: 451 case AMDGPU::V_MAX_U32_e32: 452 case AMDGPU::V_MAX_U32_e64: 453 case AMDGPU::V_XOR_B32_e32: 454 case AMDGPU::V_XOR_B32_e64: 455 if (OldOpnd->getImm() == 0) 456 return true; 457 break; 458 case AMDGPU::V_AND_B32_e32: 459 case AMDGPU::V_AND_B32_e64: 460 case AMDGPU::V_MIN_U32_e32: 461 case AMDGPU::V_MIN_U32_e64: 462 if (static_cast<uint32_t>(OldOpnd->getImm()) == 463 std::numeric_limits<uint32_t>::max()) 464 return true; 465 break; 466 case AMDGPU::V_MIN_I32_e32: 467 case AMDGPU::V_MIN_I32_e64: 468 if (static_cast<int32_t>(OldOpnd->getImm()) == 469 std::numeric_limits<int32_t>::max()) 470 return true; 471 break; 472 case AMDGPU::V_MAX_I32_e32: 473 case AMDGPU::V_MAX_I32_e64: 474 if (static_cast<int32_t>(OldOpnd->getImm()) == 475 std::numeric_limits<int32_t>::min()) 476 return true; 477 break; 478 case AMDGPU::V_MUL_I32_I24_e32: 479 case AMDGPU::V_MUL_I32_I24_e64: 480 case AMDGPU::V_MUL_U32_U24_e32: 481 case AMDGPU::V_MUL_U32_U24_e64: 482 if (OldOpnd->getImm() == 1) 483 return true; 484 break; 485 } 486 return false; 487 } 488 489 MachineInstr *GCNDPPCombine::createDPPInst( 490 MachineInstr &OrigMI, MachineInstr &MovMI, RegSubRegPair CombOldVGPR, 491 MachineOperand *OldOpndValue, bool CombBCZ, bool IsShrinkable) const { 492 assert(CombOldVGPR.Reg); 493 if (!CombBCZ && OldOpndValue && OldOpndValue->isImm()) { 494 auto *Src1 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src1); 495 if (!Src1 || !Src1->isReg()) { 496 LLVM_DEBUG(dbgs() << " failed: no src1 or it isn't a register\n"); 497 return nullptr; 498 } 499 if (!isIdentityValue(OrigMI.getOpcode(), OldOpndValue)) { 500 LLVM_DEBUG(dbgs() << " failed: old immediate isn't an identity\n"); 501 return nullptr; 502 } 503 CombOldVGPR = getRegSubRegPair(*Src1); 504 auto *MovDst = TII->getNamedOperand(MovMI, AMDGPU::OpName::vdst); 505 const TargetRegisterClass *RC = MRI->getRegClass(MovDst->getReg()); 506 if (!isOfRegClass(CombOldVGPR, *RC, *MRI)) { 507 LLVM_DEBUG(dbgs() << " failed: src1 has wrong register class\n"); 508 return nullptr; 509 } 510 } 511 return createDPPInst(OrigMI, MovMI, CombOldVGPR, CombBCZ, IsShrinkable); 512 } 513 514 // returns true if MI doesn't have OpndName immediate operand or the 515 // operand has Value 516 bool GCNDPPCombine::hasNoImmOrEqual(MachineInstr &MI, unsigned OpndName, 517 int64_t Value, int64_t Mask) const { 518 auto *Imm = TII->getNamedOperand(MI, OpndName); 519 if (!Imm) 520 return true; 521 522 assert(Imm->isImm()); 523 return (Imm->getImm() & Mask) == Value; 524 } 525 526 bool GCNDPPCombine::combineDPPMov(MachineInstr &MovMI) const { 527 assert(MovMI.getOpcode() == AMDGPU::V_MOV_B32_dpp || 528 MovMI.getOpcode() == AMDGPU::V_MOV_B64_dpp || 529 MovMI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO); 530 LLVM_DEBUG(dbgs() << "\nDPP combine: " << MovMI); 531 532 auto *DstOpnd = TII->getNamedOperand(MovMI, AMDGPU::OpName::vdst); 533 assert(DstOpnd && DstOpnd->isReg()); 534 auto DPPMovReg = DstOpnd->getReg(); 535 if (DPPMovReg.isPhysical()) { 536 LLVM_DEBUG(dbgs() << " failed: dpp move writes physreg\n"); 537 return false; 538 } 539 if (execMayBeModifiedBeforeAnyUse(*MRI, DPPMovReg, MovMI)) { 540 LLVM_DEBUG(dbgs() << " failed: EXEC mask should remain the same" 541 " for all uses\n"); 542 return false; 543 } 544 545 if (MovMI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO || 546 MovMI.getOpcode() == AMDGPU::V_MOV_B64_dpp) { 547 auto *DppCtrl = TII->getNamedOperand(MovMI, AMDGPU::OpName::dpp_ctrl); 548 assert(DppCtrl && DppCtrl->isImm()); 549 if (!AMDGPU::isLegalDPALU_DPPControl(DppCtrl->getImm())) { 550 LLVM_DEBUG(dbgs() << " failed: 64 bit dpp move uses unsupported" 551 " control value\n"); 552 // Let it split, then control may become legal. 553 return false; 554 } 555 } 556 557 auto *RowMaskOpnd = TII->getNamedOperand(MovMI, AMDGPU::OpName::row_mask); 558 assert(RowMaskOpnd && RowMaskOpnd->isImm()); 559 auto *BankMaskOpnd = TII->getNamedOperand(MovMI, AMDGPU::OpName::bank_mask); 560 assert(BankMaskOpnd && BankMaskOpnd->isImm()); 561 const bool MaskAllLanes = RowMaskOpnd->getImm() == 0xF && 562 BankMaskOpnd->getImm() == 0xF; 563 564 auto *BCZOpnd = TII->getNamedOperand(MovMI, AMDGPU::OpName::bound_ctrl); 565 assert(BCZOpnd && BCZOpnd->isImm()); 566 bool BoundCtrlZero = BCZOpnd->getImm(); 567 568 auto *OldOpnd = TII->getNamedOperand(MovMI, AMDGPU::OpName::old); 569 auto *SrcOpnd = TII->getNamedOperand(MovMI, AMDGPU::OpName::src0); 570 assert(OldOpnd && OldOpnd->isReg()); 571 assert(SrcOpnd && SrcOpnd->isReg()); 572 if (OldOpnd->getReg().isPhysical() || SrcOpnd->getReg().isPhysical()) { 573 LLVM_DEBUG(dbgs() << " failed: dpp move reads physreg\n"); 574 return false; 575 } 576 577 auto * const OldOpndValue = getOldOpndValue(*OldOpnd); 578 // OldOpndValue is either undef (IMPLICIT_DEF) or immediate or something else 579 // We could use: assert(!OldOpndValue || OldOpndValue->isImm()) 580 // but the third option is used to distinguish undef from non-immediate 581 // to reuse IMPLICIT_DEF instruction later 582 assert(!OldOpndValue || OldOpndValue->isImm() || OldOpndValue == OldOpnd); 583 584 bool CombBCZ = false; 585 586 if (MaskAllLanes && BoundCtrlZero) { // [1] 587 CombBCZ = true; 588 } else { 589 if (!OldOpndValue || !OldOpndValue->isImm()) { 590 LLVM_DEBUG(dbgs() << " failed: the DPP mov isn't combinable\n"); 591 return false; 592 } 593 594 if (OldOpndValue->getImm() == 0) { 595 if (MaskAllLanes) { 596 assert(!BoundCtrlZero); // by check [1] 597 CombBCZ = true; 598 } 599 } else if (BoundCtrlZero) { 600 assert(!MaskAllLanes); // by check [1] 601 LLVM_DEBUG(dbgs() << 602 " failed: old!=0 and bctrl:0 and not all lanes isn't combinable\n"); 603 return false; 604 } 605 } 606 607 LLVM_DEBUG(dbgs() << " old="; 608 if (!OldOpndValue) 609 dbgs() << "undef"; 610 else 611 dbgs() << *OldOpndValue; 612 dbgs() << ", bound_ctrl=" << CombBCZ << '\n'); 613 614 SmallVector<MachineInstr*, 4> OrigMIs, DPPMIs; 615 DenseMap<MachineInstr*, SmallVector<unsigned, 4>> RegSeqWithOpNos; 616 auto CombOldVGPR = getRegSubRegPair(*OldOpnd); 617 // try to reuse previous old reg if its undefined (IMPLICIT_DEF) 618 if (CombBCZ && OldOpndValue) { // CombOldVGPR should be undef 619 const TargetRegisterClass *RC = MRI->getRegClass(DPPMovReg); 620 CombOldVGPR = RegSubRegPair( 621 MRI->createVirtualRegister(RC)); 622 auto UndefInst = BuildMI(*MovMI.getParent(), MovMI, MovMI.getDebugLoc(), 623 TII->get(AMDGPU::IMPLICIT_DEF), CombOldVGPR.Reg); 624 DPPMIs.push_back(UndefInst.getInstr()); 625 } 626 627 OrigMIs.push_back(&MovMI); 628 bool Rollback = true; 629 SmallVector<MachineOperand*, 16> Uses; 630 631 for (auto &Use : MRI->use_nodbg_operands(DPPMovReg)) { 632 Uses.push_back(&Use); 633 } 634 635 while (!Uses.empty()) { 636 MachineOperand *Use = Uses.pop_back_val(); 637 Rollback = true; 638 639 auto &OrigMI = *Use->getParent(); 640 LLVM_DEBUG(dbgs() << " try: " << OrigMI); 641 642 auto OrigOp = OrigMI.getOpcode(); 643 assert((TII->get(OrigOp).getSize() != 4 || !AMDGPU::isTrue16Inst(OrigOp)) && 644 "There should not be e32 True16 instructions pre-RA"); 645 if (OrigOp == AMDGPU::REG_SEQUENCE) { 646 Register FwdReg = OrigMI.getOperand(0).getReg(); 647 unsigned FwdSubReg = 0; 648 649 if (execMayBeModifiedBeforeAnyUse(*MRI, FwdReg, OrigMI)) { 650 LLVM_DEBUG(dbgs() << " failed: EXEC mask should remain the same" 651 " for all uses\n"); 652 break; 653 } 654 655 unsigned OpNo, E = OrigMI.getNumOperands(); 656 for (OpNo = 1; OpNo < E; OpNo += 2) { 657 if (OrigMI.getOperand(OpNo).getReg() == DPPMovReg) { 658 FwdSubReg = OrigMI.getOperand(OpNo + 1).getImm(); 659 break; 660 } 661 } 662 663 if (!FwdSubReg) 664 break; 665 666 for (auto &Op : MRI->use_nodbg_operands(FwdReg)) { 667 if (Op.getSubReg() == FwdSubReg) 668 Uses.push_back(&Op); 669 } 670 RegSeqWithOpNos[&OrigMI].push_back(OpNo); 671 continue; 672 } 673 674 bool IsShrinkable = isShrinkable(OrigMI); 675 if (!(IsShrinkable || 676 ((TII->isVOP3P(OrigOp) || TII->isVOPC(OrigOp) || 677 TII->isVOP3(OrigOp)) && 678 ST->hasVOP3DPP()) || 679 TII->isVOP1(OrigOp) || TII->isVOP2(OrigOp))) { 680 LLVM_DEBUG(dbgs() << " failed: not VOP1/2/3/3P/C\n"); 681 break; 682 } 683 if (OrigMI.modifiesRegister(AMDGPU::EXEC, ST->getRegisterInfo())) { 684 LLVM_DEBUG(dbgs() << " failed: can't combine v_cmpx\n"); 685 break; 686 } 687 688 auto *Src0 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src0); 689 auto *Src1 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src1); 690 if (Use != Src0 && !(Use == Src1 && OrigMI.isCommutable())) { // [1] 691 LLVM_DEBUG(dbgs() << " failed: no suitable operands\n"); 692 break; 693 } 694 695 auto *Src2 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src2); 696 assert(Src0 && "Src1 without Src0?"); 697 if ((Use == Src0 && ((Src1 && Src1->isIdenticalTo(*Src0)) || 698 (Src2 && Src2->isIdenticalTo(*Src0)))) || 699 (Use == Src1 && (Src1->isIdenticalTo(*Src0) || 700 (Src2 && Src2->isIdenticalTo(*Src1))))) { 701 LLVM_DEBUG( 702 dbgs() 703 << " " << OrigMI 704 << " failed: DPP register is used more than once per instruction\n"); 705 break; 706 } 707 708 LLVM_DEBUG(dbgs() << " combining: " << OrigMI); 709 if (Use == Src0) { 710 if (auto *DPPInst = createDPPInst(OrigMI, MovMI, CombOldVGPR, 711 OldOpndValue, CombBCZ, IsShrinkable)) { 712 DPPMIs.push_back(DPPInst); 713 Rollback = false; 714 } 715 } else { 716 assert(Use == Src1 && OrigMI.isCommutable()); // by check [1] 717 auto *BB = OrigMI.getParent(); 718 auto *NewMI = BB->getParent()->CloneMachineInstr(&OrigMI); 719 BB->insert(OrigMI, NewMI); 720 if (TII->commuteInstruction(*NewMI)) { 721 LLVM_DEBUG(dbgs() << " commuted: " << *NewMI); 722 if (auto *DPPInst = 723 createDPPInst(*NewMI, MovMI, CombOldVGPR, OldOpndValue, CombBCZ, 724 IsShrinkable)) { 725 DPPMIs.push_back(DPPInst); 726 Rollback = false; 727 } 728 } else 729 LLVM_DEBUG(dbgs() << " failed: cannot be commuted\n"); 730 NewMI->eraseFromParent(); 731 } 732 if (Rollback) 733 break; 734 OrigMIs.push_back(&OrigMI); 735 } 736 737 Rollback |= !Uses.empty(); 738 739 for (auto *MI : *(Rollback? &DPPMIs : &OrigMIs)) 740 MI->eraseFromParent(); 741 742 if (!Rollback) { 743 for (auto &S : RegSeqWithOpNos) { 744 if (MRI->use_nodbg_empty(S.first->getOperand(0).getReg())) { 745 S.first->eraseFromParent(); 746 continue; 747 } 748 while (!S.second.empty()) 749 S.first->getOperand(S.second.pop_back_val()).setIsUndef(); 750 } 751 } 752 753 return !Rollback; 754 } 755 756 bool GCNDPPCombineLegacy::runOnMachineFunction(MachineFunction &MF) { 757 if (skipFunction(MF.getFunction())) 758 return false; 759 760 return GCNDPPCombine().run(MF); 761 } 762 763 bool GCNDPPCombine::run(MachineFunction &MF) { 764 ST = &MF.getSubtarget<GCNSubtarget>(); 765 if (!ST->hasDPP()) 766 return false; 767 768 MRI = &MF.getRegInfo(); 769 TII = ST->getInstrInfo(); 770 771 bool Changed = false; 772 for (auto &MBB : MF) { 773 for (MachineInstr &MI : llvm::make_early_inc_range(llvm::reverse(MBB))) { 774 if (MI.getOpcode() == AMDGPU::V_MOV_B32_dpp && combineDPPMov(MI)) { 775 Changed = true; 776 ++NumDPPMovsCombined; 777 } else if (MI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO || 778 MI.getOpcode() == AMDGPU::V_MOV_B64_dpp) { 779 if (ST->hasDPALU_DPP() && combineDPPMov(MI)) { 780 Changed = true; 781 ++NumDPPMovsCombined; 782 } else { 783 auto Split = TII->expandMovDPP64(MI); 784 for (auto *M : {Split.first, Split.second}) { 785 if (M && combineDPPMov(*M)) 786 ++NumDPPMovsCombined; 787 } 788 Changed = true; 789 } 790 } 791 } 792 } 793 return Changed; 794 } 795 796 PreservedAnalyses GCNDPPCombinePass::run(MachineFunction &MF, 797 MachineFunctionAnalysisManager &) { 798 MFPropsModifier _(*this, MF); 799 800 if (MF.getFunction().hasOptNone()) 801 return PreservedAnalyses::all(); 802 803 bool Changed = GCNDPPCombine().run(MF); 804 if (!Changed) 805 return PreservedAnalyses::all(); 806 807 auto PA = getMachineFunctionPassPreservedAnalyses(); 808 PA.preserveSet<CFGAnalyses>(); 809 return PA; 810 } 811