1 //===-- SIPeepholeSDWA.cpp - Peephole optimization for SDWA instructions --===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 /// \file This pass tries to apply several peephole SDWA patterns. 11 /// 12 /// E.g. original: 13 /// V_LSHRREV_B32_e32 %vreg0, 16, %vreg1 14 /// V_ADD_I32_e32 %vreg2, %vreg0, %vreg3 15 /// V_LSHLREV_B32_e32 %vreg4, 16, %vreg2 16 /// 17 /// Replace: 18 /// V_ADD_I32_sdwa %vreg4, %vreg1, %vreg3 19 /// dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 20 /// 21 //===----------------------------------------------------------------------===// 22 23 24 #include "AMDGPU.h" 25 #include "AMDGPUSubtarget.h" 26 #include "SIDefines.h" 27 #include "SIInstrInfo.h" 28 #include "llvm/ADT/Statistic.h" 29 #include "llvm/ADT/STLExtras.h" 30 #include "llvm/CodeGen/MachineFunctionPass.h" 31 #include "llvm/CodeGen/MachineInstrBuilder.h" 32 #include <unordered_map> 33 34 using namespace llvm; 35 36 #define DEBUG_TYPE "si-peephole-sdwa" 37 38 STATISTIC(NumSDWAPatternsFound, "Number of SDWA patterns found."); 39 STATISTIC(NumSDWAInstructionsPeepholed, 40 "Number of instruction converted to SDWA."); 41 42 namespace { 43 44 class SDWAOperand; 45 46 class SIPeepholeSDWA : public MachineFunctionPass { 47 private: 48 MachineRegisterInfo *MRI; 49 const SIRegisterInfo *TRI; 50 const SIInstrInfo *TII; 51 52 std::unordered_map<MachineInstr *, std::unique_ptr<SDWAOperand>> SDWAOperands; 53 54 Optional<int64_t> foldToImm(const MachineOperand &Op) const; 55 56 public: 57 static char ID; 58 59 typedef SmallVector<std::unique_ptr<SDWAOperand>, 4> SDWAOperandsVector; 60 61 SIPeepholeSDWA() : MachineFunctionPass(ID) { 62 initializeSIPeepholeSDWAPass(*PassRegistry::getPassRegistry()); 63 } 64 65 bool runOnMachineFunction(MachineFunction &MF) override; 66 void matchSDWAOperands(MachineBasicBlock &MBB); 67 bool convertToSDWA(MachineInstr &MI, const SDWAOperandsVector &SDWAOperands); 68 69 StringRef getPassName() const override { return "SI Peephole SDWA"; } 70 71 void getAnalysisUsage(AnalysisUsage &AU) const override { 72 AU.setPreservesCFG(); 73 MachineFunctionPass::getAnalysisUsage(AU); 74 } 75 }; 76 77 class SDWAOperand { 78 private: 79 MachineOperand *Target; // Operand that would be used in converted instruction 80 MachineOperand *Replaced; // Operand that would be replace by Target 81 82 public: 83 SDWAOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp) 84 : Target(TargetOp), Replaced(ReplacedOp) { 85 assert(Target->isReg()); 86 assert(Replaced->isReg()); 87 } 88 89 virtual ~SDWAOperand() {} 90 91 virtual MachineInstr *potentialToConvert(const SIInstrInfo *TII) = 0; 92 virtual bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) = 0; 93 94 MachineOperand *getTargetOperand() const { return Target; } 95 MachineOperand *getReplacedOperand() const { return Replaced; } 96 MachineInstr *getParentInst() const { return Target->getParent(); } 97 MachineRegisterInfo *getMRI() const { 98 return &getParentInst()->getParent()->getParent()->getRegInfo(); 99 } 100 }; 101 102 using namespace AMDGPU::SDWA; 103 104 class SDWASrcOperand : public SDWAOperand { 105 private: 106 SdwaSel SrcSel; 107 bool Abs; 108 bool Neg; 109 bool Sext; 110 111 public: 112 SDWASrcOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp, 113 SdwaSel SrcSel_ = DWORD, bool Abs_ = false, bool Neg_ = false, 114 bool Sext_ = false) 115 : SDWAOperand(TargetOp, ReplacedOp), SrcSel(SrcSel_), Abs(Abs_), 116 Neg(Neg_), Sext(Sext_) {} 117 118 virtual MachineInstr *potentialToConvert(const SIInstrInfo *TII) override; 119 virtual bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override; 120 121 SdwaSel getSrcSel() const { return SrcSel; } 122 bool getAbs() const { return Abs; } 123 bool getNeg() const { return Neg; } 124 bool getSext() const { return Sext; } 125 126 uint64_t getSrcMods() const; 127 }; 128 129 class SDWADstOperand : public SDWAOperand { 130 private: 131 SdwaSel DstSel; 132 DstUnused DstUn; 133 134 public: 135 SDWADstOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp, 136 SdwaSel DstSel_ = DWORD, DstUnused DstUn_ = UNUSED_PAD) 137 : SDWAOperand(TargetOp, ReplacedOp), DstSel(DstSel_), DstUn(DstUn_) {} 138 139 virtual MachineInstr *potentialToConvert(const SIInstrInfo *TII) override; 140 virtual bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override; 141 142 SdwaSel getDstSel() const { return DstSel; } 143 DstUnused getDstUnused() const { return DstUn; } 144 }; 145 146 } // End anonymous namespace. 147 148 INITIALIZE_PASS(SIPeepholeSDWA, DEBUG_TYPE, "SI Peephole SDWA", false, false) 149 150 char SIPeepholeSDWA::ID = 0; 151 152 char &llvm::SIPeepholeSDWAID = SIPeepholeSDWA::ID; 153 154 FunctionPass *llvm::createSIPeepholeSDWAPass() { 155 return new SIPeepholeSDWA(); 156 } 157 158 #ifndef NDEBUG 159 160 static raw_ostream& operator<<(raw_ostream &OS, const SdwaSel &Sel) { 161 switch(Sel) { 162 case BYTE_0: OS << "BYTE_0"; break; 163 case BYTE_1: OS << "BYTE_1"; break; 164 case BYTE_2: OS << "BYTE_2"; break; 165 case BYTE_3: OS << "BYTE_3"; break; 166 case WORD_0: OS << "WORD_0"; break; 167 case WORD_1: OS << "WORD_1"; break; 168 case DWORD: OS << "DWORD"; break; 169 } 170 return OS; 171 } 172 173 static raw_ostream& operator<<(raw_ostream &OS, const DstUnused &Un) { 174 switch(Un) { 175 case UNUSED_PAD: OS << "UNUSED_PAD"; break; 176 case UNUSED_SEXT: OS << "UNUSED_SEXT"; break; 177 case UNUSED_PRESERVE: OS << "UNUSED_PRESERVE"; break; 178 } 179 return OS; 180 } 181 182 static raw_ostream& operator<<(raw_ostream &OS, const SDWASrcOperand &Src) { 183 OS << "SDWA src: " << *Src.getTargetOperand() 184 << " src_sel:" << Src.getSrcSel() 185 << " abs:" << Src.getAbs() << " neg:" << Src.getNeg() 186 << " sext:" << Src.getSext() << '\n'; 187 return OS; 188 } 189 190 static raw_ostream& operator<<(raw_ostream &OS, const SDWADstOperand &Dst) { 191 OS << "SDWA dst: " << *Dst.getTargetOperand() 192 << " dst_sel:" << Dst.getDstSel() 193 << " dst_unused:" << Dst.getDstUnused() << '\n'; 194 return OS; 195 } 196 197 #endif 198 199 static bool isSameBB(const MachineInstr *FirstMI, const MachineInstr *SecondMI) { 200 assert(FirstMI && SecondMI); 201 return FirstMI->getParent() == SecondMI->getParent(); 202 } 203 204 static void copyRegOperand(MachineOperand &To, const MachineOperand &From) { 205 assert(To.isReg() && From.isReg()); 206 To.setReg(From.getReg()); 207 To.setSubReg(From.getSubReg()); 208 To.setIsUndef(From.isUndef()); 209 if (To.isUse()) { 210 To.setIsKill(From.isKill()); 211 } else { 212 To.setIsDead(From.isDead()); 213 } 214 } 215 216 static bool isSameReg(const MachineOperand &LHS, const MachineOperand &RHS) { 217 return LHS.isReg() && 218 RHS.isReg() && 219 LHS.getReg() == RHS.getReg() && 220 LHS.getSubReg() == RHS.getSubReg(); 221 } 222 223 static bool isSubregOf(const MachineOperand &SubReg, 224 const MachineOperand &SuperReg, 225 const TargetRegisterInfo *TRI) { 226 227 if (!SuperReg.isReg() || !SubReg.isReg()) 228 return false; 229 230 if (isSameReg(SuperReg, SubReg)) 231 return true; 232 233 if (SuperReg.getReg() != SubReg.getReg()) 234 return false; 235 236 LaneBitmask SuperMask = TRI->getSubRegIndexLaneMask(SuperReg.getSubReg()); 237 LaneBitmask SubMask = TRI->getSubRegIndexLaneMask(SubReg.getSubReg()); 238 SuperMask |= ~SubMask; 239 return SuperMask.all(); 240 } 241 242 uint64_t SDWASrcOperand::getSrcMods() const { 243 uint64_t Mods = 0; 244 if (Abs || Neg) { 245 assert(!Sext && 246 "Float and integer src modifiers can't be set simulteniously"); 247 Mods |= Abs ? SISrcMods::ABS : 0; 248 Mods |= Neg ? SISrcMods::NEG : 0; 249 } else if (Sext) { 250 Mods |= SISrcMods::SEXT; 251 } 252 253 return Mods; 254 } 255 256 MachineInstr *SDWASrcOperand::potentialToConvert(const SIInstrInfo *TII) { 257 // For SDWA src operand potential instruction is one that use register 258 // defined by parent instruction 259 MachineRegisterInfo *MRI = getMRI(); 260 MachineOperand *Replaced = getReplacedOperand(); 261 assert(Replaced->isReg()); 262 263 MachineInstr *PotentialMI = nullptr; 264 for (MachineOperand &PotentialMO : MRI->use_operands(Replaced->getReg())) { 265 // If this is use of another subreg of dst reg then do nothing 266 if (!isSubregOf(*Replaced, PotentialMO, MRI->getTargetRegisterInfo())) 267 continue; 268 269 // If there exist use of dst in another basic block or use of superreg of 270 // dst then we should not combine this opernad 271 if (!isSameBB(PotentialMO.getParent(), getParentInst()) || 272 !isSameReg(PotentialMO, *Replaced)) 273 return nullptr; 274 275 // Check that PotentialMI is only instruction that uses dst reg 276 if (PotentialMI == nullptr) { 277 PotentialMI = PotentialMO.getParent(); 278 } else if (PotentialMI != PotentialMO.getParent()) { 279 return nullptr; 280 } 281 } 282 283 return PotentialMI; 284 } 285 286 bool SDWASrcOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) { 287 // Find operand in instruction that matches source operand and replace it with 288 // target operand. Set corresponding src_sel 289 290 MachineOperand *Src = TII->getNamedOperand(MI, AMDGPU::OpName::src0); 291 MachineOperand *SrcSel = TII->getNamedOperand(MI, AMDGPU::OpName::src0_sel); 292 MachineOperand *SrcMods = 293 TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers); 294 assert(Src && Src->isReg()); 295 if (!isSameReg(*Src, *getReplacedOperand())) { 296 // If this is not src0 then it should be src1 297 Src = TII->getNamedOperand(MI, AMDGPU::OpName::src1); 298 SrcSel = TII->getNamedOperand(MI, AMDGPU::OpName::src1_sel); 299 SrcMods = TII->getNamedOperand(MI, AMDGPU::OpName::src1_modifiers); 300 301 assert(Src && Src->isReg()); 302 303 if ((MI.getOpcode() == AMDGPU::V_MAC_F16_sdwa || 304 MI.getOpcode() == AMDGPU::V_MAC_F32_sdwa) && 305 !isSameReg(*Src, *getReplacedOperand())) { 306 // In case of v_mac_f16/32_sdwa this pass can try to apply src operand to 307 // src2. This is not allowed. 308 return false; 309 } 310 311 assert(isSameReg(*Src, *getReplacedOperand()) && SrcSel && SrcMods); 312 } 313 copyRegOperand(*Src, *getTargetOperand()); 314 SrcSel->setImm(getSrcSel()); 315 SrcMods->setImm(getSrcMods()); 316 getTargetOperand()->setIsKill(false); 317 return true; 318 } 319 320 MachineInstr *SDWADstOperand::potentialToConvert(const SIInstrInfo *TII) { 321 // For SDWA dst operand potential instruction is one that defines register 322 // that this operand uses 323 MachineRegisterInfo *MRI = getMRI(); 324 MachineInstr *ParentMI = getParentInst(); 325 MachineOperand *Replaced = getReplacedOperand(); 326 assert(Replaced->isReg()); 327 328 for (MachineOperand &PotentialMO : MRI->def_operands(Replaced->getReg())) { 329 if (!isSubregOf(*Replaced, PotentialMO, MRI->getTargetRegisterInfo())) 330 continue; 331 332 if (!isSameBB(getParentInst(), PotentialMO.getParent()) || 333 !isSameReg(*Replaced, PotentialMO)) 334 return nullptr; 335 336 // Check that ParentMI is the only instruction that uses replaced register 337 for (MachineOperand &UseMO : MRI->use_operands(PotentialMO.getReg())) { 338 if (isSubregOf(UseMO, PotentialMO, MRI->getTargetRegisterInfo()) && 339 UseMO.getParent() != ParentMI) { 340 return nullptr; 341 } 342 } 343 344 // Due to SSA this should be onle def of replaced register, so return it 345 return PotentialMO.getParent(); 346 } 347 348 return nullptr; 349 } 350 351 bool SDWADstOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) { 352 // Replace vdst operand in MI with target operand. Set dst_sel and dst_unused 353 354 if ((MI.getOpcode() == AMDGPU::V_MAC_F16_sdwa || 355 MI.getOpcode() == AMDGPU::V_MAC_F32_sdwa) && 356 getDstSel() != AMDGPU::SDWA::DWORD) { 357 // v_mac_f16/32_sdwa allow dst_sel to be equal only to DWORD 358 return false; 359 } 360 361 MachineOperand *Operand = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); 362 assert(Operand && 363 Operand->isReg() && 364 isSameReg(*Operand, *getReplacedOperand())); 365 copyRegOperand(*Operand, *getTargetOperand()); 366 MachineOperand *DstSel= TII->getNamedOperand(MI, AMDGPU::OpName::dst_sel); 367 assert(DstSel); 368 DstSel->setImm(getDstSel()); 369 MachineOperand *DstUnused= TII->getNamedOperand(MI, AMDGPU::OpName::dst_unused); 370 assert(DstUnused); 371 DstUnused->setImm(getDstUnused()); 372 373 // Remove original instruction because it would conflict with our new 374 // instruction by register definition 375 getParentInst()->eraseFromParent(); 376 return true; 377 } 378 379 Optional<int64_t> SIPeepholeSDWA::foldToImm(const MachineOperand &Op) const { 380 if (Op.isImm()) { 381 return Op.getImm(); 382 } 383 384 // If this is not immediate then it can be copy of immediate value, e.g.: 385 // %vreg1<def> = S_MOV_B32 255; 386 if (Op.isReg()) { 387 for (const MachineOperand &Def : MRI->def_operands(Op.getReg())) { 388 if (!isSameReg(Op, Def)) 389 continue; 390 391 const MachineInstr *DefInst = Def.getParent(); 392 if (!TII->isFoldableCopy(*DefInst) || !isSameBB(Op.getParent(), DefInst)) 393 return None; 394 395 const MachineOperand &Copied = DefInst->getOperand(1); 396 if (!Copied.isImm()) 397 return None; 398 399 return Copied.getImm(); 400 } 401 } 402 403 return None; 404 } 405 406 void SIPeepholeSDWA::matchSDWAOperands(MachineBasicBlock &MBB) { 407 for (MachineInstr &MI : MBB) { 408 unsigned Opcode = MI.getOpcode(); 409 switch (Opcode) { 410 case AMDGPU::V_LSHRREV_B32_e32: 411 case AMDGPU::V_ASHRREV_I32_e32: 412 case AMDGPU::V_LSHLREV_B32_e32: { 413 // from: v_lshrrev_b32_e32 v1, 16/24, v0 414 // to SDWA src:v0 src_sel:WORD_1/BYTE_3 415 416 // from: v_ashrrev_i32_e32 v1, 16/24, v0 417 // to SDWA src:v0 src_sel:WORD_1/BYTE_3 sext:1 418 419 // from: v_lshlrev_b32_e32 v1, 16/24, v0 420 // to SDWA dst:v1 dst_sel:WORD_1/BYTE_3 dst_unused:UNUSED_PAD 421 MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); 422 auto Imm = foldToImm(*Src0); 423 if (!Imm) 424 break; 425 426 if (*Imm != 16 && *Imm != 24) 427 break; 428 429 MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); 430 MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); 431 if (TRI->isPhysicalRegister(Src1->getReg()) || 432 TRI->isPhysicalRegister(Dst->getReg())) 433 break; 434 435 if (Opcode == AMDGPU::V_LSHLREV_B32_e32) { 436 auto SDWADst = make_unique<SDWADstOperand>( 437 Dst, Src1, *Imm == 16 ? WORD_1 : BYTE_3, UNUSED_PAD); 438 DEBUG(dbgs() << "Match: " << MI << "To: " << *SDWADst << '\n'); 439 SDWAOperands[&MI] = std::move(SDWADst); 440 ++NumSDWAPatternsFound; 441 } else { 442 auto SDWASrc = make_unique<SDWASrcOperand>( 443 Src1, Dst, *Imm == 16 ? WORD_1 : BYTE_3, false, false, 444 Opcode == AMDGPU::V_LSHRREV_B32_e32 ? false : true); 445 DEBUG(dbgs() << "Match: " << MI << "To: " << *SDWASrc << '\n'); 446 SDWAOperands[&MI] = std::move(SDWASrc); 447 ++NumSDWAPatternsFound; 448 } 449 break; 450 } 451 452 case AMDGPU::V_LSHRREV_B16_e32: 453 case AMDGPU::V_ASHRREV_I16_e32: 454 case AMDGPU::V_LSHLREV_B16_e32: { 455 // from: v_lshrrev_b16_e32 v1, 8, v0 456 // to SDWA src:v0 src_sel:BYTE_1 457 458 // from: v_ashrrev_i16_e32 v1, 8, v0 459 // to SDWA src:v0 src_sel:BYTE_1 sext:1 460 461 // from: v_lshlrev_b16_e32 v1, 8, v0 462 // to SDWA dst:v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD 463 MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); 464 auto Imm = foldToImm(*Src0); 465 if (!Imm || *Imm != 8) 466 break; 467 468 MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); 469 MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); 470 471 if (TRI->isPhysicalRegister(Src1->getReg()) || 472 TRI->isPhysicalRegister(Dst->getReg())) 473 break; 474 475 if (Opcode == AMDGPU::V_LSHLREV_B16_e32) { 476 auto SDWADst = 477 make_unique<SDWADstOperand>(Dst, Src1, BYTE_1, UNUSED_PAD); 478 DEBUG(dbgs() << "Match: " << MI << "To: " << *SDWADst << '\n'); 479 SDWAOperands[&MI] = std::move(SDWADst); 480 ++NumSDWAPatternsFound; 481 } else { 482 auto SDWASrc = make_unique<SDWASrcOperand>( 483 Src1, Dst, BYTE_1, false, false, 484 Opcode == AMDGPU::V_LSHRREV_B16_e32 ? false : true); 485 DEBUG(dbgs() << "Match: " << MI << "To: " << *SDWASrc << '\n'); 486 SDWAOperands[&MI] = std::move(SDWASrc); 487 ++NumSDWAPatternsFound; 488 } 489 break; 490 } 491 492 case AMDGPU::V_BFE_I32: 493 case AMDGPU::V_BFE_U32: { 494 // e.g.: 495 // from: v_bfe_u32 v1, v0, 8, 8 496 // to SDWA src:v0 src_sel:BYTE_1 497 498 // offset | width | src_sel 499 // ------------------------ 500 // 0 | 8 | BYTE_0 501 // 0 | 16 | WORD_0 502 // 0 | 32 | DWORD ? 503 // 8 | 8 | BYTE_1 504 // 16 | 8 | BYTE_2 505 // 16 | 16 | WORD_1 506 // 24 | 8 | BYTE_3 507 508 MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); 509 auto Offset = foldToImm(*Src1); 510 if (!Offset) 511 break; 512 513 MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2); 514 auto Width = foldToImm(*Src2); 515 if (!Width) 516 break; 517 518 SdwaSel SrcSel = DWORD; 519 520 if (*Offset == 0 && *Width == 8) 521 SrcSel = BYTE_0; 522 else if (*Offset == 0 && *Width == 16) 523 SrcSel = WORD_0; 524 else if (*Offset == 0 && *Width == 32) 525 SrcSel = DWORD; 526 else if (*Offset == 8 && *Width == 8) 527 SrcSel = BYTE_1; 528 else if (*Offset == 16 && *Width == 8) 529 SrcSel = BYTE_2; 530 else if (*Offset == 16 && *Width == 16) 531 SrcSel = WORD_1; 532 else if (*Offset == 24 && *Width == 8) 533 SrcSel = BYTE_3; 534 else 535 break; 536 537 MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); 538 MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); 539 540 if (TRI->isPhysicalRegister(Src0->getReg()) || 541 TRI->isPhysicalRegister(Dst->getReg())) 542 break; 543 544 auto SDWASrc = make_unique<SDWASrcOperand>( 545 Src0, Dst, SrcSel, false, false, 546 Opcode == AMDGPU::V_BFE_U32 ? false : true); 547 DEBUG(dbgs() << "Match: " << MI << "To: " << *SDWASrc << '\n'); 548 SDWAOperands[&MI] = std::move(SDWASrc); 549 ++NumSDWAPatternsFound; 550 break; 551 } 552 case AMDGPU::V_AND_B32_e32: { 553 // e.g.: 554 // from: v_and_b32_e32 v1, 0x0000ffff/0x000000ff, v0 555 // to SDWA src:v0 src_sel:WORD_0/BYTE_0 556 557 MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); 558 auto Imm = foldToImm(*Src0); 559 if (!Imm) 560 break; 561 562 if (*Imm != 0x0000ffff && *Imm != 0x000000ff) 563 break; 564 565 MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); 566 MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); 567 568 if (TRI->isPhysicalRegister(Src1->getReg()) || 569 TRI->isPhysicalRegister(Dst->getReg())) 570 break; 571 572 auto SDWASrc = make_unique<SDWASrcOperand>( 573 Src1, Dst, *Imm == 0x0000ffff ? WORD_0 : BYTE_0); 574 DEBUG(dbgs() << "Match: " << MI << "To: " << *SDWASrc << '\n'); 575 SDWAOperands[&MI] = std::move(SDWASrc); 576 ++NumSDWAPatternsFound; 577 break; 578 } 579 } 580 } 581 } 582 583 bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI, 584 const SDWAOperandsVector &SDWAOperands) { 585 // Check if this instruction can be converted to SDWA: 586 // 1. Does this opcode support SDWA 587 if (AMDGPU::getSDWAOp(MI.getOpcode()) == -1) 588 return false; 589 590 // 2. Are all operands - VGPRs 591 for (const MachineOperand &Operand : MI.explicit_operands()) { 592 if (!Operand.isReg() || !TRI->isVGPR(*MRI, Operand.getReg())) 593 return false; 594 } 595 596 // Convert to sdwa 597 int SDWAOpcode = AMDGPU::getSDWAOp(MI.getOpcode()); 598 assert(SDWAOpcode != -1); 599 600 const MCInstrDesc &SDWADesc = TII->get(SDWAOpcode); 601 602 // Create SDWA version of instruction MI and initialize its operands 603 MachineInstrBuilder SDWAInst = 604 BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), SDWADesc); 605 606 // Copy dst, if it is present in original then should also be present in SDWA 607 MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); 608 if (Dst) { 609 assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::vdst) != -1); 610 SDWAInst.add(*Dst); 611 } else { 612 assert(TII->isVOPC(MI)); 613 } 614 615 // Copy src0, initialize src0_modifiers. All sdwa instructions has src0 and 616 // src0_modifiers (except for v_nop_sdwa, but it can't get here) 617 MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); 618 assert( 619 Src0 && 620 AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src0) != -1 && 621 AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src0_modifiers) != -1); 622 SDWAInst.addImm(0); 623 SDWAInst.add(*Src0); 624 625 // Copy src1 if present, initialize src1_modifiers. 626 MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); 627 if (Src1) { 628 assert( 629 AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src1) != -1 && 630 AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src1_modifiers) != -1); 631 SDWAInst.addImm(0); 632 SDWAInst.add(*Src1); 633 } else { 634 assert(TII->isVOP1(MI)); 635 } 636 637 if (SDWAOpcode == AMDGPU::V_MAC_F16_sdwa || 638 SDWAOpcode == AMDGPU::V_MAC_F32_sdwa) { 639 // v_mac_f16/32 has additional src2 operand tied to vdst 640 MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2); 641 assert(Src2); 642 SDWAInst.add(*Src2); 643 } 644 645 // Initialize clamp. 646 assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::clamp) != -1); 647 SDWAInst.addImm(0); 648 649 // Initialize dst_sel and dst_unused if present 650 if (Dst) { 651 assert( 652 AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::dst_sel) != -1 && 653 AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::dst_unused) != -1); 654 SDWAInst.addImm(AMDGPU::SDWA::SdwaSel::DWORD); 655 SDWAInst.addImm(AMDGPU::SDWA::DstUnused::UNUSED_PAD); 656 } 657 658 // Initialize src0_sel 659 assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src0_sel) != -1); 660 SDWAInst.addImm(AMDGPU::SDWA::SdwaSel::DWORD); 661 662 663 // Initialize src1_sel if present 664 if (Src1) { 665 assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src1_sel) != -1); 666 SDWAInst.addImm(AMDGPU::SDWA::SdwaSel::DWORD); 667 } 668 669 // Apply all sdwa operand pattenrs 670 bool Converted = false; 671 for (auto &Operand : SDWAOperands) { 672 Converted |= Operand->convertToSDWA(*SDWAInst, TII); 673 } 674 if (!Converted) { 675 SDWAInst->eraseFromParent(); 676 return false; 677 } 678 679 DEBUG(dbgs() << "Convert instruction:" << MI 680 << "Into:" << *SDWAInst << '\n'); 681 ++NumSDWAInstructionsPeepholed; 682 683 MI.eraseFromParent(); 684 return true; 685 } 686 687 bool SIPeepholeSDWA::runOnMachineFunction(MachineFunction &MF) { 688 const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); 689 690 if (!ST.hasSDWA() || 691 !AMDGPU::isVI(ST)) { // TODO: Add support for SDWA on gfx9 692 return false; 693 } 694 695 MRI = &MF.getRegInfo(); 696 TRI = ST.getRegisterInfo(); 697 TII = ST.getInstrInfo(); 698 699 std::unordered_map<MachineInstr *, SDWAOperandsVector> PotentialMatches; 700 701 // FIXME: For now we only combine instructions in one basic block 702 for (MachineBasicBlock &MBB : MF) { 703 SDWAOperands.clear(); 704 matchSDWAOperands(MBB); 705 706 PotentialMatches.clear(); 707 for (auto &OperandPair : SDWAOperands) { 708 auto &Operand = OperandPair.second; 709 MachineInstr *PotentialMI = Operand->potentialToConvert(TII); 710 if (PotentialMI) { 711 PotentialMatches[PotentialMI].push_back(std::move(Operand)); 712 } 713 } 714 715 for (auto &PotentialPair : PotentialMatches) { 716 MachineInstr &PotentialMI = *PotentialPair.first; 717 convertToSDWA(PotentialMI, PotentialPair.second); 718 } 719 } 720 return false; 721 } 722