1 //===- SIPeepholeSDWA.cpp - Peephole optimization for SDWA instructions ---===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file This pass tries to apply several peephole SDWA patterns. 10 /// 11 /// E.g. original: 12 /// V_LSHRREV_B32_e32 %0, 16, %1 13 /// V_ADD_CO_U32_e32 %2, %0, %3 14 /// V_LSHLREV_B32_e32 %4, 16, %2 15 /// 16 /// Replace: 17 /// V_ADD_CO_U32_sdwa %4, %1, %3 18 /// dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 19 /// 20 //===----------------------------------------------------------------------===// 21 22 #include "AMDGPU.h" 23 #include "GCNSubtarget.h" 24 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 25 #include "llvm/ADT/MapVector.h" 26 #include "llvm/ADT/Statistic.h" 27 #include "llvm/CodeGen/MachineFunctionPass.h" 28 #include <optional> 29 30 using namespace llvm; 31 32 #define DEBUG_TYPE "si-peephole-sdwa" 33 34 STATISTIC(NumSDWAPatternsFound, "Number of SDWA patterns found."); 35 STATISTIC(NumSDWAInstructionsPeepholed, 36 "Number of instruction converted to SDWA."); 37 38 namespace { 39 40 bool isConvertibleToSDWA(MachineInstr &MI, const GCNSubtarget &ST, 41 const SIInstrInfo *TII); 42 class SDWAOperand; 43 class SDWADstOperand; 44 45 using SDWAOperandsVector = SmallVector<SDWAOperand *, 4>; 46 using SDWAOperandsMap = MapVector<MachineInstr *, SDWAOperandsVector>; 47 48 class SIPeepholeSDWA : public MachineFunctionPass { 49 private: 50 MachineRegisterInfo *MRI; 51 const SIRegisterInfo *TRI; 52 const SIInstrInfo *TII; 53 54 MapVector<MachineInstr *, std::unique_ptr<SDWAOperand>> SDWAOperands; 55 SDWAOperandsMap PotentialMatches; 56 SmallVector<MachineInstr *, 8> ConvertedInstructions; 57 58 std::optional<int64_t> foldToImm(const MachineOperand &Op) const; 59 60 public: 61 static char ID; 62 63 SIPeepholeSDWA() : MachineFunctionPass(ID) { 64 initializeSIPeepholeSDWAPass(*PassRegistry::getPassRegistry()); 65 } 66 67 bool runOnMachineFunction(MachineFunction &MF) override; 68 void matchSDWAOperands(MachineBasicBlock &MBB); 69 std::unique_ptr<SDWAOperand> matchSDWAOperand(MachineInstr &MI); 70 void pseudoOpConvertToVOP2(MachineInstr &MI, 71 const GCNSubtarget &ST) const; 72 bool convertToSDWA(MachineInstr &MI, const SDWAOperandsVector &SDWAOperands); 73 void legalizeScalarOperands(MachineInstr &MI, const GCNSubtarget &ST) const; 74 75 StringRef getPassName() const override { return "SI Peephole SDWA"; } 76 77 void getAnalysisUsage(AnalysisUsage &AU) const override { 78 AU.setPreservesCFG(); 79 MachineFunctionPass::getAnalysisUsage(AU); 80 } 81 }; 82 83 class SDWAOperand { 84 private: 85 MachineOperand *Target; // Operand that would be used in converted instruction 86 MachineOperand *Replaced; // Operand that would be replace by Target 87 88 public: 89 SDWAOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp) 90 : Target(TargetOp), Replaced(ReplacedOp) { 91 assert(Target->isReg()); 92 assert(Replaced->isReg()); 93 } 94 95 virtual ~SDWAOperand() = default; 96 97 virtual MachineInstr *potentialToConvert(const SIInstrInfo *TII, 98 const GCNSubtarget &ST, 99 SDWAOperandsMap *PotentialMatches = nullptr) = 0; 100 virtual bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) = 0; 101 102 MachineOperand *getTargetOperand() const { return Target; } 103 MachineOperand *getReplacedOperand() const { return Replaced; } 104 MachineInstr *getParentInst() const { return Target->getParent(); } 105 106 MachineRegisterInfo *getMRI() const { 107 return &getParentInst()->getParent()->getParent()->getRegInfo(); 108 } 109 110 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 111 virtual void print(raw_ostream& OS) const = 0; 112 void dump() const { print(dbgs()); } 113 #endif 114 }; 115 116 using namespace AMDGPU::SDWA; 117 118 class SDWASrcOperand : public SDWAOperand { 119 private: 120 SdwaSel SrcSel; 121 bool Abs; 122 bool Neg; 123 bool Sext; 124 125 public: 126 SDWASrcOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp, 127 SdwaSel SrcSel_ = DWORD, bool Abs_ = false, bool Neg_ = false, 128 bool Sext_ = false) 129 : SDWAOperand(TargetOp, ReplacedOp), 130 SrcSel(SrcSel_), Abs(Abs_), Neg(Neg_), Sext(Sext_) {} 131 132 MachineInstr *potentialToConvert(const SIInstrInfo *TII, 133 const GCNSubtarget &ST, 134 SDWAOperandsMap *PotentialMatches = nullptr) override; 135 bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override; 136 137 SdwaSel getSrcSel() const { return SrcSel; } 138 bool getAbs() const { return Abs; } 139 bool getNeg() const { return Neg; } 140 bool getSext() const { return Sext; } 141 142 uint64_t getSrcMods(const SIInstrInfo *TII, 143 const MachineOperand *SrcOp) const; 144 145 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 146 void print(raw_ostream& OS) const override; 147 #endif 148 }; 149 150 class SDWADstOperand : public SDWAOperand { 151 private: 152 SdwaSel DstSel; 153 DstUnused DstUn; 154 155 public: 156 157 SDWADstOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp, 158 SdwaSel DstSel_ = DWORD, DstUnused DstUn_ = UNUSED_PAD) 159 : SDWAOperand(TargetOp, ReplacedOp), DstSel(DstSel_), DstUn(DstUn_) {} 160 161 MachineInstr *potentialToConvert(const SIInstrInfo *TII, 162 const GCNSubtarget &ST, 163 SDWAOperandsMap *PotentialMatches = nullptr) override; 164 bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override; 165 166 SdwaSel getDstSel() const { return DstSel; } 167 DstUnused getDstUnused() const { return DstUn; } 168 169 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 170 void print(raw_ostream& OS) const override; 171 #endif 172 }; 173 174 class SDWADstPreserveOperand : public SDWADstOperand { 175 private: 176 MachineOperand *Preserve; 177 178 public: 179 SDWADstPreserveOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp, 180 MachineOperand *PreserveOp, SdwaSel DstSel_ = DWORD) 181 : SDWADstOperand(TargetOp, ReplacedOp, DstSel_, UNUSED_PRESERVE), 182 Preserve(PreserveOp) {} 183 184 bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override; 185 186 MachineOperand *getPreservedOperand() const { return Preserve; } 187 188 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 189 void print(raw_ostream& OS) const override; 190 #endif 191 }; 192 193 } // end anonymous namespace 194 195 INITIALIZE_PASS(SIPeepholeSDWA, DEBUG_TYPE, "SI Peephole SDWA", false, false) 196 197 char SIPeepholeSDWA::ID = 0; 198 199 char &llvm::SIPeepholeSDWAID = SIPeepholeSDWA::ID; 200 201 FunctionPass *llvm::createSIPeepholeSDWAPass() { 202 return new SIPeepholeSDWA(); 203 } 204 205 206 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 207 static raw_ostream& operator<<(raw_ostream &OS, SdwaSel Sel) { 208 switch(Sel) { 209 case BYTE_0: OS << "BYTE_0"; break; 210 case BYTE_1: OS << "BYTE_1"; break; 211 case BYTE_2: OS << "BYTE_2"; break; 212 case BYTE_3: OS << "BYTE_3"; break; 213 case WORD_0: OS << "WORD_0"; break; 214 case WORD_1: OS << "WORD_1"; break; 215 case DWORD: OS << "DWORD"; break; 216 } 217 return OS; 218 } 219 220 static raw_ostream& operator<<(raw_ostream &OS, const DstUnused &Un) { 221 switch(Un) { 222 case UNUSED_PAD: OS << "UNUSED_PAD"; break; 223 case UNUSED_SEXT: OS << "UNUSED_SEXT"; break; 224 case UNUSED_PRESERVE: OS << "UNUSED_PRESERVE"; break; 225 } 226 return OS; 227 } 228 229 LLVM_DUMP_METHOD 230 void SDWASrcOperand::print(raw_ostream& OS) const { 231 OS << "SDWA src: " << *getTargetOperand() 232 << " src_sel:" << getSrcSel() 233 << " abs:" << getAbs() << " neg:" << getNeg() 234 << " sext:" << getSext() << '\n'; 235 } 236 237 LLVM_DUMP_METHOD 238 void SDWADstOperand::print(raw_ostream& OS) const { 239 OS << "SDWA dst: " << *getTargetOperand() 240 << " dst_sel:" << getDstSel() 241 << " dst_unused:" << getDstUnused() << '\n'; 242 } 243 244 LLVM_DUMP_METHOD 245 void SDWADstPreserveOperand::print(raw_ostream& OS) const { 246 OS << "SDWA preserve dst: " << *getTargetOperand() 247 << " dst_sel:" << getDstSel() 248 << " preserve:" << *getPreservedOperand() << '\n'; 249 } 250 251 #endif 252 253 static void copyRegOperand(MachineOperand &To, const MachineOperand &From) { 254 assert(To.isReg() && From.isReg()); 255 To.setReg(From.getReg()); 256 To.setSubReg(From.getSubReg()); 257 To.setIsUndef(From.isUndef()); 258 if (To.isUse()) { 259 To.setIsKill(From.isKill()); 260 } else { 261 To.setIsDead(From.isDead()); 262 } 263 } 264 265 static bool isSameReg(const MachineOperand &LHS, const MachineOperand &RHS) { 266 return LHS.isReg() && 267 RHS.isReg() && 268 LHS.getReg() == RHS.getReg() && 269 LHS.getSubReg() == RHS.getSubReg(); 270 } 271 272 static MachineOperand *findSingleRegUse(const MachineOperand *Reg, 273 const MachineRegisterInfo *MRI) { 274 if (!Reg->isReg() || !Reg->isDef()) 275 return nullptr; 276 277 MachineOperand *ResMO = nullptr; 278 for (MachineOperand &UseMO : MRI->use_nodbg_operands(Reg->getReg())) { 279 // If there exist use of subreg of Reg then return nullptr 280 if (!isSameReg(UseMO, *Reg)) 281 return nullptr; 282 283 // Check that there is only one instruction that uses Reg 284 if (!ResMO) { 285 ResMO = &UseMO; 286 } else if (ResMO->getParent() != UseMO.getParent()) { 287 return nullptr; 288 } 289 } 290 291 return ResMO; 292 } 293 294 static MachineOperand *findSingleRegDef(const MachineOperand *Reg, 295 const MachineRegisterInfo *MRI) { 296 if (!Reg->isReg()) 297 return nullptr; 298 299 MachineInstr *DefInstr = MRI->getUniqueVRegDef(Reg->getReg()); 300 if (!DefInstr) 301 return nullptr; 302 303 for (auto &DefMO : DefInstr->defs()) { 304 if (DefMO.isReg() && DefMO.getReg() == Reg->getReg()) 305 return &DefMO; 306 } 307 308 // Ignore implicit defs. 309 return nullptr; 310 } 311 312 uint64_t SDWASrcOperand::getSrcMods(const SIInstrInfo *TII, 313 const MachineOperand *SrcOp) const { 314 uint64_t Mods = 0; 315 const auto *MI = SrcOp->getParent(); 316 if (TII->getNamedOperand(*MI, AMDGPU::OpName::src0) == SrcOp) { 317 if (auto *Mod = TII->getNamedOperand(*MI, AMDGPU::OpName::src0_modifiers)) { 318 Mods = Mod->getImm(); 319 } 320 } else if (TII->getNamedOperand(*MI, AMDGPU::OpName::src1) == SrcOp) { 321 if (auto *Mod = TII->getNamedOperand(*MI, AMDGPU::OpName::src1_modifiers)) { 322 Mods = Mod->getImm(); 323 } 324 } 325 if (Abs || Neg) { 326 assert(!Sext && 327 "Float and integer src modifiers can't be set simultaneously"); 328 Mods |= Abs ? SISrcMods::ABS : 0u; 329 Mods ^= Neg ? SISrcMods::NEG : 0u; 330 } else if (Sext) { 331 Mods |= SISrcMods::SEXT; 332 } 333 334 return Mods; 335 } 336 337 MachineInstr *SDWASrcOperand::potentialToConvert(const SIInstrInfo *TII, 338 const GCNSubtarget &ST, 339 SDWAOperandsMap *PotentialMatches) { 340 if (PotentialMatches != nullptr) { 341 // Fill out the map for all uses if all can be converted 342 MachineOperand *Reg = getReplacedOperand(); 343 if (!Reg->isReg() || !Reg->isDef()) 344 return nullptr; 345 346 for (MachineInstr &UseMI : getMRI()->use_nodbg_instructions(Reg->getReg())) 347 // Check that all instructions that use Reg can be converted 348 if (!isConvertibleToSDWA(UseMI, ST, TII)) 349 return nullptr; 350 351 // Now that it's guaranteed all uses are legal, iterate over the uses again 352 // to add them for later conversion. 353 for (MachineOperand &UseMO : getMRI()->use_nodbg_operands(Reg->getReg())) { 354 // Should not get a subregister here 355 assert(isSameReg(UseMO, *Reg)); 356 357 SDWAOperandsMap &potentialMatchesMap = *PotentialMatches; 358 MachineInstr *UseMI = UseMO.getParent(); 359 potentialMatchesMap[UseMI].push_back(this); 360 } 361 return nullptr; 362 } 363 364 // For SDWA src operand potential instruction is one that use register 365 // defined by parent instruction 366 MachineOperand *PotentialMO = findSingleRegUse(getReplacedOperand(), getMRI()); 367 if (!PotentialMO) 368 return nullptr; 369 370 return PotentialMO->getParent(); 371 } 372 373 bool SDWASrcOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) { 374 switch (MI.getOpcode()) { 375 case AMDGPU::V_CVT_F32_FP8_sdwa: 376 case AMDGPU::V_CVT_F32_BF8_sdwa: 377 case AMDGPU::V_CVT_PK_F32_FP8_sdwa: 378 case AMDGPU::V_CVT_PK_F32_BF8_sdwa: 379 // Does not support input modifiers: noabs, noneg, nosext. 380 return false; 381 } 382 383 // Find operand in instruction that matches source operand and replace it with 384 // target operand. Set corresponding src_sel 385 bool IsPreserveSrc = false; 386 MachineOperand *Src = TII->getNamedOperand(MI, AMDGPU::OpName::src0); 387 MachineOperand *SrcSel = TII->getNamedOperand(MI, AMDGPU::OpName::src0_sel); 388 MachineOperand *SrcMods = 389 TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers); 390 assert(Src && (Src->isReg() || Src->isImm())); 391 if (!isSameReg(*Src, *getReplacedOperand())) { 392 // If this is not src0 then it could be src1 393 Src = TII->getNamedOperand(MI, AMDGPU::OpName::src1); 394 SrcSel = TII->getNamedOperand(MI, AMDGPU::OpName::src1_sel); 395 SrcMods = TII->getNamedOperand(MI, AMDGPU::OpName::src1_modifiers); 396 397 if (!Src || 398 !isSameReg(*Src, *getReplacedOperand())) { 399 // It's possible this Src is a tied operand for 400 // UNUSED_PRESERVE, in which case we can either 401 // abandon the peephole attempt, or if legal we can 402 // copy the target operand into the tied slot 403 // if the preserve operation will effectively cause the same 404 // result by overwriting the rest of the dst. 405 MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); 406 MachineOperand *DstUnused = 407 TII->getNamedOperand(MI, AMDGPU::OpName::dst_unused); 408 409 if (Dst && 410 DstUnused->getImm() == AMDGPU::SDWA::DstUnused::UNUSED_PRESERVE) { 411 // This will work if the tied src is accessing WORD_0, and the dst is 412 // writing WORD_1. Modifiers don't matter because all the bits that 413 // would be impacted are being overwritten by the dst. 414 // Any other case will not work. 415 SdwaSel DstSel = static_cast<SdwaSel>( 416 TII->getNamedImmOperand(MI, AMDGPU::OpName::dst_sel)); 417 if (DstSel == AMDGPU::SDWA::SdwaSel::WORD_1 && 418 getSrcSel() == AMDGPU::SDWA::SdwaSel::WORD_0) { 419 IsPreserveSrc = true; 420 auto DstIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), 421 AMDGPU::OpName::vdst); 422 auto TiedIdx = MI.findTiedOperandIdx(DstIdx); 423 Src = &MI.getOperand(TiedIdx); 424 SrcSel = nullptr; 425 SrcMods = nullptr; 426 } else { 427 // Not legal to convert this src 428 return false; 429 } 430 } 431 } 432 assert(Src && Src->isReg()); 433 434 if ((MI.getOpcode() == AMDGPU::V_FMAC_F16_sdwa || 435 MI.getOpcode() == AMDGPU::V_FMAC_F32_sdwa || 436 MI.getOpcode() == AMDGPU::V_MAC_F16_sdwa || 437 MI.getOpcode() == AMDGPU::V_MAC_F32_sdwa) && 438 !isSameReg(*Src, *getReplacedOperand())) { 439 // In case of v_mac_f16/32_sdwa this pass can try to apply src operand to 440 // src2. This is not allowed. 441 return false; 442 } 443 444 assert(isSameReg(*Src, *getReplacedOperand()) && 445 (IsPreserveSrc || (SrcSel && SrcMods))); 446 } 447 copyRegOperand(*Src, *getTargetOperand()); 448 if (!IsPreserveSrc) { 449 SrcSel->setImm(getSrcSel()); 450 SrcMods->setImm(getSrcMods(TII, Src)); 451 } 452 getTargetOperand()->setIsKill(false); 453 return true; 454 } 455 456 MachineInstr *SDWADstOperand::potentialToConvert(const SIInstrInfo *TII, 457 const GCNSubtarget &ST, 458 SDWAOperandsMap *PotentialMatches) { 459 // For SDWA dst operand potential instruction is one that defines register 460 // that this operand uses 461 MachineRegisterInfo *MRI = getMRI(); 462 MachineInstr *ParentMI = getParentInst(); 463 464 MachineOperand *PotentialMO = findSingleRegDef(getReplacedOperand(), MRI); 465 if (!PotentialMO) 466 return nullptr; 467 468 // Check that ParentMI is the only instruction that uses replaced register 469 for (MachineInstr &UseInst : MRI->use_nodbg_instructions(PotentialMO->getReg())) { 470 if (&UseInst != ParentMI) 471 return nullptr; 472 } 473 474 return PotentialMO->getParent(); 475 } 476 477 bool SDWADstOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) { 478 // Replace vdst operand in MI with target operand. Set dst_sel and dst_unused 479 480 if ((MI.getOpcode() == AMDGPU::V_FMAC_F16_sdwa || 481 MI.getOpcode() == AMDGPU::V_FMAC_F32_sdwa || 482 MI.getOpcode() == AMDGPU::V_MAC_F16_sdwa || 483 MI.getOpcode() == AMDGPU::V_MAC_F32_sdwa) && 484 getDstSel() != AMDGPU::SDWA::DWORD) { 485 // v_mac_f16/32_sdwa allow dst_sel to be equal only to DWORD 486 return false; 487 } 488 489 MachineOperand *Operand = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); 490 assert(Operand && 491 Operand->isReg() && 492 isSameReg(*Operand, *getReplacedOperand())); 493 copyRegOperand(*Operand, *getTargetOperand()); 494 MachineOperand *DstSel= TII->getNamedOperand(MI, AMDGPU::OpName::dst_sel); 495 assert(DstSel); 496 DstSel->setImm(getDstSel()); 497 MachineOperand *DstUnused= TII->getNamedOperand(MI, AMDGPU::OpName::dst_unused); 498 assert(DstUnused); 499 DstUnused->setImm(getDstUnused()); 500 501 // Remove original instruction because it would conflict with our new 502 // instruction by register definition 503 getParentInst()->eraseFromParent(); 504 return true; 505 } 506 507 bool SDWADstPreserveOperand::convertToSDWA(MachineInstr &MI, 508 const SIInstrInfo *TII) { 509 // MI should be moved right before v_or_b32. 510 // For this we should clear all kill flags on uses of MI src-operands or else 511 // we can encounter problem with use of killed operand. 512 for (MachineOperand &MO : MI.uses()) { 513 if (!MO.isReg()) 514 continue; 515 getMRI()->clearKillFlags(MO.getReg()); 516 } 517 518 // Move MI before v_or_b32 519 MI.getParent()->remove(&MI); 520 getParentInst()->getParent()->insert(getParentInst(), &MI); 521 522 // Add Implicit use of preserved register 523 MachineInstrBuilder MIB(*MI.getMF(), MI); 524 MIB.addReg(getPreservedOperand()->getReg(), 525 RegState::ImplicitKill, 526 getPreservedOperand()->getSubReg()); 527 528 // Tie dst to implicit use 529 MI.tieOperands(AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdst), 530 MI.getNumOperands() - 1); 531 532 // Convert MI as any other SDWADstOperand and remove v_or_b32 533 return SDWADstOperand::convertToSDWA(MI, TII); 534 } 535 536 std::optional<int64_t> 537 SIPeepholeSDWA::foldToImm(const MachineOperand &Op) const { 538 if (Op.isImm()) { 539 return Op.getImm(); 540 } 541 542 // If this is not immediate then it can be copy of immediate value, e.g.: 543 // %1 = S_MOV_B32 255; 544 if (Op.isReg()) { 545 for (const MachineOperand &Def : MRI->def_operands(Op.getReg())) { 546 if (!isSameReg(Op, Def)) 547 continue; 548 549 const MachineInstr *DefInst = Def.getParent(); 550 if (!TII->isFoldableCopy(*DefInst)) 551 return std::nullopt; 552 553 const MachineOperand &Copied = DefInst->getOperand(1); 554 if (!Copied.isImm()) 555 return std::nullopt; 556 557 return Copied.getImm(); 558 } 559 } 560 561 return std::nullopt; 562 } 563 564 std::unique_ptr<SDWAOperand> 565 SIPeepholeSDWA::matchSDWAOperand(MachineInstr &MI) { 566 unsigned Opcode = MI.getOpcode(); 567 switch (Opcode) { 568 case AMDGPU::V_LSHRREV_B32_e32: 569 case AMDGPU::V_ASHRREV_I32_e32: 570 case AMDGPU::V_LSHLREV_B32_e32: 571 case AMDGPU::V_LSHRREV_B32_e64: 572 case AMDGPU::V_ASHRREV_I32_e64: 573 case AMDGPU::V_LSHLREV_B32_e64: { 574 // from: v_lshrrev_b32_e32 v1, 16/24, v0 575 // to SDWA src:v0 src_sel:WORD_1/BYTE_3 576 577 // from: v_ashrrev_i32_e32 v1, 16/24, v0 578 // to SDWA src:v0 src_sel:WORD_1/BYTE_3 sext:1 579 580 // from: v_lshlrev_b32_e32 v1, 16/24, v0 581 // to SDWA dst:v1 dst_sel:WORD_1/BYTE_3 dst_unused:UNUSED_PAD 582 MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); 583 auto Imm = foldToImm(*Src0); 584 if (!Imm) 585 break; 586 587 if (*Imm != 16 && *Imm != 24) 588 break; 589 590 MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); 591 MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); 592 if (!Src1->isReg() || Src1->getReg().isPhysical() || 593 Dst->getReg().isPhysical()) 594 break; 595 596 if (Opcode == AMDGPU::V_LSHLREV_B32_e32 || 597 Opcode == AMDGPU::V_LSHLREV_B32_e64) { 598 return std::make_unique<SDWADstOperand>( 599 Dst, Src1, *Imm == 16 ? WORD_1 : BYTE_3, UNUSED_PAD); 600 } else { 601 return std::make_unique<SDWASrcOperand>( 602 Src1, Dst, *Imm == 16 ? WORD_1 : BYTE_3, false, false, 603 Opcode != AMDGPU::V_LSHRREV_B32_e32 && 604 Opcode != AMDGPU::V_LSHRREV_B32_e64); 605 } 606 break; 607 } 608 609 case AMDGPU::V_LSHRREV_B16_e32: 610 case AMDGPU::V_ASHRREV_I16_e32: 611 case AMDGPU::V_LSHLREV_B16_e32: 612 case AMDGPU::V_LSHRREV_B16_e64: 613 case AMDGPU::V_ASHRREV_I16_e64: 614 case AMDGPU::V_LSHLREV_B16_e64: { 615 // from: v_lshrrev_b16_e32 v1, 8, v0 616 // to SDWA src:v0 src_sel:BYTE_1 617 618 // from: v_ashrrev_i16_e32 v1, 8, v0 619 // to SDWA src:v0 src_sel:BYTE_1 sext:1 620 621 // from: v_lshlrev_b16_e32 v1, 8, v0 622 // to SDWA dst:v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD 623 MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); 624 auto Imm = foldToImm(*Src0); 625 if (!Imm || *Imm != 8) 626 break; 627 628 MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); 629 MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); 630 631 if (!Src1->isReg() || Src1->getReg().isPhysical() || 632 Dst->getReg().isPhysical()) 633 break; 634 635 if (Opcode == AMDGPU::V_LSHLREV_B16_e32 || 636 Opcode == AMDGPU::V_LSHLREV_B16_e64) { 637 return std::make_unique<SDWADstOperand>(Dst, Src1, BYTE_1, UNUSED_PAD); 638 } else { 639 return std::make_unique<SDWASrcOperand>( 640 Src1, Dst, BYTE_1, false, false, 641 Opcode != AMDGPU::V_LSHRREV_B16_e32 && 642 Opcode != AMDGPU::V_LSHRREV_B16_e64); 643 } 644 break; 645 } 646 647 case AMDGPU::V_BFE_I32_e64: 648 case AMDGPU::V_BFE_U32_e64: { 649 // e.g.: 650 // from: v_bfe_u32 v1, v0, 8, 8 651 // to SDWA src:v0 src_sel:BYTE_1 652 653 // offset | width | src_sel 654 // ------------------------ 655 // 0 | 8 | BYTE_0 656 // 0 | 16 | WORD_0 657 // 0 | 32 | DWORD ? 658 // 8 | 8 | BYTE_1 659 // 16 | 8 | BYTE_2 660 // 16 | 16 | WORD_1 661 // 24 | 8 | BYTE_3 662 663 MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); 664 auto Offset = foldToImm(*Src1); 665 if (!Offset) 666 break; 667 668 MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2); 669 auto Width = foldToImm(*Src2); 670 if (!Width) 671 break; 672 673 SdwaSel SrcSel = DWORD; 674 675 if (*Offset == 0 && *Width == 8) 676 SrcSel = BYTE_0; 677 else if (*Offset == 0 && *Width == 16) 678 SrcSel = WORD_0; 679 else if (*Offset == 0 && *Width == 32) 680 SrcSel = DWORD; 681 else if (*Offset == 8 && *Width == 8) 682 SrcSel = BYTE_1; 683 else if (*Offset == 16 && *Width == 8) 684 SrcSel = BYTE_2; 685 else if (*Offset == 16 && *Width == 16) 686 SrcSel = WORD_1; 687 else if (*Offset == 24 && *Width == 8) 688 SrcSel = BYTE_3; 689 else 690 break; 691 692 MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); 693 MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); 694 695 if (!Src0->isReg() || Src0->getReg().isPhysical() || 696 Dst->getReg().isPhysical()) 697 break; 698 699 return std::make_unique<SDWASrcOperand>( 700 Src0, Dst, SrcSel, false, false, Opcode != AMDGPU::V_BFE_U32_e64); 701 } 702 703 case AMDGPU::V_AND_B32_e32: 704 case AMDGPU::V_AND_B32_e64: { 705 // e.g.: 706 // from: v_and_b32_e32 v1, 0x0000ffff/0x000000ff, v0 707 // to SDWA src:v0 src_sel:WORD_0/BYTE_0 708 709 MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); 710 MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); 711 auto ValSrc = Src1; 712 auto Imm = foldToImm(*Src0); 713 714 if (!Imm) { 715 Imm = foldToImm(*Src1); 716 ValSrc = Src0; 717 } 718 719 if (!Imm || (*Imm != 0x0000ffff && *Imm != 0x000000ff)) 720 break; 721 722 MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); 723 724 if (!ValSrc->isReg() || ValSrc->getReg().isPhysical() || 725 Dst->getReg().isPhysical()) 726 break; 727 728 return std::make_unique<SDWASrcOperand>( 729 ValSrc, Dst, *Imm == 0x0000ffff ? WORD_0 : BYTE_0); 730 } 731 732 case AMDGPU::V_OR_B32_e32: 733 case AMDGPU::V_OR_B32_e64: { 734 // Patterns for dst_unused:UNUSED_PRESERVE. 735 // e.g., from: 736 // v_add_f16_sdwa v0, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD 737 // src1_sel:WORD_1 src2_sel:WORD1 738 // v_add_f16_e32 v3, v1, v2 739 // v_or_b32_e32 v4, v0, v3 740 // to SDWA preserve dst:v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE preserve:v3 741 742 // Check if one of operands of v_or_b32 is SDWA instruction 743 using CheckRetType = 744 std::optional<std::pair<MachineOperand *, MachineOperand *>>; 745 auto CheckOROperandsForSDWA = 746 [&](const MachineOperand *Op1, const MachineOperand *Op2) -> CheckRetType { 747 if (!Op1 || !Op1->isReg() || !Op2 || !Op2->isReg()) 748 return CheckRetType(std::nullopt); 749 750 MachineOperand *Op1Def = findSingleRegDef(Op1, MRI); 751 if (!Op1Def) 752 return CheckRetType(std::nullopt); 753 754 MachineInstr *Op1Inst = Op1Def->getParent(); 755 if (!TII->isSDWA(*Op1Inst)) 756 return CheckRetType(std::nullopt); 757 758 MachineOperand *Op2Def = findSingleRegDef(Op2, MRI); 759 if (!Op2Def) 760 return CheckRetType(std::nullopt); 761 762 return CheckRetType(std::pair(Op1Def, Op2Def)); 763 }; 764 765 MachineOperand *OrSDWA = TII->getNamedOperand(MI, AMDGPU::OpName::src0); 766 MachineOperand *OrOther = TII->getNamedOperand(MI, AMDGPU::OpName::src1); 767 assert(OrSDWA && OrOther); 768 auto Res = CheckOROperandsForSDWA(OrSDWA, OrOther); 769 if (!Res) { 770 OrSDWA = TII->getNamedOperand(MI, AMDGPU::OpName::src1); 771 OrOther = TII->getNamedOperand(MI, AMDGPU::OpName::src0); 772 assert(OrSDWA && OrOther); 773 Res = CheckOROperandsForSDWA(OrSDWA, OrOther); 774 if (!Res) 775 break; 776 } 777 778 MachineOperand *OrSDWADef = Res->first; 779 MachineOperand *OrOtherDef = Res->second; 780 assert(OrSDWADef && OrOtherDef); 781 782 MachineInstr *SDWAInst = OrSDWADef->getParent(); 783 MachineInstr *OtherInst = OrOtherDef->getParent(); 784 785 // Check that OtherInstr is actually bitwise compatible with SDWAInst = their 786 // destination patterns don't overlap. Compatible instruction can be either 787 // regular instruction with compatible bitness or SDWA instruction with 788 // correct dst_sel 789 // SDWAInst | OtherInst bitness / OtherInst dst_sel 790 // ----------------------------------------------------- 791 // DWORD | no / no 792 // WORD_0 | no / BYTE_2/3, WORD_1 793 // WORD_1 | 8/16-bit instructions / BYTE_0/1, WORD_0 794 // BYTE_0 | no / BYTE_1/2/3, WORD_1 795 // BYTE_1 | 8-bit / BYTE_0/2/3, WORD_1 796 // BYTE_2 | 8/16-bit / BYTE_0/1/3. WORD_0 797 // BYTE_3 | 8/16/24-bit / BYTE_0/1/2, WORD_0 798 // E.g. if SDWAInst is v_add_f16_sdwa dst_sel:WORD_1 then v_add_f16 is OK 799 // but v_add_f32 is not. 800 801 // TODO: add support for non-SDWA instructions as OtherInst. 802 // For now this only works with SDWA instructions. For regular instructions 803 // there is no way to determine if the instruction writes only 8/16/24-bit 804 // out of full register size and all registers are at min 32-bit wide. 805 if (!TII->isSDWA(*OtherInst)) 806 break; 807 808 SdwaSel DstSel = static_cast<SdwaSel>( 809 TII->getNamedImmOperand(*SDWAInst, AMDGPU::OpName::dst_sel)); 810 SdwaSel OtherDstSel = static_cast<SdwaSel>( 811 TII->getNamedImmOperand(*OtherInst, AMDGPU::OpName::dst_sel)); 812 813 bool DstSelAgree = false; 814 switch (DstSel) { 815 case WORD_0: DstSelAgree = ((OtherDstSel == BYTE_2) || 816 (OtherDstSel == BYTE_3) || 817 (OtherDstSel == WORD_1)); 818 break; 819 case WORD_1: DstSelAgree = ((OtherDstSel == BYTE_0) || 820 (OtherDstSel == BYTE_1) || 821 (OtherDstSel == WORD_0)); 822 break; 823 case BYTE_0: DstSelAgree = ((OtherDstSel == BYTE_1) || 824 (OtherDstSel == BYTE_2) || 825 (OtherDstSel == BYTE_3) || 826 (OtherDstSel == WORD_1)); 827 break; 828 case BYTE_1: DstSelAgree = ((OtherDstSel == BYTE_0) || 829 (OtherDstSel == BYTE_2) || 830 (OtherDstSel == BYTE_3) || 831 (OtherDstSel == WORD_1)); 832 break; 833 case BYTE_2: DstSelAgree = ((OtherDstSel == BYTE_0) || 834 (OtherDstSel == BYTE_1) || 835 (OtherDstSel == BYTE_3) || 836 (OtherDstSel == WORD_0)); 837 break; 838 case BYTE_3: DstSelAgree = ((OtherDstSel == BYTE_0) || 839 (OtherDstSel == BYTE_1) || 840 (OtherDstSel == BYTE_2) || 841 (OtherDstSel == WORD_0)); 842 break; 843 default: DstSelAgree = false; 844 } 845 846 if (!DstSelAgree) 847 break; 848 849 // Also OtherInst dst_unused should be UNUSED_PAD 850 DstUnused OtherDstUnused = static_cast<DstUnused>( 851 TII->getNamedImmOperand(*OtherInst, AMDGPU::OpName::dst_unused)); 852 if (OtherDstUnused != DstUnused::UNUSED_PAD) 853 break; 854 855 // Create DstPreserveOperand 856 MachineOperand *OrDst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); 857 assert(OrDst && OrDst->isReg()); 858 859 return std::make_unique<SDWADstPreserveOperand>( 860 OrDst, OrSDWADef, OrOtherDef, DstSel); 861 862 } 863 } 864 865 return std::unique_ptr<SDWAOperand>(nullptr); 866 } 867 868 #if !defined(NDEBUG) 869 static raw_ostream& operator<<(raw_ostream &OS, const SDWAOperand &Operand) { 870 Operand.print(OS); 871 return OS; 872 } 873 #endif 874 875 void SIPeepholeSDWA::matchSDWAOperands(MachineBasicBlock &MBB) { 876 for (MachineInstr &MI : MBB) { 877 if (auto Operand = matchSDWAOperand(MI)) { 878 LLVM_DEBUG(dbgs() << "Match: " << MI << "To: " << *Operand << '\n'); 879 SDWAOperands[&MI] = std::move(Operand); 880 ++NumSDWAPatternsFound; 881 } 882 } 883 } 884 885 // Convert the V_ADD_CO_U32_e64 into V_ADD_CO_U32_e32. This allows 886 // isConvertibleToSDWA to perform its transformation on V_ADD_CO_U32_e32 into 887 // V_ADD_CO_U32_sdwa. 888 // 889 // We are transforming from a VOP3 into a VOP2 form of the instruction. 890 // %19:vgpr_32 = V_AND_B32_e32 255, 891 // killed %16:vgpr_32, implicit $exec 892 // %47:vgpr_32, %49:sreg_64_xexec = V_ADD_CO_U32_e64 893 // %26.sub0:vreg_64, %19:vgpr_32, implicit $exec 894 // %48:vgpr_32, dead %50:sreg_64_xexec = V_ADDC_U32_e64 895 // %26.sub1:vreg_64, %54:vgpr_32, killed %49:sreg_64_xexec, implicit $exec 896 // 897 // becomes 898 // %47:vgpr_32 = V_ADD_CO_U32_sdwa 899 // 0, %26.sub0:vreg_64, 0, killed %16:vgpr_32, 0, 6, 0, 6, 0, 900 // implicit-def $vcc, implicit $exec 901 // %48:vgpr_32, dead %50:sreg_64_xexec = V_ADDC_U32_e64 902 // %26.sub1:vreg_64, %54:vgpr_32, killed $vcc, implicit $exec 903 void SIPeepholeSDWA::pseudoOpConvertToVOP2(MachineInstr &MI, 904 const GCNSubtarget &ST) const { 905 int Opc = MI.getOpcode(); 906 assert((Opc == AMDGPU::V_ADD_CO_U32_e64 || Opc == AMDGPU::V_SUB_CO_U32_e64) && 907 "Currently only handles V_ADD_CO_U32_e64 or V_SUB_CO_U32_e64"); 908 909 // Can the candidate MI be shrunk? 910 if (!TII->canShrink(MI, *MRI)) 911 return; 912 Opc = AMDGPU::getVOPe32(Opc); 913 // Find the related ADD instruction. 914 const MachineOperand *Sdst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst); 915 if (!Sdst) 916 return; 917 MachineOperand *NextOp = findSingleRegUse(Sdst, MRI); 918 if (!NextOp) 919 return; 920 MachineInstr &MISucc = *NextOp->getParent(); 921 922 // Make sure the carry in/out are subsequently unused. 923 MachineOperand *CarryIn = TII->getNamedOperand(MISucc, AMDGPU::OpName::src2); 924 if (!CarryIn) 925 return; 926 MachineOperand *CarryOut = TII->getNamedOperand(MISucc, AMDGPU::OpName::sdst); 927 if (!CarryOut) 928 return; 929 if (!MRI->hasOneUse(CarryIn->getReg()) || !MRI->use_empty(CarryOut->getReg())) 930 return; 931 // Make sure VCC or its subregs are dead before MI. 932 MachineBasicBlock &MBB = *MI.getParent(); 933 auto Liveness = MBB.computeRegisterLiveness(TRI, AMDGPU::VCC, MI, 25); 934 if (Liveness != MachineBasicBlock::LQR_Dead) 935 return; 936 // Check if VCC is referenced in range of (MI,MISucc]. 937 for (auto I = std::next(MI.getIterator()), E = MISucc.getIterator(); 938 I != E; ++I) { 939 if (I->modifiesRegister(AMDGPU::VCC, TRI)) 940 return; 941 } 942 943 // Replace MI with V_{SUB|ADD}_I32_e32 944 BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(Opc)) 945 .add(*TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) 946 .add(*TII->getNamedOperand(MI, AMDGPU::OpName::src0)) 947 .add(*TII->getNamedOperand(MI, AMDGPU::OpName::src1)) 948 .setMIFlags(MI.getFlags()); 949 950 MI.eraseFromParent(); 951 952 // Since the carry output of MI is now VCC, update its use in MISucc. 953 954 MISucc.substituteRegister(CarryIn->getReg(), TRI->getVCC(), 0, *TRI); 955 } 956 957 namespace { 958 bool isConvertibleToSDWA(MachineInstr &MI, 959 const GCNSubtarget &ST, 960 const SIInstrInfo* TII) { 961 // Check if this is already an SDWA instruction 962 unsigned Opc = MI.getOpcode(); 963 if (TII->isSDWA(Opc)) 964 return true; 965 966 // Check if this instruction has opcode that supports SDWA 967 if (AMDGPU::getSDWAOp(Opc) == -1) 968 Opc = AMDGPU::getVOPe32(Opc); 969 970 if (AMDGPU::getSDWAOp(Opc) == -1) 971 return false; 972 973 if (!ST.hasSDWAOmod() && TII->hasModifiersSet(MI, AMDGPU::OpName::omod)) 974 return false; 975 976 if (TII->isVOPC(Opc)) { 977 if (!ST.hasSDWASdst()) { 978 const MachineOperand *SDst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst); 979 if (SDst && (SDst->getReg() != AMDGPU::VCC && 980 SDst->getReg() != AMDGPU::VCC_LO)) 981 return false; 982 } 983 984 if (!ST.hasSDWAOutModsVOPC() && 985 (TII->hasModifiersSet(MI, AMDGPU::OpName::clamp) || 986 TII->hasModifiersSet(MI, AMDGPU::OpName::omod))) 987 return false; 988 989 } else if (TII->getNamedOperand(MI, AMDGPU::OpName::sdst) || 990 !TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) { 991 return false; 992 } 993 994 if (!ST.hasSDWAMac() && (Opc == AMDGPU::V_FMAC_F16_e32 || 995 Opc == AMDGPU::V_FMAC_F32_e32 || 996 Opc == AMDGPU::V_MAC_F16_e32 || 997 Opc == AMDGPU::V_MAC_F32_e32)) 998 return false; 999 1000 // Check if target supports this SDWA opcode 1001 if (TII->pseudoToMCOpcode(Opc) == -1) 1002 return false; 1003 1004 // FIXME: has SDWA but require handling of implicit VCC use 1005 if (Opc == AMDGPU::V_CNDMASK_B32_e32) 1006 return false; 1007 1008 if (MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0)) { 1009 if (!Src0->isReg() && !Src0->isImm()) 1010 return false; 1011 } 1012 1013 if (MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1)) { 1014 if (!Src1->isReg() && !Src1->isImm()) 1015 return false; 1016 } 1017 1018 return true; 1019 } 1020 } // namespace 1021 1022 bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI, 1023 const SDWAOperandsVector &SDWAOperands) { 1024 1025 LLVM_DEBUG(dbgs() << "Convert instruction:" << MI); 1026 1027 // Convert to sdwa 1028 int SDWAOpcode; 1029 unsigned Opcode = MI.getOpcode(); 1030 if (TII->isSDWA(Opcode)) { 1031 SDWAOpcode = Opcode; 1032 } else { 1033 SDWAOpcode = AMDGPU::getSDWAOp(Opcode); 1034 if (SDWAOpcode == -1) 1035 SDWAOpcode = AMDGPU::getSDWAOp(AMDGPU::getVOPe32(Opcode)); 1036 } 1037 assert(SDWAOpcode != -1); 1038 1039 const MCInstrDesc &SDWADesc = TII->get(SDWAOpcode); 1040 1041 // Create SDWA version of instruction MI and initialize its operands 1042 MachineInstrBuilder SDWAInst = 1043 BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), SDWADesc) 1044 .setMIFlags(MI.getFlags()); 1045 1046 // Copy dst, if it is present in original then should also be present in SDWA 1047 MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); 1048 if (Dst) { 1049 assert(AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::vdst)); 1050 SDWAInst.add(*Dst); 1051 } else if ((Dst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst))) { 1052 assert(Dst && AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::sdst)); 1053 SDWAInst.add(*Dst); 1054 } else { 1055 assert(AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::sdst)); 1056 SDWAInst.addReg(TRI->getVCC(), RegState::Define); 1057 } 1058 1059 // Copy src0, initialize src0_modifiers. All sdwa instructions has src0 and 1060 // src0_modifiers (except for v_nop_sdwa, but it can't get here) 1061 MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); 1062 assert(Src0 && AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::src0) && 1063 AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::src0_modifiers)); 1064 if (auto *Mod = TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)) 1065 SDWAInst.addImm(Mod->getImm()); 1066 else 1067 SDWAInst.addImm(0); 1068 SDWAInst.add(*Src0); 1069 1070 // Copy src1 if present, initialize src1_modifiers. 1071 MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); 1072 if (Src1) { 1073 assert(AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::src1) && 1074 AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::src1_modifiers)); 1075 if (auto *Mod = TII->getNamedOperand(MI, AMDGPU::OpName::src1_modifiers)) 1076 SDWAInst.addImm(Mod->getImm()); 1077 else 1078 SDWAInst.addImm(0); 1079 SDWAInst.add(*Src1); 1080 } 1081 1082 if (SDWAOpcode == AMDGPU::V_FMAC_F16_sdwa || 1083 SDWAOpcode == AMDGPU::V_FMAC_F32_sdwa || 1084 SDWAOpcode == AMDGPU::V_MAC_F16_sdwa || 1085 SDWAOpcode == AMDGPU::V_MAC_F32_sdwa) { 1086 // v_mac_f16/32 has additional src2 operand tied to vdst 1087 MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2); 1088 assert(Src2); 1089 SDWAInst.add(*Src2); 1090 } 1091 1092 // Copy clamp if present, initialize otherwise 1093 assert(AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::clamp)); 1094 MachineOperand *Clamp = TII->getNamedOperand(MI, AMDGPU::OpName::clamp); 1095 if (Clamp) { 1096 SDWAInst.add(*Clamp); 1097 } else { 1098 SDWAInst.addImm(0); 1099 } 1100 1101 // Copy omod if present, initialize otherwise if needed 1102 if (AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::omod)) { 1103 MachineOperand *OMod = TII->getNamedOperand(MI, AMDGPU::OpName::omod); 1104 if (OMod) { 1105 SDWAInst.add(*OMod); 1106 } else { 1107 SDWAInst.addImm(0); 1108 } 1109 } 1110 1111 // Copy dst_sel if present, initialize otherwise if needed 1112 if (AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::dst_sel)) { 1113 MachineOperand *DstSel = TII->getNamedOperand(MI, AMDGPU::OpName::dst_sel); 1114 if (DstSel) { 1115 SDWAInst.add(*DstSel); 1116 } else { 1117 SDWAInst.addImm(AMDGPU::SDWA::SdwaSel::DWORD); 1118 } 1119 } 1120 1121 // Copy dst_unused if present, initialize otherwise if needed 1122 if (AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::dst_unused)) { 1123 MachineOperand *DstUnused = TII->getNamedOperand(MI, AMDGPU::OpName::dst_unused); 1124 if (DstUnused) { 1125 SDWAInst.add(*DstUnused); 1126 } else { 1127 SDWAInst.addImm(AMDGPU::SDWA::DstUnused::UNUSED_PAD); 1128 } 1129 } 1130 1131 // Copy src0_sel if present, initialize otherwise 1132 assert(AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::src0_sel)); 1133 MachineOperand *Src0Sel = TII->getNamedOperand(MI, AMDGPU::OpName::src0_sel); 1134 if (Src0Sel) { 1135 SDWAInst.add(*Src0Sel); 1136 } else { 1137 SDWAInst.addImm(AMDGPU::SDWA::SdwaSel::DWORD); 1138 } 1139 1140 // Copy src1_sel if present, initialize otherwise if needed 1141 if (Src1) { 1142 assert(AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::src1_sel)); 1143 MachineOperand *Src1Sel = TII->getNamedOperand(MI, AMDGPU::OpName::src1_sel); 1144 if (Src1Sel) { 1145 SDWAInst.add(*Src1Sel); 1146 } else { 1147 SDWAInst.addImm(AMDGPU::SDWA::SdwaSel::DWORD); 1148 } 1149 } 1150 1151 // Check for a preserved register that needs to be copied. 1152 auto DstUnused = TII->getNamedOperand(MI, AMDGPU::OpName::dst_unused); 1153 if (DstUnused && 1154 DstUnused->getImm() == AMDGPU::SDWA::DstUnused::UNUSED_PRESERVE) { 1155 // We expect, if we are here, that the instruction was already in it's SDWA form, 1156 // with a tied operand. 1157 assert(Dst && Dst->isTied()); 1158 assert(Opcode == static_cast<unsigned int>(SDWAOpcode)); 1159 // We also expect a vdst, since sdst can't preserve. 1160 auto PreserveDstIdx = AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::vdst); 1161 assert(PreserveDstIdx != -1); 1162 1163 auto TiedIdx = MI.findTiedOperandIdx(PreserveDstIdx); 1164 auto Tied = MI.getOperand(TiedIdx); 1165 1166 SDWAInst.add(Tied); 1167 SDWAInst->tieOperands(PreserveDstIdx, SDWAInst->getNumOperands() - 1); 1168 } 1169 1170 // Apply all sdwa operand patterns. 1171 bool Converted = false; 1172 for (auto &Operand : SDWAOperands) { 1173 LLVM_DEBUG(dbgs() << *SDWAInst << "\nOperand: " << *Operand); 1174 // There should be no intersection between SDWA operands and potential MIs 1175 // e.g.: 1176 // v_and_b32 v0, 0xff, v1 -> src:v1 sel:BYTE_0 1177 // v_and_b32 v2, 0xff, v0 -> src:v0 sel:BYTE_0 1178 // v_add_u32 v3, v4, v2 1179 // 1180 // In that example it is possible that we would fold 2nd instruction into 1181 // 3rd (v_add_u32_sdwa) and then try to fold 1st instruction into 2nd (that 1182 // was already destroyed). So if SDWAOperand is also a potential MI then do 1183 // not apply it. 1184 if (PotentialMatches.count(Operand->getParentInst()) == 0) 1185 Converted |= Operand->convertToSDWA(*SDWAInst, TII); 1186 } 1187 if (Converted) { 1188 ConvertedInstructions.push_back(SDWAInst); 1189 } else { 1190 SDWAInst->eraseFromParent(); 1191 return false; 1192 } 1193 1194 LLVM_DEBUG(dbgs() << "\nInto:" << *SDWAInst << '\n'); 1195 ++NumSDWAInstructionsPeepholed; 1196 1197 MI.eraseFromParent(); 1198 return true; 1199 } 1200 1201 // If an instruction was converted to SDWA it should not have immediates or SGPR 1202 // operands (allowed one SGPR on GFX9). Copy its scalar operands into VGPRs. 1203 void SIPeepholeSDWA::legalizeScalarOperands(MachineInstr &MI, 1204 const GCNSubtarget &ST) const { 1205 const MCInstrDesc &Desc = TII->get(MI.getOpcode()); 1206 unsigned ConstantBusCount = 0; 1207 for (MachineOperand &Op : MI.explicit_uses()) { 1208 if (!Op.isImm() && !(Op.isReg() && !TRI->isVGPR(*MRI, Op.getReg()))) 1209 continue; 1210 1211 unsigned I = Op.getOperandNo(); 1212 if (Desc.operands()[I].RegClass == -1 || 1213 !TRI->isVSSuperClass(TRI->getRegClass(Desc.operands()[I].RegClass))) 1214 continue; 1215 1216 if (ST.hasSDWAScalar() && ConstantBusCount == 0 && Op.isReg() && 1217 TRI->isSGPRReg(*MRI, Op.getReg())) { 1218 ++ConstantBusCount; 1219 continue; 1220 } 1221 1222 Register VGPR = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1223 auto Copy = BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(), 1224 TII->get(AMDGPU::V_MOV_B32_e32), VGPR); 1225 if (Op.isImm()) 1226 Copy.addImm(Op.getImm()); 1227 else if (Op.isReg()) 1228 Copy.addReg(Op.getReg(), Op.isKill() ? RegState::Kill : 0, 1229 Op.getSubReg()); 1230 Op.ChangeToRegister(VGPR, false); 1231 } 1232 } 1233 1234 bool SIPeepholeSDWA::runOnMachineFunction(MachineFunction &MF) { 1235 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1236 1237 if (!ST.hasSDWA() || skipFunction(MF.getFunction())) 1238 return false; 1239 1240 MRI = &MF.getRegInfo(); 1241 TRI = ST.getRegisterInfo(); 1242 TII = ST.getInstrInfo(); 1243 1244 // Find all SDWA operands in MF. 1245 bool Ret = false; 1246 for (MachineBasicBlock &MBB : MF) { 1247 bool Changed = false; 1248 do { 1249 // Preprocess the ADD/SUB pairs so they could be SDWA'ed. 1250 // Look for a possible ADD or SUB that resulted from a previously lowered 1251 // V_{ADD|SUB}_U64_PSEUDO. The function pseudoOpConvertToVOP2 1252 // lowers the pair of instructions into e32 form. 1253 matchSDWAOperands(MBB); 1254 for (const auto &OperandPair : SDWAOperands) { 1255 const auto &Operand = OperandPair.second; 1256 MachineInstr *PotentialMI = Operand->potentialToConvert(TII, ST); 1257 if (PotentialMI && 1258 (PotentialMI->getOpcode() == AMDGPU::V_ADD_CO_U32_e64 || 1259 PotentialMI->getOpcode() == AMDGPU::V_SUB_CO_U32_e64)) 1260 pseudoOpConvertToVOP2(*PotentialMI, ST); 1261 } 1262 SDWAOperands.clear(); 1263 1264 // Generate potential match list. 1265 matchSDWAOperands(MBB); 1266 1267 for (const auto &OperandPair : SDWAOperands) { 1268 const auto &Operand = OperandPair.second; 1269 MachineInstr *PotentialMI = Operand->potentialToConvert(TII, ST, &PotentialMatches); 1270 if (PotentialMI && isConvertibleToSDWA(*PotentialMI, ST, TII)) { 1271 PotentialMatches[PotentialMI].push_back(Operand.get()); 1272 } 1273 } 1274 1275 for (auto &PotentialPair : PotentialMatches) { 1276 MachineInstr &PotentialMI = *PotentialPair.first; 1277 convertToSDWA(PotentialMI, PotentialPair.second); 1278 } 1279 1280 PotentialMatches.clear(); 1281 SDWAOperands.clear(); 1282 1283 Changed = !ConvertedInstructions.empty(); 1284 1285 if (Changed) 1286 Ret = true; 1287 while (!ConvertedInstructions.empty()) 1288 legalizeScalarOperands(*ConvertedInstructions.pop_back_val(), ST); 1289 } while (Changed); 1290 } 1291 1292 return Ret; 1293 } 1294