1 //===- SIPeepholeSDWA.cpp - Peephole optimization for SDWA instructions ---===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file This pass tries to apply several peephole SDWA patterns. 10 /// 11 /// E.g. original: 12 /// V_LSHRREV_B32_e32 %0, 16, %1 13 /// V_ADD_CO_U32_e32 %2, %0, %3 14 /// V_LSHLREV_B32_e32 %4, 16, %2 15 /// 16 /// Replace: 17 /// V_ADD_CO_U32_sdwa %4, %1, %3 18 /// dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 19 /// 20 //===----------------------------------------------------------------------===// 21 22 #include "SIPeepholeSDWA.h" 23 #include "AMDGPU.h" 24 #include "GCNSubtarget.h" 25 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 26 #include "llvm/ADT/MapVector.h" 27 #include "llvm/ADT/Statistic.h" 28 #include "llvm/CodeGen/MachineFunctionPass.h" 29 #include <optional> 30 31 using namespace llvm; 32 33 #define DEBUG_TYPE "si-peephole-sdwa" 34 35 STATISTIC(NumSDWAPatternsFound, "Number of SDWA patterns found."); 36 STATISTIC(NumSDWAInstructionsPeepholed, 37 "Number of instruction converted to SDWA."); 38 39 namespace { 40 41 bool isConvertibleToSDWA(MachineInstr &MI, const GCNSubtarget &ST, 42 const SIInstrInfo *TII); 43 class SDWAOperand; 44 class SDWADstOperand; 45 46 using SDWAOperandsVector = SmallVector<SDWAOperand *, 4>; 47 using SDWAOperandsMap = MapVector<MachineInstr *, SDWAOperandsVector>; 48 49 class SIPeepholeSDWA { 50 private: 51 MachineRegisterInfo *MRI; 52 const SIRegisterInfo *TRI; 53 const SIInstrInfo *TII; 54 55 MapVector<MachineInstr *, std::unique_ptr<SDWAOperand>> SDWAOperands; 56 SDWAOperandsMap PotentialMatches; 57 SmallVector<MachineInstr *, 8> ConvertedInstructions; 58 59 std::optional<int64_t> foldToImm(const MachineOperand &Op) const; 60 61 void matchSDWAOperands(MachineBasicBlock &MBB); 62 std::unique_ptr<SDWAOperand> matchSDWAOperand(MachineInstr &MI); 63 void pseudoOpConvertToVOP2(MachineInstr &MI, 64 const GCNSubtarget &ST) const; 65 bool convertToSDWA(MachineInstr &MI, const SDWAOperandsVector &SDWAOperands); 66 void legalizeScalarOperands(MachineInstr &MI, const GCNSubtarget &ST) const; 67 68 public: 69 bool run(MachineFunction &MF); 70 }; 71 72 class SIPeepholeSDWALegacy : public MachineFunctionPass { 73 public: 74 static char ID; 75 76 SIPeepholeSDWALegacy() : MachineFunctionPass(ID) {} 77 78 StringRef getPassName() const override { return "SI Peephole SDWA"; } 79 80 bool runOnMachineFunction(MachineFunction &MF) override; 81 82 void getAnalysisUsage(AnalysisUsage &AU) const override { 83 AU.setPreservesCFG(); 84 MachineFunctionPass::getAnalysisUsage(AU); 85 } 86 }; 87 88 class SDWAOperand { 89 private: 90 MachineOperand *Target; // Operand that would be used in converted instruction 91 MachineOperand *Replaced; // Operand that would be replace by Target 92 93 public: 94 SDWAOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp) 95 : Target(TargetOp), Replaced(ReplacedOp) { 96 assert(Target->isReg()); 97 assert(Replaced->isReg()); 98 } 99 100 virtual ~SDWAOperand() = default; 101 102 virtual MachineInstr *potentialToConvert(const SIInstrInfo *TII, 103 const GCNSubtarget &ST, 104 SDWAOperandsMap *PotentialMatches = nullptr) = 0; 105 virtual bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) = 0; 106 107 MachineOperand *getTargetOperand() const { return Target; } 108 MachineOperand *getReplacedOperand() const { return Replaced; } 109 MachineInstr *getParentInst() const { return Target->getParent(); } 110 111 MachineRegisterInfo *getMRI() const { 112 return &getParentInst()->getParent()->getParent()->getRegInfo(); 113 } 114 115 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 116 virtual void print(raw_ostream& OS) const = 0; 117 void dump() const { print(dbgs()); } 118 #endif 119 }; 120 121 using namespace AMDGPU::SDWA; 122 123 class SDWASrcOperand : public SDWAOperand { 124 private: 125 SdwaSel SrcSel; 126 bool Abs; 127 bool Neg; 128 bool Sext; 129 130 public: 131 SDWASrcOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp, 132 SdwaSel SrcSel_ = DWORD, bool Abs_ = false, bool Neg_ = false, 133 bool Sext_ = false) 134 : SDWAOperand(TargetOp, ReplacedOp), 135 SrcSel(SrcSel_), Abs(Abs_), Neg(Neg_), Sext(Sext_) {} 136 137 MachineInstr *potentialToConvert(const SIInstrInfo *TII, 138 const GCNSubtarget &ST, 139 SDWAOperandsMap *PotentialMatches = nullptr) override; 140 bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override; 141 142 SdwaSel getSrcSel() const { return SrcSel; } 143 bool getAbs() const { return Abs; } 144 bool getNeg() const { return Neg; } 145 bool getSext() const { return Sext; } 146 147 uint64_t getSrcMods(const SIInstrInfo *TII, 148 const MachineOperand *SrcOp) const; 149 150 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 151 void print(raw_ostream& OS) const override; 152 #endif 153 }; 154 155 class SDWADstOperand : public SDWAOperand { 156 private: 157 SdwaSel DstSel; 158 DstUnused DstUn; 159 160 public: 161 162 SDWADstOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp, 163 SdwaSel DstSel_ = DWORD, DstUnused DstUn_ = UNUSED_PAD) 164 : SDWAOperand(TargetOp, ReplacedOp), DstSel(DstSel_), DstUn(DstUn_) {} 165 166 MachineInstr *potentialToConvert(const SIInstrInfo *TII, 167 const GCNSubtarget &ST, 168 SDWAOperandsMap *PotentialMatches = nullptr) override; 169 bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override; 170 171 SdwaSel getDstSel() const { return DstSel; } 172 DstUnused getDstUnused() const { return DstUn; } 173 174 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 175 void print(raw_ostream& OS) const override; 176 #endif 177 }; 178 179 class SDWADstPreserveOperand : public SDWADstOperand { 180 private: 181 MachineOperand *Preserve; 182 183 public: 184 SDWADstPreserveOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp, 185 MachineOperand *PreserveOp, SdwaSel DstSel_ = DWORD) 186 : SDWADstOperand(TargetOp, ReplacedOp, DstSel_, UNUSED_PRESERVE), 187 Preserve(PreserveOp) {} 188 189 bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override; 190 191 MachineOperand *getPreservedOperand() const { return Preserve; } 192 193 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 194 void print(raw_ostream& OS) const override; 195 #endif 196 }; 197 198 } // end anonymous namespace 199 200 INITIALIZE_PASS(SIPeepholeSDWALegacy, DEBUG_TYPE, "SI Peephole SDWA", false, 201 false) 202 203 char SIPeepholeSDWALegacy::ID = 0; 204 205 char &llvm::SIPeepholeSDWALegacyID = SIPeepholeSDWALegacy::ID; 206 207 FunctionPass *llvm::createSIPeepholeSDWALegacyPass() { 208 return new SIPeepholeSDWALegacy(); 209 } 210 211 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 212 static raw_ostream& operator<<(raw_ostream &OS, SdwaSel Sel) { 213 switch(Sel) { 214 case BYTE_0: OS << "BYTE_0"; break; 215 case BYTE_1: OS << "BYTE_1"; break; 216 case BYTE_2: OS << "BYTE_2"; break; 217 case BYTE_3: OS << "BYTE_3"; break; 218 case WORD_0: OS << "WORD_0"; break; 219 case WORD_1: OS << "WORD_1"; break; 220 case DWORD: OS << "DWORD"; break; 221 } 222 return OS; 223 } 224 225 static raw_ostream& operator<<(raw_ostream &OS, const DstUnused &Un) { 226 switch(Un) { 227 case UNUSED_PAD: OS << "UNUSED_PAD"; break; 228 case UNUSED_SEXT: OS << "UNUSED_SEXT"; break; 229 case UNUSED_PRESERVE: OS << "UNUSED_PRESERVE"; break; 230 } 231 return OS; 232 } 233 234 LLVM_DUMP_METHOD 235 void SDWASrcOperand::print(raw_ostream& OS) const { 236 OS << "SDWA src: " << *getTargetOperand() 237 << " src_sel:" << getSrcSel() 238 << " abs:" << getAbs() << " neg:" << getNeg() 239 << " sext:" << getSext() << '\n'; 240 } 241 242 LLVM_DUMP_METHOD 243 void SDWADstOperand::print(raw_ostream& OS) const { 244 OS << "SDWA dst: " << *getTargetOperand() 245 << " dst_sel:" << getDstSel() 246 << " dst_unused:" << getDstUnused() << '\n'; 247 } 248 249 LLVM_DUMP_METHOD 250 void SDWADstPreserveOperand::print(raw_ostream& OS) const { 251 OS << "SDWA preserve dst: " << *getTargetOperand() 252 << " dst_sel:" << getDstSel() 253 << " preserve:" << *getPreservedOperand() << '\n'; 254 } 255 256 #endif 257 258 static void copyRegOperand(MachineOperand &To, const MachineOperand &From) { 259 assert(To.isReg() && From.isReg()); 260 To.setReg(From.getReg()); 261 To.setSubReg(From.getSubReg()); 262 To.setIsUndef(From.isUndef()); 263 if (To.isUse()) { 264 To.setIsKill(From.isKill()); 265 } else { 266 To.setIsDead(From.isDead()); 267 } 268 } 269 270 static bool isSameReg(const MachineOperand &LHS, const MachineOperand &RHS) { 271 return LHS.isReg() && 272 RHS.isReg() && 273 LHS.getReg() == RHS.getReg() && 274 LHS.getSubReg() == RHS.getSubReg(); 275 } 276 277 static MachineOperand *findSingleRegUse(const MachineOperand *Reg, 278 const MachineRegisterInfo *MRI) { 279 if (!Reg->isReg() || !Reg->isDef()) 280 return nullptr; 281 282 MachineOperand *ResMO = nullptr; 283 for (MachineOperand &UseMO : MRI->use_nodbg_operands(Reg->getReg())) { 284 // If there exist use of subreg of Reg then return nullptr 285 if (!isSameReg(UseMO, *Reg)) 286 return nullptr; 287 288 // Check that there is only one instruction that uses Reg 289 if (!ResMO) { 290 ResMO = &UseMO; 291 } else if (ResMO->getParent() != UseMO.getParent()) { 292 return nullptr; 293 } 294 } 295 296 return ResMO; 297 } 298 299 static MachineOperand *findSingleRegDef(const MachineOperand *Reg, 300 const MachineRegisterInfo *MRI) { 301 if (!Reg->isReg()) 302 return nullptr; 303 304 MachineInstr *DefInstr = MRI->getUniqueVRegDef(Reg->getReg()); 305 if (!DefInstr) 306 return nullptr; 307 308 for (auto &DefMO : DefInstr->defs()) { 309 if (DefMO.isReg() && DefMO.getReg() == Reg->getReg()) 310 return &DefMO; 311 } 312 313 // Ignore implicit defs. 314 return nullptr; 315 } 316 317 uint64_t SDWASrcOperand::getSrcMods(const SIInstrInfo *TII, 318 const MachineOperand *SrcOp) const { 319 uint64_t Mods = 0; 320 const auto *MI = SrcOp->getParent(); 321 if (TII->getNamedOperand(*MI, AMDGPU::OpName::src0) == SrcOp) { 322 if (auto *Mod = TII->getNamedOperand(*MI, AMDGPU::OpName::src0_modifiers)) { 323 Mods = Mod->getImm(); 324 } 325 } else if (TII->getNamedOperand(*MI, AMDGPU::OpName::src1) == SrcOp) { 326 if (auto *Mod = TII->getNamedOperand(*MI, AMDGPU::OpName::src1_modifiers)) { 327 Mods = Mod->getImm(); 328 } 329 } 330 if (Abs || Neg) { 331 assert(!Sext && 332 "Float and integer src modifiers can't be set simultaneously"); 333 Mods |= Abs ? SISrcMods::ABS : 0u; 334 Mods ^= Neg ? SISrcMods::NEG : 0u; 335 } else if (Sext) { 336 Mods |= SISrcMods::SEXT; 337 } 338 339 return Mods; 340 } 341 342 MachineInstr *SDWASrcOperand::potentialToConvert(const SIInstrInfo *TII, 343 const GCNSubtarget &ST, 344 SDWAOperandsMap *PotentialMatches) { 345 if (PotentialMatches != nullptr) { 346 // Fill out the map for all uses if all can be converted 347 MachineOperand *Reg = getReplacedOperand(); 348 if (!Reg->isReg() || !Reg->isDef()) 349 return nullptr; 350 351 for (MachineInstr &UseMI : getMRI()->use_nodbg_instructions(Reg->getReg())) 352 // Check that all instructions that use Reg can be converted 353 if (!isConvertibleToSDWA(UseMI, ST, TII)) 354 return nullptr; 355 356 // Now that it's guaranteed all uses are legal, iterate over the uses again 357 // to add them for later conversion. 358 for (MachineOperand &UseMO : getMRI()->use_nodbg_operands(Reg->getReg())) { 359 // Should not get a subregister here 360 assert(isSameReg(UseMO, *Reg)); 361 362 SDWAOperandsMap &potentialMatchesMap = *PotentialMatches; 363 MachineInstr *UseMI = UseMO.getParent(); 364 potentialMatchesMap[UseMI].push_back(this); 365 } 366 return nullptr; 367 } 368 369 // For SDWA src operand potential instruction is one that use register 370 // defined by parent instruction 371 MachineOperand *PotentialMO = findSingleRegUse(getReplacedOperand(), getMRI()); 372 if (!PotentialMO) 373 return nullptr; 374 375 return PotentialMO->getParent(); 376 } 377 378 bool SDWASrcOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) { 379 switch (MI.getOpcode()) { 380 case AMDGPU::V_CVT_F32_FP8_sdwa: 381 case AMDGPU::V_CVT_F32_BF8_sdwa: 382 case AMDGPU::V_CVT_PK_F32_FP8_sdwa: 383 case AMDGPU::V_CVT_PK_F32_BF8_sdwa: 384 // Does not support input modifiers: noabs, noneg, nosext. 385 return false; 386 } 387 388 // Find operand in instruction that matches source operand and replace it with 389 // target operand. Set corresponding src_sel 390 bool IsPreserveSrc = false; 391 MachineOperand *Src = TII->getNamedOperand(MI, AMDGPU::OpName::src0); 392 MachineOperand *SrcSel = TII->getNamedOperand(MI, AMDGPU::OpName::src0_sel); 393 MachineOperand *SrcMods = 394 TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers); 395 assert(Src && (Src->isReg() || Src->isImm())); 396 if (!isSameReg(*Src, *getReplacedOperand())) { 397 // If this is not src0 then it could be src1 398 Src = TII->getNamedOperand(MI, AMDGPU::OpName::src1); 399 SrcSel = TII->getNamedOperand(MI, AMDGPU::OpName::src1_sel); 400 SrcMods = TII->getNamedOperand(MI, AMDGPU::OpName::src1_modifiers); 401 402 if (!Src || 403 !isSameReg(*Src, *getReplacedOperand())) { 404 // It's possible this Src is a tied operand for 405 // UNUSED_PRESERVE, in which case we can either 406 // abandon the peephole attempt, or if legal we can 407 // copy the target operand into the tied slot 408 // if the preserve operation will effectively cause the same 409 // result by overwriting the rest of the dst. 410 MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); 411 MachineOperand *DstUnused = 412 TII->getNamedOperand(MI, AMDGPU::OpName::dst_unused); 413 414 if (Dst && 415 DstUnused->getImm() == AMDGPU::SDWA::DstUnused::UNUSED_PRESERVE) { 416 // This will work if the tied src is accessing WORD_0, and the dst is 417 // writing WORD_1. Modifiers don't matter because all the bits that 418 // would be impacted are being overwritten by the dst. 419 // Any other case will not work. 420 SdwaSel DstSel = static_cast<SdwaSel>( 421 TII->getNamedImmOperand(MI, AMDGPU::OpName::dst_sel)); 422 if (DstSel == AMDGPU::SDWA::SdwaSel::WORD_1 && 423 getSrcSel() == AMDGPU::SDWA::SdwaSel::WORD_0) { 424 IsPreserveSrc = true; 425 auto DstIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), 426 AMDGPU::OpName::vdst); 427 auto TiedIdx = MI.findTiedOperandIdx(DstIdx); 428 Src = &MI.getOperand(TiedIdx); 429 SrcSel = nullptr; 430 SrcMods = nullptr; 431 } else { 432 // Not legal to convert this src 433 return false; 434 } 435 } 436 } 437 assert(Src && Src->isReg()); 438 439 if ((MI.getOpcode() == AMDGPU::V_FMAC_F16_sdwa || 440 MI.getOpcode() == AMDGPU::V_FMAC_F32_sdwa || 441 MI.getOpcode() == AMDGPU::V_MAC_F16_sdwa || 442 MI.getOpcode() == AMDGPU::V_MAC_F32_sdwa) && 443 !isSameReg(*Src, *getReplacedOperand())) { 444 // In case of v_mac_f16/32_sdwa this pass can try to apply src operand to 445 // src2. This is not allowed. 446 return false; 447 } 448 449 assert(isSameReg(*Src, *getReplacedOperand()) && 450 (IsPreserveSrc || (SrcSel && SrcMods))); 451 } 452 copyRegOperand(*Src, *getTargetOperand()); 453 if (!IsPreserveSrc) { 454 SrcSel->setImm(getSrcSel()); 455 SrcMods->setImm(getSrcMods(TII, Src)); 456 } 457 getTargetOperand()->setIsKill(false); 458 return true; 459 } 460 461 MachineInstr *SDWADstOperand::potentialToConvert(const SIInstrInfo *TII, 462 const GCNSubtarget &ST, 463 SDWAOperandsMap *PotentialMatches) { 464 // For SDWA dst operand potential instruction is one that defines register 465 // that this operand uses 466 MachineRegisterInfo *MRI = getMRI(); 467 MachineInstr *ParentMI = getParentInst(); 468 469 MachineOperand *PotentialMO = findSingleRegDef(getReplacedOperand(), MRI); 470 if (!PotentialMO) 471 return nullptr; 472 473 // Check that ParentMI is the only instruction that uses replaced register 474 for (MachineInstr &UseInst : MRI->use_nodbg_instructions(PotentialMO->getReg())) { 475 if (&UseInst != ParentMI) 476 return nullptr; 477 } 478 479 return PotentialMO->getParent(); 480 } 481 482 bool SDWADstOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) { 483 // Replace vdst operand in MI with target operand. Set dst_sel and dst_unused 484 485 if ((MI.getOpcode() == AMDGPU::V_FMAC_F16_sdwa || 486 MI.getOpcode() == AMDGPU::V_FMAC_F32_sdwa || 487 MI.getOpcode() == AMDGPU::V_MAC_F16_sdwa || 488 MI.getOpcode() == AMDGPU::V_MAC_F32_sdwa) && 489 getDstSel() != AMDGPU::SDWA::DWORD) { 490 // v_mac_f16/32_sdwa allow dst_sel to be equal only to DWORD 491 return false; 492 } 493 494 MachineOperand *Operand = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); 495 assert(Operand && 496 Operand->isReg() && 497 isSameReg(*Operand, *getReplacedOperand())); 498 copyRegOperand(*Operand, *getTargetOperand()); 499 MachineOperand *DstSel= TII->getNamedOperand(MI, AMDGPU::OpName::dst_sel); 500 assert(DstSel); 501 DstSel->setImm(getDstSel()); 502 MachineOperand *DstUnused= TII->getNamedOperand(MI, AMDGPU::OpName::dst_unused); 503 assert(DstUnused); 504 DstUnused->setImm(getDstUnused()); 505 506 // Remove original instruction because it would conflict with our new 507 // instruction by register definition 508 getParentInst()->eraseFromParent(); 509 return true; 510 } 511 512 bool SDWADstPreserveOperand::convertToSDWA(MachineInstr &MI, 513 const SIInstrInfo *TII) { 514 // MI should be moved right before v_or_b32. 515 // For this we should clear all kill flags on uses of MI src-operands or else 516 // we can encounter problem with use of killed operand. 517 for (MachineOperand &MO : MI.uses()) { 518 if (!MO.isReg()) 519 continue; 520 getMRI()->clearKillFlags(MO.getReg()); 521 } 522 523 // Move MI before v_or_b32 524 MI.getParent()->remove(&MI); 525 getParentInst()->getParent()->insert(getParentInst(), &MI); 526 527 // Add Implicit use of preserved register 528 MachineInstrBuilder MIB(*MI.getMF(), MI); 529 MIB.addReg(getPreservedOperand()->getReg(), 530 RegState::ImplicitKill, 531 getPreservedOperand()->getSubReg()); 532 533 // Tie dst to implicit use 534 MI.tieOperands(AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdst), 535 MI.getNumOperands() - 1); 536 537 // Convert MI as any other SDWADstOperand and remove v_or_b32 538 return SDWADstOperand::convertToSDWA(MI, TII); 539 } 540 541 std::optional<int64_t> 542 SIPeepholeSDWA::foldToImm(const MachineOperand &Op) const { 543 if (Op.isImm()) { 544 return Op.getImm(); 545 } 546 547 // If this is not immediate then it can be copy of immediate value, e.g.: 548 // %1 = S_MOV_B32 255; 549 if (Op.isReg()) { 550 for (const MachineOperand &Def : MRI->def_operands(Op.getReg())) { 551 if (!isSameReg(Op, Def)) 552 continue; 553 554 const MachineInstr *DefInst = Def.getParent(); 555 if (!TII->isFoldableCopy(*DefInst)) 556 return std::nullopt; 557 558 const MachineOperand &Copied = DefInst->getOperand(1); 559 if (!Copied.isImm()) 560 return std::nullopt; 561 562 return Copied.getImm(); 563 } 564 } 565 566 return std::nullopt; 567 } 568 569 std::unique_ptr<SDWAOperand> 570 SIPeepholeSDWA::matchSDWAOperand(MachineInstr &MI) { 571 unsigned Opcode = MI.getOpcode(); 572 switch (Opcode) { 573 case AMDGPU::V_LSHRREV_B32_e32: 574 case AMDGPU::V_ASHRREV_I32_e32: 575 case AMDGPU::V_LSHLREV_B32_e32: 576 case AMDGPU::V_LSHRREV_B32_e64: 577 case AMDGPU::V_ASHRREV_I32_e64: 578 case AMDGPU::V_LSHLREV_B32_e64: { 579 // from: v_lshrrev_b32_e32 v1, 16/24, v0 580 // to SDWA src:v0 src_sel:WORD_1/BYTE_3 581 582 // from: v_ashrrev_i32_e32 v1, 16/24, v0 583 // to SDWA src:v0 src_sel:WORD_1/BYTE_3 sext:1 584 585 // from: v_lshlrev_b32_e32 v1, 16/24, v0 586 // to SDWA dst:v1 dst_sel:WORD_1/BYTE_3 dst_unused:UNUSED_PAD 587 MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); 588 auto Imm = foldToImm(*Src0); 589 if (!Imm) 590 break; 591 592 if (*Imm != 16 && *Imm != 24) 593 break; 594 595 MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); 596 MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); 597 if (!Src1->isReg() || Src1->getReg().isPhysical() || 598 Dst->getReg().isPhysical()) 599 break; 600 601 if (Opcode == AMDGPU::V_LSHLREV_B32_e32 || 602 Opcode == AMDGPU::V_LSHLREV_B32_e64) { 603 return std::make_unique<SDWADstOperand>( 604 Dst, Src1, *Imm == 16 ? WORD_1 : BYTE_3, UNUSED_PAD); 605 } 606 return std::make_unique<SDWASrcOperand>( 607 Src1, Dst, *Imm == 16 ? WORD_1 : BYTE_3, false, false, 608 Opcode != AMDGPU::V_LSHRREV_B32_e32 && 609 Opcode != AMDGPU::V_LSHRREV_B32_e64); 610 break; 611 } 612 613 case AMDGPU::V_LSHRREV_B16_e32: 614 case AMDGPU::V_ASHRREV_I16_e32: 615 case AMDGPU::V_LSHLREV_B16_e32: 616 case AMDGPU::V_LSHRREV_B16_e64: 617 case AMDGPU::V_ASHRREV_I16_e64: 618 case AMDGPU::V_LSHLREV_B16_e64: { 619 // from: v_lshrrev_b16_e32 v1, 8, v0 620 // to SDWA src:v0 src_sel:BYTE_1 621 622 // from: v_ashrrev_i16_e32 v1, 8, v0 623 // to SDWA src:v0 src_sel:BYTE_1 sext:1 624 625 // from: v_lshlrev_b16_e32 v1, 8, v0 626 // to SDWA dst:v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD 627 MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); 628 auto Imm = foldToImm(*Src0); 629 if (!Imm || *Imm != 8) 630 break; 631 632 MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); 633 MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); 634 635 if (!Src1->isReg() || Src1->getReg().isPhysical() || 636 Dst->getReg().isPhysical()) 637 break; 638 639 if (Opcode == AMDGPU::V_LSHLREV_B16_e32 || 640 Opcode == AMDGPU::V_LSHLREV_B16_e64) 641 return std::make_unique<SDWADstOperand>(Dst, Src1, BYTE_1, UNUSED_PAD); 642 return std::make_unique<SDWASrcOperand>( 643 Src1, Dst, BYTE_1, false, false, 644 Opcode != AMDGPU::V_LSHRREV_B16_e32 && 645 Opcode != AMDGPU::V_LSHRREV_B16_e64); 646 break; 647 } 648 649 case AMDGPU::V_BFE_I32_e64: 650 case AMDGPU::V_BFE_U32_e64: { 651 // e.g.: 652 // from: v_bfe_u32 v1, v0, 8, 8 653 // to SDWA src:v0 src_sel:BYTE_1 654 655 // offset | width | src_sel 656 // ------------------------ 657 // 0 | 8 | BYTE_0 658 // 0 | 16 | WORD_0 659 // 0 | 32 | DWORD ? 660 // 8 | 8 | BYTE_1 661 // 16 | 8 | BYTE_2 662 // 16 | 16 | WORD_1 663 // 24 | 8 | BYTE_3 664 665 MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); 666 auto Offset = foldToImm(*Src1); 667 if (!Offset) 668 break; 669 670 MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2); 671 auto Width = foldToImm(*Src2); 672 if (!Width) 673 break; 674 675 SdwaSel SrcSel = DWORD; 676 677 if (*Offset == 0 && *Width == 8) 678 SrcSel = BYTE_0; 679 else if (*Offset == 0 && *Width == 16) 680 SrcSel = WORD_0; 681 else if (*Offset == 0 && *Width == 32) 682 SrcSel = DWORD; 683 else if (*Offset == 8 && *Width == 8) 684 SrcSel = BYTE_1; 685 else if (*Offset == 16 && *Width == 8) 686 SrcSel = BYTE_2; 687 else if (*Offset == 16 && *Width == 16) 688 SrcSel = WORD_1; 689 else if (*Offset == 24 && *Width == 8) 690 SrcSel = BYTE_3; 691 else 692 break; 693 694 MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); 695 MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); 696 697 if (!Src0->isReg() || Src0->getReg().isPhysical() || 698 Dst->getReg().isPhysical()) 699 break; 700 701 return std::make_unique<SDWASrcOperand>( 702 Src0, Dst, SrcSel, false, false, Opcode != AMDGPU::V_BFE_U32_e64); 703 } 704 705 case AMDGPU::V_AND_B32_e32: 706 case AMDGPU::V_AND_B32_e64: { 707 // e.g.: 708 // from: v_and_b32_e32 v1, 0x0000ffff/0x000000ff, v0 709 // to SDWA src:v0 src_sel:WORD_0/BYTE_0 710 711 MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); 712 MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); 713 auto *ValSrc = Src1; 714 auto Imm = foldToImm(*Src0); 715 716 if (!Imm) { 717 Imm = foldToImm(*Src1); 718 ValSrc = Src0; 719 } 720 721 if (!Imm || (*Imm != 0x0000ffff && *Imm != 0x000000ff)) 722 break; 723 724 MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); 725 726 if (!ValSrc->isReg() || ValSrc->getReg().isPhysical() || 727 Dst->getReg().isPhysical()) 728 break; 729 730 return std::make_unique<SDWASrcOperand>( 731 ValSrc, Dst, *Imm == 0x0000ffff ? WORD_0 : BYTE_0); 732 } 733 734 case AMDGPU::V_OR_B32_e32: 735 case AMDGPU::V_OR_B32_e64: { 736 // Patterns for dst_unused:UNUSED_PRESERVE. 737 // e.g., from: 738 // v_add_f16_sdwa v0, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD 739 // src1_sel:WORD_1 src2_sel:WORD1 740 // v_add_f16_e32 v3, v1, v2 741 // v_or_b32_e32 v4, v0, v3 742 // to SDWA preserve dst:v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE preserve:v3 743 744 // Check if one of operands of v_or_b32 is SDWA instruction 745 using CheckRetType = 746 std::optional<std::pair<MachineOperand *, MachineOperand *>>; 747 auto CheckOROperandsForSDWA = 748 [&](const MachineOperand *Op1, const MachineOperand *Op2) -> CheckRetType { 749 if (!Op1 || !Op1->isReg() || !Op2 || !Op2->isReg()) 750 return CheckRetType(std::nullopt); 751 752 MachineOperand *Op1Def = findSingleRegDef(Op1, MRI); 753 if (!Op1Def) 754 return CheckRetType(std::nullopt); 755 756 MachineInstr *Op1Inst = Op1Def->getParent(); 757 if (!TII->isSDWA(*Op1Inst)) 758 return CheckRetType(std::nullopt); 759 760 MachineOperand *Op2Def = findSingleRegDef(Op2, MRI); 761 if (!Op2Def) 762 return CheckRetType(std::nullopt); 763 764 return CheckRetType(std::pair(Op1Def, Op2Def)); 765 }; 766 767 MachineOperand *OrSDWA = TII->getNamedOperand(MI, AMDGPU::OpName::src0); 768 MachineOperand *OrOther = TII->getNamedOperand(MI, AMDGPU::OpName::src1); 769 assert(OrSDWA && OrOther); 770 auto Res = CheckOROperandsForSDWA(OrSDWA, OrOther); 771 if (!Res) { 772 OrSDWA = TII->getNamedOperand(MI, AMDGPU::OpName::src1); 773 OrOther = TII->getNamedOperand(MI, AMDGPU::OpName::src0); 774 assert(OrSDWA && OrOther); 775 Res = CheckOROperandsForSDWA(OrSDWA, OrOther); 776 if (!Res) 777 break; 778 } 779 780 MachineOperand *OrSDWADef = Res->first; 781 MachineOperand *OrOtherDef = Res->second; 782 assert(OrSDWADef && OrOtherDef); 783 784 MachineInstr *SDWAInst = OrSDWADef->getParent(); 785 MachineInstr *OtherInst = OrOtherDef->getParent(); 786 787 // Check that OtherInstr is actually bitwise compatible with SDWAInst = their 788 // destination patterns don't overlap. Compatible instruction can be either 789 // regular instruction with compatible bitness or SDWA instruction with 790 // correct dst_sel 791 // SDWAInst | OtherInst bitness / OtherInst dst_sel 792 // ----------------------------------------------------- 793 // DWORD | no / no 794 // WORD_0 | no / BYTE_2/3, WORD_1 795 // WORD_1 | 8/16-bit instructions / BYTE_0/1, WORD_0 796 // BYTE_0 | no / BYTE_1/2/3, WORD_1 797 // BYTE_1 | 8-bit / BYTE_0/2/3, WORD_1 798 // BYTE_2 | 8/16-bit / BYTE_0/1/3. WORD_0 799 // BYTE_3 | 8/16/24-bit / BYTE_0/1/2, WORD_0 800 // E.g. if SDWAInst is v_add_f16_sdwa dst_sel:WORD_1 then v_add_f16 is OK 801 // but v_add_f32 is not. 802 803 // TODO: add support for non-SDWA instructions as OtherInst. 804 // For now this only works with SDWA instructions. For regular instructions 805 // there is no way to determine if the instruction writes only 8/16/24-bit 806 // out of full register size and all registers are at min 32-bit wide. 807 if (!TII->isSDWA(*OtherInst)) 808 break; 809 810 SdwaSel DstSel = static_cast<SdwaSel>( 811 TII->getNamedImmOperand(*SDWAInst, AMDGPU::OpName::dst_sel)); 812 SdwaSel OtherDstSel = static_cast<SdwaSel>( 813 TII->getNamedImmOperand(*OtherInst, AMDGPU::OpName::dst_sel)); 814 815 bool DstSelAgree = false; 816 switch (DstSel) { 817 case WORD_0: DstSelAgree = ((OtherDstSel == BYTE_2) || 818 (OtherDstSel == BYTE_3) || 819 (OtherDstSel == WORD_1)); 820 break; 821 case WORD_1: DstSelAgree = ((OtherDstSel == BYTE_0) || 822 (OtherDstSel == BYTE_1) || 823 (OtherDstSel == WORD_0)); 824 break; 825 case BYTE_0: DstSelAgree = ((OtherDstSel == BYTE_1) || 826 (OtherDstSel == BYTE_2) || 827 (OtherDstSel == BYTE_3) || 828 (OtherDstSel == WORD_1)); 829 break; 830 case BYTE_1: DstSelAgree = ((OtherDstSel == BYTE_0) || 831 (OtherDstSel == BYTE_2) || 832 (OtherDstSel == BYTE_3) || 833 (OtherDstSel == WORD_1)); 834 break; 835 case BYTE_2: DstSelAgree = ((OtherDstSel == BYTE_0) || 836 (OtherDstSel == BYTE_1) || 837 (OtherDstSel == BYTE_3) || 838 (OtherDstSel == WORD_0)); 839 break; 840 case BYTE_3: DstSelAgree = ((OtherDstSel == BYTE_0) || 841 (OtherDstSel == BYTE_1) || 842 (OtherDstSel == BYTE_2) || 843 (OtherDstSel == WORD_0)); 844 break; 845 default: DstSelAgree = false; 846 } 847 848 if (!DstSelAgree) 849 break; 850 851 // Also OtherInst dst_unused should be UNUSED_PAD 852 DstUnused OtherDstUnused = static_cast<DstUnused>( 853 TII->getNamedImmOperand(*OtherInst, AMDGPU::OpName::dst_unused)); 854 if (OtherDstUnused != DstUnused::UNUSED_PAD) 855 break; 856 857 // Create DstPreserveOperand 858 MachineOperand *OrDst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); 859 assert(OrDst && OrDst->isReg()); 860 861 return std::make_unique<SDWADstPreserveOperand>( 862 OrDst, OrSDWADef, OrOtherDef, DstSel); 863 864 } 865 } 866 867 return std::unique_ptr<SDWAOperand>(nullptr); 868 } 869 870 #if !defined(NDEBUG) 871 static raw_ostream& operator<<(raw_ostream &OS, const SDWAOperand &Operand) { 872 Operand.print(OS); 873 return OS; 874 } 875 #endif 876 877 void SIPeepholeSDWA::matchSDWAOperands(MachineBasicBlock &MBB) { 878 for (MachineInstr &MI : MBB) { 879 if (auto Operand = matchSDWAOperand(MI)) { 880 LLVM_DEBUG(dbgs() << "Match: " << MI << "To: " << *Operand << '\n'); 881 SDWAOperands[&MI] = std::move(Operand); 882 ++NumSDWAPatternsFound; 883 } 884 } 885 } 886 887 // Convert the V_ADD_CO_U32_e64 into V_ADD_CO_U32_e32. This allows 888 // isConvertibleToSDWA to perform its transformation on V_ADD_CO_U32_e32 into 889 // V_ADD_CO_U32_sdwa. 890 // 891 // We are transforming from a VOP3 into a VOP2 form of the instruction. 892 // %19:vgpr_32 = V_AND_B32_e32 255, 893 // killed %16:vgpr_32, implicit $exec 894 // %47:vgpr_32, %49:sreg_64_xexec = V_ADD_CO_U32_e64 895 // %26.sub0:vreg_64, %19:vgpr_32, implicit $exec 896 // %48:vgpr_32, dead %50:sreg_64_xexec = V_ADDC_U32_e64 897 // %26.sub1:vreg_64, %54:vgpr_32, killed %49:sreg_64_xexec, implicit $exec 898 // 899 // becomes 900 // %47:vgpr_32 = V_ADD_CO_U32_sdwa 901 // 0, %26.sub0:vreg_64, 0, killed %16:vgpr_32, 0, 6, 0, 6, 0, 902 // implicit-def $vcc, implicit $exec 903 // %48:vgpr_32, dead %50:sreg_64_xexec = V_ADDC_U32_e64 904 // %26.sub1:vreg_64, %54:vgpr_32, killed $vcc, implicit $exec 905 void SIPeepholeSDWA::pseudoOpConvertToVOP2(MachineInstr &MI, 906 const GCNSubtarget &ST) const { 907 int Opc = MI.getOpcode(); 908 assert((Opc == AMDGPU::V_ADD_CO_U32_e64 || Opc == AMDGPU::V_SUB_CO_U32_e64) && 909 "Currently only handles V_ADD_CO_U32_e64 or V_SUB_CO_U32_e64"); 910 911 // Can the candidate MI be shrunk? 912 if (!TII->canShrink(MI, *MRI)) 913 return; 914 Opc = AMDGPU::getVOPe32(Opc); 915 // Find the related ADD instruction. 916 const MachineOperand *Sdst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst); 917 if (!Sdst) 918 return; 919 MachineOperand *NextOp = findSingleRegUse(Sdst, MRI); 920 if (!NextOp) 921 return; 922 MachineInstr &MISucc = *NextOp->getParent(); 923 924 // Make sure the carry in/out are subsequently unused. 925 MachineOperand *CarryIn = TII->getNamedOperand(MISucc, AMDGPU::OpName::src2); 926 if (!CarryIn) 927 return; 928 MachineOperand *CarryOut = TII->getNamedOperand(MISucc, AMDGPU::OpName::sdst); 929 if (!CarryOut) 930 return; 931 if (!MRI->hasOneUse(CarryIn->getReg()) || !MRI->use_empty(CarryOut->getReg())) 932 return; 933 // Make sure VCC or its subregs are dead before MI. 934 MachineBasicBlock &MBB = *MI.getParent(); 935 auto Liveness = MBB.computeRegisterLiveness(TRI, AMDGPU::VCC, MI, 25); 936 if (Liveness != MachineBasicBlock::LQR_Dead) 937 return; 938 // Check if VCC is referenced in range of (MI,MISucc]. 939 for (auto I = std::next(MI.getIterator()), E = MISucc.getIterator(); 940 I != E; ++I) { 941 if (I->modifiesRegister(AMDGPU::VCC, TRI)) 942 return; 943 } 944 945 // Replace MI with V_{SUB|ADD}_I32_e32 946 BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(Opc)) 947 .add(*TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) 948 .add(*TII->getNamedOperand(MI, AMDGPU::OpName::src0)) 949 .add(*TII->getNamedOperand(MI, AMDGPU::OpName::src1)) 950 .setMIFlags(MI.getFlags()); 951 952 MI.eraseFromParent(); 953 954 // Since the carry output of MI is now VCC, update its use in MISucc. 955 956 MISucc.substituteRegister(CarryIn->getReg(), TRI->getVCC(), 0, *TRI); 957 } 958 959 namespace { 960 bool isConvertibleToSDWA(MachineInstr &MI, 961 const GCNSubtarget &ST, 962 const SIInstrInfo* TII) { 963 // Check if this is already an SDWA instruction 964 unsigned Opc = MI.getOpcode(); 965 if (TII->isSDWA(Opc)) { 966 // FIXME: Reenable after fixing selection handling. 967 // Cf. llvm/test/CodeGen/AMDGPU/sdwa-peephole-instr-combine-sel.ll 968 return false; 969 } 970 971 // Check if this instruction has opcode that supports SDWA 972 if (AMDGPU::getSDWAOp(Opc) == -1) 973 Opc = AMDGPU::getVOPe32(Opc); 974 975 if (AMDGPU::getSDWAOp(Opc) == -1) 976 return false; 977 978 if (!ST.hasSDWAOmod() && TII->hasModifiersSet(MI, AMDGPU::OpName::omod)) 979 return false; 980 981 if (TII->isVOPC(Opc)) { 982 if (!ST.hasSDWASdst()) { 983 const MachineOperand *SDst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst); 984 if (SDst && (SDst->getReg() != AMDGPU::VCC && 985 SDst->getReg() != AMDGPU::VCC_LO)) 986 return false; 987 } 988 989 if (!ST.hasSDWAOutModsVOPC() && 990 (TII->hasModifiersSet(MI, AMDGPU::OpName::clamp) || 991 TII->hasModifiersSet(MI, AMDGPU::OpName::omod))) 992 return false; 993 994 } else if (TII->getNamedOperand(MI, AMDGPU::OpName::sdst) || 995 !TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) { 996 return false; 997 } 998 999 if (!ST.hasSDWAMac() && (Opc == AMDGPU::V_FMAC_F16_e32 || 1000 Opc == AMDGPU::V_FMAC_F32_e32 || 1001 Opc == AMDGPU::V_MAC_F16_e32 || 1002 Opc == AMDGPU::V_MAC_F32_e32)) 1003 return false; 1004 1005 // Check if target supports this SDWA opcode 1006 if (TII->pseudoToMCOpcode(Opc) == -1) 1007 return false; 1008 1009 // FIXME: has SDWA but require handling of implicit VCC use 1010 if (Opc == AMDGPU::V_CNDMASK_B32_e32) 1011 return false; 1012 1013 if (MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0)) { 1014 if (!Src0->isReg() && !Src0->isImm()) 1015 return false; 1016 } 1017 1018 if (MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1)) { 1019 if (!Src1->isReg() && !Src1->isImm()) 1020 return false; 1021 } 1022 1023 return true; 1024 } 1025 } // namespace 1026 1027 bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI, 1028 const SDWAOperandsVector &SDWAOperands) { 1029 1030 LLVM_DEBUG(dbgs() << "Convert instruction:" << MI); 1031 1032 // Convert to sdwa 1033 int SDWAOpcode; 1034 unsigned Opcode = MI.getOpcode(); 1035 if (TII->isSDWA(Opcode)) { 1036 SDWAOpcode = Opcode; 1037 } else { 1038 SDWAOpcode = AMDGPU::getSDWAOp(Opcode); 1039 if (SDWAOpcode == -1) 1040 SDWAOpcode = AMDGPU::getSDWAOp(AMDGPU::getVOPe32(Opcode)); 1041 } 1042 assert(SDWAOpcode != -1); 1043 1044 const MCInstrDesc &SDWADesc = TII->get(SDWAOpcode); 1045 1046 // Create SDWA version of instruction MI and initialize its operands 1047 MachineInstrBuilder SDWAInst = 1048 BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), SDWADesc) 1049 .setMIFlags(MI.getFlags()); 1050 1051 // Copy dst, if it is present in original then should also be present in SDWA 1052 MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); 1053 if (Dst) { 1054 assert(AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::vdst)); 1055 SDWAInst.add(*Dst); 1056 } else if ((Dst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst))) { 1057 assert(Dst && AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::sdst)); 1058 SDWAInst.add(*Dst); 1059 } else { 1060 assert(AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::sdst)); 1061 SDWAInst.addReg(TRI->getVCC(), RegState::Define); 1062 } 1063 1064 // Copy src0, initialize src0_modifiers. All sdwa instructions has src0 and 1065 // src0_modifiers (except for v_nop_sdwa, but it can't get here) 1066 MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); 1067 assert(Src0 && AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::src0) && 1068 AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::src0_modifiers)); 1069 if (auto *Mod = TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)) 1070 SDWAInst.addImm(Mod->getImm()); 1071 else 1072 SDWAInst.addImm(0); 1073 SDWAInst.add(*Src0); 1074 1075 // Copy src1 if present, initialize src1_modifiers. 1076 MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); 1077 if (Src1) { 1078 assert(AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::src1) && 1079 AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::src1_modifiers)); 1080 if (auto *Mod = TII->getNamedOperand(MI, AMDGPU::OpName::src1_modifiers)) 1081 SDWAInst.addImm(Mod->getImm()); 1082 else 1083 SDWAInst.addImm(0); 1084 SDWAInst.add(*Src1); 1085 } 1086 1087 if (SDWAOpcode == AMDGPU::V_FMAC_F16_sdwa || 1088 SDWAOpcode == AMDGPU::V_FMAC_F32_sdwa || 1089 SDWAOpcode == AMDGPU::V_MAC_F16_sdwa || 1090 SDWAOpcode == AMDGPU::V_MAC_F32_sdwa) { 1091 // v_mac_f16/32 has additional src2 operand tied to vdst 1092 MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2); 1093 assert(Src2); 1094 SDWAInst.add(*Src2); 1095 } 1096 1097 // Copy clamp if present, initialize otherwise 1098 assert(AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::clamp)); 1099 MachineOperand *Clamp = TII->getNamedOperand(MI, AMDGPU::OpName::clamp); 1100 if (Clamp) { 1101 SDWAInst.add(*Clamp); 1102 } else { 1103 SDWAInst.addImm(0); 1104 } 1105 1106 // Copy omod if present, initialize otherwise if needed 1107 if (AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::omod)) { 1108 MachineOperand *OMod = TII->getNamedOperand(MI, AMDGPU::OpName::omod); 1109 if (OMod) { 1110 SDWAInst.add(*OMod); 1111 } else { 1112 SDWAInst.addImm(0); 1113 } 1114 } 1115 1116 // Copy dst_sel if present, initialize otherwise if needed 1117 if (AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::dst_sel)) { 1118 MachineOperand *DstSel = TII->getNamedOperand(MI, AMDGPU::OpName::dst_sel); 1119 if (DstSel) { 1120 SDWAInst.add(*DstSel); 1121 } else { 1122 SDWAInst.addImm(AMDGPU::SDWA::SdwaSel::DWORD); 1123 } 1124 } 1125 1126 // Copy dst_unused if present, initialize otherwise if needed 1127 if (AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::dst_unused)) { 1128 MachineOperand *DstUnused = TII->getNamedOperand(MI, AMDGPU::OpName::dst_unused); 1129 if (DstUnused) { 1130 SDWAInst.add(*DstUnused); 1131 } else { 1132 SDWAInst.addImm(AMDGPU::SDWA::DstUnused::UNUSED_PAD); 1133 } 1134 } 1135 1136 // Copy src0_sel if present, initialize otherwise 1137 assert(AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::src0_sel)); 1138 MachineOperand *Src0Sel = TII->getNamedOperand(MI, AMDGPU::OpName::src0_sel); 1139 if (Src0Sel) { 1140 SDWAInst.add(*Src0Sel); 1141 } else { 1142 SDWAInst.addImm(AMDGPU::SDWA::SdwaSel::DWORD); 1143 } 1144 1145 // Copy src1_sel if present, initialize otherwise if needed 1146 if (Src1) { 1147 assert(AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::src1_sel)); 1148 MachineOperand *Src1Sel = TII->getNamedOperand(MI, AMDGPU::OpName::src1_sel); 1149 if (Src1Sel) { 1150 SDWAInst.add(*Src1Sel); 1151 } else { 1152 SDWAInst.addImm(AMDGPU::SDWA::SdwaSel::DWORD); 1153 } 1154 } 1155 1156 // Check for a preserved register that needs to be copied. 1157 auto *DstUnused = TII->getNamedOperand(MI, AMDGPU::OpName::dst_unused); 1158 if (DstUnused && 1159 DstUnused->getImm() == AMDGPU::SDWA::DstUnused::UNUSED_PRESERVE) { 1160 // We expect, if we are here, that the instruction was already in it's SDWA form, 1161 // with a tied operand. 1162 assert(Dst && Dst->isTied()); 1163 assert(Opcode == static_cast<unsigned int>(SDWAOpcode)); 1164 // We also expect a vdst, since sdst can't preserve. 1165 auto PreserveDstIdx = AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::vdst); 1166 assert(PreserveDstIdx != -1); 1167 1168 auto TiedIdx = MI.findTiedOperandIdx(PreserveDstIdx); 1169 auto Tied = MI.getOperand(TiedIdx); 1170 1171 SDWAInst.add(Tied); 1172 SDWAInst->tieOperands(PreserveDstIdx, SDWAInst->getNumOperands() - 1); 1173 } 1174 1175 // Apply all sdwa operand patterns. 1176 bool Converted = false; 1177 for (auto &Operand : SDWAOperands) { 1178 LLVM_DEBUG(dbgs() << *SDWAInst << "\nOperand: " << *Operand); 1179 // There should be no intersection between SDWA operands and potential MIs 1180 // e.g.: 1181 // v_and_b32 v0, 0xff, v1 -> src:v1 sel:BYTE_0 1182 // v_and_b32 v2, 0xff, v0 -> src:v0 sel:BYTE_0 1183 // v_add_u32 v3, v4, v2 1184 // 1185 // In that example it is possible that we would fold 2nd instruction into 1186 // 3rd (v_add_u32_sdwa) and then try to fold 1st instruction into 2nd (that 1187 // was already destroyed). So if SDWAOperand is also a potential MI then do 1188 // not apply it. 1189 if (PotentialMatches.count(Operand->getParentInst()) == 0) 1190 Converted |= Operand->convertToSDWA(*SDWAInst, TII); 1191 } 1192 1193 if (Converted) { 1194 ConvertedInstructions.push_back(SDWAInst); 1195 for (MachineOperand &MO : SDWAInst->uses()) { 1196 if (!MO.isReg()) 1197 continue; 1198 1199 MRI->clearKillFlags(MO.getReg()); 1200 } 1201 } else { 1202 SDWAInst->eraseFromParent(); 1203 return false; 1204 } 1205 1206 LLVM_DEBUG(dbgs() << "\nInto:" << *SDWAInst << '\n'); 1207 ++NumSDWAInstructionsPeepholed; 1208 1209 MI.eraseFromParent(); 1210 return true; 1211 } 1212 1213 // If an instruction was converted to SDWA it should not have immediates or SGPR 1214 // operands (allowed one SGPR on GFX9). Copy its scalar operands into VGPRs. 1215 void SIPeepholeSDWA::legalizeScalarOperands(MachineInstr &MI, 1216 const GCNSubtarget &ST) const { 1217 const MCInstrDesc &Desc = TII->get(MI.getOpcode()); 1218 unsigned ConstantBusCount = 0; 1219 for (MachineOperand &Op : MI.explicit_uses()) { 1220 if (!Op.isImm() && !(Op.isReg() && !TRI->isVGPR(*MRI, Op.getReg()))) 1221 continue; 1222 1223 unsigned I = Op.getOperandNo(); 1224 if (Desc.operands()[I].RegClass == -1 || 1225 !TRI->isVSSuperClass(TRI->getRegClass(Desc.operands()[I].RegClass))) 1226 continue; 1227 1228 if (ST.hasSDWAScalar() && ConstantBusCount == 0 && Op.isReg() && 1229 TRI->isSGPRReg(*MRI, Op.getReg())) { 1230 ++ConstantBusCount; 1231 continue; 1232 } 1233 1234 Register VGPR = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1235 auto Copy = BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(), 1236 TII->get(AMDGPU::V_MOV_B32_e32), VGPR); 1237 if (Op.isImm()) 1238 Copy.addImm(Op.getImm()); 1239 else if (Op.isReg()) 1240 Copy.addReg(Op.getReg(), Op.isKill() ? RegState::Kill : 0, 1241 Op.getSubReg()); 1242 Op.ChangeToRegister(VGPR, false); 1243 } 1244 } 1245 1246 bool SIPeepholeSDWALegacy::runOnMachineFunction(MachineFunction &MF) { 1247 if (skipFunction(MF.getFunction())) 1248 return false; 1249 1250 return SIPeepholeSDWA().run(MF); 1251 } 1252 1253 bool SIPeepholeSDWA::run(MachineFunction &MF) { 1254 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1255 1256 if (!ST.hasSDWA()) 1257 return false; 1258 1259 MRI = &MF.getRegInfo(); 1260 TRI = ST.getRegisterInfo(); 1261 TII = ST.getInstrInfo(); 1262 1263 // Find all SDWA operands in MF. 1264 bool Ret = false; 1265 for (MachineBasicBlock &MBB : MF) { 1266 bool Changed = false; 1267 do { 1268 // Preprocess the ADD/SUB pairs so they could be SDWA'ed. 1269 // Look for a possible ADD or SUB that resulted from a previously lowered 1270 // V_{ADD|SUB}_U64_PSEUDO. The function pseudoOpConvertToVOP2 1271 // lowers the pair of instructions into e32 form. 1272 matchSDWAOperands(MBB); 1273 for (const auto &OperandPair : SDWAOperands) { 1274 const auto &Operand = OperandPair.second; 1275 MachineInstr *PotentialMI = Operand->potentialToConvert(TII, ST); 1276 if (PotentialMI && 1277 (PotentialMI->getOpcode() == AMDGPU::V_ADD_CO_U32_e64 || 1278 PotentialMI->getOpcode() == AMDGPU::V_SUB_CO_U32_e64)) 1279 pseudoOpConvertToVOP2(*PotentialMI, ST); 1280 } 1281 SDWAOperands.clear(); 1282 1283 // Generate potential match list. 1284 matchSDWAOperands(MBB); 1285 1286 for (const auto &OperandPair : SDWAOperands) { 1287 const auto &Operand = OperandPair.second; 1288 MachineInstr *PotentialMI = Operand->potentialToConvert(TII, ST, &PotentialMatches); 1289 if (PotentialMI && isConvertibleToSDWA(*PotentialMI, ST, TII)) { 1290 PotentialMatches[PotentialMI].push_back(Operand.get()); 1291 } 1292 } 1293 1294 for (auto &PotentialPair : PotentialMatches) { 1295 MachineInstr &PotentialMI = *PotentialPair.first; 1296 convertToSDWA(PotentialMI, PotentialPair.second); 1297 } 1298 1299 PotentialMatches.clear(); 1300 SDWAOperands.clear(); 1301 1302 Changed = !ConvertedInstructions.empty(); 1303 1304 if (Changed) 1305 Ret = true; 1306 while (!ConvertedInstructions.empty()) 1307 legalizeScalarOperands(*ConvertedInstructions.pop_back_val(), ST); 1308 } while (Changed); 1309 } 1310 1311 return Ret; 1312 } 1313 1314 PreservedAnalyses SIPeepholeSDWAPass::run(MachineFunction &MF, 1315 MachineFunctionAnalysisManager &) { 1316 if (MF.getFunction().hasOptNone() || !SIPeepholeSDWA().run(MF)) 1317 return PreservedAnalyses::all(); 1318 1319 PreservedAnalyses PA = getMachineFunctionPassPreservedAnalyses(); 1320 PA.preserveSet<CFGAnalyses>(); 1321 return PA; 1322 } 1323