1 //===- SIPeepholeSDWA.cpp - Peephole optimization for SDWA instructions ---===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 /// \file This pass tries to apply several peephole SDWA patterns. 11 /// 12 /// E.g. original: 13 /// V_LSHRREV_B32_e32 %0, 16, %1 14 /// V_ADD_I32_e32 %2, %0, %3 15 /// V_LSHLREV_B32_e32 %4, 16, %2 16 /// 17 /// Replace: 18 /// V_ADD_I32_sdwa %4, %1, %3 19 /// dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 20 /// 21 //===----------------------------------------------------------------------===// 22 23 #include "AMDGPU.h" 24 #include "AMDGPUSubtarget.h" 25 #include "SIDefines.h" 26 #include "SIInstrInfo.h" 27 #include "SIRegisterInfo.h" 28 #include "Utils/AMDGPUBaseInfo.h" 29 #include "llvm/ADT/None.h" 30 #include "llvm/ADT/Optional.h" 31 #include "llvm/ADT/STLExtras.h" 32 #include "llvm/ADT/SmallVector.h" 33 #include "llvm/ADT/Statistic.h" 34 #include "llvm/CodeGen/MachineBasicBlock.h" 35 #include "llvm/CodeGen/MachineFunction.h" 36 #include "llvm/CodeGen/MachineFunctionPass.h" 37 #include "llvm/CodeGen/MachineInstr.h" 38 #include "llvm/CodeGen/MachineInstrBuilder.h" 39 #include "llvm/CodeGen/MachineOperand.h" 40 #include "llvm/CodeGen/MachineRegisterInfo.h" 41 #include "llvm/CodeGen/TargetRegisterInfo.h" 42 #include "llvm/MC/LaneBitmask.h" 43 #include "llvm/MC/MCInstrDesc.h" 44 #include "llvm/Pass.h" 45 #include "llvm/Support/Debug.h" 46 #include "llvm/Support/raw_ostream.h" 47 #include <algorithm> 48 #include <cassert> 49 #include <cstdint> 50 #include <memory> 51 #include <unordered_map> 52 53 using namespace llvm; 54 55 #define DEBUG_TYPE "si-peephole-sdwa" 56 57 STATISTIC(NumSDWAPatternsFound, "Number of SDWA patterns found."); 58 STATISTIC(NumSDWAInstructionsPeepholed, 59 "Number of instruction converted to SDWA."); 60 61 namespace { 62 63 class SDWAOperand; 64 class SDWADstOperand; 65 66 class SIPeepholeSDWA : public MachineFunctionPass { 67 public: 68 using SDWAOperandsVector = SmallVector<SDWAOperand *, 4>; 69 70 private: 71 MachineRegisterInfo *MRI; 72 const SIRegisterInfo *TRI; 73 const SIInstrInfo *TII; 74 75 std::unordered_map<MachineInstr *, std::unique_ptr<SDWAOperand>> SDWAOperands; 76 std::unordered_map<MachineInstr *, SDWAOperandsVector> PotentialMatches; 77 SmallVector<MachineInstr *, 8> ConvertedInstructions; 78 79 Optional<int64_t> foldToImm(const MachineOperand &Op) const; 80 81 public: 82 static char ID; 83 84 SIPeepholeSDWA() : MachineFunctionPass(ID) { 85 initializeSIPeepholeSDWAPass(*PassRegistry::getPassRegistry()); 86 } 87 88 bool runOnMachineFunction(MachineFunction &MF) override; 89 void matchSDWAOperands(MachineBasicBlock &MBB); 90 std::unique_ptr<SDWAOperand> matchSDWAOperand(MachineInstr &MI); 91 bool isConvertibleToSDWA(const MachineInstr &MI, const SISubtarget &ST) const; 92 bool convertToSDWA(MachineInstr &MI, const SDWAOperandsVector &SDWAOperands); 93 void legalizeScalarOperands(MachineInstr &MI, const SISubtarget &ST) const; 94 95 StringRef getPassName() const override { return "SI Peephole SDWA"; } 96 97 void getAnalysisUsage(AnalysisUsage &AU) const override { 98 AU.setPreservesCFG(); 99 MachineFunctionPass::getAnalysisUsage(AU); 100 } 101 }; 102 103 class SDWAOperand { 104 private: 105 MachineOperand *Target; // Operand that would be used in converted instruction 106 MachineOperand *Replaced; // Operand that would be replace by Target 107 108 public: 109 SDWAOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp) 110 : Target(TargetOp), Replaced(ReplacedOp) { 111 assert(Target->isReg()); 112 assert(Replaced->isReg()); 113 } 114 115 virtual ~SDWAOperand() = default; 116 117 virtual MachineInstr *potentialToConvert(const SIInstrInfo *TII) = 0; 118 virtual bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) = 0; 119 120 MachineOperand *getTargetOperand() const { return Target; } 121 MachineOperand *getReplacedOperand() const { return Replaced; } 122 MachineInstr *getParentInst() const { return Target->getParent(); } 123 124 MachineRegisterInfo *getMRI() const { 125 return &getParentInst()->getParent()->getParent()->getRegInfo(); 126 } 127 128 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 129 virtual void print(raw_ostream& OS) const = 0; 130 void dump() const { print(dbgs()); } 131 #endif 132 }; 133 134 using namespace AMDGPU::SDWA; 135 136 class SDWASrcOperand : public SDWAOperand { 137 private: 138 SdwaSel SrcSel; 139 bool Abs; 140 bool Neg; 141 bool Sext; 142 143 public: 144 SDWASrcOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp, 145 SdwaSel SrcSel_ = DWORD, bool Abs_ = false, bool Neg_ = false, 146 bool Sext_ = false) 147 : SDWAOperand(TargetOp, ReplacedOp), 148 SrcSel(SrcSel_), Abs(Abs_), Neg(Neg_), Sext(Sext_) {} 149 150 MachineInstr *potentialToConvert(const SIInstrInfo *TII) override; 151 bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override; 152 153 SdwaSel getSrcSel() const { return SrcSel; } 154 bool getAbs() const { return Abs; } 155 bool getNeg() const { return Neg; } 156 bool getSext() const { return Sext; } 157 158 uint64_t getSrcMods(const SIInstrInfo *TII, 159 const MachineOperand *SrcOp) const; 160 161 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 162 void print(raw_ostream& OS) const override; 163 #endif 164 }; 165 166 class SDWADstOperand : public SDWAOperand { 167 private: 168 SdwaSel DstSel; 169 DstUnused DstUn; 170 171 public: 172 173 SDWADstOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp, 174 SdwaSel DstSel_ = DWORD, DstUnused DstUn_ = UNUSED_PAD) 175 : SDWAOperand(TargetOp, ReplacedOp), DstSel(DstSel_), DstUn(DstUn_) {} 176 177 MachineInstr *potentialToConvert(const SIInstrInfo *TII) override; 178 bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override; 179 180 SdwaSel getDstSel() const { return DstSel; } 181 DstUnused getDstUnused() const { return DstUn; } 182 183 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 184 void print(raw_ostream& OS) const override; 185 #endif 186 }; 187 188 class SDWADstPreserveOperand : public SDWADstOperand { 189 private: 190 MachineOperand *Preserve; 191 192 public: 193 SDWADstPreserveOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp, 194 MachineOperand *PreserveOp, SdwaSel DstSel_ = DWORD) 195 : SDWADstOperand(TargetOp, ReplacedOp, DstSel_, UNUSED_PRESERVE), 196 Preserve(PreserveOp) {} 197 198 bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override; 199 200 MachineOperand *getPreservedOperand() const { return Preserve; } 201 202 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 203 void print(raw_ostream& OS) const override; 204 #endif 205 }; 206 207 } // end anonymous namespace 208 209 INITIALIZE_PASS(SIPeepholeSDWA, DEBUG_TYPE, "SI Peephole SDWA", false, false) 210 211 char SIPeepholeSDWA::ID = 0; 212 213 char &llvm::SIPeepholeSDWAID = SIPeepholeSDWA::ID; 214 215 FunctionPass *llvm::createSIPeepholeSDWAPass() { 216 return new SIPeepholeSDWA(); 217 } 218 219 220 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 221 static raw_ostream& operator<<(raw_ostream &OS, SdwaSel Sel) { 222 switch(Sel) { 223 case BYTE_0: OS << "BYTE_0"; break; 224 case BYTE_1: OS << "BYTE_1"; break; 225 case BYTE_2: OS << "BYTE_2"; break; 226 case BYTE_3: OS << "BYTE_3"; break; 227 case WORD_0: OS << "WORD_0"; break; 228 case WORD_1: OS << "WORD_1"; break; 229 case DWORD: OS << "DWORD"; break; 230 } 231 return OS; 232 } 233 234 static raw_ostream& operator<<(raw_ostream &OS, const DstUnused &Un) { 235 switch(Un) { 236 case UNUSED_PAD: OS << "UNUSED_PAD"; break; 237 case UNUSED_SEXT: OS << "UNUSED_SEXT"; break; 238 case UNUSED_PRESERVE: OS << "UNUSED_PRESERVE"; break; 239 } 240 return OS; 241 } 242 243 static raw_ostream& operator<<(raw_ostream &OS, const SDWAOperand &Operand) { 244 Operand.print(OS); 245 return OS; 246 } 247 248 LLVM_DUMP_METHOD 249 void SDWASrcOperand::print(raw_ostream& OS) const { 250 OS << "SDWA src: " << *getTargetOperand() 251 << " src_sel:" << getSrcSel() 252 << " abs:" << getAbs() << " neg:" << getNeg() 253 << " sext:" << getSext() << '\n'; 254 } 255 256 LLVM_DUMP_METHOD 257 void SDWADstOperand::print(raw_ostream& OS) const { 258 OS << "SDWA dst: " << *getTargetOperand() 259 << " dst_sel:" << getDstSel() 260 << " dst_unused:" << getDstUnused() << '\n'; 261 } 262 263 LLVM_DUMP_METHOD 264 void SDWADstPreserveOperand::print(raw_ostream& OS) const { 265 OS << "SDWA preserve dst: " << *getTargetOperand() 266 << " dst_sel:" << getDstSel() 267 << " preserve:" << *getPreservedOperand() << '\n'; 268 } 269 270 #endif 271 272 static void copyRegOperand(MachineOperand &To, const MachineOperand &From) { 273 assert(To.isReg() && From.isReg()); 274 To.setReg(From.getReg()); 275 To.setSubReg(From.getSubReg()); 276 To.setIsUndef(From.isUndef()); 277 if (To.isUse()) { 278 To.setIsKill(From.isKill()); 279 } else { 280 To.setIsDead(From.isDead()); 281 } 282 } 283 284 static bool isSameReg(const MachineOperand &LHS, const MachineOperand &RHS) { 285 return LHS.isReg() && 286 RHS.isReg() && 287 LHS.getReg() == RHS.getReg() && 288 LHS.getSubReg() == RHS.getSubReg(); 289 } 290 291 static MachineOperand *findSingleRegUse(const MachineOperand *Reg, 292 const MachineRegisterInfo *MRI) { 293 if (!Reg->isReg() || !Reg->isDef()) 294 return nullptr; 295 296 MachineOperand *ResMO = nullptr; 297 for (MachineOperand &UseMO : MRI->use_nodbg_operands(Reg->getReg())) { 298 // If there exist use of subreg of Reg then return nullptr 299 if (!isSameReg(UseMO, *Reg)) 300 return nullptr; 301 302 // Check that there is only one instruction that uses Reg 303 if (!ResMO) { 304 ResMO = &UseMO; 305 } else if (ResMO->getParent() != UseMO.getParent()) { 306 return nullptr; 307 } 308 } 309 310 return ResMO; 311 } 312 313 static MachineOperand *findSingleRegDef(const MachineOperand *Reg, 314 const MachineRegisterInfo *MRI) { 315 if (!Reg->isReg()) 316 return nullptr; 317 318 MachineInstr *DefInstr = MRI->getUniqueVRegDef(Reg->getReg()); 319 if (!DefInstr) 320 return nullptr; 321 322 for (auto &DefMO : DefInstr->defs()) { 323 if (DefMO.isReg() && DefMO.getReg() == Reg->getReg()) 324 return &DefMO; 325 } 326 327 // Ignore implicit defs. 328 return nullptr; 329 } 330 331 uint64_t SDWASrcOperand::getSrcMods(const SIInstrInfo *TII, 332 const MachineOperand *SrcOp) const { 333 uint64_t Mods = 0; 334 const auto *MI = SrcOp->getParent(); 335 if (TII->getNamedOperand(*MI, AMDGPU::OpName::src0) == SrcOp) { 336 if (auto *Mod = TII->getNamedOperand(*MI, AMDGPU::OpName::src0_modifiers)) { 337 Mods = Mod->getImm(); 338 } 339 } else if (TII->getNamedOperand(*MI, AMDGPU::OpName::src1) == SrcOp) { 340 if (auto *Mod = TII->getNamedOperand(*MI, AMDGPU::OpName::src1_modifiers)) { 341 Mods = Mod->getImm(); 342 } 343 } 344 if (Abs || Neg) { 345 assert(!Sext && 346 "Float and integer src modifiers can't be set simulteniously"); 347 Mods |= Abs ? SISrcMods::ABS : 0; 348 Mods ^= Neg ? SISrcMods::NEG : 0; 349 } else if (Sext) { 350 Mods |= SISrcMods::SEXT; 351 } 352 353 return Mods; 354 } 355 356 MachineInstr *SDWASrcOperand::potentialToConvert(const SIInstrInfo *TII) { 357 // For SDWA src operand potential instruction is one that use register 358 // defined by parent instruction 359 MachineOperand *PotentialMO = findSingleRegUse(getReplacedOperand(), getMRI()); 360 if (!PotentialMO) 361 return nullptr; 362 363 return PotentialMO->getParent(); 364 } 365 366 bool SDWASrcOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) { 367 // Find operand in instruction that matches source operand and replace it with 368 // target operand. Set corresponding src_sel 369 bool IsPreserveSrc = false; 370 MachineOperand *Src = TII->getNamedOperand(MI, AMDGPU::OpName::src0); 371 MachineOperand *SrcSel = TII->getNamedOperand(MI, AMDGPU::OpName::src0_sel); 372 MachineOperand *SrcMods = 373 TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers); 374 assert(Src && (Src->isReg() || Src->isImm())); 375 if (!isSameReg(*Src, *getReplacedOperand())) { 376 // If this is not src0 then it could be src1 377 Src = TII->getNamedOperand(MI, AMDGPU::OpName::src1); 378 SrcSel = TII->getNamedOperand(MI, AMDGPU::OpName::src1_sel); 379 SrcMods = TII->getNamedOperand(MI, AMDGPU::OpName::src1_modifiers); 380 381 if (!Src || 382 !isSameReg(*Src, *getReplacedOperand())) { 383 // It's possible this Src is a tied operand for 384 // UNUSED_PRESERVE, in which case we can either 385 // abandon the peephole attempt, or if legal we can 386 // copy the target operand into the tied slot 387 // if the preserve operation will effectively cause the same 388 // result by overwriting the rest of the dst. 389 MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); 390 MachineOperand *DstUnused = 391 TII->getNamedOperand(MI, AMDGPU::OpName::dst_unused); 392 393 if (Dst && 394 DstUnused->getImm() == AMDGPU::SDWA::DstUnused::UNUSED_PRESERVE) { 395 // This will work if the tied src is acessing WORD_0, and the dst is 396 // writing WORD_1. Modifiers don't matter because all the bits that 397 // would be impacted are being overwritten by the dst. 398 // Any other case will not work. 399 SdwaSel DstSel = static_cast<SdwaSel>( 400 TII->getNamedImmOperand(MI, AMDGPU::OpName::dst_sel)); 401 if (DstSel == AMDGPU::SDWA::SdwaSel::WORD_1 && 402 getSrcSel() == AMDGPU::SDWA::SdwaSel::WORD_0) { 403 IsPreserveSrc = true; 404 auto DstIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), 405 AMDGPU::OpName::vdst); 406 auto TiedIdx = MI.findTiedOperandIdx(DstIdx); 407 Src = &MI.getOperand(TiedIdx); 408 SrcSel = nullptr; 409 SrcMods = nullptr; 410 } else { 411 // Not legal to convert this src 412 return false; 413 } 414 } 415 } 416 assert(Src && Src->isReg()); 417 418 if ((MI.getOpcode() == AMDGPU::V_MAC_F16_sdwa || 419 MI.getOpcode() == AMDGPU::V_MAC_F32_sdwa) && 420 !isSameReg(*Src, *getReplacedOperand())) { 421 // In case of v_mac_f16/32_sdwa this pass can try to apply src operand to 422 // src2. This is not allowed. 423 return false; 424 } 425 426 assert(isSameReg(*Src, *getReplacedOperand()) && 427 (IsPreserveSrc || (SrcSel && SrcMods))); 428 } 429 copyRegOperand(*Src, *getTargetOperand()); 430 if (!IsPreserveSrc) { 431 SrcSel->setImm(getSrcSel()); 432 SrcMods->setImm(getSrcMods(TII, Src)); 433 } 434 getTargetOperand()->setIsKill(false); 435 return true; 436 } 437 438 MachineInstr *SDWADstOperand::potentialToConvert(const SIInstrInfo *TII) { 439 // For SDWA dst operand potential instruction is one that defines register 440 // that this operand uses 441 MachineRegisterInfo *MRI = getMRI(); 442 MachineInstr *ParentMI = getParentInst(); 443 444 MachineOperand *PotentialMO = findSingleRegDef(getReplacedOperand(), MRI); 445 if (!PotentialMO) 446 return nullptr; 447 448 // Check that ParentMI is the only instruction that uses replaced register 449 for (MachineInstr &UseInst : MRI->use_nodbg_instructions(PotentialMO->getReg())) { 450 if (&UseInst != ParentMI) 451 return nullptr; 452 } 453 454 return PotentialMO->getParent(); 455 } 456 457 bool SDWADstOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) { 458 // Replace vdst operand in MI with target operand. Set dst_sel and dst_unused 459 460 if ((MI.getOpcode() == AMDGPU::V_MAC_F16_sdwa || 461 MI.getOpcode() == AMDGPU::V_MAC_F32_sdwa) && 462 getDstSel() != AMDGPU::SDWA::DWORD) { 463 // v_mac_f16/32_sdwa allow dst_sel to be equal only to DWORD 464 return false; 465 } 466 467 MachineOperand *Operand = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); 468 assert(Operand && 469 Operand->isReg() && 470 isSameReg(*Operand, *getReplacedOperand())); 471 copyRegOperand(*Operand, *getTargetOperand()); 472 MachineOperand *DstSel= TII->getNamedOperand(MI, AMDGPU::OpName::dst_sel); 473 assert(DstSel); 474 DstSel->setImm(getDstSel()); 475 MachineOperand *DstUnused= TII->getNamedOperand(MI, AMDGPU::OpName::dst_unused); 476 assert(DstUnused); 477 DstUnused->setImm(getDstUnused()); 478 479 // Remove original instruction because it would conflict with our new 480 // instruction by register definition 481 getParentInst()->eraseFromParent(); 482 return true; 483 } 484 485 bool SDWADstPreserveOperand::convertToSDWA(MachineInstr &MI, 486 const SIInstrInfo *TII) { 487 // MI should be moved right before v_or_b32. 488 // For this we should clear all kill flags on uses of MI src-operands or else 489 // we can encounter problem with use of killed operand. 490 for (MachineOperand &MO : MI.uses()) { 491 if (!MO.isReg()) 492 continue; 493 getMRI()->clearKillFlags(MO.getReg()); 494 } 495 496 // Move MI before v_or_b32 497 auto MBB = MI.getParent(); 498 MBB->remove(&MI); 499 MBB->insert(getParentInst(), &MI); 500 501 // Add Implicit use of preserved register 502 MachineInstrBuilder MIB(*MBB->getParent(), MI); 503 MIB.addReg(getPreservedOperand()->getReg(), 504 RegState::ImplicitKill, 505 getPreservedOperand()->getSubReg()); 506 507 // Tie dst to implicit use 508 MI.tieOperands(AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdst), 509 MI.getNumOperands() - 1); 510 511 // Convert MI as any other SDWADstOperand and remove v_or_b32 512 return SDWADstOperand::convertToSDWA(MI, TII); 513 } 514 515 Optional<int64_t> SIPeepholeSDWA::foldToImm(const MachineOperand &Op) const { 516 if (Op.isImm()) { 517 return Op.getImm(); 518 } 519 520 // If this is not immediate then it can be copy of immediate value, e.g.: 521 // %1 = S_MOV_B32 255; 522 if (Op.isReg()) { 523 for (const MachineOperand &Def : MRI->def_operands(Op.getReg())) { 524 if (!isSameReg(Op, Def)) 525 continue; 526 527 const MachineInstr *DefInst = Def.getParent(); 528 if (!TII->isFoldableCopy(*DefInst)) 529 return None; 530 531 const MachineOperand &Copied = DefInst->getOperand(1); 532 if (!Copied.isImm()) 533 return None; 534 535 return Copied.getImm(); 536 } 537 } 538 539 return None; 540 } 541 542 std::unique_ptr<SDWAOperand> 543 SIPeepholeSDWA::matchSDWAOperand(MachineInstr &MI) { 544 unsigned Opcode = MI.getOpcode(); 545 switch (Opcode) { 546 case AMDGPU::V_LSHRREV_B32_e32: 547 case AMDGPU::V_ASHRREV_I32_e32: 548 case AMDGPU::V_LSHLREV_B32_e32: 549 case AMDGPU::V_LSHRREV_B32_e64: 550 case AMDGPU::V_ASHRREV_I32_e64: 551 case AMDGPU::V_LSHLREV_B32_e64: { 552 // from: v_lshrrev_b32_e32 v1, 16/24, v0 553 // to SDWA src:v0 src_sel:WORD_1/BYTE_3 554 555 // from: v_ashrrev_i32_e32 v1, 16/24, v0 556 // to SDWA src:v0 src_sel:WORD_1/BYTE_3 sext:1 557 558 // from: v_lshlrev_b32_e32 v1, 16/24, v0 559 // to SDWA dst:v1 dst_sel:WORD_1/BYTE_3 dst_unused:UNUSED_PAD 560 MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); 561 auto Imm = foldToImm(*Src0); 562 if (!Imm) 563 break; 564 565 if (*Imm != 16 && *Imm != 24) 566 break; 567 568 MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); 569 MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); 570 if (TRI->isPhysicalRegister(Src1->getReg()) || 571 TRI->isPhysicalRegister(Dst->getReg())) 572 break; 573 574 if (Opcode == AMDGPU::V_LSHLREV_B32_e32 || 575 Opcode == AMDGPU::V_LSHLREV_B32_e64) { 576 return make_unique<SDWADstOperand>( 577 Dst, Src1, *Imm == 16 ? WORD_1 : BYTE_3, UNUSED_PAD); 578 } else { 579 return make_unique<SDWASrcOperand>( 580 Src1, Dst, *Imm == 16 ? WORD_1 : BYTE_3, false, false, 581 Opcode != AMDGPU::V_LSHRREV_B32_e32 && 582 Opcode != AMDGPU::V_LSHRREV_B32_e64); 583 } 584 break; 585 } 586 587 case AMDGPU::V_LSHRREV_B16_e32: 588 case AMDGPU::V_ASHRREV_I16_e32: 589 case AMDGPU::V_LSHLREV_B16_e32: 590 case AMDGPU::V_LSHRREV_B16_e64: 591 case AMDGPU::V_ASHRREV_I16_e64: 592 case AMDGPU::V_LSHLREV_B16_e64: { 593 // from: v_lshrrev_b16_e32 v1, 8, v0 594 // to SDWA src:v0 src_sel:BYTE_1 595 596 // from: v_ashrrev_i16_e32 v1, 8, v0 597 // to SDWA src:v0 src_sel:BYTE_1 sext:1 598 599 // from: v_lshlrev_b16_e32 v1, 8, v0 600 // to SDWA dst:v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD 601 MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); 602 auto Imm = foldToImm(*Src0); 603 if (!Imm || *Imm != 8) 604 break; 605 606 MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); 607 MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); 608 609 if (TRI->isPhysicalRegister(Src1->getReg()) || 610 TRI->isPhysicalRegister(Dst->getReg())) 611 break; 612 613 if (Opcode == AMDGPU::V_LSHLREV_B16_e32 || 614 Opcode == AMDGPU::V_LSHLREV_B16_e64) { 615 return make_unique<SDWADstOperand>(Dst, Src1, BYTE_1, UNUSED_PAD); 616 } else { 617 return make_unique<SDWASrcOperand>( 618 Src1, Dst, BYTE_1, false, false, 619 Opcode != AMDGPU::V_LSHRREV_B16_e32 && 620 Opcode != AMDGPU::V_LSHRREV_B16_e64); 621 } 622 break; 623 } 624 625 case AMDGPU::V_BFE_I32: 626 case AMDGPU::V_BFE_U32: { 627 // e.g.: 628 // from: v_bfe_u32 v1, v0, 8, 8 629 // to SDWA src:v0 src_sel:BYTE_1 630 631 // offset | width | src_sel 632 // ------------------------ 633 // 0 | 8 | BYTE_0 634 // 0 | 16 | WORD_0 635 // 0 | 32 | DWORD ? 636 // 8 | 8 | BYTE_1 637 // 16 | 8 | BYTE_2 638 // 16 | 16 | WORD_1 639 // 24 | 8 | BYTE_3 640 641 MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); 642 auto Offset = foldToImm(*Src1); 643 if (!Offset) 644 break; 645 646 MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2); 647 auto Width = foldToImm(*Src2); 648 if (!Width) 649 break; 650 651 SdwaSel SrcSel = DWORD; 652 653 if (*Offset == 0 && *Width == 8) 654 SrcSel = BYTE_0; 655 else if (*Offset == 0 && *Width == 16) 656 SrcSel = WORD_0; 657 else if (*Offset == 0 && *Width == 32) 658 SrcSel = DWORD; 659 else if (*Offset == 8 && *Width == 8) 660 SrcSel = BYTE_1; 661 else if (*Offset == 16 && *Width == 8) 662 SrcSel = BYTE_2; 663 else if (*Offset == 16 && *Width == 16) 664 SrcSel = WORD_1; 665 else if (*Offset == 24 && *Width == 8) 666 SrcSel = BYTE_3; 667 else 668 break; 669 670 MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); 671 MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); 672 673 if (TRI->isPhysicalRegister(Src0->getReg()) || 674 TRI->isPhysicalRegister(Dst->getReg())) 675 break; 676 677 return make_unique<SDWASrcOperand>( 678 Src0, Dst, SrcSel, false, false, Opcode != AMDGPU::V_BFE_U32); 679 } 680 681 case AMDGPU::V_AND_B32_e32: 682 case AMDGPU::V_AND_B32_e64: { 683 // e.g.: 684 // from: v_and_b32_e32 v1, 0x0000ffff/0x000000ff, v0 685 // to SDWA src:v0 src_sel:WORD_0/BYTE_0 686 687 MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); 688 MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); 689 auto ValSrc = Src1; 690 auto Imm = foldToImm(*Src0); 691 692 if (!Imm) { 693 Imm = foldToImm(*Src1); 694 ValSrc = Src0; 695 } 696 697 if (!Imm || (*Imm != 0x0000ffff && *Imm != 0x000000ff)) 698 break; 699 700 MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); 701 702 if (TRI->isPhysicalRegister(Src1->getReg()) || 703 TRI->isPhysicalRegister(Dst->getReg())) 704 break; 705 706 return make_unique<SDWASrcOperand>( 707 ValSrc, Dst, *Imm == 0x0000ffff ? WORD_0 : BYTE_0); 708 } 709 710 case AMDGPU::V_OR_B32_e32: 711 case AMDGPU::V_OR_B32_e64: { 712 // Patterns for dst_unused:UNUSED_PRESERVE. 713 // e.g., from: 714 // v_add_f16_sdwa v0, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD 715 // src1_sel:WORD_1 src2_sel:WORD1 716 // v_add_f16_e32 v3, v1, v2 717 // v_or_b32_e32 v4, v0, v3 718 // to SDWA preserve dst:v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE preserve:v3 719 720 // Check if one of operands of v_or_b32 is SDWA instruction 721 using CheckRetType = Optional<std::pair<MachineOperand *, MachineOperand *>>; 722 auto CheckOROperandsForSDWA = 723 [&](const MachineOperand *Op1, const MachineOperand *Op2) -> CheckRetType { 724 if (!Op1 || !Op1->isReg() || !Op2 || !Op2->isReg()) 725 return CheckRetType(None); 726 727 MachineOperand *Op1Def = findSingleRegDef(Op1, MRI); 728 if (!Op1Def) 729 return CheckRetType(None); 730 731 MachineInstr *Op1Inst = Op1Def->getParent(); 732 if (!TII->isSDWA(*Op1Inst)) 733 return CheckRetType(None); 734 735 MachineOperand *Op2Def = findSingleRegDef(Op2, MRI); 736 if (!Op2Def) 737 return CheckRetType(None); 738 739 return CheckRetType(std::make_pair(Op1Def, Op2Def)); 740 }; 741 742 MachineOperand *OrSDWA = TII->getNamedOperand(MI, AMDGPU::OpName::src0); 743 MachineOperand *OrOther = TII->getNamedOperand(MI, AMDGPU::OpName::src1); 744 assert(OrSDWA && OrOther); 745 auto Res = CheckOROperandsForSDWA(OrSDWA, OrOther); 746 if (!Res) { 747 OrSDWA = TII->getNamedOperand(MI, AMDGPU::OpName::src1); 748 OrOther = TII->getNamedOperand(MI, AMDGPU::OpName::src0); 749 assert(OrSDWA && OrOther); 750 Res = CheckOROperandsForSDWA(OrSDWA, OrOther); 751 if (!Res) 752 break; 753 } 754 755 MachineOperand *OrSDWADef = Res->first; 756 MachineOperand *OrOtherDef = Res->second; 757 assert(OrSDWADef && OrOtherDef); 758 759 MachineInstr *SDWAInst = OrSDWADef->getParent(); 760 MachineInstr *OtherInst = OrOtherDef->getParent(); 761 762 // Check that OtherInstr is actually bitwise compatible with SDWAInst = their 763 // destination patterns don't overlap. Compatible instruction can be either 764 // regular instruction with compatible bitness or SDWA instruction with 765 // correct dst_sel 766 // SDWAInst | OtherInst bitness / OtherInst dst_sel 767 // ----------------------------------------------------- 768 // DWORD | no / no 769 // WORD_0 | no / BYTE_2/3, WORD_1 770 // WORD_1 | 8/16-bit instructions / BYTE_0/1, WORD_0 771 // BYTE_0 | no / BYTE_1/2/3, WORD_1 772 // BYTE_1 | 8-bit / BYTE_0/2/3, WORD_1 773 // BYTE_2 | 8/16-bit / BYTE_0/1/3. WORD_0 774 // BYTE_3 | 8/16/24-bit / BYTE_0/1/2, WORD_0 775 // E.g. if SDWAInst is v_add_f16_sdwa dst_sel:WORD_1 then v_add_f16 is OK 776 // but v_add_f32 is not. 777 778 // TODO: add support for non-SDWA instructions as OtherInst. 779 // For now this only works with SDWA instructions. For regular instructions 780 // there is no way to determine if the instruction writes only 8/16/24-bit 781 // out of full register size and all registers are at min 32-bit wide. 782 if (!TII->isSDWA(*OtherInst)) 783 break; 784 785 SdwaSel DstSel = static_cast<SdwaSel>( 786 TII->getNamedImmOperand(*SDWAInst, AMDGPU::OpName::dst_sel));; 787 SdwaSel OtherDstSel = static_cast<SdwaSel>( 788 TII->getNamedImmOperand(*OtherInst, AMDGPU::OpName::dst_sel)); 789 790 bool DstSelAgree = false; 791 switch (DstSel) { 792 case WORD_0: DstSelAgree = ((OtherDstSel == BYTE_2) || 793 (OtherDstSel == BYTE_3) || 794 (OtherDstSel == WORD_1)); 795 break; 796 case WORD_1: DstSelAgree = ((OtherDstSel == BYTE_0) || 797 (OtherDstSel == BYTE_1) || 798 (OtherDstSel == WORD_0)); 799 break; 800 case BYTE_0: DstSelAgree = ((OtherDstSel == BYTE_1) || 801 (OtherDstSel == BYTE_2) || 802 (OtherDstSel == BYTE_3) || 803 (OtherDstSel == WORD_1)); 804 break; 805 case BYTE_1: DstSelAgree = ((OtherDstSel == BYTE_0) || 806 (OtherDstSel == BYTE_2) || 807 (OtherDstSel == BYTE_3) || 808 (OtherDstSel == WORD_1)); 809 break; 810 case BYTE_2: DstSelAgree = ((OtherDstSel == BYTE_0) || 811 (OtherDstSel == BYTE_1) || 812 (OtherDstSel == BYTE_3) || 813 (OtherDstSel == WORD_0)); 814 break; 815 case BYTE_3: DstSelAgree = ((OtherDstSel == BYTE_0) || 816 (OtherDstSel == BYTE_1) || 817 (OtherDstSel == BYTE_2) || 818 (OtherDstSel == WORD_0)); 819 break; 820 default: DstSelAgree = false; 821 } 822 823 if (!DstSelAgree) 824 break; 825 826 // Also OtherInst dst_unused should be UNUSED_PAD 827 DstUnused OtherDstUnused = static_cast<DstUnused>( 828 TII->getNamedImmOperand(*OtherInst, AMDGPU::OpName::dst_unused)); 829 if (OtherDstUnused != DstUnused::UNUSED_PAD) 830 break; 831 832 // Create DstPreserveOperand 833 MachineOperand *OrDst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); 834 assert(OrDst && OrDst->isReg()); 835 836 return make_unique<SDWADstPreserveOperand>( 837 OrDst, OrSDWADef, OrOtherDef, DstSel); 838 839 } 840 } 841 842 return std::unique_ptr<SDWAOperand>(nullptr); 843 } 844 845 void SIPeepholeSDWA::matchSDWAOperands(MachineBasicBlock &MBB) { 846 for (MachineInstr &MI : MBB) { 847 if (auto Operand = matchSDWAOperand(MI)) { 848 DEBUG(dbgs() << "Match: " << MI << "To: " << *Operand << '\n'); 849 SDWAOperands[&MI] = std::move(Operand); 850 ++NumSDWAPatternsFound; 851 } 852 } 853 } 854 855 bool SIPeepholeSDWA::isConvertibleToSDWA(const MachineInstr &MI, 856 const SISubtarget &ST) const { 857 // Check if this is already an SDWA instruction 858 unsigned Opc = MI.getOpcode(); 859 if (TII->isSDWA(Opc)) 860 return true; 861 862 // Check if this instruction has opcode that supports SDWA 863 if (AMDGPU::getSDWAOp(Opc) == -1) 864 Opc = AMDGPU::getVOPe32(Opc); 865 866 if (AMDGPU::getSDWAOp(Opc) == -1) 867 return false; 868 869 if (!ST.hasSDWAOmod() && TII->hasModifiersSet(MI, AMDGPU::OpName::omod)) 870 return false; 871 872 if (TII->isVOPC(Opc)) { 873 if (!ST.hasSDWASdst()) { 874 const MachineOperand *SDst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst); 875 if (SDst && SDst->getReg() != AMDGPU::VCC) 876 return false; 877 } 878 879 if (!ST.hasSDWAOutModsVOPC() && 880 (TII->hasModifiersSet(MI, AMDGPU::OpName::clamp) || 881 TII->hasModifiersSet(MI, AMDGPU::OpName::omod))) 882 return false; 883 884 } else if (TII->getNamedOperand(MI, AMDGPU::OpName::sdst) || 885 !TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) { 886 return false; 887 } 888 889 if (!ST.hasSDWAMac() && (Opc == AMDGPU::V_MAC_F16_e32 || 890 Opc == AMDGPU::V_MAC_F32_e32)) 891 return false; 892 893 // FIXME: has SDWA but require handling of implicit VCC use 894 if (Opc == AMDGPU::V_CNDMASK_B32_e32) 895 return false; 896 897 return true; 898 } 899 900 bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI, 901 const SDWAOperandsVector &SDWAOperands) { 902 903 DEBUG(dbgs() << "Convert instruction:" << MI); 904 905 // Convert to sdwa 906 int SDWAOpcode; 907 unsigned Opcode = MI.getOpcode(); 908 if (TII->isSDWA(Opcode)) { 909 SDWAOpcode = Opcode; 910 } else { 911 SDWAOpcode = AMDGPU::getSDWAOp(Opcode); 912 if (SDWAOpcode == -1) 913 SDWAOpcode = AMDGPU::getSDWAOp(AMDGPU::getVOPe32(Opcode)); 914 } 915 assert(SDWAOpcode != -1); 916 917 const MCInstrDesc &SDWADesc = TII->get(SDWAOpcode); 918 919 // Create SDWA version of instruction MI and initialize its operands 920 MachineInstrBuilder SDWAInst = 921 BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), SDWADesc); 922 923 // Copy dst, if it is present in original then should also be present in SDWA 924 MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); 925 if (Dst) { 926 assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::vdst) != -1); 927 SDWAInst.add(*Dst); 928 } else if ((Dst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst))) { 929 assert(Dst && 930 AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::sdst) != -1); 931 SDWAInst.add(*Dst); 932 } else { 933 assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::sdst) != -1); 934 SDWAInst.addReg(AMDGPU::VCC, RegState::Define); 935 } 936 937 // Copy src0, initialize src0_modifiers. All sdwa instructions has src0 and 938 // src0_modifiers (except for v_nop_sdwa, but it can't get here) 939 MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); 940 assert( 941 Src0 && 942 AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src0) != -1 && 943 AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src0_modifiers) != -1); 944 if (auto *Mod = TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)) 945 SDWAInst.addImm(Mod->getImm()); 946 else 947 SDWAInst.addImm(0); 948 SDWAInst.add(*Src0); 949 950 // Copy src1 if present, initialize src1_modifiers. 951 MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); 952 if (Src1) { 953 assert( 954 AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src1) != -1 && 955 AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src1_modifiers) != -1); 956 if (auto *Mod = TII->getNamedOperand(MI, AMDGPU::OpName::src1_modifiers)) 957 SDWAInst.addImm(Mod->getImm()); 958 else 959 SDWAInst.addImm(0); 960 SDWAInst.add(*Src1); 961 } 962 963 if (SDWAOpcode == AMDGPU::V_MAC_F16_sdwa || 964 SDWAOpcode == AMDGPU::V_MAC_F32_sdwa) { 965 // v_mac_f16/32 has additional src2 operand tied to vdst 966 MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2); 967 assert(Src2); 968 SDWAInst.add(*Src2); 969 } 970 971 // Copy clamp if present, initialize otherwise 972 assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::clamp) != -1); 973 MachineOperand *Clamp = TII->getNamedOperand(MI, AMDGPU::OpName::clamp); 974 if (Clamp) { 975 SDWAInst.add(*Clamp); 976 } else { 977 SDWAInst.addImm(0); 978 } 979 980 // Copy omod if present, initialize otherwise if needed 981 if (AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::omod) != -1) { 982 MachineOperand *OMod = TII->getNamedOperand(MI, AMDGPU::OpName::omod); 983 if (OMod) { 984 SDWAInst.add(*OMod); 985 } else { 986 SDWAInst.addImm(0); 987 } 988 } 989 990 // Copy dst_sel if present, initialize otherwise if needed 991 if (AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::dst_sel) != -1) { 992 MachineOperand *DstSel = TII->getNamedOperand(MI, AMDGPU::OpName::dst_sel); 993 if (DstSel) { 994 SDWAInst.add(*DstSel); 995 } else { 996 SDWAInst.addImm(AMDGPU::SDWA::SdwaSel::DWORD); 997 } 998 } 999 1000 // Copy dst_unused if present, initialize otherwise if needed 1001 if (AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::dst_unused) != -1) { 1002 MachineOperand *DstUnused = TII->getNamedOperand(MI, AMDGPU::OpName::dst_unused); 1003 if (DstUnused) { 1004 SDWAInst.add(*DstUnused); 1005 } else { 1006 SDWAInst.addImm(AMDGPU::SDWA::DstUnused::UNUSED_PAD); 1007 } 1008 } 1009 1010 // Copy src0_sel if present, initialize otherwise 1011 assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src0_sel) != -1); 1012 MachineOperand *Src0Sel = TII->getNamedOperand(MI, AMDGPU::OpName::src0_sel); 1013 if (Src0Sel) { 1014 SDWAInst.add(*Src0Sel); 1015 } else { 1016 SDWAInst.addImm(AMDGPU::SDWA::SdwaSel::DWORD); 1017 } 1018 1019 // Copy src1_sel if present, initialize otherwise if needed 1020 if (Src1) { 1021 assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src1_sel) != -1); 1022 MachineOperand *Src1Sel = TII->getNamedOperand(MI, AMDGPU::OpName::src1_sel); 1023 if (Src1Sel) { 1024 SDWAInst.add(*Src1Sel); 1025 } else { 1026 SDWAInst.addImm(AMDGPU::SDWA::SdwaSel::DWORD); 1027 } 1028 } 1029 1030 // Check for a preserved register that needs to be copied. 1031 auto DstUnused = TII->getNamedOperand(MI, AMDGPU::OpName::dst_unused); 1032 if (DstUnused && 1033 DstUnused->getImm() == AMDGPU::SDWA::DstUnused::UNUSED_PRESERVE) { 1034 // We expect, if we are here, that the instruction was already in it's SDWA form, 1035 // with a tied operand. 1036 assert(Dst && Dst->isTied()); 1037 assert(Opcode == static_cast<unsigned int>(SDWAOpcode)); 1038 // We also expect a vdst, since sdst can't preserve. 1039 auto PreserveDstIdx = AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::vdst); 1040 assert(PreserveDstIdx != -1); 1041 1042 auto TiedIdx = MI.findTiedOperandIdx(PreserveDstIdx); 1043 auto Tied = MI.getOperand(TiedIdx); 1044 1045 SDWAInst.add(Tied); 1046 SDWAInst->tieOperands(PreserveDstIdx, SDWAInst->getNumOperands() - 1); 1047 } 1048 1049 // Apply all sdwa operand patterns. 1050 bool Converted = false; 1051 for (auto &Operand : SDWAOperands) { 1052 DEBUG(dbgs() << *SDWAInst << "\nOperand: " << *Operand); 1053 // There should be no intesection between SDWA operands and potential MIs 1054 // e.g.: 1055 // v_and_b32 v0, 0xff, v1 -> src:v1 sel:BYTE_0 1056 // v_and_b32 v2, 0xff, v0 -> src:v0 sel:BYTE_0 1057 // v_add_u32 v3, v4, v2 1058 // 1059 // In that example it is possible that we would fold 2nd instruction into 3rd 1060 // (v_add_u32_sdwa) and then try to fold 1st instruction into 2nd (that was 1061 // already destroyed). So if SDWAOperand is also a potential MI then do not 1062 // apply it. 1063 if (PotentialMatches.count(Operand->getParentInst()) == 0) 1064 Converted |= Operand->convertToSDWA(*SDWAInst, TII); 1065 } 1066 if (Converted) { 1067 ConvertedInstructions.push_back(SDWAInst); 1068 } else { 1069 SDWAInst->eraseFromParent(); 1070 return false; 1071 } 1072 1073 DEBUG(dbgs() << "\nInto:" << *SDWAInst << '\n'); 1074 ++NumSDWAInstructionsPeepholed; 1075 1076 MI.eraseFromParent(); 1077 return true; 1078 } 1079 1080 // If an instruction was converted to SDWA it should not have immediates or SGPR 1081 // operands (allowed one SGPR on GFX9). Copy its scalar operands into VGPRs. 1082 void SIPeepholeSDWA::legalizeScalarOperands(MachineInstr &MI, 1083 const SISubtarget &ST) const { 1084 const MCInstrDesc &Desc = TII->get(MI.getOpcode()); 1085 unsigned ConstantBusCount = 0; 1086 for (MachineOperand &Op : MI.explicit_uses()) { 1087 if (!Op.isImm() && !(Op.isReg() && !TRI->isVGPR(*MRI, Op.getReg()))) 1088 continue; 1089 1090 unsigned I = MI.getOperandNo(&Op); 1091 if (Desc.OpInfo[I].RegClass == -1 || 1092 !TRI->hasVGPRs(TRI->getRegClass(Desc.OpInfo[I].RegClass))) 1093 continue; 1094 1095 if (ST.hasSDWAScalar() && ConstantBusCount == 0 && Op.isReg() && 1096 TRI->isSGPRReg(*MRI, Op.getReg())) { 1097 ++ConstantBusCount; 1098 continue; 1099 } 1100 1101 unsigned VGPR = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1102 auto Copy = BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(), 1103 TII->get(AMDGPU::V_MOV_B32_e32), VGPR); 1104 if (Op.isImm()) 1105 Copy.addImm(Op.getImm()); 1106 else if (Op.isReg()) 1107 Copy.addReg(Op.getReg(), Op.isKill() ? RegState::Kill : 0, 1108 Op.getSubReg()); 1109 Op.ChangeToRegister(VGPR, false); 1110 } 1111 } 1112 1113 bool SIPeepholeSDWA::runOnMachineFunction(MachineFunction &MF) { 1114 const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); 1115 1116 if (!ST.hasSDWA() || skipFunction(MF.getFunction())) 1117 return false; 1118 1119 MRI = &MF.getRegInfo(); 1120 TRI = ST.getRegisterInfo(); 1121 TII = ST.getInstrInfo(); 1122 1123 // Find all SDWA operands in MF. 1124 bool Ret = false; 1125 for (MachineBasicBlock &MBB : MF) { 1126 bool Changed = false; 1127 do { 1128 matchSDWAOperands(MBB); 1129 1130 for (const auto &OperandPair : SDWAOperands) { 1131 const auto &Operand = OperandPair.second; 1132 MachineInstr *PotentialMI = Operand->potentialToConvert(TII); 1133 if (PotentialMI && isConvertibleToSDWA(*PotentialMI, ST)) { 1134 PotentialMatches[PotentialMI].push_back(Operand.get()); 1135 } 1136 } 1137 1138 for (auto &PotentialPair : PotentialMatches) { 1139 MachineInstr &PotentialMI = *PotentialPair.first; 1140 convertToSDWA(PotentialMI, PotentialPair.second); 1141 } 1142 1143 PotentialMatches.clear(); 1144 SDWAOperands.clear(); 1145 1146 Changed = !ConvertedInstructions.empty(); 1147 1148 if (Changed) 1149 Ret = true; 1150 while (!ConvertedInstructions.empty()) 1151 legalizeScalarOperands(*ConvertedInstructions.pop_back_val(), ST); 1152 } while (Changed); 1153 } 1154 1155 return Ret; 1156 } 1157