1 //===- SIPeepholeSDWA.cpp - Peephole optimization for SDWA instructions ---===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 /// \file This pass tries to apply several peephole SDWA patterns. 11 /// 12 /// E.g. original: 13 /// V_LSHRREV_B32_e32 %0, 16, %1 14 /// V_ADD_I32_e32 %2, %0, %3 15 /// V_LSHLREV_B32_e32 %4, 16, %2 16 /// 17 /// Replace: 18 /// V_ADD_I32_sdwa %4, %1, %3 19 /// dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 20 /// 21 //===----------------------------------------------------------------------===// 22 23 #include "AMDGPU.h" 24 #include "AMDGPUSubtarget.h" 25 #include "SIDefines.h" 26 #include "SIInstrInfo.h" 27 #include "SIRegisterInfo.h" 28 #include "Utils/AMDGPUBaseInfo.h" 29 #include "llvm/ADT/None.h" 30 #include "llvm/ADT/Optional.h" 31 #include "llvm/ADT/STLExtras.h" 32 #include "llvm/ADT/SmallVector.h" 33 #include "llvm/ADT/Statistic.h" 34 #include "llvm/CodeGen/MachineBasicBlock.h" 35 #include "llvm/CodeGen/MachineFunction.h" 36 #include "llvm/CodeGen/MachineFunctionPass.h" 37 #include "llvm/CodeGen/MachineInstr.h" 38 #include "llvm/CodeGen/MachineInstrBuilder.h" 39 #include "llvm/CodeGen/MachineOperand.h" 40 #include "llvm/CodeGen/MachineRegisterInfo.h" 41 #include "llvm/CodeGen/TargetRegisterInfo.h" 42 #include "llvm/Config/llvm-config.h" 43 #include "llvm/MC/LaneBitmask.h" 44 #include "llvm/MC/MCInstrDesc.h" 45 #include "llvm/Pass.h" 46 #include "llvm/Support/Debug.h" 47 #include "llvm/Support/raw_ostream.h" 48 #include <algorithm> 49 #include <cassert> 50 #include <cstdint> 51 #include <memory> 52 #include <unordered_map> 53 54 using namespace llvm; 55 56 #define DEBUG_TYPE "si-peephole-sdwa" 57 58 STATISTIC(NumSDWAPatternsFound, "Number of SDWA patterns found."); 59 STATISTIC(NumSDWAInstructionsPeepholed, 60 "Number of instruction converted to SDWA."); 61 62 namespace { 63 64 class SDWAOperand; 65 class SDWADstOperand; 66 67 class SIPeepholeSDWA : public MachineFunctionPass { 68 public: 69 using SDWAOperandsVector = SmallVector<SDWAOperand *, 4>; 70 71 private: 72 MachineRegisterInfo *MRI; 73 const SIRegisterInfo *TRI; 74 const SIInstrInfo *TII; 75 76 std::unordered_map<MachineInstr *, std::unique_ptr<SDWAOperand>> SDWAOperands; 77 std::unordered_map<MachineInstr *, SDWAOperandsVector> PotentialMatches; 78 SmallVector<MachineInstr *, 8> ConvertedInstructions; 79 80 Optional<int64_t> foldToImm(const MachineOperand &Op) const; 81 82 public: 83 static char ID; 84 85 SIPeepholeSDWA() : MachineFunctionPass(ID) { 86 initializeSIPeepholeSDWAPass(*PassRegistry::getPassRegistry()); 87 } 88 89 bool runOnMachineFunction(MachineFunction &MF) override; 90 void matchSDWAOperands(MachineBasicBlock &MBB); 91 std::unique_ptr<SDWAOperand> matchSDWAOperand(MachineInstr &MI); 92 bool isConvertibleToSDWA(const MachineInstr &MI, const SISubtarget &ST) const; 93 bool convertToSDWA(MachineInstr &MI, const SDWAOperandsVector &SDWAOperands); 94 void legalizeScalarOperands(MachineInstr &MI, const SISubtarget &ST) const; 95 96 StringRef getPassName() const override { return "SI Peephole SDWA"; } 97 98 void getAnalysisUsage(AnalysisUsage &AU) const override { 99 AU.setPreservesCFG(); 100 MachineFunctionPass::getAnalysisUsage(AU); 101 } 102 }; 103 104 class SDWAOperand { 105 private: 106 MachineOperand *Target; // Operand that would be used in converted instruction 107 MachineOperand *Replaced; // Operand that would be replace by Target 108 109 public: 110 SDWAOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp) 111 : Target(TargetOp), Replaced(ReplacedOp) { 112 assert(Target->isReg()); 113 assert(Replaced->isReg()); 114 } 115 116 virtual ~SDWAOperand() = default; 117 118 virtual MachineInstr *potentialToConvert(const SIInstrInfo *TII) = 0; 119 virtual bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) = 0; 120 121 MachineOperand *getTargetOperand() const { return Target; } 122 MachineOperand *getReplacedOperand() const { return Replaced; } 123 MachineInstr *getParentInst() const { return Target->getParent(); } 124 125 MachineRegisterInfo *getMRI() const { 126 return &getParentInst()->getParent()->getParent()->getRegInfo(); 127 } 128 129 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 130 virtual void print(raw_ostream& OS) const = 0; 131 void dump() const { print(dbgs()); } 132 #endif 133 }; 134 135 using namespace AMDGPU::SDWA; 136 137 class SDWASrcOperand : public SDWAOperand { 138 private: 139 SdwaSel SrcSel; 140 bool Abs; 141 bool Neg; 142 bool Sext; 143 144 public: 145 SDWASrcOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp, 146 SdwaSel SrcSel_ = DWORD, bool Abs_ = false, bool Neg_ = false, 147 bool Sext_ = false) 148 : SDWAOperand(TargetOp, ReplacedOp), 149 SrcSel(SrcSel_), Abs(Abs_), Neg(Neg_), Sext(Sext_) {} 150 151 MachineInstr *potentialToConvert(const SIInstrInfo *TII) override; 152 bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override; 153 154 SdwaSel getSrcSel() const { return SrcSel; } 155 bool getAbs() const { return Abs; } 156 bool getNeg() const { return Neg; } 157 bool getSext() const { return Sext; } 158 159 uint64_t getSrcMods(const SIInstrInfo *TII, 160 const MachineOperand *SrcOp) const; 161 162 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 163 void print(raw_ostream& OS) const override; 164 #endif 165 }; 166 167 class SDWADstOperand : public SDWAOperand { 168 private: 169 SdwaSel DstSel; 170 DstUnused DstUn; 171 172 public: 173 174 SDWADstOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp, 175 SdwaSel DstSel_ = DWORD, DstUnused DstUn_ = UNUSED_PAD) 176 : SDWAOperand(TargetOp, ReplacedOp), DstSel(DstSel_), DstUn(DstUn_) {} 177 178 MachineInstr *potentialToConvert(const SIInstrInfo *TII) override; 179 bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override; 180 181 SdwaSel getDstSel() const { return DstSel; } 182 DstUnused getDstUnused() const { return DstUn; } 183 184 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 185 void print(raw_ostream& OS) const override; 186 #endif 187 }; 188 189 class SDWADstPreserveOperand : public SDWADstOperand { 190 private: 191 MachineOperand *Preserve; 192 193 public: 194 SDWADstPreserveOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp, 195 MachineOperand *PreserveOp, SdwaSel DstSel_ = DWORD) 196 : SDWADstOperand(TargetOp, ReplacedOp, DstSel_, UNUSED_PRESERVE), 197 Preserve(PreserveOp) {} 198 199 bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override; 200 201 MachineOperand *getPreservedOperand() const { return Preserve; } 202 203 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 204 void print(raw_ostream& OS) const override; 205 #endif 206 }; 207 208 } // end anonymous namespace 209 210 INITIALIZE_PASS(SIPeepholeSDWA, DEBUG_TYPE, "SI Peephole SDWA", false, false) 211 212 char SIPeepholeSDWA::ID = 0; 213 214 char &llvm::SIPeepholeSDWAID = SIPeepholeSDWA::ID; 215 216 FunctionPass *llvm::createSIPeepholeSDWAPass() { 217 return new SIPeepholeSDWA(); 218 } 219 220 221 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 222 static raw_ostream& operator<<(raw_ostream &OS, SdwaSel Sel) { 223 switch(Sel) { 224 case BYTE_0: OS << "BYTE_0"; break; 225 case BYTE_1: OS << "BYTE_1"; break; 226 case BYTE_2: OS << "BYTE_2"; break; 227 case BYTE_3: OS << "BYTE_3"; break; 228 case WORD_0: OS << "WORD_0"; break; 229 case WORD_1: OS << "WORD_1"; break; 230 case DWORD: OS << "DWORD"; break; 231 } 232 return OS; 233 } 234 235 static raw_ostream& operator<<(raw_ostream &OS, const DstUnused &Un) { 236 switch(Un) { 237 case UNUSED_PAD: OS << "UNUSED_PAD"; break; 238 case UNUSED_SEXT: OS << "UNUSED_SEXT"; break; 239 case UNUSED_PRESERVE: OS << "UNUSED_PRESERVE"; break; 240 } 241 return OS; 242 } 243 244 static raw_ostream& operator<<(raw_ostream &OS, const SDWAOperand &Operand) { 245 Operand.print(OS); 246 return OS; 247 } 248 249 LLVM_DUMP_METHOD 250 void SDWASrcOperand::print(raw_ostream& OS) const { 251 OS << "SDWA src: " << *getTargetOperand() 252 << " src_sel:" << getSrcSel() 253 << " abs:" << getAbs() << " neg:" << getNeg() 254 << " sext:" << getSext() << '\n'; 255 } 256 257 LLVM_DUMP_METHOD 258 void SDWADstOperand::print(raw_ostream& OS) const { 259 OS << "SDWA dst: " << *getTargetOperand() 260 << " dst_sel:" << getDstSel() 261 << " dst_unused:" << getDstUnused() << '\n'; 262 } 263 264 LLVM_DUMP_METHOD 265 void SDWADstPreserveOperand::print(raw_ostream& OS) const { 266 OS << "SDWA preserve dst: " << *getTargetOperand() 267 << " dst_sel:" << getDstSel() 268 << " preserve:" << *getPreservedOperand() << '\n'; 269 } 270 271 #endif 272 273 static void copyRegOperand(MachineOperand &To, const MachineOperand &From) { 274 assert(To.isReg() && From.isReg()); 275 To.setReg(From.getReg()); 276 To.setSubReg(From.getSubReg()); 277 To.setIsUndef(From.isUndef()); 278 if (To.isUse()) { 279 To.setIsKill(From.isKill()); 280 } else { 281 To.setIsDead(From.isDead()); 282 } 283 } 284 285 static bool isSameReg(const MachineOperand &LHS, const MachineOperand &RHS) { 286 return LHS.isReg() && 287 RHS.isReg() && 288 LHS.getReg() == RHS.getReg() && 289 LHS.getSubReg() == RHS.getSubReg(); 290 } 291 292 static MachineOperand *findSingleRegUse(const MachineOperand *Reg, 293 const MachineRegisterInfo *MRI) { 294 if (!Reg->isReg() || !Reg->isDef()) 295 return nullptr; 296 297 MachineOperand *ResMO = nullptr; 298 for (MachineOperand &UseMO : MRI->use_nodbg_operands(Reg->getReg())) { 299 // If there exist use of subreg of Reg then return nullptr 300 if (!isSameReg(UseMO, *Reg)) 301 return nullptr; 302 303 // Check that there is only one instruction that uses Reg 304 if (!ResMO) { 305 ResMO = &UseMO; 306 } else if (ResMO->getParent() != UseMO.getParent()) { 307 return nullptr; 308 } 309 } 310 311 return ResMO; 312 } 313 314 static MachineOperand *findSingleRegDef(const MachineOperand *Reg, 315 const MachineRegisterInfo *MRI) { 316 if (!Reg->isReg()) 317 return nullptr; 318 319 MachineInstr *DefInstr = MRI->getUniqueVRegDef(Reg->getReg()); 320 if (!DefInstr) 321 return nullptr; 322 323 for (auto &DefMO : DefInstr->defs()) { 324 if (DefMO.isReg() && DefMO.getReg() == Reg->getReg()) 325 return &DefMO; 326 } 327 328 // Ignore implicit defs. 329 return nullptr; 330 } 331 332 uint64_t SDWASrcOperand::getSrcMods(const SIInstrInfo *TII, 333 const MachineOperand *SrcOp) const { 334 uint64_t Mods = 0; 335 const auto *MI = SrcOp->getParent(); 336 if (TII->getNamedOperand(*MI, AMDGPU::OpName::src0) == SrcOp) { 337 if (auto *Mod = TII->getNamedOperand(*MI, AMDGPU::OpName::src0_modifiers)) { 338 Mods = Mod->getImm(); 339 } 340 } else if (TII->getNamedOperand(*MI, AMDGPU::OpName::src1) == SrcOp) { 341 if (auto *Mod = TII->getNamedOperand(*MI, AMDGPU::OpName::src1_modifiers)) { 342 Mods = Mod->getImm(); 343 } 344 } 345 if (Abs || Neg) { 346 assert(!Sext && 347 "Float and integer src modifiers can't be set simulteniously"); 348 Mods |= Abs ? SISrcMods::ABS : 0; 349 Mods ^= Neg ? SISrcMods::NEG : 0; 350 } else if (Sext) { 351 Mods |= SISrcMods::SEXT; 352 } 353 354 return Mods; 355 } 356 357 MachineInstr *SDWASrcOperand::potentialToConvert(const SIInstrInfo *TII) { 358 // For SDWA src operand potential instruction is one that use register 359 // defined by parent instruction 360 MachineOperand *PotentialMO = findSingleRegUse(getReplacedOperand(), getMRI()); 361 if (!PotentialMO) 362 return nullptr; 363 364 return PotentialMO->getParent(); 365 } 366 367 bool SDWASrcOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) { 368 // Find operand in instruction that matches source operand and replace it with 369 // target operand. Set corresponding src_sel 370 bool IsPreserveSrc = false; 371 MachineOperand *Src = TII->getNamedOperand(MI, AMDGPU::OpName::src0); 372 MachineOperand *SrcSel = TII->getNamedOperand(MI, AMDGPU::OpName::src0_sel); 373 MachineOperand *SrcMods = 374 TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers); 375 assert(Src && (Src->isReg() || Src->isImm())); 376 if (!isSameReg(*Src, *getReplacedOperand())) { 377 // If this is not src0 then it could be src1 378 Src = TII->getNamedOperand(MI, AMDGPU::OpName::src1); 379 SrcSel = TII->getNamedOperand(MI, AMDGPU::OpName::src1_sel); 380 SrcMods = TII->getNamedOperand(MI, AMDGPU::OpName::src1_modifiers); 381 382 if (!Src || 383 !isSameReg(*Src, *getReplacedOperand())) { 384 // It's possible this Src is a tied operand for 385 // UNUSED_PRESERVE, in which case we can either 386 // abandon the peephole attempt, or if legal we can 387 // copy the target operand into the tied slot 388 // if the preserve operation will effectively cause the same 389 // result by overwriting the rest of the dst. 390 MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); 391 MachineOperand *DstUnused = 392 TII->getNamedOperand(MI, AMDGPU::OpName::dst_unused); 393 394 if (Dst && 395 DstUnused->getImm() == AMDGPU::SDWA::DstUnused::UNUSED_PRESERVE) { 396 // This will work if the tied src is acessing WORD_0, and the dst is 397 // writing WORD_1. Modifiers don't matter because all the bits that 398 // would be impacted are being overwritten by the dst. 399 // Any other case will not work. 400 SdwaSel DstSel = static_cast<SdwaSel>( 401 TII->getNamedImmOperand(MI, AMDGPU::OpName::dst_sel)); 402 if (DstSel == AMDGPU::SDWA::SdwaSel::WORD_1 && 403 getSrcSel() == AMDGPU::SDWA::SdwaSel::WORD_0) { 404 IsPreserveSrc = true; 405 auto DstIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), 406 AMDGPU::OpName::vdst); 407 auto TiedIdx = MI.findTiedOperandIdx(DstIdx); 408 Src = &MI.getOperand(TiedIdx); 409 SrcSel = nullptr; 410 SrcMods = nullptr; 411 } else { 412 // Not legal to convert this src 413 return false; 414 } 415 } 416 } 417 assert(Src && Src->isReg()); 418 419 if ((MI.getOpcode() == AMDGPU::V_MAC_F16_sdwa || 420 MI.getOpcode() == AMDGPU::V_MAC_F32_sdwa) && 421 !isSameReg(*Src, *getReplacedOperand())) { 422 // In case of v_mac_f16/32_sdwa this pass can try to apply src operand to 423 // src2. This is not allowed. 424 return false; 425 } 426 427 assert(isSameReg(*Src, *getReplacedOperand()) && 428 (IsPreserveSrc || (SrcSel && SrcMods))); 429 } 430 copyRegOperand(*Src, *getTargetOperand()); 431 if (!IsPreserveSrc) { 432 SrcSel->setImm(getSrcSel()); 433 SrcMods->setImm(getSrcMods(TII, Src)); 434 } 435 getTargetOperand()->setIsKill(false); 436 return true; 437 } 438 439 MachineInstr *SDWADstOperand::potentialToConvert(const SIInstrInfo *TII) { 440 // For SDWA dst operand potential instruction is one that defines register 441 // that this operand uses 442 MachineRegisterInfo *MRI = getMRI(); 443 MachineInstr *ParentMI = getParentInst(); 444 445 MachineOperand *PotentialMO = findSingleRegDef(getReplacedOperand(), MRI); 446 if (!PotentialMO) 447 return nullptr; 448 449 // Check that ParentMI is the only instruction that uses replaced register 450 for (MachineInstr &UseInst : MRI->use_nodbg_instructions(PotentialMO->getReg())) { 451 if (&UseInst != ParentMI) 452 return nullptr; 453 } 454 455 return PotentialMO->getParent(); 456 } 457 458 bool SDWADstOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) { 459 // Replace vdst operand in MI with target operand. Set dst_sel and dst_unused 460 461 if ((MI.getOpcode() == AMDGPU::V_MAC_F16_sdwa || 462 MI.getOpcode() == AMDGPU::V_MAC_F32_sdwa) && 463 getDstSel() != AMDGPU::SDWA::DWORD) { 464 // v_mac_f16/32_sdwa allow dst_sel to be equal only to DWORD 465 return false; 466 } 467 468 MachineOperand *Operand = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); 469 assert(Operand && 470 Operand->isReg() && 471 isSameReg(*Operand, *getReplacedOperand())); 472 copyRegOperand(*Operand, *getTargetOperand()); 473 MachineOperand *DstSel= TII->getNamedOperand(MI, AMDGPU::OpName::dst_sel); 474 assert(DstSel); 475 DstSel->setImm(getDstSel()); 476 MachineOperand *DstUnused= TII->getNamedOperand(MI, AMDGPU::OpName::dst_unused); 477 assert(DstUnused); 478 DstUnused->setImm(getDstUnused()); 479 480 // Remove original instruction because it would conflict with our new 481 // instruction by register definition 482 getParentInst()->eraseFromParent(); 483 return true; 484 } 485 486 bool SDWADstPreserveOperand::convertToSDWA(MachineInstr &MI, 487 const SIInstrInfo *TII) { 488 // MI should be moved right before v_or_b32. 489 // For this we should clear all kill flags on uses of MI src-operands or else 490 // we can encounter problem with use of killed operand. 491 for (MachineOperand &MO : MI.uses()) { 492 if (!MO.isReg()) 493 continue; 494 getMRI()->clearKillFlags(MO.getReg()); 495 } 496 497 // Move MI before v_or_b32 498 auto MBB = MI.getParent(); 499 MBB->remove(&MI); 500 MBB->insert(getParentInst(), &MI); 501 502 // Add Implicit use of preserved register 503 MachineInstrBuilder MIB(*MBB->getParent(), MI); 504 MIB.addReg(getPreservedOperand()->getReg(), 505 RegState::ImplicitKill, 506 getPreservedOperand()->getSubReg()); 507 508 // Tie dst to implicit use 509 MI.tieOperands(AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdst), 510 MI.getNumOperands() - 1); 511 512 // Convert MI as any other SDWADstOperand and remove v_or_b32 513 return SDWADstOperand::convertToSDWA(MI, TII); 514 } 515 516 Optional<int64_t> SIPeepholeSDWA::foldToImm(const MachineOperand &Op) const { 517 if (Op.isImm()) { 518 return Op.getImm(); 519 } 520 521 // If this is not immediate then it can be copy of immediate value, e.g.: 522 // %1 = S_MOV_B32 255; 523 if (Op.isReg()) { 524 for (const MachineOperand &Def : MRI->def_operands(Op.getReg())) { 525 if (!isSameReg(Op, Def)) 526 continue; 527 528 const MachineInstr *DefInst = Def.getParent(); 529 if (!TII->isFoldableCopy(*DefInst)) 530 return None; 531 532 const MachineOperand &Copied = DefInst->getOperand(1); 533 if (!Copied.isImm()) 534 return None; 535 536 return Copied.getImm(); 537 } 538 } 539 540 return None; 541 } 542 543 std::unique_ptr<SDWAOperand> 544 SIPeepholeSDWA::matchSDWAOperand(MachineInstr &MI) { 545 unsigned Opcode = MI.getOpcode(); 546 switch (Opcode) { 547 case AMDGPU::V_LSHRREV_B32_e32: 548 case AMDGPU::V_ASHRREV_I32_e32: 549 case AMDGPU::V_LSHLREV_B32_e32: 550 case AMDGPU::V_LSHRREV_B32_e64: 551 case AMDGPU::V_ASHRREV_I32_e64: 552 case AMDGPU::V_LSHLREV_B32_e64: { 553 // from: v_lshrrev_b32_e32 v1, 16/24, v0 554 // to SDWA src:v0 src_sel:WORD_1/BYTE_3 555 556 // from: v_ashrrev_i32_e32 v1, 16/24, v0 557 // to SDWA src:v0 src_sel:WORD_1/BYTE_3 sext:1 558 559 // from: v_lshlrev_b32_e32 v1, 16/24, v0 560 // to SDWA dst:v1 dst_sel:WORD_1/BYTE_3 dst_unused:UNUSED_PAD 561 MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); 562 auto Imm = foldToImm(*Src0); 563 if (!Imm) 564 break; 565 566 if (*Imm != 16 && *Imm != 24) 567 break; 568 569 MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); 570 MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); 571 if (TRI->isPhysicalRegister(Src1->getReg()) || 572 TRI->isPhysicalRegister(Dst->getReg())) 573 break; 574 575 if (Opcode == AMDGPU::V_LSHLREV_B32_e32 || 576 Opcode == AMDGPU::V_LSHLREV_B32_e64) { 577 return make_unique<SDWADstOperand>( 578 Dst, Src1, *Imm == 16 ? WORD_1 : BYTE_3, UNUSED_PAD); 579 } else { 580 return make_unique<SDWASrcOperand>( 581 Src1, Dst, *Imm == 16 ? WORD_1 : BYTE_3, false, false, 582 Opcode != AMDGPU::V_LSHRREV_B32_e32 && 583 Opcode != AMDGPU::V_LSHRREV_B32_e64); 584 } 585 break; 586 } 587 588 case AMDGPU::V_LSHRREV_B16_e32: 589 case AMDGPU::V_ASHRREV_I16_e32: 590 case AMDGPU::V_LSHLREV_B16_e32: 591 case AMDGPU::V_LSHRREV_B16_e64: 592 case AMDGPU::V_ASHRREV_I16_e64: 593 case AMDGPU::V_LSHLREV_B16_e64: { 594 // from: v_lshrrev_b16_e32 v1, 8, v0 595 // to SDWA src:v0 src_sel:BYTE_1 596 597 // from: v_ashrrev_i16_e32 v1, 8, v0 598 // to SDWA src:v0 src_sel:BYTE_1 sext:1 599 600 // from: v_lshlrev_b16_e32 v1, 8, v0 601 // to SDWA dst:v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD 602 MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); 603 auto Imm = foldToImm(*Src0); 604 if (!Imm || *Imm != 8) 605 break; 606 607 MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); 608 MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); 609 610 if (TRI->isPhysicalRegister(Src1->getReg()) || 611 TRI->isPhysicalRegister(Dst->getReg())) 612 break; 613 614 if (Opcode == AMDGPU::V_LSHLREV_B16_e32 || 615 Opcode == AMDGPU::V_LSHLREV_B16_e64) { 616 return make_unique<SDWADstOperand>(Dst, Src1, BYTE_1, UNUSED_PAD); 617 } else { 618 return make_unique<SDWASrcOperand>( 619 Src1, Dst, BYTE_1, false, false, 620 Opcode != AMDGPU::V_LSHRREV_B16_e32 && 621 Opcode != AMDGPU::V_LSHRREV_B16_e64); 622 } 623 break; 624 } 625 626 case AMDGPU::V_BFE_I32: 627 case AMDGPU::V_BFE_U32: { 628 // e.g.: 629 // from: v_bfe_u32 v1, v0, 8, 8 630 // to SDWA src:v0 src_sel:BYTE_1 631 632 // offset | width | src_sel 633 // ------------------------ 634 // 0 | 8 | BYTE_0 635 // 0 | 16 | WORD_0 636 // 0 | 32 | DWORD ? 637 // 8 | 8 | BYTE_1 638 // 16 | 8 | BYTE_2 639 // 16 | 16 | WORD_1 640 // 24 | 8 | BYTE_3 641 642 MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); 643 auto Offset = foldToImm(*Src1); 644 if (!Offset) 645 break; 646 647 MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2); 648 auto Width = foldToImm(*Src2); 649 if (!Width) 650 break; 651 652 SdwaSel SrcSel = DWORD; 653 654 if (*Offset == 0 && *Width == 8) 655 SrcSel = BYTE_0; 656 else if (*Offset == 0 && *Width == 16) 657 SrcSel = WORD_0; 658 else if (*Offset == 0 && *Width == 32) 659 SrcSel = DWORD; 660 else if (*Offset == 8 && *Width == 8) 661 SrcSel = BYTE_1; 662 else if (*Offset == 16 && *Width == 8) 663 SrcSel = BYTE_2; 664 else if (*Offset == 16 && *Width == 16) 665 SrcSel = WORD_1; 666 else if (*Offset == 24 && *Width == 8) 667 SrcSel = BYTE_3; 668 else 669 break; 670 671 MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); 672 MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); 673 674 if (TRI->isPhysicalRegister(Src0->getReg()) || 675 TRI->isPhysicalRegister(Dst->getReg())) 676 break; 677 678 return make_unique<SDWASrcOperand>( 679 Src0, Dst, SrcSel, false, false, Opcode != AMDGPU::V_BFE_U32); 680 } 681 682 case AMDGPU::V_AND_B32_e32: 683 case AMDGPU::V_AND_B32_e64: { 684 // e.g.: 685 // from: v_and_b32_e32 v1, 0x0000ffff/0x000000ff, v0 686 // to SDWA src:v0 src_sel:WORD_0/BYTE_0 687 688 MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); 689 MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); 690 auto ValSrc = Src1; 691 auto Imm = foldToImm(*Src0); 692 693 if (!Imm) { 694 Imm = foldToImm(*Src1); 695 ValSrc = Src0; 696 } 697 698 if (!Imm || (*Imm != 0x0000ffff && *Imm != 0x000000ff)) 699 break; 700 701 MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); 702 703 if (TRI->isPhysicalRegister(ValSrc->getReg()) || 704 TRI->isPhysicalRegister(Dst->getReg())) 705 break; 706 707 return make_unique<SDWASrcOperand>( 708 ValSrc, Dst, *Imm == 0x0000ffff ? WORD_0 : BYTE_0); 709 } 710 711 case AMDGPU::V_OR_B32_e32: 712 case AMDGPU::V_OR_B32_e64: { 713 // Patterns for dst_unused:UNUSED_PRESERVE. 714 // e.g., from: 715 // v_add_f16_sdwa v0, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD 716 // src1_sel:WORD_1 src2_sel:WORD1 717 // v_add_f16_e32 v3, v1, v2 718 // v_or_b32_e32 v4, v0, v3 719 // to SDWA preserve dst:v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE preserve:v3 720 721 // Check if one of operands of v_or_b32 is SDWA instruction 722 using CheckRetType = Optional<std::pair<MachineOperand *, MachineOperand *>>; 723 auto CheckOROperandsForSDWA = 724 [&](const MachineOperand *Op1, const MachineOperand *Op2) -> CheckRetType { 725 if (!Op1 || !Op1->isReg() || !Op2 || !Op2->isReg()) 726 return CheckRetType(None); 727 728 MachineOperand *Op1Def = findSingleRegDef(Op1, MRI); 729 if (!Op1Def) 730 return CheckRetType(None); 731 732 MachineInstr *Op1Inst = Op1Def->getParent(); 733 if (!TII->isSDWA(*Op1Inst)) 734 return CheckRetType(None); 735 736 MachineOperand *Op2Def = findSingleRegDef(Op2, MRI); 737 if (!Op2Def) 738 return CheckRetType(None); 739 740 return CheckRetType(std::make_pair(Op1Def, Op2Def)); 741 }; 742 743 MachineOperand *OrSDWA = TII->getNamedOperand(MI, AMDGPU::OpName::src0); 744 MachineOperand *OrOther = TII->getNamedOperand(MI, AMDGPU::OpName::src1); 745 assert(OrSDWA && OrOther); 746 auto Res = CheckOROperandsForSDWA(OrSDWA, OrOther); 747 if (!Res) { 748 OrSDWA = TII->getNamedOperand(MI, AMDGPU::OpName::src1); 749 OrOther = TII->getNamedOperand(MI, AMDGPU::OpName::src0); 750 assert(OrSDWA && OrOther); 751 Res = CheckOROperandsForSDWA(OrSDWA, OrOther); 752 if (!Res) 753 break; 754 } 755 756 MachineOperand *OrSDWADef = Res->first; 757 MachineOperand *OrOtherDef = Res->second; 758 assert(OrSDWADef && OrOtherDef); 759 760 MachineInstr *SDWAInst = OrSDWADef->getParent(); 761 MachineInstr *OtherInst = OrOtherDef->getParent(); 762 763 // Check that OtherInstr is actually bitwise compatible with SDWAInst = their 764 // destination patterns don't overlap. Compatible instruction can be either 765 // regular instruction with compatible bitness or SDWA instruction with 766 // correct dst_sel 767 // SDWAInst | OtherInst bitness / OtherInst dst_sel 768 // ----------------------------------------------------- 769 // DWORD | no / no 770 // WORD_0 | no / BYTE_2/3, WORD_1 771 // WORD_1 | 8/16-bit instructions / BYTE_0/1, WORD_0 772 // BYTE_0 | no / BYTE_1/2/3, WORD_1 773 // BYTE_1 | 8-bit / BYTE_0/2/3, WORD_1 774 // BYTE_2 | 8/16-bit / BYTE_0/1/3. WORD_0 775 // BYTE_3 | 8/16/24-bit / BYTE_0/1/2, WORD_0 776 // E.g. if SDWAInst is v_add_f16_sdwa dst_sel:WORD_1 then v_add_f16 is OK 777 // but v_add_f32 is not. 778 779 // TODO: add support for non-SDWA instructions as OtherInst. 780 // For now this only works with SDWA instructions. For regular instructions 781 // there is no way to determine if the instruction writes only 8/16/24-bit 782 // out of full register size and all registers are at min 32-bit wide. 783 if (!TII->isSDWA(*OtherInst)) 784 break; 785 786 SdwaSel DstSel = static_cast<SdwaSel>( 787 TII->getNamedImmOperand(*SDWAInst, AMDGPU::OpName::dst_sel));; 788 SdwaSel OtherDstSel = static_cast<SdwaSel>( 789 TII->getNamedImmOperand(*OtherInst, AMDGPU::OpName::dst_sel)); 790 791 bool DstSelAgree = false; 792 switch (DstSel) { 793 case WORD_0: DstSelAgree = ((OtherDstSel == BYTE_2) || 794 (OtherDstSel == BYTE_3) || 795 (OtherDstSel == WORD_1)); 796 break; 797 case WORD_1: DstSelAgree = ((OtherDstSel == BYTE_0) || 798 (OtherDstSel == BYTE_1) || 799 (OtherDstSel == WORD_0)); 800 break; 801 case BYTE_0: DstSelAgree = ((OtherDstSel == BYTE_1) || 802 (OtherDstSel == BYTE_2) || 803 (OtherDstSel == BYTE_3) || 804 (OtherDstSel == WORD_1)); 805 break; 806 case BYTE_1: DstSelAgree = ((OtherDstSel == BYTE_0) || 807 (OtherDstSel == BYTE_2) || 808 (OtherDstSel == BYTE_3) || 809 (OtherDstSel == WORD_1)); 810 break; 811 case BYTE_2: DstSelAgree = ((OtherDstSel == BYTE_0) || 812 (OtherDstSel == BYTE_1) || 813 (OtherDstSel == BYTE_3) || 814 (OtherDstSel == WORD_0)); 815 break; 816 case BYTE_3: DstSelAgree = ((OtherDstSel == BYTE_0) || 817 (OtherDstSel == BYTE_1) || 818 (OtherDstSel == BYTE_2) || 819 (OtherDstSel == WORD_0)); 820 break; 821 default: DstSelAgree = false; 822 } 823 824 if (!DstSelAgree) 825 break; 826 827 // Also OtherInst dst_unused should be UNUSED_PAD 828 DstUnused OtherDstUnused = static_cast<DstUnused>( 829 TII->getNamedImmOperand(*OtherInst, AMDGPU::OpName::dst_unused)); 830 if (OtherDstUnused != DstUnused::UNUSED_PAD) 831 break; 832 833 // Create DstPreserveOperand 834 MachineOperand *OrDst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); 835 assert(OrDst && OrDst->isReg()); 836 837 return make_unique<SDWADstPreserveOperand>( 838 OrDst, OrSDWADef, OrOtherDef, DstSel); 839 840 } 841 } 842 843 return std::unique_ptr<SDWAOperand>(nullptr); 844 } 845 846 void SIPeepholeSDWA::matchSDWAOperands(MachineBasicBlock &MBB) { 847 for (MachineInstr &MI : MBB) { 848 if (auto Operand = matchSDWAOperand(MI)) { 849 LLVM_DEBUG(dbgs() << "Match: " << MI << "To: " << *Operand << '\n'); 850 SDWAOperands[&MI] = std::move(Operand); 851 ++NumSDWAPatternsFound; 852 } 853 } 854 } 855 856 bool SIPeepholeSDWA::isConvertibleToSDWA(const MachineInstr &MI, 857 const SISubtarget &ST) const { 858 // Check if this is already an SDWA instruction 859 unsigned Opc = MI.getOpcode(); 860 if (TII->isSDWA(Opc)) 861 return true; 862 863 // Check if this instruction has opcode that supports SDWA 864 if (AMDGPU::getSDWAOp(Opc) == -1) 865 Opc = AMDGPU::getVOPe32(Opc); 866 867 if (AMDGPU::getSDWAOp(Opc) == -1) 868 return false; 869 870 if (!ST.hasSDWAOmod() && TII->hasModifiersSet(MI, AMDGPU::OpName::omod)) 871 return false; 872 873 if (TII->isVOPC(Opc)) { 874 if (!ST.hasSDWASdst()) { 875 const MachineOperand *SDst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst); 876 if (SDst && SDst->getReg() != AMDGPU::VCC) 877 return false; 878 } 879 880 if (!ST.hasSDWAOutModsVOPC() && 881 (TII->hasModifiersSet(MI, AMDGPU::OpName::clamp) || 882 TII->hasModifiersSet(MI, AMDGPU::OpName::omod))) 883 return false; 884 885 } else if (TII->getNamedOperand(MI, AMDGPU::OpName::sdst) || 886 !TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) { 887 return false; 888 } 889 890 if (!ST.hasSDWAMac() && (Opc == AMDGPU::V_MAC_F16_e32 || 891 Opc == AMDGPU::V_MAC_F32_e32)) 892 return false; 893 894 // FIXME: has SDWA but require handling of implicit VCC use 895 if (Opc == AMDGPU::V_CNDMASK_B32_e32) 896 return false; 897 898 return true; 899 } 900 901 bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI, 902 const SDWAOperandsVector &SDWAOperands) { 903 904 LLVM_DEBUG(dbgs() << "Convert instruction:" << MI); 905 906 // Convert to sdwa 907 int SDWAOpcode; 908 unsigned Opcode = MI.getOpcode(); 909 if (TII->isSDWA(Opcode)) { 910 SDWAOpcode = Opcode; 911 } else { 912 SDWAOpcode = AMDGPU::getSDWAOp(Opcode); 913 if (SDWAOpcode == -1) 914 SDWAOpcode = AMDGPU::getSDWAOp(AMDGPU::getVOPe32(Opcode)); 915 } 916 assert(SDWAOpcode != -1); 917 918 const MCInstrDesc &SDWADesc = TII->get(SDWAOpcode); 919 920 // Create SDWA version of instruction MI and initialize its operands 921 MachineInstrBuilder SDWAInst = 922 BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), SDWADesc); 923 924 // Copy dst, if it is present in original then should also be present in SDWA 925 MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); 926 if (Dst) { 927 assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::vdst) != -1); 928 SDWAInst.add(*Dst); 929 } else if ((Dst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst))) { 930 assert(Dst && 931 AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::sdst) != -1); 932 SDWAInst.add(*Dst); 933 } else { 934 assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::sdst) != -1); 935 SDWAInst.addReg(AMDGPU::VCC, RegState::Define); 936 } 937 938 // Copy src0, initialize src0_modifiers. All sdwa instructions has src0 and 939 // src0_modifiers (except for v_nop_sdwa, but it can't get here) 940 MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); 941 assert( 942 Src0 && 943 AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src0) != -1 && 944 AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src0_modifiers) != -1); 945 if (auto *Mod = TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)) 946 SDWAInst.addImm(Mod->getImm()); 947 else 948 SDWAInst.addImm(0); 949 SDWAInst.add(*Src0); 950 951 // Copy src1 if present, initialize src1_modifiers. 952 MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); 953 if (Src1) { 954 assert( 955 AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src1) != -1 && 956 AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src1_modifiers) != -1); 957 if (auto *Mod = TII->getNamedOperand(MI, AMDGPU::OpName::src1_modifiers)) 958 SDWAInst.addImm(Mod->getImm()); 959 else 960 SDWAInst.addImm(0); 961 SDWAInst.add(*Src1); 962 } 963 964 if (SDWAOpcode == AMDGPU::V_MAC_F16_sdwa || 965 SDWAOpcode == AMDGPU::V_MAC_F32_sdwa) { 966 // v_mac_f16/32 has additional src2 operand tied to vdst 967 MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2); 968 assert(Src2); 969 SDWAInst.add(*Src2); 970 } 971 972 // Copy clamp if present, initialize otherwise 973 assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::clamp) != -1); 974 MachineOperand *Clamp = TII->getNamedOperand(MI, AMDGPU::OpName::clamp); 975 if (Clamp) { 976 SDWAInst.add(*Clamp); 977 } else { 978 SDWAInst.addImm(0); 979 } 980 981 // Copy omod if present, initialize otherwise if needed 982 if (AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::omod) != -1) { 983 MachineOperand *OMod = TII->getNamedOperand(MI, AMDGPU::OpName::omod); 984 if (OMod) { 985 SDWAInst.add(*OMod); 986 } else { 987 SDWAInst.addImm(0); 988 } 989 } 990 991 // Copy dst_sel if present, initialize otherwise if needed 992 if (AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::dst_sel) != -1) { 993 MachineOperand *DstSel = TII->getNamedOperand(MI, AMDGPU::OpName::dst_sel); 994 if (DstSel) { 995 SDWAInst.add(*DstSel); 996 } else { 997 SDWAInst.addImm(AMDGPU::SDWA::SdwaSel::DWORD); 998 } 999 } 1000 1001 // Copy dst_unused if present, initialize otherwise if needed 1002 if (AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::dst_unused) != -1) { 1003 MachineOperand *DstUnused = TII->getNamedOperand(MI, AMDGPU::OpName::dst_unused); 1004 if (DstUnused) { 1005 SDWAInst.add(*DstUnused); 1006 } else { 1007 SDWAInst.addImm(AMDGPU::SDWA::DstUnused::UNUSED_PAD); 1008 } 1009 } 1010 1011 // Copy src0_sel if present, initialize otherwise 1012 assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src0_sel) != -1); 1013 MachineOperand *Src0Sel = TII->getNamedOperand(MI, AMDGPU::OpName::src0_sel); 1014 if (Src0Sel) { 1015 SDWAInst.add(*Src0Sel); 1016 } else { 1017 SDWAInst.addImm(AMDGPU::SDWA::SdwaSel::DWORD); 1018 } 1019 1020 // Copy src1_sel if present, initialize otherwise if needed 1021 if (Src1) { 1022 assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src1_sel) != -1); 1023 MachineOperand *Src1Sel = TII->getNamedOperand(MI, AMDGPU::OpName::src1_sel); 1024 if (Src1Sel) { 1025 SDWAInst.add(*Src1Sel); 1026 } else { 1027 SDWAInst.addImm(AMDGPU::SDWA::SdwaSel::DWORD); 1028 } 1029 } 1030 1031 // Check for a preserved register that needs to be copied. 1032 auto DstUnused = TII->getNamedOperand(MI, AMDGPU::OpName::dst_unused); 1033 if (DstUnused && 1034 DstUnused->getImm() == AMDGPU::SDWA::DstUnused::UNUSED_PRESERVE) { 1035 // We expect, if we are here, that the instruction was already in it's SDWA form, 1036 // with a tied operand. 1037 assert(Dst && Dst->isTied()); 1038 assert(Opcode == static_cast<unsigned int>(SDWAOpcode)); 1039 // We also expect a vdst, since sdst can't preserve. 1040 auto PreserveDstIdx = AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::vdst); 1041 assert(PreserveDstIdx != -1); 1042 1043 auto TiedIdx = MI.findTiedOperandIdx(PreserveDstIdx); 1044 auto Tied = MI.getOperand(TiedIdx); 1045 1046 SDWAInst.add(Tied); 1047 SDWAInst->tieOperands(PreserveDstIdx, SDWAInst->getNumOperands() - 1); 1048 } 1049 1050 // Apply all sdwa operand patterns. 1051 bool Converted = false; 1052 for (auto &Operand : SDWAOperands) { 1053 LLVM_DEBUG(dbgs() << *SDWAInst << "\nOperand: " << *Operand); 1054 // There should be no intesection between SDWA operands and potential MIs 1055 // e.g.: 1056 // v_and_b32 v0, 0xff, v1 -> src:v1 sel:BYTE_0 1057 // v_and_b32 v2, 0xff, v0 -> src:v0 sel:BYTE_0 1058 // v_add_u32 v3, v4, v2 1059 // 1060 // In that example it is possible that we would fold 2nd instruction into 3rd 1061 // (v_add_u32_sdwa) and then try to fold 1st instruction into 2nd (that was 1062 // already destroyed). So if SDWAOperand is also a potential MI then do not 1063 // apply it. 1064 if (PotentialMatches.count(Operand->getParentInst()) == 0) 1065 Converted |= Operand->convertToSDWA(*SDWAInst, TII); 1066 } 1067 if (Converted) { 1068 ConvertedInstructions.push_back(SDWAInst); 1069 } else { 1070 SDWAInst->eraseFromParent(); 1071 return false; 1072 } 1073 1074 LLVM_DEBUG(dbgs() << "\nInto:" << *SDWAInst << '\n'); 1075 ++NumSDWAInstructionsPeepholed; 1076 1077 MI.eraseFromParent(); 1078 return true; 1079 } 1080 1081 // If an instruction was converted to SDWA it should not have immediates or SGPR 1082 // operands (allowed one SGPR on GFX9). Copy its scalar operands into VGPRs. 1083 void SIPeepholeSDWA::legalizeScalarOperands(MachineInstr &MI, 1084 const SISubtarget &ST) const { 1085 const MCInstrDesc &Desc = TII->get(MI.getOpcode()); 1086 unsigned ConstantBusCount = 0; 1087 for (MachineOperand &Op : MI.explicit_uses()) { 1088 if (!Op.isImm() && !(Op.isReg() && !TRI->isVGPR(*MRI, Op.getReg()))) 1089 continue; 1090 1091 unsigned I = MI.getOperandNo(&Op); 1092 if (Desc.OpInfo[I].RegClass == -1 || 1093 !TRI->hasVGPRs(TRI->getRegClass(Desc.OpInfo[I].RegClass))) 1094 continue; 1095 1096 if (ST.hasSDWAScalar() && ConstantBusCount == 0 && Op.isReg() && 1097 TRI->isSGPRReg(*MRI, Op.getReg())) { 1098 ++ConstantBusCount; 1099 continue; 1100 } 1101 1102 unsigned VGPR = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1103 auto Copy = BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(), 1104 TII->get(AMDGPU::V_MOV_B32_e32), VGPR); 1105 if (Op.isImm()) 1106 Copy.addImm(Op.getImm()); 1107 else if (Op.isReg()) 1108 Copy.addReg(Op.getReg(), Op.isKill() ? RegState::Kill : 0, 1109 Op.getSubReg()); 1110 Op.ChangeToRegister(VGPR, false); 1111 } 1112 } 1113 1114 bool SIPeepholeSDWA::runOnMachineFunction(MachineFunction &MF) { 1115 const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); 1116 1117 if (!ST.hasSDWA() || skipFunction(MF.getFunction())) 1118 return false; 1119 1120 MRI = &MF.getRegInfo(); 1121 TRI = ST.getRegisterInfo(); 1122 TII = ST.getInstrInfo(); 1123 1124 // Find all SDWA operands in MF. 1125 bool Ret = false; 1126 for (MachineBasicBlock &MBB : MF) { 1127 bool Changed = false; 1128 do { 1129 matchSDWAOperands(MBB); 1130 1131 for (const auto &OperandPair : SDWAOperands) { 1132 const auto &Operand = OperandPair.second; 1133 MachineInstr *PotentialMI = Operand->potentialToConvert(TII); 1134 if (PotentialMI && isConvertibleToSDWA(*PotentialMI, ST)) { 1135 PotentialMatches[PotentialMI].push_back(Operand.get()); 1136 } 1137 } 1138 1139 for (auto &PotentialPair : PotentialMatches) { 1140 MachineInstr &PotentialMI = *PotentialPair.first; 1141 convertToSDWA(PotentialMI, PotentialPair.second); 1142 } 1143 1144 PotentialMatches.clear(); 1145 SDWAOperands.clear(); 1146 1147 Changed = !ConvertedInstructions.empty(); 1148 1149 if (Changed) 1150 Ret = true; 1151 while (!ConvertedInstructions.empty()) 1152 legalizeScalarOperands(*ConvertedInstructions.pop_back_val(), ST); 1153 } while (Changed); 1154 } 1155 1156 return Ret; 1157 } 1158