1 //===- SIPeepholeSDWA.cpp - Peephole optimization for SDWA instructions ---===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file This pass tries to apply several peephole SDWA patterns. 10 /// 11 /// E.g. original: 12 /// V_LSHRREV_B32_e32 %0, 16, %1 13 /// V_ADD_I32_e32 %2, %0, %3 14 /// V_LSHLREV_B32_e32 %4, 16, %2 15 /// 16 /// Replace: 17 /// V_ADD_I32_sdwa %4, %1, %3 18 /// dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 19 /// 20 //===----------------------------------------------------------------------===// 21 22 #include "AMDGPU.h" 23 #include "AMDGPUSubtarget.h" 24 #include "SIDefines.h" 25 #include "SIInstrInfo.h" 26 #include "SIRegisterInfo.h" 27 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 28 #include "Utils/AMDGPUBaseInfo.h" 29 #include "llvm/ADT/MapVector.h" 30 #include "llvm/ADT/None.h" 31 #include "llvm/ADT/Optional.h" 32 #include "llvm/ADT/STLExtras.h" 33 #include "llvm/ADT/SmallVector.h" 34 #include "llvm/ADT/Statistic.h" 35 #include "llvm/CodeGen/MachineBasicBlock.h" 36 #include "llvm/CodeGen/MachineFunction.h" 37 #include "llvm/CodeGen/MachineFunctionPass.h" 38 #include "llvm/CodeGen/MachineInstr.h" 39 #include "llvm/CodeGen/MachineInstrBuilder.h" 40 #include "llvm/CodeGen/MachineOperand.h" 41 #include "llvm/CodeGen/MachineRegisterInfo.h" 42 #include "llvm/CodeGen/TargetRegisterInfo.h" 43 #include "llvm/Config/llvm-config.h" 44 #include "llvm/MC/LaneBitmask.h" 45 #include "llvm/MC/MCInstrDesc.h" 46 #include "llvm/Pass.h" 47 #include "llvm/Support/Debug.h" 48 #include "llvm/Support/raw_ostream.h" 49 #include <algorithm> 50 #include <cassert> 51 #include <cstdint> 52 #include <memory> 53 #include <unordered_map> 54 55 using namespace llvm; 56 57 #define DEBUG_TYPE "si-peephole-sdwa" 58 59 STATISTIC(NumSDWAPatternsFound, "Number of SDWA patterns found."); 60 STATISTIC(NumSDWAInstructionsPeepholed, 61 "Number of instruction converted to SDWA."); 62 63 namespace { 64 65 class SDWAOperand; 66 class SDWADstOperand; 67 68 class SIPeepholeSDWA : public MachineFunctionPass { 69 public: 70 using SDWAOperandsVector = SmallVector<SDWAOperand *, 4>; 71 72 private: 73 MachineRegisterInfo *MRI; 74 const SIRegisterInfo *TRI; 75 const SIInstrInfo *TII; 76 77 MapVector<MachineInstr *, std::unique_ptr<SDWAOperand>> SDWAOperands; 78 MapVector<MachineInstr *, SDWAOperandsVector> PotentialMatches; 79 SmallVector<MachineInstr *, 8> ConvertedInstructions; 80 81 Optional<int64_t> foldToImm(const MachineOperand &Op) const; 82 83 public: 84 static char ID; 85 86 SIPeepholeSDWA() : MachineFunctionPass(ID) { 87 initializeSIPeepholeSDWAPass(*PassRegistry::getPassRegistry()); 88 } 89 90 bool runOnMachineFunction(MachineFunction &MF) override; 91 void matchSDWAOperands(MachineBasicBlock &MBB); 92 std::unique_ptr<SDWAOperand> matchSDWAOperand(MachineInstr &MI); 93 bool isConvertibleToSDWA(MachineInstr &MI, const GCNSubtarget &ST) const; 94 void pseudoOpConvertToVOP2(MachineInstr &MI, 95 const GCNSubtarget &ST) const; 96 bool convertToSDWA(MachineInstr &MI, const SDWAOperandsVector &SDWAOperands); 97 void legalizeScalarOperands(MachineInstr &MI, const GCNSubtarget &ST) const; 98 99 StringRef getPassName() const override { return "SI Peephole SDWA"; } 100 101 void getAnalysisUsage(AnalysisUsage &AU) const override { 102 AU.setPreservesCFG(); 103 MachineFunctionPass::getAnalysisUsage(AU); 104 } 105 }; 106 107 class SDWAOperand { 108 private: 109 MachineOperand *Target; // Operand that would be used in converted instruction 110 MachineOperand *Replaced; // Operand that would be replace by Target 111 112 public: 113 SDWAOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp) 114 : Target(TargetOp), Replaced(ReplacedOp) { 115 assert(Target->isReg()); 116 assert(Replaced->isReg()); 117 } 118 119 virtual ~SDWAOperand() = default; 120 121 virtual MachineInstr *potentialToConvert(const SIInstrInfo *TII) = 0; 122 virtual bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) = 0; 123 124 MachineOperand *getTargetOperand() const { return Target; } 125 MachineOperand *getReplacedOperand() const { return Replaced; } 126 MachineInstr *getParentInst() const { return Target->getParent(); } 127 128 MachineRegisterInfo *getMRI() const { 129 return &getParentInst()->getParent()->getParent()->getRegInfo(); 130 } 131 132 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 133 virtual void print(raw_ostream& OS) const = 0; 134 void dump() const { print(dbgs()); } 135 #endif 136 }; 137 138 using namespace AMDGPU::SDWA; 139 140 class SDWASrcOperand : public SDWAOperand { 141 private: 142 SdwaSel SrcSel; 143 bool Abs; 144 bool Neg; 145 bool Sext; 146 147 public: 148 SDWASrcOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp, 149 SdwaSel SrcSel_ = DWORD, bool Abs_ = false, bool Neg_ = false, 150 bool Sext_ = false) 151 : SDWAOperand(TargetOp, ReplacedOp), 152 SrcSel(SrcSel_), Abs(Abs_), Neg(Neg_), Sext(Sext_) {} 153 154 MachineInstr *potentialToConvert(const SIInstrInfo *TII) override; 155 bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override; 156 157 SdwaSel getSrcSel() const { return SrcSel; } 158 bool getAbs() const { return Abs; } 159 bool getNeg() const { return Neg; } 160 bool getSext() const { return Sext; } 161 162 uint64_t getSrcMods(const SIInstrInfo *TII, 163 const MachineOperand *SrcOp) const; 164 165 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 166 void print(raw_ostream& OS) const override; 167 #endif 168 }; 169 170 class SDWADstOperand : public SDWAOperand { 171 private: 172 SdwaSel DstSel; 173 DstUnused DstUn; 174 175 public: 176 177 SDWADstOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp, 178 SdwaSel DstSel_ = DWORD, DstUnused DstUn_ = UNUSED_PAD) 179 : SDWAOperand(TargetOp, ReplacedOp), DstSel(DstSel_), DstUn(DstUn_) {} 180 181 MachineInstr *potentialToConvert(const SIInstrInfo *TII) override; 182 bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override; 183 184 SdwaSel getDstSel() const { return DstSel; } 185 DstUnused getDstUnused() const { return DstUn; } 186 187 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 188 void print(raw_ostream& OS) const override; 189 #endif 190 }; 191 192 class SDWADstPreserveOperand : public SDWADstOperand { 193 private: 194 MachineOperand *Preserve; 195 196 public: 197 SDWADstPreserveOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp, 198 MachineOperand *PreserveOp, SdwaSel DstSel_ = DWORD) 199 : SDWADstOperand(TargetOp, ReplacedOp, DstSel_, UNUSED_PRESERVE), 200 Preserve(PreserveOp) {} 201 202 bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override; 203 204 MachineOperand *getPreservedOperand() const { return Preserve; } 205 206 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 207 void print(raw_ostream& OS) const override; 208 #endif 209 }; 210 211 } // end anonymous namespace 212 213 INITIALIZE_PASS(SIPeepholeSDWA, DEBUG_TYPE, "SI Peephole SDWA", false, false) 214 215 char SIPeepholeSDWA::ID = 0; 216 217 char &llvm::SIPeepholeSDWAID = SIPeepholeSDWA::ID; 218 219 FunctionPass *llvm::createSIPeepholeSDWAPass() { 220 return new SIPeepholeSDWA(); 221 } 222 223 224 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 225 static raw_ostream& operator<<(raw_ostream &OS, SdwaSel Sel) { 226 switch(Sel) { 227 case BYTE_0: OS << "BYTE_0"; break; 228 case BYTE_1: OS << "BYTE_1"; break; 229 case BYTE_2: OS << "BYTE_2"; break; 230 case BYTE_3: OS << "BYTE_3"; break; 231 case WORD_0: OS << "WORD_0"; break; 232 case WORD_1: OS << "WORD_1"; break; 233 case DWORD: OS << "DWORD"; break; 234 } 235 return OS; 236 } 237 238 static raw_ostream& operator<<(raw_ostream &OS, const DstUnused &Un) { 239 switch(Un) { 240 case UNUSED_PAD: OS << "UNUSED_PAD"; break; 241 case UNUSED_SEXT: OS << "UNUSED_SEXT"; break; 242 case UNUSED_PRESERVE: OS << "UNUSED_PRESERVE"; break; 243 } 244 return OS; 245 } 246 247 static raw_ostream& operator<<(raw_ostream &OS, const SDWAOperand &Operand) { 248 Operand.print(OS); 249 return OS; 250 } 251 252 LLVM_DUMP_METHOD 253 void SDWASrcOperand::print(raw_ostream& OS) const { 254 OS << "SDWA src: " << *getTargetOperand() 255 << " src_sel:" << getSrcSel() 256 << " abs:" << getAbs() << " neg:" << getNeg() 257 << " sext:" << getSext() << '\n'; 258 } 259 260 LLVM_DUMP_METHOD 261 void SDWADstOperand::print(raw_ostream& OS) const { 262 OS << "SDWA dst: " << *getTargetOperand() 263 << " dst_sel:" << getDstSel() 264 << " dst_unused:" << getDstUnused() << '\n'; 265 } 266 267 LLVM_DUMP_METHOD 268 void SDWADstPreserveOperand::print(raw_ostream& OS) const { 269 OS << "SDWA preserve dst: " << *getTargetOperand() 270 << " dst_sel:" << getDstSel() 271 << " preserve:" << *getPreservedOperand() << '\n'; 272 } 273 274 #endif 275 276 static void copyRegOperand(MachineOperand &To, const MachineOperand &From) { 277 assert(To.isReg() && From.isReg()); 278 To.setReg(From.getReg()); 279 To.setSubReg(From.getSubReg()); 280 To.setIsUndef(From.isUndef()); 281 if (To.isUse()) { 282 To.setIsKill(From.isKill()); 283 } else { 284 To.setIsDead(From.isDead()); 285 } 286 } 287 288 static bool isSameReg(const MachineOperand &LHS, const MachineOperand &RHS) { 289 return LHS.isReg() && 290 RHS.isReg() && 291 LHS.getReg() == RHS.getReg() && 292 LHS.getSubReg() == RHS.getSubReg(); 293 } 294 295 static MachineOperand *findSingleRegUse(const MachineOperand *Reg, 296 const MachineRegisterInfo *MRI) { 297 if (!Reg->isReg() || !Reg->isDef()) 298 return nullptr; 299 300 MachineOperand *ResMO = nullptr; 301 for (MachineOperand &UseMO : MRI->use_nodbg_operands(Reg->getReg())) { 302 // If there exist use of subreg of Reg then return nullptr 303 if (!isSameReg(UseMO, *Reg)) 304 return nullptr; 305 306 // Check that there is only one instruction that uses Reg 307 if (!ResMO) { 308 ResMO = &UseMO; 309 } else if (ResMO->getParent() != UseMO.getParent()) { 310 return nullptr; 311 } 312 } 313 314 return ResMO; 315 } 316 317 static MachineOperand *findSingleRegDef(const MachineOperand *Reg, 318 const MachineRegisterInfo *MRI) { 319 if (!Reg->isReg()) 320 return nullptr; 321 322 MachineInstr *DefInstr = MRI->getUniqueVRegDef(Reg->getReg()); 323 if (!DefInstr) 324 return nullptr; 325 326 for (auto &DefMO : DefInstr->defs()) { 327 if (DefMO.isReg() && DefMO.getReg() == Reg->getReg()) 328 return &DefMO; 329 } 330 331 // Ignore implicit defs. 332 return nullptr; 333 } 334 335 uint64_t SDWASrcOperand::getSrcMods(const SIInstrInfo *TII, 336 const MachineOperand *SrcOp) const { 337 uint64_t Mods = 0; 338 const auto *MI = SrcOp->getParent(); 339 if (TII->getNamedOperand(*MI, AMDGPU::OpName::src0) == SrcOp) { 340 if (auto *Mod = TII->getNamedOperand(*MI, AMDGPU::OpName::src0_modifiers)) { 341 Mods = Mod->getImm(); 342 } 343 } else if (TII->getNamedOperand(*MI, AMDGPU::OpName::src1) == SrcOp) { 344 if (auto *Mod = TII->getNamedOperand(*MI, AMDGPU::OpName::src1_modifiers)) { 345 Mods = Mod->getImm(); 346 } 347 } 348 if (Abs || Neg) { 349 assert(!Sext && 350 "Float and integer src modifiers can't be set simulteniously"); 351 Mods |= Abs ? SISrcMods::ABS : 0u; 352 Mods ^= Neg ? SISrcMods::NEG : 0u; 353 } else if (Sext) { 354 Mods |= SISrcMods::SEXT; 355 } 356 357 return Mods; 358 } 359 360 MachineInstr *SDWASrcOperand::potentialToConvert(const SIInstrInfo *TII) { 361 // For SDWA src operand potential instruction is one that use register 362 // defined by parent instruction 363 MachineOperand *PotentialMO = findSingleRegUse(getReplacedOperand(), getMRI()); 364 if (!PotentialMO) 365 return nullptr; 366 367 return PotentialMO->getParent(); 368 } 369 370 bool SDWASrcOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) { 371 // Find operand in instruction that matches source operand and replace it with 372 // target operand. Set corresponding src_sel 373 bool IsPreserveSrc = false; 374 MachineOperand *Src = TII->getNamedOperand(MI, AMDGPU::OpName::src0); 375 MachineOperand *SrcSel = TII->getNamedOperand(MI, AMDGPU::OpName::src0_sel); 376 MachineOperand *SrcMods = 377 TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers); 378 assert(Src && (Src->isReg() || Src->isImm())); 379 if (!isSameReg(*Src, *getReplacedOperand())) { 380 // If this is not src0 then it could be src1 381 Src = TII->getNamedOperand(MI, AMDGPU::OpName::src1); 382 SrcSel = TII->getNamedOperand(MI, AMDGPU::OpName::src1_sel); 383 SrcMods = TII->getNamedOperand(MI, AMDGPU::OpName::src1_modifiers); 384 385 if (!Src || 386 !isSameReg(*Src, *getReplacedOperand())) { 387 // It's possible this Src is a tied operand for 388 // UNUSED_PRESERVE, in which case we can either 389 // abandon the peephole attempt, or if legal we can 390 // copy the target operand into the tied slot 391 // if the preserve operation will effectively cause the same 392 // result by overwriting the rest of the dst. 393 MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); 394 MachineOperand *DstUnused = 395 TII->getNamedOperand(MI, AMDGPU::OpName::dst_unused); 396 397 if (Dst && 398 DstUnused->getImm() == AMDGPU::SDWA::DstUnused::UNUSED_PRESERVE) { 399 // This will work if the tied src is acessing WORD_0, and the dst is 400 // writing WORD_1. Modifiers don't matter because all the bits that 401 // would be impacted are being overwritten by the dst. 402 // Any other case will not work. 403 SdwaSel DstSel = static_cast<SdwaSel>( 404 TII->getNamedImmOperand(MI, AMDGPU::OpName::dst_sel)); 405 if (DstSel == AMDGPU::SDWA::SdwaSel::WORD_1 && 406 getSrcSel() == AMDGPU::SDWA::SdwaSel::WORD_0) { 407 IsPreserveSrc = true; 408 auto DstIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), 409 AMDGPU::OpName::vdst); 410 auto TiedIdx = MI.findTiedOperandIdx(DstIdx); 411 Src = &MI.getOperand(TiedIdx); 412 SrcSel = nullptr; 413 SrcMods = nullptr; 414 } else { 415 // Not legal to convert this src 416 return false; 417 } 418 } 419 } 420 assert(Src && Src->isReg()); 421 422 if ((MI.getOpcode() == AMDGPU::V_FMAC_F16_sdwa || 423 MI.getOpcode() == AMDGPU::V_FMAC_F32_sdwa || 424 MI.getOpcode() == AMDGPU::V_MAC_F16_sdwa || 425 MI.getOpcode() == AMDGPU::V_MAC_F32_sdwa) && 426 !isSameReg(*Src, *getReplacedOperand())) { 427 // In case of v_mac_f16/32_sdwa this pass can try to apply src operand to 428 // src2. This is not allowed. 429 return false; 430 } 431 432 assert(isSameReg(*Src, *getReplacedOperand()) && 433 (IsPreserveSrc || (SrcSel && SrcMods))); 434 } 435 copyRegOperand(*Src, *getTargetOperand()); 436 if (!IsPreserveSrc) { 437 SrcSel->setImm(getSrcSel()); 438 SrcMods->setImm(getSrcMods(TII, Src)); 439 } 440 getTargetOperand()->setIsKill(false); 441 return true; 442 } 443 444 MachineInstr *SDWADstOperand::potentialToConvert(const SIInstrInfo *TII) { 445 // For SDWA dst operand potential instruction is one that defines register 446 // that this operand uses 447 MachineRegisterInfo *MRI = getMRI(); 448 MachineInstr *ParentMI = getParentInst(); 449 450 MachineOperand *PotentialMO = findSingleRegDef(getReplacedOperand(), MRI); 451 if (!PotentialMO) 452 return nullptr; 453 454 // Check that ParentMI is the only instruction that uses replaced register 455 for (MachineInstr &UseInst : MRI->use_nodbg_instructions(PotentialMO->getReg())) { 456 if (&UseInst != ParentMI) 457 return nullptr; 458 } 459 460 return PotentialMO->getParent(); 461 } 462 463 bool SDWADstOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) { 464 // Replace vdst operand in MI with target operand. Set dst_sel and dst_unused 465 466 if ((MI.getOpcode() == AMDGPU::V_FMAC_F16_sdwa || 467 MI.getOpcode() == AMDGPU::V_FMAC_F32_sdwa || 468 MI.getOpcode() == AMDGPU::V_MAC_F16_sdwa || 469 MI.getOpcode() == AMDGPU::V_MAC_F32_sdwa) && 470 getDstSel() != AMDGPU::SDWA::DWORD) { 471 // v_mac_f16/32_sdwa allow dst_sel to be equal only to DWORD 472 return false; 473 } 474 475 MachineOperand *Operand = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); 476 assert(Operand && 477 Operand->isReg() && 478 isSameReg(*Operand, *getReplacedOperand())); 479 copyRegOperand(*Operand, *getTargetOperand()); 480 MachineOperand *DstSel= TII->getNamedOperand(MI, AMDGPU::OpName::dst_sel); 481 assert(DstSel); 482 DstSel->setImm(getDstSel()); 483 MachineOperand *DstUnused= TII->getNamedOperand(MI, AMDGPU::OpName::dst_unused); 484 assert(DstUnused); 485 DstUnused->setImm(getDstUnused()); 486 487 // Remove original instruction because it would conflict with our new 488 // instruction by register definition 489 getParentInst()->eraseFromParent(); 490 return true; 491 } 492 493 bool SDWADstPreserveOperand::convertToSDWA(MachineInstr &MI, 494 const SIInstrInfo *TII) { 495 // MI should be moved right before v_or_b32. 496 // For this we should clear all kill flags on uses of MI src-operands or else 497 // we can encounter problem with use of killed operand. 498 for (MachineOperand &MO : MI.uses()) { 499 if (!MO.isReg()) 500 continue; 501 getMRI()->clearKillFlags(MO.getReg()); 502 } 503 504 // Move MI before v_or_b32 505 auto MBB = MI.getParent(); 506 MBB->remove(&MI); 507 MBB->insert(getParentInst(), &MI); 508 509 // Add Implicit use of preserved register 510 MachineInstrBuilder MIB(*MBB->getParent(), MI); 511 MIB.addReg(getPreservedOperand()->getReg(), 512 RegState::ImplicitKill, 513 getPreservedOperand()->getSubReg()); 514 515 // Tie dst to implicit use 516 MI.tieOperands(AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdst), 517 MI.getNumOperands() - 1); 518 519 // Convert MI as any other SDWADstOperand and remove v_or_b32 520 return SDWADstOperand::convertToSDWA(MI, TII); 521 } 522 523 Optional<int64_t> SIPeepholeSDWA::foldToImm(const MachineOperand &Op) const { 524 if (Op.isImm()) { 525 return Op.getImm(); 526 } 527 528 // If this is not immediate then it can be copy of immediate value, e.g.: 529 // %1 = S_MOV_B32 255; 530 if (Op.isReg()) { 531 for (const MachineOperand &Def : MRI->def_operands(Op.getReg())) { 532 if (!isSameReg(Op, Def)) 533 continue; 534 535 const MachineInstr *DefInst = Def.getParent(); 536 if (!TII->isFoldableCopy(*DefInst)) 537 return None; 538 539 const MachineOperand &Copied = DefInst->getOperand(1); 540 if (!Copied.isImm()) 541 return None; 542 543 return Copied.getImm(); 544 } 545 } 546 547 return None; 548 } 549 550 std::unique_ptr<SDWAOperand> 551 SIPeepholeSDWA::matchSDWAOperand(MachineInstr &MI) { 552 unsigned Opcode = MI.getOpcode(); 553 switch (Opcode) { 554 case AMDGPU::V_LSHRREV_B32_e32: 555 case AMDGPU::V_ASHRREV_I32_e32: 556 case AMDGPU::V_LSHLREV_B32_e32: 557 case AMDGPU::V_LSHRREV_B32_e64: 558 case AMDGPU::V_ASHRREV_I32_e64: 559 case AMDGPU::V_LSHLREV_B32_e64: { 560 // from: v_lshrrev_b32_e32 v1, 16/24, v0 561 // to SDWA src:v0 src_sel:WORD_1/BYTE_3 562 563 // from: v_ashrrev_i32_e32 v1, 16/24, v0 564 // to SDWA src:v0 src_sel:WORD_1/BYTE_3 sext:1 565 566 // from: v_lshlrev_b32_e32 v1, 16/24, v0 567 // to SDWA dst:v1 dst_sel:WORD_1/BYTE_3 dst_unused:UNUSED_PAD 568 MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); 569 auto Imm = foldToImm(*Src0); 570 if (!Imm) 571 break; 572 573 if (*Imm != 16 && *Imm != 24) 574 break; 575 576 MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); 577 MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); 578 if (Register::isPhysicalRegister(Src1->getReg()) || 579 Register::isPhysicalRegister(Dst->getReg())) 580 break; 581 582 if (Opcode == AMDGPU::V_LSHLREV_B32_e32 || 583 Opcode == AMDGPU::V_LSHLREV_B32_e64) { 584 return std::make_unique<SDWADstOperand>( 585 Dst, Src1, *Imm == 16 ? WORD_1 : BYTE_3, UNUSED_PAD); 586 } else { 587 return std::make_unique<SDWASrcOperand>( 588 Src1, Dst, *Imm == 16 ? WORD_1 : BYTE_3, false, false, 589 Opcode != AMDGPU::V_LSHRREV_B32_e32 && 590 Opcode != AMDGPU::V_LSHRREV_B32_e64); 591 } 592 break; 593 } 594 595 case AMDGPU::V_LSHRREV_B16_e32: 596 case AMDGPU::V_ASHRREV_I16_e32: 597 case AMDGPU::V_LSHLREV_B16_e32: 598 case AMDGPU::V_LSHRREV_B16_e64: 599 case AMDGPU::V_ASHRREV_I16_e64: 600 case AMDGPU::V_LSHLREV_B16_e64: { 601 // from: v_lshrrev_b16_e32 v1, 8, v0 602 // to SDWA src:v0 src_sel:BYTE_1 603 604 // from: v_ashrrev_i16_e32 v1, 8, v0 605 // to SDWA src:v0 src_sel:BYTE_1 sext:1 606 607 // from: v_lshlrev_b16_e32 v1, 8, v0 608 // to SDWA dst:v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD 609 MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); 610 auto Imm = foldToImm(*Src0); 611 if (!Imm || *Imm != 8) 612 break; 613 614 MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); 615 MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); 616 617 if (Register::isPhysicalRegister(Src1->getReg()) || 618 Register::isPhysicalRegister(Dst->getReg())) 619 break; 620 621 if (Opcode == AMDGPU::V_LSHLREV_B16_e32 || 622 Opcode == AMDGPU::V_LSHLREV_B16_e64) { 623 return std::make_unique<SDWADstOperand>(Dst, Src1, BYTE_1, UNUSED_PAD); 624 } else { 625 return std::make_unique<SDWASrcOperand>( 626 Src1, Dst, BYTE_1, false, false, 627 Opcode != AMDGPU::V_LSHRREV_B16_e32 && 628 Opcode != AMDGPU::V_LSHRREV_B16_e64); 629 } 630 break; 631 } 632 633 case AMDGPU::V_BFE_I32: 634 case AMDGPU::V_BFE_U32: { 635 // e.g.: 636 // from: v_bfe_u32 v1, v0, 8, 8 637 // to SDWA src:v0 src_sel:BYTE_1 638 639 // offset | width | src_sel 640 // ------------------------ 641 // 0 | 8 | BYTE_0 642 // 0 | 16 | WORD_0 643 // 0 | 32 | DWORD ? 644 // 8 | 8 | BYTE_1 645 // 16 | 8 | BYTE_2 646 // 16 | 16 | WORD_1 647 // 24 | 8 | BYTE_3 648 649 MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); 650 auto Offset = foldToImm(*Src1); 651 if (!Offset) 652 break; 653 654 MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2); 655 auto Width = foldToImm(*Src2); 656 if (!Width) 657 break; 658 659 SdwaSel SrcSel = DWORD; 660 661 if (*Offset == 0 && *Width == 8) 662 SrcSel = BYTE_0; 663 else if (*Offset == 0 && *Width == 16) 664 SrcSel = WORD_0; 665 else if (*Offset == 0 && *Width == 32) 666 SrcSel = DWORD; 667 else if (*Offset == 8 && *Width == 8) 668 SrcSel = BYTE_1; 669 else if (*Offset == 16 && *Width == 8) 670 SrcSel = BYTE_2; 671 else if (*Offset == 16 && *Width == 16) 672 SrcSel = WORD_1; 673 else if (*Offset == 24 && *Width == 8) 674 SrcSel = BYTE_3; 675 else 676 break; 677 678 MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); 679 MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); 680 681 if (Register::isPhysicalRegister(Src0->getReg()) || 682 Register::isPhysicalRegister(Dst->getReg())) 683 break; 684 685 return std::make_unique<SDWASrcOperand>( 686 Src0, Dst, SrcSel, false, false, Opcode != AMDGPU::V_BFE_U32); 687 } 688 689 case AMDGPU::V_AND_B32_e32: 690 case AMDGPU::V_AND_B32_e64: { 691 // e.g.: 692 // from: v_and_b32_e32 v1, 0x0000ffff/0x000000ff, v0 693 // to SDWA src:v0 src_sel:WORD_0/BYTE_0 694 695 MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); 696 MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); 697 auto ValSrc = Src1; 698 auto Imm = foldToImm(*Src0); 699 700 if (!Imm) { 701 Imm = foldToImm(*Src1); 702 ValSrc = Src0; 703 } 704 705 if (!Imm || (*Imm != 0x0000ffff && *Imm != 0x000000ff)) 706 break; 707 708 MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); 709 710 if (Register::isPhysicalRegister(ValSrc->getReg()) || 711 Register::isPhysicalRegister(Dst->getReg())) 712 break; 713 714 return std::make_unique<SDWASrcOperand>( 715 ValSrc, Dst, *Imm == 0x0000ffff ? WORD_0 : BYTE_0); 716 } 717 718 case AMDGPU::V_OR_B32_e32: 719 case AMDGPU::V_OR_B32_e64: { 720 // Patterns for dst_unused:UNUSED_PRESERVE. 721 // e.g., from: 722 // v_add_f16_sdwa v0, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD 723 // src1_sel:WORD_1 src2_sel:WORD1 724 // v_add_f16_e32 v3, v1, v2 725 // v_or_b32_e32 v4, v0, v3 726 // to SDWA preserve dst:v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE preserve:v3 727 728 // Check if one of operands of v_or_b32 is SDWA instruction 729 using CheckRetType = Optional<std::pair<MachineOperand *, MachineOperand *>>; 730 auto CheckOROperandsForSDWA = 731 [&](const MachineOperand *Op1, const MachineOperand *Op2) -> CheckRetType { 732 if (!Op1 || !Op1->isReg() || !Op2 || !Op2->isReg()) 733 return CheckRetType(None); 734 735 MachineOperand *Op1Def = findSingleRegDef(Op1, MRI); 736 if (!Op1Def) 737 return CheckRetType(None); 738 739 MachineInstr *Op1Inst = Op1Def->getParent(); 740 if (!TII->isSDWA(*Op1Inst)) 741 return CheckRetType(None); 742 743 MachineOperand *Op2Def = findSingleRegDef(Op2, MRI); 744 if (!Op2Def) 745 return CheckRetType(None); 746 747 return CheckRetType(std::make_pair(Op1Def, Op2Def)); 748 }; 749 750 MachineOperand *OrSDWA = TII->getNamedOperand(MI, AMDGPU::OpName::src0); 751 MachineOperand *OrOther = TII->getNamedOperand(MI, AMDGPU::OpName::src1); 752 assert(OrSDWA && OrOther); 753 auto Res = CheckOROperandsForSDWA(OrSDWA, OrOther); 754 if (!Res) { 755 OrSDWA = TII->getNamedOperand(MI, AMDGPU::OpName::src1); 756 OrOther = TII->getNamedOperand(MI, AMDGPU::OpName::src0); 757 assert(OrSDWA && OrOther); 758 Res = CheckOROperandsForSDWA(OrSDWA, OrOther); 759 if (!Res) 760 break; 761 } 762 763 MachineOperand *OrSDWADef = Res->first; 764 MachineOperand *OrOtherDef = Res->second; 765 assert(OrSDWADef && OrOtherDef); 766 767 MachineInstr *SDWAInst = OrSDWADef->getParent(); 768 MachineInstr *OtherInst = OrOtherDef->getParent(); 769 770 // Check that OtherInstr is actually bitwise compatible with SDWAInst = their 771 // destination patterns don't overlap. Compatible instruction can be either 772 // regular instruction with compatible bitness or SDWA instruction with 773 // correct dst_sel 774 // SDWAInst | OtherInst bitness / OtherInst dst_sel 775 // ----------------------------------------------------- 776 // DWORD | no / no 777 // WORD_0 | no / BYTE_2/3, WORD_1 778 // WORD_1 | 8/16-bit instructions / BYTE_0/1, WORD_0 779 // BYTE_0 | no / BYTE_1/2/3, WORD_1 780 // BYTE_1 | 8-bit / BYTE_0/2/3, WORD_1 781 // BYTE_2 | 8/16-bit / BYTE_0/1/3. WORD_0 782 // BYTE_3 | 8/16/24-bit / BYTE_0/1/2, WORD_0 783 // E.g. if SDWAInst is v_add_f16_sdwa dst_sel:WORD_1 then v_add_f16 is OK 784 // but v_add_f32 is not. 785 786 // TODO: add support for non-SDWA instructions as OtherInst. 787 // For now this only works with SDWA instructions. For regular instructions 788 // there is no way to determine if the instruction writes only 8/16/24-bit 789 // out of full register size and all registers are at min 32-bit wide. 790 if (!TII->isSDWA(*OtherInst)) 791 break; 792 793 SdwaSel DstSel = static_cast<SdwaSel>( 794 TII->getNamedImmOperand(*SDWAInst, AMDGPU::OpName::dst_sel));; 795 SdwaSel OtherDstSel = static_cast<SdwaSel>( 796 TII->getNamedImmOperand(*OtherInst, AMDGPU::OpName::dst_sel)); 797 798 bool DstSelAgree = false; 799 switch (DstSel) { 800 case WORD_0: DstSelAgree = ((OtherDstSel == BYTE_2) || 801 (OtherDstSel == BYTE_3) || 802 (OtherDstSel == WORD_1)); 803 break; 804 case WORD_1: DstSelAgree = ((OtherDstSel == BYTE_0) || 805 (OtherDstSel == BYTE_1) || 806 (OtherDstSel == WORD_0)); 807 break; 808 case BYTE_0: DstSelAgree = ((OtherDstSel == BYTE_1) || 809 (OtherDstSel == BYTE_2) || 810 (OtherDstSel == BYTE_3) || 811 (OtherDstSel == WORD_1)); 812 break; 813 case BYTE_1: DstSelAgree = ((OtherDstSel == BYTE_0) || 814 (OtherDstSel == BYTE_2) || 815 (OtherDstSel == BYTE_3) || 816 (OtherDstSel == WORD_1)); 817 break; 818 case BYTE_2: DstSelAgree = ((OtherDstSel == BYTE_0) || 819 (OtherDstSel == BYTE_1) || 820 (OtherDstSel == BYTE_3) || 821 (OtherDstSel == WORD_0)); 822 break; 823 case BYTE_3: DstSelAgree = ((OtherDstSel == BYTE_0) || 824 (OtherDstSel == BYTE_1) || 825 (OtherDstSel == BYTE_2) || 826 (OtherDstSel == WORD_0)); 827 break; 828 default: DstSelAgree = false; 829 } 830 831 if (!DstSelAgree) 832 break; 833 834 // Also OtherInst dst_unused should be UNUSED_PAD 835 DstUnused OtherDstUnused = static_cast<DstUnused>( 836 TII->getNamedImmOperand(*OtherInst, AMDGPU::OpName::dst_unused)); 837 if (OtherDstUnused != DstUnused::UNUSED_PAD) 838 break; 839 840 // Create DstPreserveOperand 841 MachineOperand *OrDst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); 842 assert(OrDst && OrDst->isReg()); 843 844 return std::make_unique<SDWADstPreserveOperand>( 845 OrDst, OrSDWADef, OrOtherDef, DstSel); 846 847 } 848 } 849 850 return std::unique_ptr<SDWAOperand>(nullptr); 851 } 852 853 void SIPeepholeSDWA::matchSDWAOperands(MachineBasicBlock &MBB) { 854 for (MachineInstr &MI : MBB) { 855 if (auto Operand = matchSDWAOperand(MI)) { 856 LLVM_DEBUG(dbgs() << "Match: " << MI << "To: " << *Operand << '\n'); 857 SDWAOperands[&MI] = std::move(Operand); 858 ++NumSDWAPatternsFound; 859 } 860 } 861 } 862 863 // Convert the V_ADDC_U32_e64 into V_ADDC_U32_e32, and 864 // V_ADD_I32_e64 into V_ADD_I32_e32. This allows isConvertibleToSDWA 865 // to perform its transformation on V_ADD_I32_e32 into V_ADD_I32_sdwa. 866 // 867 // We are transforming from a VOP3 into a VOP2 form of the instruction. 868 // %19:vgpr_32 = V_AND_B32_e32 255, 869 // killed %16:vgpr_32, implicit $exec 870 // %47:vgpr_32, %49:sreg_64_xexec = V_ADD_I32_e64 871 // %26.sub0:vreg_64, %19:vgpr_32, implicit $exec 872 // %48:vgpr_32, dead %50:sreg_64_xexec = V_ADDC_U32_e64 873 // %26.sub1:vreg_64, %54:vgpr_32, killed %49:sreg_64_xexec, implicit $exec 874 // 875 // becomes 876 // %47:vgpr_32 = V_ADD_I32_sdwa 877 // 0, %26.sub0:vreg_64, 0, killed %16:vgpr_32, 0, 6, 0, 6, 0, 878 // implicit-def $vcc, implicit $exec 879 // %48:vgpr_32 = V_ADDC_U32_e32 880 // 0, %26.sub1:vreg_64, implicit-def $vcc, implicit $vcc, implicit $exec 881 void SIPeepholeSDWA::pseudoOpConvertToVOP2(MachineInstr &MI, 882 const GCNSubtarget &ST) const { 883 int Opc = MI.getOpcode(); 884 assert((Opc == AMDGPU::V_ADD_I32_e64 || Opc == AMDGPU::V_SUB_I32_e64) && 885 "Currently only handles V_ADD_I32_e64 or V_SUB_I32_e64"); 886 887 // Can the candidate MI be shrunk? 888 if (!TII->canShrink(MI, *MRI)) 889 return; 890 Opc = AMDGPU::getVOPe32(Opc); 891 // Find the related ADD instruction. 892 const MachineOperand *Sdst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst); 893 if (!Sdst) 894 return; 895 MachineOperand *NextOp = findSingleRegUse(Sdst, MRI); 896 if (!NextOp) 897 return; 898 MachineInstr &MISucc = *NextOp->getParent(); 899 // Can the successor be shrunk? 900 if (!TII->canShrink(MISucc, *MRI)) 901 return; 902 int SuccOpc = AMDGPU::getVOPe32(MISucc.getOpcode()); 903 // Make sure the carry in/out are subsequently unused. 904 MachineOperand *CarryIn = TII->getNamedOperand(MISucc, AMDGPU::OpName::src2); 905 if (!CarryIn) 906 return; 907 MachineOperand *CarryOut = TII->getNamedOperand(MISucc, AMDGPU::OpName::sdst); 908 if (!CarryOut) 909 return; 910 if (!MRI->hasOneUse(CarryIn->getReg()) || !MRI->use_empty(CarryOut->getReg())) 911 return; 912 // Make sure VCC or its subregs are dead before MI. 913 MachineBasicBlock &MBB = *MI.getParent(); 914 auto Liveness = MBB.computeRegisterLiveness(TRI, AMDGPU::VCC, MI, 25); 915 if (Liveness != MachineBasicBlock::LQR_Dead) 916 return; 917 // Check if VCC is referenced in range of (MI,MISucc]. 918 for (auto I = std::next(MI.getIterator()), E = MISucc.getIterator(); 919 I != E; ++I) { 920 if (I->modifiesRegister(AMDGPU::VCC, TRI)) 921 return; 922 } 923 // Make the two new e32 instruction variants. 924 // Replace MI with V_{SUB|ADD}_I32_e32 925 auto NewMI = BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(Opc)); 926 NewMI.add(*TII->getNamedOperand(MI, AMDGPU::OpName::vdst)); 927 NewMI.add(*TII->getNamedOperand(MI, AMDGPU::OpName::src0)); 928 NewMI.add(*TII->getNamedOperand(MI, AMDGPU::OpName::src1)); 929 MI.eraseFromParent(); 930 // Replace MISucc with V_{SUBB|ADDC}_U32_e32 931 auto NewInst = BuildMI(MBB, MISucc, MISucc.getDebugLoc(), TII->get(SuccOpc)); 932 NewInst.add(*TII->getNamedOperand(MISucc, AMDGPU::OpName::vdst)); 933 NewInst.add(*TII->getNamedOperand(MISucc, AMDGPU::OpName::src0)); 934 NewInst.add(*TII->getNamedOperand(MISucc, AMDGPU::OpName::src1)); 935 MISucc.eraseFromParent(); 936 } 937 938 bool SIPeepholeSDWA::isConvertibleToSDWA(MachineInstr &MI, 939 const GCNSubtarget &ST) const { 940 // Check if this is already an SDWA instruction 941 unsigned Opc = MI.getOpcode(); 942 if (TII->isSDWA(Opc)) 943 return true; 944 945 // Check if this instruction has opcode that supports SDWA 946 if (AMDGPU::getSDWAOp(Opc) == -1) 947 Opc = AMDGPU::getVOPe32(Opc); 948 949 if (AMDGPU::getSDWAOp(Opc) == -1) 950 return false; 951 952 if (!ST.hasSDWAOmod() && TII->hasModifiersSet(MI, AMDGPU::OpName::omod)) 953 return false; 954 955 if (TII->isVOPC(Opc)) { 956 if (!ST.hasSDWASdst()) { 957 const MachineOperand *SDst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst); 958 if (SDst && (SDst->getReg() != AMDGPU::VCC && 959 SDst->getReg() != AMDGPU::VCC_LO)) 960 return false; 961 } 962 963 if (!ST.hasSDWAOutModsVOPC() && 964 (TII->hasModifiersSet(MI, AMDGPU::OpName::clamp) || 965 TII->hasModifiersSet(MI, AMDGPU::OpName::omod))) 966 return false; 967 968 } else if (TII->getNamedOperand(MI, AMDGPU::OpName::sdst) || 969 !TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) { 970 return false; 971 } 972 973 if (!ST.hasSDWAMac() && (Opc == AMDGPU::V_FMAC_F16_e32 || 974 Opc == AMDGPU::V_FMAC_F32_e32 || 975 Opc == AMDGPU::V_MAC_F16_e32 || 976 Opc == AMDGPU::V_MAC_F32_e32)) 977 return false; 978 979 // Check if target supports this SDWA opcode 980 if (TII->pseudoToMCOpcode(Opc) == -1) 981 return false; 982 983 // FIXME: has SDWA but require handling of implicit VCC use 984 if (Opc == AMDGPU::V_CNDMASK_B32_e32) 985 return false; 986 987 return true; 988 } 989 990 bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI, 991 const SDWAOperandsVector &SDWAOperands) { 992 993 LLVM_DEBUG(dbgs() << "Convert instruction:" << MI); 994 995 // Convert to sdwa 996 int SDWAOpcode; 997 unsigned Opcode = MI.getOpcode(); 998 if (TII->isSDWA(Opcode)) { 999 SDWAOpcode = Opcode; 1000 } else { 1001 SDWAOpcode = AMDGPU::getSDWAOp(Opcode); 1002 if (SDWAOpcode == -1) 1003 SDWAOpcode = AMDGPU::getSDWAOp(AMDGPU::getVOPe32(Opcode)); 1004 } 1005 assert(SDWAOpcode != -1); 1006 1007 const MCInstrDesc &SDWADesc = TII->get(SDWAOpcode); 1008 1009 // Create SDWA version of instruction MI and initialize its operands 1010 MachineInstrBuilder SDWAInst = 1011 BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), SDWADesc); 1012 1013 // Copy dst, if it is present in original then should also be present in SDWA 1014 MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); 1015 if (Dst) { 1016 assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::vdst) != -1); 1017 SDWAInst.add(*Dst); 1018 } else if ((Dst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst))) { 1019 assert(Dst && 1020 AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::sdst) != -1); 1021 SDWAInst.add(*Dst); 1022 } else { 1023 assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::sdst) != -1); 1024 SDWAInst.addReg(TRI->getVCC(), RegState::Define); 1025 } 1026 1027 // Copy src0, initialize src0_modifiers. All sdwa instructions has src0 and 1028 // src0_modifiers (except for v_nop_sdwa, but it can't get here) 1029 MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); 1030 assert( 1031 Src0 && 1032 AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src0) != -1 && 1033 AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src0_modifiers) != -1); 1034 if (auto *Mod = TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)) 1035 SDWAInst.addImm(Mod->getImm()); 1036 else 1037 SDWAInst.addImm(0); 1038 SDWAInst.add(*Src0); 1039 1040 // Copy src1 if present, initialize src1_modifiers. 1041 MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); 1042 if (Src1) { 1043 assert( 1044 AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src1) != -1 && 1045 AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src1_modifiers) != -1); 1046 if (auto *Mod = TII->getNamedOperand(MI, AMDGPU::OpName::src1_modifiers)) 1047 SDWAInst.addImm(Mod->getImm()); 1048 else 1049 SDWAInst.addImm(0); 1050 SDWAInst.add(*Src1); 1051 } 1052 1053 if (SDWAOpcode == AMDGPU::V_FMAC_F16_sdwa || 1054 SDWAOpcode == AMDGPU::V_FMAC_F32_sdwa || 1055 SDWAOpcode == AMDGPU::V_MAC_F16_sdwa || 1056 SDWAOpcode == AMDGPU::V_MAC_F32_sdwa) { 1057 // v_mac_f16/32 has additional src2 operand tied to vdst 1058 MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2); 1059 assert(Src2); 1060 SDWAInst.add(*Src2); 1061 } 1062 1063 // Copy clamp if present, initialize otherwise 1064 assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::clamp) != -1); 1065 MachineOperand *Clamp = TII->getNamedOperand(MI, AMDGPU::OpName::clamp); 1066 if (Clamp) { 1067 SDWAInst.add(*Clamp); 1068 } else { 1069 SDWAInst.addImm(0); 1070 } 1071 1072 // Copy omod if present, initialize otherwise if needed 1073 if (AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::omod) != -1) { 1074 MachineOperand *OMod = TII->getNamedOperand(MI, AMDGPU::OpName::omod); 1075 if (OMod) { 1076 SDWAInst.add(*OMod); 1077 } else { 1078 SDWAInst.addImm(0); 1079 } 1080 } 1081 1082 // Copy dst_sel if present, initialize otherwise if needed 1083 if (AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::dst_sel) != -1) { 1084 MachineOperand *DstSel = TII->getNamedOperand(MI, AMDGPU::OpName::dst_sel); 1085 if (DstSel) { 1086 SDWAInst.add(*DstSel); 1087 } else { 1088 SDWAInst.addImm(AMDGPU::SDWA::SdwaSel::DWORD); 1089 } 1090 } 1091 1092 // Copy dst_unused if present, initialize otherwise if needed 1093 if (AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::dst_unused) != -1) { 1094 MachineOperand *DstUnused = TII->getNamedOperand(MI, AMDGPU::OpName::dst_unused); 1095 if (DstUnused) { 1096 SDWAInst.add(*DstUnused); 1097 } else { 1098 SDWAInst.addImm(AMDGPU::SDWA::DstUnused::UNUSED_PAD); 1099 } 1100 } 1101 1102 // Copy src0_sel if present, initialize otherwise 1103 assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src0_sel) != -1); 1104 MachineOperand *Src0Sel = TII->getNamedOperand(MI, AMDGPU::OpName::src0_sel); 1105 if (Src0Sel) { 1106 SDWAInst.add(*Src0Sel); 1107 } else { 1108 SDWAInst.addImm(AMDGPU::SDWA::SdwaSel::DWORD); 1109 } 1110 1111 // Copy src1_sel if present, initialize otherwise if needed 1112 if (Src1) { 1113 assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src1_sel) != -1); 1114 MachineOperand *Src1Sel = TII->getNamedOperand(MI, AMDGPU::OpName::src1_sel); 1115 if (Src1Sel) { 1116 SDWAInst.add(*Src1Sel); 1117 } else { 1118 SDWAInst.addImm(AMDGPU::SDWA::SdwaSel::DWORD); 1119 } 1120 } 1121 1122 // Check for a preserved register that needs to be copied. 1123 auto DstUnused = TII->getNamedOperand(MI, AMDGPU::OpName::dst_unused); 1124 if (DstUnused && 1125 DstUnused->getImm() == AMDGPU::SDWA::DstUnused::UNUSED_PRESERVE) { 1126 // We expect, if we are here, that the instruction was already in it's SDWA form, 1127 // with a tied operand. 1128 assert(Dst && Dst->isTied()); 1129 assert(Opcode == static_cast<unsigned int>(SDWAOpcode)); 1130 // We also expect a vdst, since sdst can't preserve. 1131 auto PreserveDstIdx = AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::vdst); 1132 assert(PreserveDstIdx != -1); 1133 1134 auto TiedIdx = MI.findTiedOperandIdx(PreserveDstIdx); 1135 auto Tied = MI.getOperand(TiedIdx); 1136 1137 SDWAInst.add(Tied); 1138 SDWAInst->tieOperands(PreserveDstIdx, SDWAInst->getNumOperands() - 1); 1139 } 1140 1141 // Apply all sdwa operand patterns. 1142 bool Converted = false; 1143 for (auto &Operand : SDWAOperands) { 1144 LLVM_DEBUG(dbgs() << *SDWAInst << "\nOperand: " << *Operand); 1145 // There should be no intesection between SDWA operands and potential MIs 1146 // e.g.: 1147 // v_and_b32 v0, 0xff, v1 -> src:v1 sel:BYTE_0 1148 // v_and_b32 v2, 0xff, v0 -> src:v0 sel:BYTE_0 1149 // v_add_u32 v3, v4, v2 1150 // 1151 // In that example it is possible that we would fold 2nd instruction into 3rd 1152 // (v_add_u32_sdwa) and then try to fold 1st instruction into 2nd (that was 1153 // already destroyed). So if SDWAOperand is also a potential MI then do not 1154 // apply it. 1155 if (PotentialMatches.count(Operand->getParentInst()) == 0) 1156 Converted |= Operand->convertToSDWA(*SDWAInst, TII); 1157 } 1158 if (Converted) { 1159 ConvertedInstructions.push_back(SDWAInst); 1160 } else { 1161 SDWAInst->eraseFromParent(); 1162 return false; 1163 } 1164 1165 LLVM_DEBUG(dbgs() << "\nInto:" << *SDWAInst << '\n'); 1166 ++NumSDWAInstructionsPeepholed; 1167 1168 MI.eraseFromParent(); 1169 return true; 1170 } 1171 1172 // If an instruction was converted to SDWA it should not have immediates or SGPR 1173 // operands (allowed one SGPR on GFX9). Copy its scalar operands into VGPRs. 1174 void SIPeepholeSDWA::legalizeScalarOperands(MachineInstr &MI, 1175 const GCNSubtarget &ST) const { 1176 const MCInstrDesc &Desc = TII->get(MI.getOpcode()); 1177 unsigned ConstantBusCount = 0; 1178 for (MachineOperand &Op : MI.explicit_uses()) { 1179 if (!Op.isImm() && !(Op.isReg() && !TRI->isVGPR(*MRI, Op.getReg()))) 1180 continue; 1181 1182 unsigned I = MI.getOperandNo(&Op); 1183 if (Desc.OpInfo[I].RegClass == -1 || 1184 !TRI->hasVGPRs(TRI->getRegClass(Desc.OpInfo[I].RegClass))) 1185 continue; 1186 1187 if (ST.hasSDWAScalar() && ConstantBusCount == 0 && Op.isReg() && 1188 TRI->isSGPRReg(*MRI, Op.getReg())) { 1189 ++ConstantBusCount; 1190 continue; 1191 } 1192 1193 Register VGPR = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1194 auto Copy = BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(), 1195 TII->get(AMDGPU::V_MOV_B32_e32), VGPR); 1196 if (Op.isImm()) 1197 Copy.addImm(Op.getImm()); 1198 else if (Op.isReg()) 1199 Copy.addReg(Op.getReg(), Op.isKill() ? RegState::Kill : 0, 1200 Op.getSubReg()); 1201 Op.ChangeToRegister(VGPR, false); 1202 } 1203 } 1204 1205 bool SIPeepholeSDWA::runOnMachineFunction(MachineFunction &MF) { 1206 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1207 1208 if (!ST.hasSDWA() || skipFunction(MF.getFunction())) 1209 return false; 1210 1211 MRI = &MF.getRegInfo(); 1212 TRI = ST.getRegisterInfo(); 1213 TII = ST.getInstrInfo(); 1214 1215 // Find all SDWA operands in MF. 1216 bool Ret = false; 1217 for (MachineBasicBlock &MBB : MF) { 1218 bool Changed = false; 1219 do { 1220 // Preprocess the ADD/SUB pairs so they could be SDWA'ed. 1221 // Look for a possible ADD or SUB that resulted from a previously lowered 1222 // V_{ADD|SUB}_U64_PSEUDO. The function pseudoOpConvertToVOP2 1223 // lowers the pair of instructions into e32 form. 1224 matchSDWAOperands(MBB); 1225 for (const auto &OperandPair : SDWAOperands) { 1226 const auto &Operand = OperandPair.second; 1227 MachineInstr *PotentialMI = Operand->potentialToConvert(TII); 1228 if (PotentialMI && 1229 (PotentialMI->getOpcode() == AMDGPU::V_ADD_I32_e64 || 1230 PotentialMI->getOpcode() == AMDGPU::V_SUB_I32_e64)) 1231 pseudoOpConvertToVOP2(*PotentialMI, ST); 1232 } 1233 SDWAOperands.clear(); 1234 1235 // Generate potential match list. 1236 matchSDWAOperands(MBB); 1237 1238 for (const auto &OperandPair : SDWAOperands) { 1239 const auto &Operand = OperandPair.second; 1240 MachineInstr *PotentialMI = Operand->potentialToConvert(TII); 1241 if (PotentialMI && isConvertibleToSDWA(*PotentialMI, ST)) { 1242 PotentialMatches[PotentialMI].push_back(Operand.get()); 1243 } 1244 } 1245 1246 for (auto &PotentialPair : PotentialMatches) { 1247 MachineInstr &PotentialMI = *PotentialPair.first; 1248 convertToSDWA(PotentialMI, PotentialPair.second); 1249 } 1250 1251 PotentialMatches.clear(); 1252 SDWAOperands.clear(); 1253 1254 Changed = !ConvertedInstructions.empty(); 1255 1256 if (Changed) 1257 Ret = true; 1258 while (!ConvertedInstructions.empty()) 1259 legalizeScalarOperands(*ConvertedInstructions.pop_back_val(), ST); 1260 } while (Changed); 1261 } 1262 1263 return Ret; 1264 } 1265