xref: /llvm-project/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp (revision bfd9bc274586b0261e16e22ac50d50586a0152e2)
1 //===- SIPeepholeSDWA.cpp - Peephole optimization for SDWA instructions ---===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file This pass tries to apply several peephole SDWA patterns.
10 ///
11 /// E.g. original:
12 ///   V_LSHRREV_B32_e32 %0, 16, %1
13 ///   V_ADD_CO_U32_e32 %2, %0, %3
14 ///   V_LSHLREV_B32_e32 %4, 16, %2
15 ///
16 /// Replace:
17 ///   V_ADD_CO_U32_sdwa %4, %1, %3
18 ///       dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
19 ///
20 //===----------------------------------------------------------------------===//
21 
22 #include "SIPeepholeSDWA.h"
23 #include "AMDGPU.h"
24 #include "GCNSubtarget.h"
25 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
26 #include "llvm/ADT/MapVector.h"
27 #include "llvm/ADT/Statistic.h"
28 #include "llvm/CodeGen/MachineFunctionPass.h"
29 #include <optional>
30 
31 using namespace llvm;
32 
33 #define DEBUG_TYPE "si-peephole-sdwa"
34 
35 STATISTIC(NumSDWAPatternsFound, "Number of SDWA patterns found.");
36 STATISTIC(NumSDWAInstructionsPeepholed,
37           "Number of instruction converted to SDWA.");
38 
39 namespace {
40 
41 bool isConvertibleToSDWA(MachineInstr &MI, const GCNSubtarget &ST,
42                          const SIInstrInfo *TII);
43 class SDWAOperand;
44 class SDWADstOperand;
45 
46 using SDWAOperandsVector = SmallVector<SDWAOperand *, 4>;
47 using SDWAOperandsMap = MapVector<MachineInstr *, SDWAOperandsVector>;
48 
49 class SIPeepholeSDWA {
50 private:
51   MachineRegisterInfo *MRI;
52   const SIRegisterInfo *TRI;
53   const SIInstrInfo *TII;
54 
55   MapVector<MachineInstr *, std::unique_ptr<SDWAOperand>> SDWAOperands;
56   SDWAOperandsMap PotentialMatches;
57   SmallVector<MachineInstr *, 8> ConvertedInstructions;
58 
59   std::optional<int64_t> foldToImm(const MachineOperand &Op) const;
60 
61   void matchSDWAOperands(MachineBasicBlock &MBB);
62   std::unique_ptr<SDWAOperand> matchSDWAOperand(MachineInstr &MI);
63   void pseudoOpConvertToVOP2(MachineInstr &MI,
64                              const GCNSubtarget &ST) const;
65   bool convertToSDWA(MachineInstr &MI, const SDWAOperandsVector &SDWAOperands);
66   void legalizeScalarOperands(MachineInstr &MI, const GCNSubtarget &ST) const;
67 
68 public:
69   bool run(MachineFunction &MF);
70 };
71 
72 class SIPeepholeSDWALegacy : public MachineFunctionPass {
73 public:
74   static char ID;
75 
76   SIPeepholeSDWALegacy() : MachineFunctionPass(ID) {}
77 
78   StringRef getPassName() const override { return "SI Peephole SDWA"; }
79 
80   bool runOnMachineFunction(MachineFunction &MF) override;
81 
82   void getAnalysisUsage(AnalysisUsage &AU) const override {
83     AU.setPreservesCFG();
84     MachineFunctionPass::getAnalysisUsage(AU);
85   }
86 };
87 
88 class SDWAOperand {
89 private:
90   MachineOperand *Target; // Operand that would be used in converted instruction
91   MachineOperand *Replaced; // Operand that would be replace by Target
92 
93 public:
94   SDWAOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp)
95       : Target(TargetOp), Replaced(ReplacedOp) {
96     assert(Target->isReg());
97     assert(Replaced->isReg());
98   }
99 
100   virtual ~SDWAOperand() = default;
101 
102   virtual MachineInstr *potentialToConvert(const SIInstrInfo *TII,
103                                            const GCNSubtarget &ST,
104                                            SDWAOperandsMap *PotentialMatches = nullptr) = 0;
105   virtual bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) = 0;
106 
107   MachineOperand *getTargetOperand() const { return Target; }
108   MachineOperand *getReplacedOperand() const { return Replaced; }
109   MachineInstr *getParentInst() const { return Target->getParent(); }
110 
111   MachineRegisterInfo *getMRI() const {
112     return &getParentInst()->getParent()->getParent()->getRegInfo();
113   }
114 
115 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
116   virtual void print(raw_ostream& OS) const = 0;
117   void dump() const { print(dbgs()); }
118 #endif
119 };
120 
121 using namespace AMDGPU::SDWA;
122 
123 class SDWASrcOperand : public SDWAOperand {
124 private:
125   SdwaSel SrcSel;
126   bool Abs;
127   bool Neg;
128   bool Sext;
129 
130 public:
131   SDWASrcOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp,
132                  SdwaSel SrcSel_ = DWORD, bool Abs_ = false, bool Neg_ = false,
133                  bool Sext_ = false)
134       : SDWAOperand(TargetOp, ReplacedOp),
135         SrcSel(SrcSel_), Abs(Abs_), Neg(Neg_), Sext(Sext_) {}
136 
137   MachineInstr *potentialToConvert(const SIInstrInfo *TII,
138                                    const GCNSubtarget &ST,
139                                    SDWAOperandsMap *PotentialMatches = nullptr) override;
140   bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override;
141 
142   SdwaSel getSrcSel() const { return SrcSel; }
143   bool getAbs() const { return Abs; }
144   bool getNeg() const { return Neg; }
145   bool getSext() const { return Sext; }
146 
147   uint64_t getSrcMods(const SIInstrInfo *TII,
148                       const MachineOperand *SrcOp) const;
149 
150 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
151   void print(raw_ostream& OS) const override;
152 #endif
153 };
154 
155 class SDWADstOperand : public SDWAOperand {
156 private:
157   SdwaSel DstSel;
158   DstUnused DstUn;
159 
160 public:
161 
162   SDWADstOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp,
163                  SdwaSel DstSel_ = DWORD, DstUnused DstUn_ = UNUSED_PAD)
164     : SDWAOperand(TargetOp, ReplacedOp), DstSel(DstSel_), DstUn(DstUn_) {}
165 
166   MachineInstr *potentialToConvert(const SIInstrInfo *TII,
167                                    const GCNSubtarget &ST,
168                                    SDWAOperandsMap *PotentialMatches = nullptr) override;
169   bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override;
170 
171   SdwaSel getDstSel() const { return DstSel; }
172   DstUnused getDstUnused() const { return DstUn; }
173 
174 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
175   void print(raw_ostream& OS) const override;
176 #endif
177 };
178 
179 class SDWADstPreserveOperand : public SDWADstOperand {
180 private:
181   MachineOperand *Preserve;
182 
183 public:
184   SDWADstPreserveOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp,
185                          MachineOperand *PreserveOp, SdwaSel DstSel_ = DWORD)
186       : SDWADstOperand(TargetOp, ReplacedOp, DstSel_, UNUSED_PRESERVE),
187         Preserve(PreserveOp) {}
188 
189   bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override;
190 
191   MachineOperand *getPreservedOperand() const { return Preserve; }
192 
193 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
194   void print(raw_ostream& OS) const override;
195 #endif
196 };
197 
198 } // end anonymous namespace
199 
200 INITIALIZE_PASS(SIPeepholeSDWALegacy, DEBUG_TYPE, "SI Peephole SDWA", false,
201                 false)
202 
203 char SIPeepholeSDWALegacy::ID = 0;
204 
205 char &llvm::SIPeepholeSDWALegacyID = SIPeepholeSDWALegacy::ID;
206 
207 FunctionPass *llvm::createSIPeepholeSDWALegacyPass() {
208   return new SIPeepholeSDWALegacy();
209 }
210 
211 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
212 static raw_ostream& operator<<(raw_ostream &OS, SdwaSel Sel) {
213   switch(Sel) {
214   case BYTE_0: OS << "BYTE_0"; break;
215   case BYTE_1: OS << "BYTE_1"; break;
216   case BYTE_2: OS << "BYTE_2"; break;
217   case BYTE_3: OS << "BYTE_3"; break;
218   case WORD_0: OS << "WORD_0"; break;
219   case WORD_1: OS << "WORD_1"; break;
220   case DWORD:  OS << "DWORD"; break;
221   }
222   return OS;
223 }
224 
225 static raw_ostream& operator<<(raw_ostream &OS, const DstUnused &Un) {
226   switch(Un) {
227   case UNUSED_PAD: OS << "UNUSED_PAD"; break;
228   case UNUSED_SEXT: OS << "UNUSED_SEXT"; break;
229   case UNUSED_PRESERVE: OS << "UNUSED_PRESERVE"; break;
230   }
231   return OS;
232 }
233 
234 LLVM_DUMP_METHOD
235 void SDWASrcOperand::print(raw_ostream& OS) const {
236   OS << "SDWA src: " << *getTargetOperand()
237     << " src_sel:" << getSrcSel()
238     << " abs:" << getAbs() << " neg:" << getNeg()
239     << " sext:" << getSext() << '\n';
240 }
241 
242 LLVM_DUMP_METHOD
243 void SDWADstOperand::print(raw_ostream& OS) const {
244   OS << "SDWA dst: " << *getTargetOperand()
245     << " dst_sel:" << getDstSel()
246     << " dst_unused:" << getDstUnused() << '\n';
247 }
248 
249 LLVM_DUMP_METHOD
250 void SDWADstPreserveOperand::print(raw_ostream& OS) const {
251   OS << "SDWA preserve dst: " << *getTargetOperand()
252     << " dst_sel:" << getDstSel()
253     << " preserve:" << *getPreservedOperand() << '\n';
254 }
255 
256 #endif
257 
258 static void copyRegOperand(MachineOperand &To, const MachineOperand &From) {
259   assert(To.isReg() && From.isReg());
260   To.setReg(From.getReg());
261   To.setSubReg(From.getSubReg());
262   To.setIsUndef(From.isUndef());
263   if (To.isUse()) {
264     To.setIsKill(From.isKill());
265   } else {
266     To.setIsDead(From.isDead());
267   }
268 }
269 
270 static bool isSameReg(const MachineOperand &LHS, const MachineOperand &RHS) {
271   return LHS.isReg() &&
272          RHS.isReg() &&
273          LHS.getReg() == RHS.getReg() &&
274          LHS.getSubReg() == RHS.getSubReg();
275 }
276 
277 static MachineOperand *findSingleRegUse(const MachineOperand *Reg,
278                                         const MachineRegisterInfo *MRI) {
279   if (!Reg->isReg() || !Reg->isDef())
280     return nullptr;
281 
282   MachineOperand *ResMO = nullptr;
283   for (MachineOperand &UseMO : MRI->use_nodbg_operands(Reg->getReg())) {
284     // If there exist use of subreg of Reg then return nullptr
285     if (!isSameReg(UseMO, *Reg))
286       return nullptr;
287 
288     // Check that there is only one instruction that uses Reg
289     if (!ResMO) {
290       ResMO = &UseMO;
291     } else if (ResMO->getParent() != UseMO.getParent()) {
292       return nullptr;
293     }
294   }
295 
296   return ResMO;
297 }
298 
299 static MachineOperand *findSingleRegDef(const MachineOperand *Reg,
300                                         const MachineRegisterInfo *MRI) {
301   if (!Reg->isReg())
302     return nullptr;
303 
304   MachineInstr *DefInstr = MRI->getUniqueVRegDef(Reg->getReg());
305   if (!DefInstr)
306     return nullptr;
307 
308   for (auto &DefMO : DefInstr->defs()) {
309     if (DefMO.isReg() && DefMO.getReg() == Reg->getReg())
310       return &DefMO;
311   }
312 
313   // Ignore implicit defs.
314   return nullptr;
315 }
316 
317 uint64_t SDWASrcOperand::getSrcMods(const SIInstrInfo *TII,
318                                     const MachineOperand *SrcOp) const {
319   uint64_t Mods = 0;
320   const auto *MI = SrcOp->getParent();
321   if (TII->getNamedOperand(*MI, AMDGPU::OpName::src0) == SrcOp) {
322     if (auto *Mod = TII->getNamedOperand(*MI, AMDGPU::OpName::src0_modifiers)) {
323       Mods = Mod->getImm();
324     }
325   } else if (TII->getNamedOperand(*MI, AMDGPU::OpName::src1) == SrcOp) {
326     if (auto *Mod = TII->getNamedOperand(*MI, AMDGPU::OpName::src1_modifiers)) {
327       Mods = Mod->getImm();
328     }
329   }
330   if (Abs || Neg) {
331     assert(!Sext &&
332            "Float and integer src modifiers can't be set simultaneously");
333     Mods |= Abs ? SISrcMods::ABS : 0u;
334     Mods ^= Neg ? SISrcMods::NEG : 0u;
335   } else if (Sext) {
336     Mods |= SISrcMods::SEXT;
337   }
338 
339   return Mods;
340 }
341 
342 MachineInstr *SDWASrcOperand::potentialToConvert(const SIInstrInfo *TII,
343                                                  const GCNSubtarget &ST,
344                                                  SDWAOperandsMap *PotentialMatches) {
345   if (PotentialMatches != nullptr) {
346     // Fill out the map for all uses if all can be converted
347     MachineOperand *Reg = getReplacedOperand();
348     if (!Reg->isReg() || !Reg->isDef())
349       return nullptr;
350 
351     for (MachineInstr &UseMI : getMRI()->use_nodbg_instructions(Reg->getReg()))
352       // Check that all instructions that use Reg can be converted
353       if (!isConvertibleToSDWA(UseMI, ST, TII))
354         return nullptr;
355 
356     // Now that it's guaranteed all uses are legal, iterate over the uses again
357     // to add them for later conversion.
358     for (MachineOperand &UseMO : getMRI()->use_nodbg_operands(Reg->getReg())) {
359       // Should not get a subregister here
360       assert(isSameReg(UseMO, *Reg));
361 
362       SDWAOperandsMap &potentialMatchesMap = *PotentialMatches;
363       MachineInstr *UseMI = UseMO.getParent();
364       potentialMatchesMap[UseMI].push_back(this);
365     }
366     return nullptr;
367   }
368 
369   // For SDWA src operand potential instruction is one that use register
370   // defined by parent instruction
371   MachineOperand *PotentialMO = findSingleRegUse(getReplacedOperand(), getMRI());
372   if (!PotentialMO)
373     return nullptr;
374 
375   return PotentialMO->getParent();
376 }
377 
378 bool SDWASrcOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) {
379   switch (MI.getOpcode()) {
380   case AMDGPU::V_CVT_F32_FP8_sdwa:
381   case AMDGPU::V_CVT_F32_BF8_sdwa:
382   case AMDGPU::V_CVT_PK_F32_FP8_sdwa:
383   case AMDGPU::V_CVT_PK_F32_BF8_sdwa:
384     // Does not support input modifiers: noabs, noneg, nosext.
385     return false;
386   }
387 
388   // Find operand in instruction that matches source operand and replace it with
389   // target operand. Set corresponding src_sel
390   bool IsPreserveSrc = false;
391   MachineOperand *Src = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
392   MachineOperand *SrcSel = TII->getNamedOperand(MI, AMDGPU::OpName::src0_sel);
393   MachineOperand *SrcMods =
394       TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
395   assert(Src && (Src->isReg() || Src->isImm()));
396   if (!isSameReg(*Src, *getReplacedOperand())) {
397     // If this is not src0 then it could be src1
398     Src = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
399     SrcSel = TII->getNamedOperand(MI, AMDGPU::OpName::src1_sel);
400     SrcMods = TII->getNamedOperand(MI, AMDGPU::OpName::src1_modifiers);
401 
402     if (!Src ||
403         !isSameReg(*Src, *getReplacedOperand())) {
404       // It's possible this Src is a tied operand for
405       // UNUSED_PRESERVE, in which case we can either
406       // abandon the peephole attempt, or if legal we can
407       // copy the target operand into the tied slot
408       // if the preserve operation will effectively cause the same
409       // result by overwriting the rest of the dst.
410       MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
411       MachineOperand *DstUnused =
412         TII->getNamedOperand(MI, AMDGPU::OpName::dst_unused);
413 
414       if (Dst &&
415           DstUnused->getImm() == AMDGPU::SDWA::DstUnused::UNUSED_PRESERVE) {
416         // This will work if the tied src is accessing WORD_0, and the dst is
417         // writing WORD_1. Modifiers don't matter because all the bits that
418         // would be impacted are being overwritten by the dst.
419         // Any other case will not work.
420         SdwaSel DstSel = static_cast<SdwaSel>(
421             TII->getNamedImmOperand(MI, AMDGPU::OpName::dst_sel));
422         if (DstSel == AMDGPU::SDWA::SdwaSel::WORD_1 &&
423             getSrcSel() == AMDGPU::SDWA::SdwaSel::WORD_0) {
424           IsPreserveSrc = true;
425           auto DstIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
426                                                    AMDGPU::OpName::vdst);
427           auto TiedIdx = MI.findTiedOperandIdx(DstIdx);
428           Src = &MI.getOperand(TiedIdx);
429           SrcSel = nullptr;
430           SrcMods = nullptr;
431         } else {
432           // Not legal to convert this src
433           return false;
434         }
435       }
436     }
437     assert(Src && Src->isReg());
438 
439     if ((MI.getOpcode() == AMDGPU::V_FMAC_F16_sdwa ||
440          MI.getOpcode() == AMDGPU::V_FMAC_F32_sdwa ||
441          MI.getOpcode() == AMDGPU::V_MAC_F16_sdwa ||
442          MI.getOpcode() == AMDGPU::V_MAC_F32_sdwa) &&
443          !isSameReg(*Src, *getReplacedOperand())) {
444       // In case of v_mac_f16/32_sdwa this pass can try to apply src operand to
445       // src2. This is not allowed.
446       return false;
447     }
448 
449     assert(isSameReg(*Src, *getReplacedOperand()) &&
450            (IsPreserveSrc || (SrcSel && SrcMods)));
451   }
452   copyRegOperand(*Src, *getTargetOperand());
453   if (!IsPreserveSrc) {
454     SrcSel->setImm(getSrcSel());
455     SrcMods->setImm(getSrcMods(TII, Src));
456   }
457   getTargetOperand()->setIsKill(false);
458   return true;
459 }
460 
461 MachineInstr *SDWADstOperand::potentialToConvert(const SIInstrInfo *TII,
462                                                  const GCNSubtarget &ST,
463                                                  SDWAOperandsMap *PotentialMatches) {
464   // For SDWA dst operand potential instruction is one that defines register
465   // that this operand uses
466   MachineRegisterInfo *MRI = getMRI();
467   MachineInstr *ParentMI = getParentInst();
468 
469   MachineOperand *PotentialMO = findSingleRegDef(getReplacedOperand(), MRI);
470   if (!PotentialMO)
471     return nullptr;
472 
473   // Check that ParentMI is the only instruction that uses replaced register
474   for (MachineInstr &UseInst : MRI->use_nodbg_instructions(PotentialMO->getReg())) {
475     if (&UseInst != ParentMI)
476       return nullptr;
477   }
478 
479   return PotentialMO->getParent();
480 }
481 
482 bool SDWADstOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) {
483   // Replace vdst operand in MI with target operand. Set dst_sel and dst_unused
484 
485   if ((MI.getOpcode() == AMDGPU::V_FMAC_F16_sdwa ||
486        MI.getOpcode() == AMDGPU::V_FMAC_F32_sdwa ||
487        MI.getOpcode() == AMDGPU::V_MAC_F16_sdwa ||
488        MI.getOpcode() == AMDGPU::V_MAC_F32_sdwa) &&
489       getDstSel() != AMDGPU::SDWA::DWORD) {
490     // v_mac_f16/32_sdwa allow dst_sel to be equal only to DWORD
491     return false;
492   }
493 
494   MachineOperand *Operand = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
495   assert(Operand &&
496          Operand->isReg() &&
497          isSameReg(*Operand, *getReplacedOperand()));
498   copyRegOperand(*Operand, *getTargetOperand());
499   MachineOperand *DstSel= TII->getNamedOperand(MI, AMDGPU::OpName::dst_sel);
500   assert(DstSel);
501   DstSel->setImm(getDstSel());
502   MachineOperand *DstUnused= TII->getNamedOperand(MI, AMDGPU::OpName::dst_unused);
503   assert(DstUnused);
504   DstUnused->setImm(getDstUnused());
505 
506   // Remove original instruction  because it would conflict with our new
507   // instruction by register definition
508   getParentInst()->eraseFromParent();
509   return true;
510 }
511 
512 bool SDWADstPreserveOperand::convertToSDWA(MachineInstr &MI,
513                                            const SIInstrInfo *TII) {
514   // MI should be moved right before v_or_b32.
515   // For this we should clear all kill flags on uses of MI src-operands or else
516   // we can encounter problem with use of killed operand.
517   for (MachineOperand &MO : MI.uses()) {
518     if (!MO.isReg())
519       continue;
520     getMRI()->clearKillFlags(MO.getReg());
521   }
522 
523   // Move MI before v_or_b32
524   MI.getParent()->remove(&MI);
525   getParentInst()->getParent()->insert(getParentInst(), &MI);
526 
527   // Add Implicit use of preserved register
528   MachineInstrBuilder MIB(*MI.getMF(), MI);
529   MIB.addReg(getPreservedOperand()->getReg(),
530              RegState::ImplicitKill,
531              getPreservedOperand()->getSubReg());
532 
533   // Tie dst to implicit use
534   MI.tieOperands(AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdst),
535                  MI.getNumOperands() - 1);
536 
537   // Convert MI as any other SDWADstOperand and remove v_or_b32
538   return SDWADstOperand::convertToSDWA(MI, TII);
539 }
540 
541 std::optional<int64_t>
542 SIPeepholeSDWA::foldToImm(const MachineOperand &Op) const {
543   if (Op.isImm()) {
544     return Op.getImm();
545   }
546 
547   // If this is not immediate then it can be copy of immediate value, e.g.:
548   // %1 = S_MOV_B32 255;
549   if (Op.isReg()) {
550     for (const MachineOperand &Def : MRI->def_operands(Op.getReg())) {
551       if (!isSameReg(Op, Def))
552         continue;
553 
554       const MachineInstr *DefInst = Def.getParent();
555       if (!TII->isFoldableCopy(*DefInst))
556         return std::nullopt;
557 
558       const MachineOperand &Copied = DefInst->getOperand(1);
559       if (!Copied.isImm())
560         return std::nullopt;
561 
562       return Copied.getImm();
563     }
564   }
565 
566   return std::nullopt;
567 }
568 
569 std::unique_ptr<SDWAOperand>
570 SIPeepholeSDWA::matchSDWAOperand(MachineInstr &MI) {
571   unsigned Opcode = MI.getOpcode();
572   switch (Opcode) {
573   case AMDGPU::V_LSHRREV_B32_e32:
574   case AMDGPU::V_ASHRREV_I32_e32:
575   case AMDGPU::V_LSHLREV_B32_e32:
576   case AMDGPU::V_LSHRREV_B32_e64:
577   case AMDGPU::V_ASHRREV_I32_e64:
578   case AMDGPU::V_LSHLREV_B32_e64: {
579     // from: v_lshrrev_b32_e32 v1, 16/24, v0
580     // to SDWA src:v0 src_sel:WORD_1/BYTE_3
581 
582     // from: v_ashrrev_i32_e32 v1, 16/24, v0
583     // to SDWA src:v0 src_sel:WORD_1/BYTE_3 sext:1
584 
585     // from: v_lshlrev_b32_e32 v1, 16/24, v0
586     // to SDWA dst:v1 dst_sel:WORD_1/BYTE_3 dst_unused:UNUSED_PAD
587     MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
588     auto Imm = foldToImm(*Src0);
589     if (!Imm)
590       break;
591 
592     if (*Imm != 16 && *Imm != 24)
593       break;
594 
595     MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
596     MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
597     if (!Src1->isReg() || Src1->getReg().isPhysical() ||
598         Dst->getReg().isPhysical())
599       break;
600 
601     if (Opcode == AMDGPU::V_LSHLREV_B32_e32 ||
602         Opcode == AMDGPU::V_LSHLREV_B32_e64) {
603       return std::make_unique<SDWADstOperand>(
604           Dst, Src1, *Imm == 16 ? WORD_1 : BYTE_3, UNUSED_PAD);
605     }
606     return std::make_unique<SDWASrcOperand>(
607         Src1, Dst, *Imm == 16 ? WORD_1 : BYTE_3, false, false,
608         Opcode != AMDGPU::V_LSHRREV_B32_e32 &&
609             Opcode != AMDGPU::V_LSHRREV_B32_e64);
610     break;
611   }
612 
613   case AMDGPU::V_LSHRREV_B16_e32:
614   case AMDGPU::V_ASHRREV_I16_e32:
615   case AMDGPU::V_LSHLREV_B16_e32:
616   case AMDGPU::V_LSHRREV_B16_e64:
617   case AMDGPU::V_ASHRREV_I16_e64:
618   case AMDGPU::V_LSHLREV_B16_e64: {
619     // from: v_lshrrev_b16_e32 v1, 8, v0
620     // to SDWA src:v0 src_sel:BYTE_1
621 
622     // from: v_ashrrev_i16_e32 v1, 8, v0
623     // to SDWA src:v0 src_sel:BYTE_1 sext:1
624 
625     // from: v_lshlrev_b16_e32 v1, 8, v0
626     // to SDWA dst:v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD
627     MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
628     auto Imm = foldToImm(*Src0);
629     if (!Imm || *Imm != 8)
630       break;
631 
632     MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
633     MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
634 
635     if (!Src1->isReg() || Src1->getReg().isPhysical() ||
636         Dst->getReg().isPhysical())
637       break;
638 
639     if (Opcode == AMDGPU::V_LSHLREV_B16_e32 ||
640         Opcode == AMDGPU::V_LSHLREV_B16_e64)
641       return std::make_unique<SDWADstOperand>(Dst, Src1, BYTE_1, UNUSED_PAD);
642     return std::make_unique<SDWASrcOperand>(
643         Src1, Dst, BYTE_1, false, false,
644         Opcode != AMDGPU::V_LSHRREV_B16_e32 &&
645             Opcode != AMDGPU::V_LSHRREV_B16_e64);
646     break;
647   }
648 
649   case AMDGPU::V_BFE_I32_e64:
650   case AMDGPU::V_BFE_U32_e64: {
651     // e.g.:
652     // from: v_bfe_u32 v1, v0, 8, 8
653     // to SDWA src:v0 src_sel:BYTE_1
654 
655     // offset | width | src_sel
656     // ------------------------
657     // 0      | 8     | BYTE_0
658     // 0      | 16    | WORD_0
659     // 0      | 32    | DWORD ?
660     // 8      | 8     | BYTE_1
661     // 16     | 8     | BYTE_2
662     // 16     | 16    | WORD_1
663     // 24     | 8     | BYTE_3
664 
665     MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
666     auto Offset = foldToImm(*Src1);
667     if (!Offset)
668       break;
669 
670     MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2);
671     auto Width = foldToImm(*Src2);
672     if (!Width)
673       break;
674 
675     SdwaSel SrcSel = DWORD;
676 
677     if (*Offset == 0 && *Width == 8)
678       SrcSel = BYTE_0;
679     else if (*Offset == 0 && *Width == 16)
680       SrcSel = WORD_0;
681     else if (*Offset == 0 && *Width == 32)
682       SrcSel = DWORD;
683     else if (*Offset == 8 && *Width == 8)
684       SrcSel = BYTE_1;
685     else if (*Offset == 16 && *Width == 8)
686       SrcSel = BYTE_2;
687     else if (*Offset == 16 && *Width == 16)
688       SrcSel = WORD_1;
689     else if (*Offset == 24 && *Width == 8)
690       SrcSel = BYTE_3;
691     else
692       break;
693 
694     MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
695     MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
696 
697     if (!Src0->isReg() || Src0->getReg().isPhysical() ||
698         Dst->getReg().isPhysical())
699       break;
700 
701     return std::make_unique<SDWASrcOperand>(
702           Src0, Dst, SrcSel, false, false, Opcode != AMDGPU::V_BFE_U32_e64);
703   }
704 
705   case AMDGPU::V_AND_B32_e32:
706   case AMDGPU::V_AND_B32_e64: {
707     // e.g.:
708     // from: v_and_b32_e32 v1, 0x0000ffff/0x000000ff, v0
709     // to SDWA src:v0 src_sel:WORD_0/BYTE_0
710 
711     MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
712     MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
713     auto *ValSrc = Src1;
714     auto Imm = foldToImm(*Src0);
715 
716     if (!Imm) {
717       Imm = foldToImm(*Src1);
718       ValSrc = Src0;
719     }
720 
721     if (!Imm || (*Imm != 0x0000ffff && *Imm != 0x000000ff))
722       break;
723 
724     MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
725 
726     if (!ValSrc->isReg() || ValSrc->getReg().isPhysical() ||
727         Dst->getReg().isPhysical())
728       break;
729 
730     return std::make_unique<SDWASrcOperand>(
731         ValSrc, Dst, *Imm == 0x0000ffff ? WORD_0 : BYTE_0);
732   }
733 
734   case AMDGPU::V_OR_B32_e32:
735   case AMDGPU::V_OR_B32_e64: {
736     // Patterns for dst_unused:UNUSED_PRESERVE.
737     // e.g., from:
738     // v_add_f16_sdwa v0, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD
739     //                           src1_sel:WORD_1 src2_sel:WORD1
740     // v_add_f16_e32 v3, v1, v2
741     // v_or_b32_e32 v4, v0, v3
742     // to SDWA preserve dst:v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE preserve:v3
743 
744     // Check if one of operands of v_or_b32 is SDWA instruction
745     using CheckRetType =
746         std::optional<std::pair<MachineOperand *, MachineOperand *>>;
747     auto CheckOROperandsForSDWA =
748       [&](const MachineOperand *Op1, const MachineOperand *Op2) -> CheckRetType {
749         if (!Op1 || !Op1->isReg() || !Op2 || !Op2->isReg())
750           return CheckRetType(std::nullopt);
751 
752         MachineOperand *Op1Def = findSingleRegDef(Op1, MRI);
753         if (!Op1Def)
754           return CheckRetType(std::nullopt);
755 
756         MachineInstr *Op1Inst = Op1Def->getParent();
757         if (!TII->isSDWA(*Op1Inst))
758           return CheckRetType(std::nullopt);
759 
760         MachineOperand *Op2Def = findSingleRegDef(Op2, MRI);
761         if (!Op2Def)
762           return CheckRetType(std::nullopt);
763 
764         return CheckRetType(std::pair(Op1Def, Op2Def));
765       };
766 
767     MachineOperand *OrSDWA = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
768     MachineOperand *OrOther = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
769     assert(OrSDWA && OrOther);
770     auto Res = CheckOROperandsForSDWA(OrSDWA, OrOther);
771     if (!Res) {
772       OrSDWA = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
773       OrOther = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
774       assert(OrSDWA && OrOther);
775       Res = CheckOROperandsForSDWA(OrSDWA, OrOther);
776       if (!Res)
777         break;
778     }
779 
780     MachineOperand *OrSDWADef = Res->first;
781     MachineOperand *OrOtherDef = Res->second;
782     assert(OrSDWADef && OrOtherDef);
783 
784     MachineInstr *SDWAInst = OrSDWADef->getParent();
785     MachineInstr *OtherInst = OrOtherDef->getParent();
786 
787     // Check that OtherInstr is actually bitwise compatible with SDWAInst = their
788     // destination patterns don't overlap. Compatible instruction can be either
789     // regular instruction with compatible bitness or SDWA instruction with
790     // correct dst_sel
791     // SDWAInst | OtherInst bitness / OtherInst dst_sel
792     // -----------------------------------------------------
793     // DWORD    | no                    / no
794     // WORD_0   | no                    / BYTE_2/3, WORD_1
795     // WORD_1   | 8/16-bit instructions / BYTE_0/1, WORD_0
796     // BYTE_0   | no                    / BYTE_1/2/3, WORD_1
797     // BYTE_1   | 8-bit                 / BYTE_0/2/3, WORD_1
798     // BYTE_2   | 8/16-bit              / BYTE_0/1/3. WORD_0
799     // BYTE_3   | 8/16/24-bit           / BYTE_0/1/2, WORD_0
800     // E.g. if SDWAInst is v_add_f16_sdwa dst_sel:WORD_1 then v_add_f16 is OK
801     // but v_add_f32 is not.
802 
803     // TODO: add support for non-SDWA instructions as OtherInst.
804     // For now this only works with SDWA instructions. For regular instructions
805     // there is no way to determine if the instruction writes only 8/16/24-bit
806     // out of full register size and all registers are at min 32-bit wide.
807     if (!TII->isSDWA(*OtherInst))
808       break;
809 
810     SdwaSel DstSel = static_cast<SdwaSel>(
811         TII->getNamedImmOperand(*SDWAInst, AMDGPU::OpName::dst_sel));
812     SdwaSel OtherDstSel = static_cast<SdwaSel>(
813       TII->getNamedImmOperand(*OtherInst, AMDGPU::OpName::dst_sel));
814 
815     bool DstSelAgree = false;
816     switch (DstSel) {
817     case WORD_0: DstSelAgree = ((OtherDstSel == BYTE_2) ||
818                                 (OtherDstSel == BYTE_3) ||
819                                 (OtherDstSel == WORD_1));
820       break;
821     case WORD_1: DstSelAgree = ((OtherDstSel == BYTE_0) ||
822                                 (OtherDstSel == BYTE_1) ||
823                                 (OtherDstSel == WORD_0));
824       break;
825     case BYTE_0: DstSelAgree = ((OtherDstSel == BYTE_1) ||
826                                 (OtherDstSel == BYTE_2) ||
827                                 (OtherDstSel == BYTE_3) ||
828                                 (OtherDstSel == WORD_1));
829       break;
830     case BYTE_1: DstSelAgree = ((OtherDstSel == BYTE_0) ||
831                                 (OtherDstSel == BYTE_2) ||
832                                 (OtherDstSel == BYTE_3) ||
833                                 (OtherDstSel == WORD_1));
834       break;
835     case BYTE_2: DstSelAgree = ((OtherDstSel == BYTE_0) ||
836                                 (OtherDstSel == BYTE_1) ||
837                                 (OtherDstSel == BYTE_3) ||
838                                 (OtherDstSel == WORD_0));
839       break;
840     case BYTE_3: DstSelAgree = ((OtherDstSel == BYTE_0) ||
841                                 (OtherDstSel == BYTE_1) ||
842                                 (OtherDstSel == BYTE_2) ||
843                                 (OtherDstSel == WORD_0));
844       break;
845     default: DstSelAgree = false;
846     }
847 
848     if (!DstSelAgree)
849       break;
850 
851     // Also OtherInst dst_unused should be UNUSED_PAD
852     DstUnused OtherDstUnused = static_cast<DstUnused>(
853       TII->getNamedImmOperand(*OtherInst, AMDGPU::OpName::dst_unused));
854     if (OtherDstUnused != DstUnused::UNUSED_PAD)
855       break;
856 
857     // Create DstPreserveOperand
858     MachineOperand *OrDst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
859     assert(OrDst && OrDst->isReg());
860 
861     return std::make_unique<SDWADstPreserveOperand>(
862       OrDst, OrSDWADef, OrOtherDef, DstSel);
863 
864   }
865   }
866 
867   return std::unique_ptr<SDWAOperand>(nullptr);
868 }
869 
870 #if !defined(NDEBUG)
871 static raw_ostream& operator<<(raw_ostream &OS, const SDWAOperand &Operand) {
872   Operand.print(OS);
873   return OS;
874 }
875 #endif
876 
877 void SIPeepholeSDWA::matchSDWAOperands(MachineBasicBlock &MBB) {
878   for (MachineInstr &MI : MBB) {
879     if (auto Operand = matchSDWAOperand(MI)) {
880       LLVM_DEBUG(dbgs() << "Match: " << MI << "To: " << *Operand << '\n');
881       SDWAOperands[&MI] = std::move(Operand);
882       ++NumSDWAPatternsFound;
883     }
884   }
885 }
886 
887 // Convert the V_ADD_CO_U32_e64 into V_ADD_CO_U32_e32. This allows
888 // isConvertibleToSDWA to perform its transformation on V_ADD_CO_U32_e32 into
889 // V_ADD_CO_U32_sdwa.
890 //
891 // We are transforming from a VOP3 into a VOP2 form of the instruction.
892 //   %19:vgpr_32 = V_AND_B32_e32 255,
893 //       killed %16:vgpr_32, implicit $exec
894 //   %47:vgpr_32, %49:sreg_64_xexec = V_ADD_CO_U32_e64
895 //       %26.sub0:vreg_64, %19:vgpr_32, implicit $exec
896 //  %48:vgpr_32, dead %50:sreg_64_xexec = V_ADDC_U32_e64
897 //       %26.sub1:vreg_64, %54:vgpr_32, killed %49:sreg_64_xexec, implicit $exec
898 //
899 // becomes
900 //   %47:vgpr_32 = V_ADD_CO_U32_sdwa
901 //       0, %26.sub0:vreg_64, 0, killed %16:vgpr_32, 0, 6, 0, 6, 0,
902 //       implicit-def $vcc, implicit $exec
903 //  %48:vgpr_32, dead %50:sreg_64_xexec = V_ADDC_U32_e64
904 //       %26.sub1:vreg_64, %54:vgpr_32, killed $vcc, implicit $exec
905 void SIPeepholeSDWA::pseudoOpConvertToVOP2(MachineInstr &MI,
906                                            const GCNSubtarget &ST) const {
907   int Opc = MI.getOpcode();
908   assert((Opc == AMDGPU::V_ADD_CO_U32_e64 || Opc == AMDGPU::V_SUB_CO_U32_e64) &&
909          "Currently only handles V_ADD_CO_U32_e64 or V_SUB_CO_U32_e64");
910 
911   // Can the candidate MI be shrunk?
912   if (!TII->canShrink(MI, *MRI))
913     return;
914   Opc = AMDGPU::getVOPe32(Opc);
915   // Find the related ADD instruction.
916   const MachineOperand *Sdst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst);
917   if (!Sdst)
918     return;
919   MachineOperand *NextOp = findSingleRegUse(Sdst, MRI);
920   if (!NextOp)
921     return;
922   MachineInstr &MISucc = *NextOp->getParent();
923 
924   // Make sure the carry in/out are subsequently unused.
925   MachineOperand *CarryIn = TII->getNamedOperand(MISucc, AMDGPU::OpName::src2);
926   if (!CarryIn)
927     return;
928   MachineOperand *CarryOut = TII->getNamedOperand(MISucc, AMDGPU::OpName::sdst);
929   if (!CarryOut)
930     return;
931   if (!MRI->hasOneUse(CarryIn->getReg()) || !MRI->use_empty(CarryOut->getReg()))
932     return;
933   // Make sure VCC or its subregs are dead before MI.
934   MachineBasicBlock &MBB = *MI.getParent();
935   auto Liveness = MBB.computeRegisterLiveness(TRI, AMDGPU::VCC, MI, 25);
936   if (Liveness != MachineBasicBlock::LQR_Dead)
937     return;
938   // Check if VCC is referenced in range of (MI,MISucc].
939   for (auto I = std::next(MI.getIterator()), E = MISucc.getIterator();
940        I != E; ++I) {
941     if (I->modifiesRegister(AMDGPU::VCC, TRI))
942       return;
943   }
944 
945   // Replace MI with V_{SUB|ADD}_I32_e32
946   BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(Opc))
947     .add(*TII->getNamedOperand(MI, AMDGPU::OpName::vdst))
948     .add(*TII->getNamedOperand(MI, AMDGPU::OpName::src0))
949     .add(*TII->getNamedOperand(MI, AMDGPU::OpName::src1))
950     .setMIFlags(MI.getFlags());
951 
952   MI.eraseFromParent();
953 
954   // Since the carry output of MI is now VCC, update its use in MISucc.
955 
956   MISucc.substituteRegister(CarryIn->getReg(), TRI->getVCC(), 0, *TRI);
957 }
958 
959 namespace {
960 bool isConvertibleToSDWA(MachineInstr &MI,
961                          const GCNSubtarget &ST,
962                          const SIInstrInfo* TII) {
963   // Check if this is already an SDWA instruction
964   unsigned Opc = MI.getOpcode();
965   if (TII->isSDWA(Opc)) {
966     // FIXME: Reenable after fixing selection handling.
967     // Cf. llvm/test/CodeGen/AMDGPU/sdwa-peephole-instr-combine-sel.ll
968     return false;
969   }
970 
971   // Check if this instruction has opcode that supports SDWA
972   if (AMDGPU::getSDWAOp(Opc) == -1)
973     Opc = AMDGPU::getVOPe32(Opc);
974 
975   if (AMDGPU::getSDWAOp(Opc) == -1)
976     return false;
977 
978   if (!ST.hasSDWAOmod() && TII->hasModifiersSet(MI, AMDGPU::OpName::omod))
979     return false;
980 
981   if (TII->isVOPC(Opc)) {
982     if (!ST.hasSDWASdst()) {
983       const MachineOperand *SDst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst);
984       if (SDst && (SDst->getReg() != AMDGPU::VCC &&
985                    SDst->getReg() != AMDGPU::VCC_LO))
986         return false;
987     }
988 
989     if (!ST.hasSDWAOutModsVOPC() &&
990         (TII->hasModifiersSet(MI, AMDGPU::OpName::clamp) ||
991          TII->hasModifiersSet(MI, AMDGPU::OpName::omod)))
992       return false;
993 
994   } else if (TII->getNamedOperand(MI, AMDGPU::OpName::sdst) ||
995              !TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) {
996     return false;
997   }
998 
999   if (!ST.hasSDWAMac() && (Opc == AMDGPU::V_FMAC_F16_e32 ||
1000                            Opc == AMDGPU::V_FMAC_F32_e32 ||
1001                            Opc == AMDGPU::V_MAC_F16_e32 ||
1002                            Opc == AMDGPU::V_MAC_F32_e32))
1003     return false;
1004 
1005   // Check if target supports this SDWA opcode
1006   if (TII->pseudoToMCOpcode(Opc) == -1)
1007     return false;
1008 
1009   // FIXME: has SDWA but require handling of implicit VCC use
1010   if (Opc == AMDGPU::V_CNDMASK_B32_e32)
1011     return false;
1012 
1013   if (MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0)) {
1014     if (!Src0->isReg() && !Src0->isImm())
1015       return false;
1016   }
1017 
1018   if (MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1)) {
1019     if (!Src1->isReg() && !Src1->isImm())
1020       return false;
1021   }
1022 
1023   return true;
1024 }
1025 } // namespace
1026 
1027 bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI,
1028                                    const SDWAOperandsVector &SDWAOperands) {
1029 
1030   LLVM_DEBUG(dbgs() << "Convert instruction:" << MI);
1031 
1032   // Convert to sdwa
1033   int SDWAOpcode;
1034   unsigned Opcode = MI.getOpcode();
1035   if (TII->isSDWA(Opcode)) {
1036     SDWAOpcode = Opcode;
1037   } else {
1038     SDWAOpcode = AMDGPU::getSDWAOp(Opcode);
1039     if (SDWAOpcode == -1)
1040       SDWAOpcode = AMDGPU::getSDWAOp(AMDGPU::getVOPe32(Opcode));
1041   }
1042   assert(SDWAOpcode != -1);
1043 
1044   const MCInstrDesc &SDWADesc = TII->get(SDWAOpcode);
1045 
1046   // Create SDWA version of instruction MI and initialize its operands
1047   MachineInstrBuilder SDWAInst =
1048     BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), SDWADesc)
1049     .setMIFlags(MI.getFlags());
1050 
1051   // Copy dst, if it is present in original then should also be present in SDWA
1052   MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
1053   if (Dst) {
1054     assert(AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::vdst));
1055     SDWAInst.add(*Dst);
1056   } else if ((Dst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst))) {
1057     assert(Dst && AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::sdst));
1058     SDWAInst.add(*Dst);
1059   } else {
1060     assert(AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::sdst));
1061     SDWAInst.addReg(TRI->getVCC(), RegState::Define);
1062   }
1063 
1064   // Copy src0, initialize src0_modifiers. All sdwa instructions has src0 and
1065   // src0_modifiers (except for v_nop_sdwa, but it can't get here)
1066   MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
1067   assert(Src0 && AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::src0) &&
1068          AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::src0_modifiers));
1069   if (auto *Mod = TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers))
1070     SDWAInst.addImm(Mod->getImm());
1071   else
1072     SDWAInst.addImm(0);
1073   SDWAInst.add(*Src0);
1074 
1075   // Copy src1 if present, initialize src1_modifiers.
1076   MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
1077   if (Src1) {
1078     assert(AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::src1) &&
1079            AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::src1_modifiers));
1080     if (auto *Mod = TII->getNamedOperand(MI, AMDGPU::OpName::src1_modifiers))
1081       SDWAInst.addImm(Mod->getImm());
1082     else
1083       SDWAInst.addImm(0);
1084     SDWAInst.add(*Src1);
1085   }
1086 
1087   if (SDWAOpcode == AMDGPU::V_FMAC_F16_sdwa ||
1088       SDWAOpcode == AMDGPU::V_FMAC_F32_sdwa ||
1089       SDWAOpcode == AMDGPU::V_MAC_F16_sdwa ||
1090       SDWAOpcode == AMDGPU::V_MAC_F32_sdwa) {
1091     // v_mac_f16/32 has additional src2 operand tied to vdst
1092     MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2);
1093     assert(Src2);
1094     SDWAInst.add(*Src2);
1095   }
1096 
1097   // Copy clamp if present, initialize otherwise
1098   assert(AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::clamp));
1099   MachineOperand *Clamp = TII->getNamedOperand(MI, AMDGPU::OpName::clamp);
1100   if (Clamp) {
1101     SDWAInst.add(*Clamp);
1102   } else {
1103     SDWAInst.addImm(0);
1104   }
1105 
1106   // Copy omod if present, initialize otherwise if needed
1107   if (AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::omod)) {
1108     MachineOperand *OMod = TII->getNamedOperand(MI, AMDGPU::OpName::omod);
1109     if (OMod) {
1110       SDWAInst.add(*OMod);
1111     } else {
1112       SDWAInst.addImm(0);
1113     }
1114   }
1115 
1116   // Copy dst_sel if present, initialize otherwise if needed
1117   if (AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::dst_sel)) {
1118     MachineOperand *DstSel = TII->getNamedOperand(MI, AMDGPU::OpName::dst_sel);
1119     if (DstSel) {
1120       SDWAInst.add(*DstSel);
1121     } else {
1122       SDWAInst.addImm(AMDGPU::SDWA::SdwaSel::DWORD);
1123     }
1124   }
1125 
1126   // Copy dst_unused if present, initialize otherwise if needed
1127   if (AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::dst_unused)) {
1128     MachineOperand *DstUnused = TII->getNamedOperand(MI, AMDGPU::OpName::dst_unused);
1129     if (DstUnused) {
1130       SDWAInst.add(*DstUnused);
1131     } else {
1132       SDWAInst.addImm(AMDGPU::SDWA::DstUnused::UNUSED_PAD);
1133     }
1134   }
1135 
1136   // Copy src0_sel if present, initialize otherwise
1137   assert(AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::src0_sel));
1138   MachineOperand *Src0Sel = TII->getNamedOperand(MI, AMDGPU::OpName::src0_sel);
1139   if (Src0Sel) {
1140     SDWAInst.add(*Src0Sel);
1141   } else {
1142     SDWAInst.addImm(AMDGPU::SDWA::SdwaSel::DWORD);
1143   }
1144 
1145   // Copy src1_sel if present, initialize otherwise if needed
1146   if (Src1) {
1147     assert(AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::src1_sel));
1148     MachineOperand *Src1Sel = TII->getNamedOperand(MI, AMDGPU::OpName::src1_sel);
1149     if (Src1Sel) {
1150       SDWAInst.add(*Src1Sel);
1151     } else {
1152       SDWAInst.addImm(AMDGPU::SDWA::SdwaSel::DWORD);
1153     }
1154   }
1155 
1156   // Check for a preserved register that needs to be copied.
1157   auto *DstUnused = TII->getNamedOperand(MI, AMDGPU::OpName::dst_unused);
1158   if (DstUnused &&
1159       DstUnused->getImm() == AMDGPU::SDWA::DstUnused::UNUSED_PRESERVE) {
1160     // We expect, if we are here, that the instruction was already in it's SDWA form,
1161     // with a tied operand.
1162     assert(Dst && Dst->isTied());
1163     assert(Opcode == static_cast<unsigned int>(SDWAOpcode));
1164     // We also expect a vdst, since sdst can't preserve.
1165     auto PreserveDstIdx = AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::vdst);
1166     assert(PreserveDstIdx != -1);
1167 
1168     auto TiedIdx = MI.findTiedOperandIdx(PreserveDstIdx);
1169     auto Tied = MI.getOperand(TiedIdx);
1170 
1171     SDWAInst.add(Tied);
1172     SDWAInst->tieOperands(PreserveDstIdx, SDWAInst->getNumOperands() - 1);
1173   }
1174 
1175   // Apply all sdwa operand patterns.
1176   bool Converted = false;
1177   for (auto &Operand : SDWAOperands) {
1178     LLVM_DEBUG(dbgs() << *SDWAInst << "\nOperand: " << *Operand);
1179     // There should be no intersection between SDWA operands and potential MIs
1180     // e.g.:
1181     // v_and_b32 v0, 0xff, v1 -> src:v1 sel:BYTE_0
1182     // v_and_b32 v2, 0xff, v0 -> src:v0 sel:BYTE_0
1183     // v_add_u32 v3, v4, v2
1184     //
1185     // In that example it is possible that we would fold 2nd instruction into
1186     // 3rd (v_add_u32_sdwa) and then try to fold 1st instruction into 2nd (that
1187     // was already destroyed). So if SDWAOperand is also a potential MI then do
1188     // not apply it.
1189     if (PotentialMatches.count(Operand->getParentInst()) == 0)
1190       Converted |= Operand->convertToSDWA(*SDWAInst, TII);
1191   }
1192 
1193   if (Converted) {
1194     ConvertedInstructions.push_back(SDWAInst);
1195     for (MachineOperand &MO : SDWAInst->uses()) {
1196       if (!MO.isReg())
1197         continue;
1198 
1199       MRI->clearKillFlags(MO.getReg());
1200     }
1201   } else {
1202     SDWAInst->eraseFromParent();
1203     return false;
1204   }
1205 
1206   LLVM_DEBUG(dbgs() << "\nInto:" << *SDWAInst << '\n');
1207   ++NumSDWAInstructionsPeepholed;
1208 
1209   MI.eraseFromParent();
1210   return true;
1211 }
1212 
1213 // If an instruction was converted to SDWA it should not have immediates or SGPR
1214 // operands (allowed one SGPR on GFX9). Copy its scalar operands into VGPRs.
1215 void SIPeepholeSDWA::legalizeScalarOperands(MachineInstr &MI,
1216                                             const GCNSubtarget &ST) const {
1217   const MCInstrDesc &Desc = TII->get(MI.getOpcode());
1218   unsigned ConstantBusCount = 0;
1219   for (MachineOperand &Op : MI.explicit_uses()) {
1220     if (!Op.isImm() && !(Op.isReg() && !TRI->isVGPR(*MRI, Op.getReg())))
1221       continue;
1222 
1223     unsigned I = Op.getOperandNo();
1224     if (Desc.operands()[I].RegClass == -1 ||
1225         !TRI->isVSSuperClass(TRI->getRegClass(Desc.operands()[I].RegClass)))
1226       continue;
1227 
1228     if (ST.hasSDWAScalar() && ConstantBusCount == 0 && Op.isReg() &&
1229         TRI->isSGPRReg(*MRI, Op.getReg())) {
1230       ++ConstantBusCount;
1231       continue;
1232     }
1233 
1234     Register VGPR = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1235     auto Copy = BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(),
1236                         TII->get(AMDGPU::V_MOV_B32_e32), VGPR);
1237     if (Op.isImm())
1238       Copy.addImm(Op.getImm());
1239     else if (Op.isReg())
1240       Copy.addReg(Op.getReg(), Op.isKill() ? RegState::Kill : 0,
1241                   Op.getSubReg());
1242     Op.ChangeToRegister(VGPR, false);
1243   }
1244 }
1245 
1246 bool SIPeepholeSDWALegacy::runOnMachineFunction(MachineFunction &MF) {
1247   if (skipFunction(MF.getFunction()))
1248     return false;
1249 
1250   return SIPeepholeSDWA().run(MF);
1251 }
1252 
1253 bool SIPeepholeSDWA::run(MachineFunction &MF) {
1254   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1255 
1256   if (!ST.hasSDWA())
1257     return false;
1258 
1259   MRI = &MF.getRegInfo();
1260   TRI = ST.getRegisterInfo();
1261   TII = ST.getInstrInfo();
1262 
1263   // Find all SDWA operands in MF.
1264   bool Ret = false;
1265   for (MachineBasicBlock &MBB : MF) {
1266     bool Changed = false;
1267     do {
1268       // Preprocess the ADD/SUB pairs so they could be SDWA'ed.
1269       // Look for a possible ADD or SUB that resulted from a previously lowered
1270       // V_{ADD|SUB}_U64_PSEUDO. The function pseudoOpConvertToVOP2
1271       // lowers the pair of instructions into e32 form.
1272       matchSDWAOperands(MBB);
1273       for (const auto &OperandPair : SDWAOperands) {
1274         const auto &Operand = OperandPair.second;
1275         MachineInstr *PotentialMI = Operand->potentialToConvert(TII, ST);
1276         if (PotentialMI &&
1277            (PotentialMI->getOpcode() == AMDGPU::V_ADD_CO_U32_e64 ||
1278             PotentialMI->getOpcode() == AMDGPU::V_SUB_CO_U32_e64))
1279           pseudoOpConvertToVOP2(*PotentialMI, ST);
1280       }
1281       SDWAOperands.clear();
1282 
1283       // Generate potential match list.
1284       matchSDWAOperands(MBB);
1285 
1286       for (const auto &OperandPair : SDWAOperands) {
1287         const auto &Operand = OperandPair.second;
1288         MachineInstr *PotentialMI = Operand->potentialToConvert(TII, ST, &PotentialMatches);
1289         if (PotentialMI && isConvertibleToSDWA(*PotentialMI, ST, TII)) {
1290           PotentialMatches[PotentialMI].push_back(Operand.get());
1291         }
1292       }
1293 
1294       for (auto &PotentialPair : PotentialMatches) {
1295         MachineInstr &PotentialMI = *PotentialPair.first;
1296         convertToSDWA(PotentialMI, PotentialPair.second);
1297       }
1298 
1299       PotentialMatches.clear();
1300       SDWAOperands.clear();
1301 
1302       Changed = !ConvertedInstructions.empty();
1303 
1304       if (Changed)
1305         Ret = true;
1306       while (!ConvertedInstructions.empty())
1307         legalizeScalarOperands(*ConvertedInstructions.pop_back_val(), ST);
1308     } while (Changed);
1309   }
1310 
1311   return Ret;
1312 }
1313 
1314 PreservedAnalyses SIPeepholeSDWAPass::run(MachineFunction &MF,
1315                                           MachineFunctionAnalysisManager &) {
1316   if (MF.getFunction().hasOptNone() || !SIPeepholeSDWA().run(MF))
1317     return PreservedAnalyses::all();
1318 
1319   PreservedAnalyses PA = getMachineFunctionPassPreservedAnalyses();
1320   PA.preserveSet<CFGAnalyses>();
1321   return PA;
1322 }
1323