xref: /llvm-project/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp (revision 9fa169601f3cf42a32305ecf26024fc9c149c98c)
1 //===-- SIPeepholeSDWA.cpp - Peephole optimization for SDWA instructions --===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 /// \file This pass tries to apply several peephole SDWA patterns.
11 ///
12 /// E.g. original:
13 ///   V_LSHRREV_B32_e32 %vreg0, 16, %vreg1
14 ///   V_ADD_I32_e32 %vreg2, %vreg0, %vreg3
15 ///   V_LSHLREV_B32_e32 %vreg4, 16, %vreg2
16 ///
17 /// Replace:
18 ///   V_ADD_I32_sdwa %vreg4, %vreg1, %vreg3
19 ///       dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
20 ///
21 //===----------------------------------------------------------------------===//
22 
23 
24 #include "AMDGPU.h"
25 #include "AMDGPUSubtarget.h"
26 #include "SIDefines.h"
27 #include "SIInstrInfo.h"
28 #include "llvm/ADT/Statistic.h"
29 #include "llvm/ADT/STLExtras.h"
30 #include "llvm/CodeGen/MachineFunctionPass.h"
31 #include "llvm/CodeGen/MachineInstrBuilder.h"
32 #include <unordered_map>
33 
34 using namespace llvm;
35 
36 #define DEBUG_TYPE "si-peephole-sdwa"
37 
38 STATISTIC(NumSDWAPatternsFound, "Number of SDWA patterns found.");
39 STATISTIC(NumSDWAInstructionsPeepholed,
40           "Number of instruction converted to SDWA.");
41 
42 namespace {
43 
44 class SDWAOperand;
45 
46 class SIPeepholeSDWA : public MachineFunctionPass {
47 private:
48   MachineRegisterInfo *MRI;
49   const SIRegisterInfo *TRI;
50   const SIInstrInfo *TII;
51 
52   std::unordered_map<MachineInstr *, std::unique_ptr<SDWAOperand>> SDWAOperands;
53 
54   Optional<int64_t> foldToImm(const MachineOperand &Op) const;
55 
56 public:
57   static char ID;
58 
59   typedef SmallVector<std::unique_ptr<SDWAOperand>, 4> SDWAOperandsVector;
60 
61   SIPeepholeSDWA() : MachineFunctionPass(ID) {
62     initializeSIPeepholeSDWAPass(*PassRegistry::getPassRegistry());
63   }
64 
65   bool runOnMachineFunction(MachineFunction &MF) override;
66   void matchSDWAOperands(MachineBasicBlock &MBB);
67   bool convertToSDWA(MachineInstr &MI, const SDWAOperandsVector &SDWAOperands);
68 
69   StringRef getPassName() const override { return "SI Peephole SDWA"; }
70 
71   void getAnalysisUsage(AnalysisUsage &AU) const override {
72     AU.setPreservesCFG();
73     MachineFunctionPass::getAnalysisUsage(AU);
74   }
75 };
76 
77 class SDWAOperand {
78 private:
79   MachineOperand *Target; // Operand that would be used in converted instruction
80   MachineOperand *Replaced; // Operand that would be replace by Target
81 
82 public:
83   SDWAOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp)
84       : Target(TargetOp), Replaced(ReplacedOp) {
85     assert(Target->isReg());
86     assert(Replaced->isReg());
87   }
88 
89   virtual ~SDWAOperand() {}
90 
91   virtual MachineInstr *potentialToConvert(const SIInstrInfo *TII) = 0;
92   virtual bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) = 0;
93 
94   MachineOperand *getTargetOperand() const { return Target; }
95   MachineOperand *getReplacedOperand() const { return Replaced; }
96   MachineInstr *getParentInst() const { return Target->getParent(); }
97   MachineRegisterInfo *getMRI() const {
98     return &getParentInst()->getParent()->getParent()->getRegInfo();
99   }
100 };
101 
102 using namespace AMDGPU::SDWA;
103 
104 class SDWASrcOperand : public SDWAOperand {
105 private:
106   SdwaSel SrcSel;
107   bool Abs;
108   bool Neg;
109   bool Sext;
110 
111 public:
112   SDWASrcOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp,
113                  SdwaSel SrcSel_ = DWORD, bool Abs_ = false, bool Neg_ = false,
114                  bool Sext_ = false)
115       : SDWAOperand(TargetOp, ReplacedOp), SrcSel(SrcSel_), Abs(Abs_),
116         Neg(Neg_), Sext(Sext_) {}
117 
118   virtual MachineInstr *potentialToConvert(const SIInstrInfo *TII) override;
119   virtual bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override;
120 
121   SdwaSel getSrcSel() const { return SrcSel; }
122   bool getAbs() const { return Abs; }
123   bool getNeg() const { return Neg; }
124   bool getSext() const { return Sext; }
125 
126   uint64_t getSrcMods() const;
127 };
128 
129 class SDWADstOperand : public SDWAOperand {
130 private:
131   SdwaSel DstSel;
132   DstUnused DstUn;
133 
134 public:
135   SDWADstOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp,
136                  SdwaSel DstSel_ = DWORD, DstUnused DstUn_ = UNUSED_PAD)
137       : SDWAOperand(TargetOp, ReplacedOp), DstSel(DstSel_), DstUn(DstUn_) {}
138 
139   virtual MachineInstr *potentialToConvert(const SIInstrInfo *TII) override;
140   virtual bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override;
141 
142   SdwaSel getDstSel() const { return DstSel; }
143   DstUnused getDstUnused() const { return DstUn; }
144 };
145 
146 } // End anonymous namespace.
147 
148 INITIALIZE_PASS(SIPeepholeSDWA, DEBUG_TYPE, "SI Peephole SDWA", false, false)
149 
150 char SIPeepholeSDWA::ID = 0;
151 
152 char &llvm::SIPeepholeSDWAID = SIPeepholeSDWA::ID;
153 
154 FunctionPass *llvm::createSIPeepholeSDWAPass() {
155   return new SIPeepholeSDWA();
156 }
157 
158 #ifndef NDEBUG
159 
160 static raw_ostream& operator<<(raw_ostream &OS, const SdwaSel &Sel) {
161   switch(Sel) {
162   case BYTE_0: OS << "BYTE_0"; break;
163   case BYTE_1: OS << "BYTE_1"; break;
164   case BYTE_2: OS << "BYTE_2"; break;
165   case BYTE_3: OS << "BYTE_3"; break;
166   case WORD_0: OS << "WORD_0"; break;
167   case WORD_1: OS << "WORD_1"; break;
168   case DWORD:  OS << "DWORD"; break;
169   }
170   return OS;
171 }
172 
173 static raw_ostream& operator<<(raw_ostream &OS, const DstUnused &Un) {
174   switch(Un) {
175   case UNUSED_PAD: OS << "UNUSED_PAD"; break;
176   case UNUSED_SEXT: OS << "UNUSED_SEXT"; break;
177   case UNUSED_PRESERVE: OS << "UNUSED_PRESERVE"; break;
178   }
179   return OS;
180 }
181 
182 static raw_ostream& operator<<(raw_ostream &OS, const SDWASrcOperand &Src) {
183   OS << "SDWA src: " << *Src.getTargetOperand()
184      << " src_sel:" << Src.getSrcSel()
185      << " abs:" << Src.getAbs() << " neg:" << Src.getNeg()
186      << " sext:" << Src.getSext() << '\n';
187   return OS;
188 }
189 
190 static raw_ostream& operator<<(raw_ostream &OS, const SDWADstOperand &Dst) {
191   OS << "SDWA dst: " << *Dst.getTargetOperand()
192      << " dst_sel:" << Dst.getDstSel()
193      << " dst_unused:" << Dst.getDstUnused() << '\n';
194   return OS;
195 }
196 
197 #endif
198 
199 static bool isSameBB(const MachineInstr *FirstMI, const MachineInstr *SecondMI) {
200   assert(FirstMI && SecondMI);
201   return FirstMI->getParent() == SecondMI->getParent();
202 }
203 
204 static void copyRegOperand(MachineOperand &To, const MachineOperand &From) {
205   assert(To.isReg() && From.isReg());
206   To.setReg(From.getReg());
207   To.setSubReg(From.getSubReg());
208   To.setIsUndef(From.isUndef());
209   if (To.isUse()) {
210     To.setIsKill(From.isKill());
211   } else {
212     To.setIsDead(From.isDead());
213   }
214 }
215 
216 static bool isSameReg(const MachineOperand &LHS, const MachineOperand &RHS) {
217   return LHS.isReg() &&
218          RHS.isReg() &&
219          LHS.getReg() == RHS.getReg() &&
220          LHS.getSubReg() == RHS.getSubReg();
221 }
222 
223 static bool isSubregOf(const MachineOperand &SubReg,
224                        const MachineOperand &SuperReg,
225                        const TargetRegisterInfo *TRI) {
226 
227   if (!SuperReg.isReg() || !SubReg.isReg())
228     return false;
229 
230   if (isSameReg(SuperReg, SubReg))
231     return true;
232 
233   if (SuperReg.getReg() != SubReg.getReg())
234     return false;
235 
236   LaneBitmask SuperMask = TRI->getSubRegIndexLaneMask(SuperReg.getSubReg());
237   LaneBitmask SubMask = TRI->getSubRegIndexLaneMask(SubReg.getSubReg());
238   SuperMask |= ~SubMask;
239   return SuperMask.all();
240 }
241 
242 uint64_t SDWASrcOperand::getSrcMods() const {
243   uint64_t Mods = 0;
244   if (Abs || Neg) {
245     assert(!Sext &&
246            "Float and integer src modifiers can't be set simulteniously");
247     Mods |= Abs ? SISrcMods::ABS : 0;
248     Mods |= Neg ? SISrcMods::NEG : 0;
249   } else if (Sext) {
250     Mods |= SISrcMods::SEXT;
251   }
252 
253   return Mods;
254 }
255 
256 MachineInstr *SDWASrcOperand::potentialToConvert(const SIInstrInfo *TII) {
257   // For SDWA src operand potential instruction is one that use register
258   // defined by parent instruction
259   MachineRegisterInfo *MRI = getMRI();
260   MachineOperand *Replaced = getReplacedOperand();
261   assert(Replaced->isReg());
262 
263   MachineInstr *PotentialMI = nullptr;
264   for (MachineOperand &PotentialMO : MRI->use_operands(Replaced->getReg())) {
265     // If this is use of another subreg of dst reg then do nothing
266     if (!isSubregOf(*Replaced, PotentialMO, MRI->getTargetRegisterInfo()))
267       continue;
268 
269     // If there exist use of dst in another basic block or use of superreg of
270     // dst then we should not combine this opernad
271     if (!isSameBB(PotentialMO.getParent(), getParentInst()) ||
272         !isSameReg(PotentialMO, *Replaced))
273       return nullptr;
274 
275     // Check that PotentialMI is only instruction that uses dst reg
276     if (PotentialMI == nullptr) {
277       PotentialMI = PotentialMO.getParent();
278     } else if (PotentialMI != PotentialMO.getParent()) {
279       return nullptr;
280     }
281   }
282 
283   return PotentialMI;
284 }
285 
286 bool SDWASrcOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) {
287   // Find operand in instruction that matches source operand and replace it with
288   // target operand. Set corresponding src_sel
289 
290   MachineOperand *Src = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
291   MachineOperand *SrcSel = TII->getNamedOperand(MI, AMDGPU::OpName::src0_sel);
292   MachineOperand *SrcMods =
293       TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
294   assert(Src && Src->isReg());
295   if (!isSameReg(*Src, *getReplacedOperand())) {
296     // If this is not src0 then it should be src1
297     Src = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
298     SrcSel = TII->getNamedOperand(MI, AMDGPU::OpName::src1_sel);
299     SrcMods = TII->getNamedOperand(MI, AMDGPU::OpName::src1_modifiers);
300 
301     assert(Src && Src->isReg());
302 
303     if ((MI.getOpcode() == AMDGPU::V_MAC_F16_sdwa ||
304          MI.getOpcode() == AMDGPU::V_MAC_F32_sdwa) &&
305         !isSameReg(*Src, *getReplacedOperand())) {
306       // In case of v_mac_f16/32_sdwa this pass can try to apply src operand to
307       // src2. This is not allowed.
308       return false;
309     }
310 
311     assert(isSameReg(*Src, *getReplacedOperand()) && SrcSel && SrcMods);
312   }
313   copyRegOperand(*Src, *getTargetOperand());
314   SrcSel->setImm(getSrcSel());
315   SrcMods->setImm(getSrcMods());
316   getTargetOperand()->setIsKill(false);
317   return true;
318 }
319 
320 MachineInstr *SDWADstOperand::potentialToConvert(const SIInstrInfo *TII) {
321   // For SDWA dst operand potential instruction is one that defines register
322   // that this operand uses
323   MachineRegisterInfo *MRI = getMRI();
324   MachineInstr *ParentMI = getParentInst();
325   MachineOperand *Replaced = getReplacedOperand();
326   assert(Replaced->isReg());
327 
328   for (MachineOperand &PotentialMO : MRI->def_operands(Replaced->getReg())) {
329     if (!isSubregOf(*Replaced, PotentialMO, MRI->getTargetRegisterInfo()))
330       continue;
331 
332     if (!isSameBB(getParentInst(), PotentialMO.getParent()) ||
333         !isSameReg(*Replaced, PotentialMO))
334       return nullptr;
335 
336     // Check that ParentMI is the only instruction that uses replaced register
337     for (MachineOperand &UseMO : MRI->use_operands(PotentialMO.getReg())) {
338       if (isSubregOf(UseMO, PotentialMO, MRI->getTargetRegisterInfo()) &&
339           UseMO.getParent() != ParentMI) {
340         return nullptr;
341       }
342     }
343 
344     // Due to SSA this should be onle def of replaced register, so return it
345     return PotentialMO.getParent();
346   }
347 
348   return nullptr;
349 }
350 
351 bool SDWADstOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) {
352   // Replace vdst operand in MI with target operand. Set dst_sel and dst_unused
353 
354   if ((MI.getOpcode() == AMDGPU::V_MAC_F16_sdwa ||
355        MI.getOpcode() == AMDGPU::V_MAC_F32_sdwa) &&
356       getDstSel() != AMDGPU::SDWA::DWORD) {
357     // v_mac_f16/32_sdwa allow dst_sel to be equal only to DWORD
358     return false;
359   }
360 
361   MachineOperand *Operand = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
362   assert(Operand &&
363          Operand->isReg() &&
364          isSameReg(*Operand, *getReplacedOperand()));
365   copyRegOperand(*Operand, *getTargetOperand());
366   MachineOperand *DstSel= TII->getNamedOperand(MI, AMDGPU::OpName::dst_sel);
367   assert(DstSel);
368   DstSel->setImm(getDstSel());
369   MachineOperand *DstUnused= TII->getNamedOperand(MI, AMDGPU::OpName::dst_unused);
370   assert(DstUnused);
371   DstUnused->setImm(getDstUnused());
372 
373   // Remove original instruction  because it would conflict with our new
374   // instruction by register definition
375   getParentInst()->eraseFromParent();
376   return true;
377 }
378 
379 Optional<int64_t> SIPeepholeSDWA::foldToImm(const MachineOperand &Op) const {
380   if (Op.isImm()) {
381     return Op.getImm();
382   }
383 
384   // If this is not immediate then it can be copy of immediate value, e.g.:
385   // %vreg1<def> = S_MOV_B32 255;
386   if (Op.isReg()) {
387     for (const MachineOperand &Def : MRI->def_operands(Op.getReg())) {
388       if (!isSameReg(Op, Def))
389         continue;
390 
391       const MachineInstr *DefInst = Def.getParent();
392       if (!TII->isFoldableCopy(*DefInst) || !isSameBB(Op.getParent(), DefInst))
393         return None;
394 
395       const MachineOperand &Copied = DefInst->getOperand(1);
396       if (!Copied.isImm())
397         return None;
398 
399       return Copied.getImm();
400     }
401   }
402 
403   return None;
404 }
405 
406 void SIPeepholeSDWA::matchSDWAOperands(MachineBasicBlock &MBB) {
407   for (MachineInstr &MI : MBB) {
408     unsigned Opcode = MI.getOpcode();
409     switch (Opcode) {
410     case AMDGPU::V_LSHRREV_B32_e32:
411     case AMDGPU::V_ASHRREV_I32_e32:
412     case AMDGPU::V_LSHLREV_B32_e32: {
413       // from: v_lshrrev_b32_e32 v1, 16/24, v0
414       // to SDWA src:v0 src_sel:WORD_1/BYTE_3
415 
416       // from: v_ashrrev_i32_e32 v1, 16/24, v0
417       // to SDWA src:v0 src_sel:WORD_1/BYTE_3 sext:1
418 
419       // from: v_lshlrev_b32_e32 v1, 16/24, v0
420       // to SDWA dst:v1 dst_sel:WORD_1/BYTE_3 dst_unused:UNUSED_PAD
421       MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
422       auto Imm = foldToImm(*Src0);
423       if (!Imm)
424         break;
425 
426       if (*Imm != 16 && *Imm != 24)
427         break;
428 
429       MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
430       MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
431       if (TRI->isPhysicalRegister(Src1->getReg()) ||
432           TRI->isPhysicalRegister(Dst->getReg()))
433         break;
434 
435       if (Opcode == AMDGPU::V_LSHLREV_B32_e32) {
436         auto SDWADst = make_unique<SDWADstOperand>(
437             Dst, Src1, *Imm == 16 ? WORD_1 : BYTE_3, UNUSED_PAD);
438         DEBUG(dbgs() << "Match: " << MI << "To: " << *SDWADst << '\n');
439         SDWAOperands[&MI] = std::move(SDWADst);
440         ++NumSDWAPatternsFound;
441       } else {
442         auto SDWASrc = make_unique<SDWASrcOperand>(
443             Src1, Dst, *Imm == 16 ? WORD_1 : BYTE_3, false, false,
444             Opcode == AMDGPU::V_LSHRREV_B32_e32 ? false : true);
445         DEBUG(dbgs() << "Match: " << MI << "To: " << *SDWASrc << '\n');
446         SDWAOperands[&MI] = std::move(SDWASrc);
447         ++NumSDWAPatternsFound;
448       }
449       break;
450     }
451 
452     case AMDGPU::V_LSHRREV_B16_e32:
453     case AMDGPU::V_ASHRREV_I16_e32:
454     case AMDGPU::V_LSHLREV_B16_e32: {
455       // from: v_lshrrev_b16_e32 v1, 8, v0
456       // to SDWA src:v0 src_sel:BYTE_1
457 
458       // from: v_ashrrev_i16_e32 v1, 8, v0
459       // to SDWA src:v0 src_sel:BYTE_1 sext:1
460 
461       // from: v_lshlrev_b16_e32 v1, 8, v0
462       // to SDWA dst:v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD
463       MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
464       auto Imm = foldToImm(*Src0);
465       if (!Imm || *Imm != 8)
466         break;
467 
468       MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
469       MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
470 
471       if (TRI->isPhysicalRegister(Src1->getReg()) ||
472           TRI->isPhysicalRegister(Dst->getReg()))
473         break;
474 
475       if (Opcode == AMDGPU::V_LSHLREV_B16_e32) {
476         auto SDWADst =
477             make_unique<SDWADstOperand>(Dst, Src1, BYTE_1, UNUSED_PAD);
478         DEBUG(dbgs() << "Match: " << MI << "To: " << *SDWADst << '\n');
479         SDWAOperands[&MI] = std::move(SDWADst);
480         ++NumSDWAPatternsFound;
481       } else {
482         auto SDWASrc = make_unique<SDWASrcOperand>(
483             Src1, Dst, BYTE_1, false, false,
484             Opcode == AMDGPU::V_LSHRREV_B16_e32 ? false : true);
485         DEBUG(dbgs() << "Match: " << MI << "To: " << *SDWASrc << '\n');
486         SDWAOperands[&MI] = std::move(SDWASrc);
487         ++NumSDWAPatternsFound;
488       }
489       break;
490     }
491 
492     case AMDGPU::V_BFE_I32:
493     case AMDGPU::V_BFE_U32: {
494       // e.g.:
495       // from: v_bfe_u32 v1, v0, 8, 8
496       // to SDWA src:v0 src_sel:BYTE_1
497 
498       // offset | width | src_sel
499       // ------------------------
500       // 0      | 8     | BYTE_0
501       // 0      | 16    | WORD_0
502       // 0      | 32    | DWORD ?
503       // 8      | 8     | BYTE_1
504       // 16     | 8     | BYTE_2
505       // 16     | 16    | WORD_1
506       // 24     | 8     | BYTE_3
507 
508       MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
509       auto Offset = foldToImm(*Src1);
510       if (!Offset)
511         break;
512 
513       MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2);
514       auto Width = foldToImm(*Src2);
515       if (!Width)
516         break;
517 
518       SdwaSel SrcSel = DWORD;
519 
520       if (*Offset == 0 && *Width == 8)
521         SrcSel = BYTE_0;
522       else if (*Offset == 0 && *Width == 16)
523         SrcSel = WORD_0;
524       else if (*Offset == 0 && *Width == 32)
525         SrcSel = DWORD;
526       else if (*Offset == 8 && *Width == 8)
527         SrcSel = BYTE_1;
528       else if (*Offset == 16 && *Width == 8)
529         SrcSel = BYTE_2;
530       else if (*Offset == 16 && *Width == 16)
531         SrcSel = WORD_1;
532       else if (*Offset == 24 && *Width == 8)
533         SrcSel = BYTE_3;
534       else
535         break;
536 
537       MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
538       MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
539 
540       if (TRI->isPhysicalRegister(Src0->getReg()) ||
541           TRI->isPhysicalRegister(Dst->getReg()))
542         break;
543 
544       auto SDWASrc = make_unique<SDWASrcOperand>(
545           Src0, Dst, SrcSel, false, false,
546           Opcode == AMDGPU::V_BFE_U32 ? false : true);
547       DEBUG(dbgs() << "Match: " << MI << "To: " << *SDWASrc << '\n');
548       SDWAOperands[&MI] = std::move(SDWASrc);
549       ++NumSDWAPatternsFound;
550       break;
551     }
552     case AMDGPU::V_AND_B32_e32: {
553       // e.g.:
554       // from: v_and_b32_e32 v1, 0x0000ffff/0x000000ff, v0
555       // to SDWA src:v0 src_sel:WORD_0/BYTE_0
556 
557       MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
558       auto Imm = foldToImm(*Src0);
559       if (!Imm)
560         break;
561 
562       if (*Imm != 0x0000ffff && *Imm != 0x000000ff)
563         break;
564 
565       MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
566       MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
567 
568       if (TRI->isPhysicalRegister(Src1->getReg()) ||
569           TRI->isPhysicalRegister(Dst->getReg()))
570         break;
571 
572       auto SDWASrc = make_unique<SDWASrcOperand>(
573           Src1, Dst, *Imm == 0x0000ffff ? WORD_0 : BYTE_0);
574       DEBUG(dbgs() << "Match: " << MI << "To: " << *SDWASrc << '\n');
575       SDWAOperands[&MI] = std::move(SDWASrc);
576       ++NumSDWAPatternsFound;
577       break;
578     }
579     }
580   }
581 }
582 
583 bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI,
584                                    const SDWAOperandsVector &SDWAOperands) {
585   // Check if this instruction can be converted to SDWA:
586   // 1. Does this opcode support SDWA
587   if (AMDGPU::getSDWAOp(MI.getOpcode()) == -1)
588     return false;
589 
590   // 2. Are all operands - VGPRs
591   for (const MachineOperand &Operand : MI.explicit_operands()) {
592     if (!Operand.isReg() || !TRI->isVGPR(*MRI, Operand.getReg()))
593       return false;
594   }
595 
596   // Convert to sdwa
597   int SDWAOpcode = AMDGPU::getSDWAOp(MI.getOpcode());
598   assert(SDWAOpcode != -1);
599 
600   const MCInstrDesc &SDWADesc = TII->get(SDWAOpcode);
601 
602   // Create SDWA version of instruction MI and initialize its operands
603   MachineInstrBuilder SDWAInst =
604     BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), SDWADesc);
605 
606   // Copy dst, if it is present in original then should also be present in SDWA
607   MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
608   if (Dst) {
609     assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::vdst) != -1);
610     SDWAInst.add(*Dst);
611   } else {
612     assert(TII->isVOPC(MI));
613   }
614 
615   // Copy src0, initialize src0_modifiers. All sdwa instructions has src0 and
616   // src0_modifiers (except for v_nop_sdwa, but it can't get here)
617   MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
618   assert(
619     Src0 &&
620     AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src0) != -1 &&
621     AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src0_modifiers) != -1);
622   SDWAInst.addImm(0);
623   SDWAInst.add(*Src0);
624 
625   // Copy src1 if present, initialize src1_modifiers.
626   MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
627   if (Src1) {
628     assert(
629       AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src1) != -1 &&
630       AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src1_modifiers) != -1);
631     SDWAInst.addImm(0);
632     SDWAInst.add(*Src1);
633   } else {
634     assert(TII->isVOP1(MI));
635   }
636 
637   if (SDWAOpcode == AMDGPU::V_MAC_F16_sdwa ||
638       SDWAOpcode == AMDGPU::V_MAC_F32_sdwa) {
639     // v_mac_f16/32 has additional src2 operand tied to vdst
640     MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2);
641     assert(Src2);
642     SDWAInst.add(*Src2);
643   }
644 
645   // Initialize clamp.
646   assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::clamp) != -1);
647   SDWAInst.addImm(0);
648 
649   // Initialize dst_sel and dst_unused if present
650   if (Dst) {
651     assert(
652       AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::dst_sel) != -1 &&
653       AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::dst_unused) != -1);
654     SDWAInst.addImm(AMDGPU::SDWA::SdwaSel::DWORD);
655     SDWAInst.addImm(AMDGPU::SDWA::DstUnused::UNUSED_PAD);
656   }
657 
658   // Initialize src0_sel
659   assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src0_sel) != -1);
660   SDWAInst.addImm(AMDGPU::SDWA::SdwaSel::DWORD);
661 
662 
663   // Initialize src1_sel if present
664   if (Src1) {
665     assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src1_sel) != -1);
666     SDWAInst.addImm(AMDGPU::SDWA::SdwaSel::DWORD);
667   }
668 
669   // Apply all sdwa operand pattenrs
670   bool Converted = false;
671   for (auto &Operand : SDWAOperands) {
672     Converted |= Operand->convertToSDWA(*SDWAInst, TII);
673   }
674   if (!Converted) {
675     SDWAInst->eraseFromParent();
676     return false;
677   }
678 
679   DEBUG(dbgs() << "Convert instruction:" << MI
680                << "Into:" << *SDWAInst << '\n');
681   ++NumSDWAInstructionsPeepholed;
682 
683   MI.eraseFromParent();
684   return true;
685 }
686 
687 bool SIPeepholeSDWA::runOnMachineFunction(MachineFunction &MF) {
688   const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
689 
690   if (!ST.hasSDWA() ||
691       !AMDGPU::isVI(ST)) { // TODO: Add support for SDWA on gfx9
692     return false;
693   }
694 
695   MRI = &MF.getRegInfo();
696   TRI = ST.getRegisterInfo();
697   TII = ST.getInstrInfo();
698 
699   std::unordered_map<MachineInstr *, SDWAOperandsVector> PotentialMatches;
700 
701   // FIXME: For now we only combine instructions in one basic block
702   for (MachineBasicBlock &MBB : MF) {
703     SDWAOperands.clear();
704     matchSDWAOperands(MBB);
705 
706     PotentialMatches.clear();
707     for (auto &OperandPair : SDWAOperands) {
708       auto &Operand = OperandPair.second;
709       MachineInstr *PotentialMI = Operand->potentialToConvert(TII);
710       if (PotentialMI) {
711         PotentialMatches[PotentialMI].push_back(std::move(Operand));
712       }
713     }
714 
715     for (auto &PotentialPair : PotentialMatches) {
716       MachineInstr &PotentialMI = *PotentialPair.first;
717       convertToSDWA(PotentialMI, PotentialPair.second);
718     }
719   }
720   return false;
721 }
722