xref: /llvm-project/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp (revision 314e29ed2b78c69111635ecab94541b94c9e4c67)
1 //===- SIPeepholeSDWA.cpp - Peephole optimization for SDWA instructions ---===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file This pass tries to apply several peephole SDWA patterns.
10 ///
11 /// E.g. original:
12 ///   V_LSHRREV_B32_e32 %0, 16, %1
13 ///   V_ADD_CO_U32_e32 %2, %0, %3
14 ///   V_LSHLREV_B32_e32 %4, 16, %2
15 ///
16 /// Replace:
17 ///   V_ADD_CO_U32_sdwa %4, %1, %3
18 ///       dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
19 ///
20 //===----------------------------------------------------------------------===//
21 
22 #include "AMDGPU.h"
23 #include "AMDGPUSubtarget.h"
24 #include "llvm/ADT/MapVector.h"
25 #include "llvm/ADT/Statistic.h"
26 #include "llvm/CodeGen/MachineFunctionPass.h"
27 
28 using namespace llvm;
29 
30 #define DEBUG_TYPE "si-peephole-sdwa"
31 
32 STATISTIC(NumSDWAPatternsFound, "Number of SDWA patterns found.");
33 STATISTIC(NumSDWAInstructionsPeepholed,
34           "Number of instruction converted to SDWA.");
35 
36 namespace {
37 
38 class SDWAOperand;
39 class SDWADstOperand;
40 
41 class SIPeepholeSDWA : public MachineFunctionPass {
42 public:
43   using SDWAOperandsVector = SmallVector<SDWAOperand *, 4>;
44 
45 private:
46   MachineRegisterInfo *MRI;
47   const SIRegisterInfo *TRI;
48   const SIInstrInfo *TII;
49 
50   MapVector<MachineInstr *, std::unique_ptr<SDWAOperand>> SDWAOperands;
51   MapVector<MachineInstr *, SDWAOperandsVector> PotentialMatches;
52   SmallVector<MachineInstr *, 8> ConvertedInstructions;
53 
54   Optional<int64_t> foldToImm(const MachineOperand &Op) const;
55 
56 public:
57   static char ID;
58 
59   SIPeepholeSDWA() : MachineFunctionPass(ID) {
60     initializeSIPeepholeSDWAPass(*PassRegistry::getPassRegistry());
61   }
62 
63   bool runOnMachineFunction(MachineFunction &MF) override;
64   void matchSDWAOperands(MachineBasicBlock &MBB);
65   std::unique_ptr<SDWAOperand> matchSDWAOperand(MachineInstr &MI);
66   bool isConvertibleToSDWA(MachineInstr &MI, const GCNSubtarget &ST) const;
67   void pseudoOpConvertToVOP2(MachineInstr &MI,
68                              const GCNSubtarget &ST) const;
69   bool convertToSDWA(MachineInstr &MI, const SDWAOperandsVector &SDWAOperands);
70   void legalizeScalarOperands(MachineInstr &MI, const GCNSubtarget &ST) const;
71 
72   StringRef getPassName() const override { return "SI Peephole SDWA"; }
73 
74   void getAnalysisUsage(AnalysisUsage &AU) const override {
75     AU.setPreservesCFG();
76     MachineFunctionPass::getAnalysisUsage(AU);
77   }
78 };
79 
80 class SDWAOperand {
81 private:
82   MachineOperand *Target; // Operand that would be used in converted instruction
83   MachineOperand *Replaced; // Operand that would be replace by Target
84 
85 public:
86   SDWAOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp)
87       : Target(TargetOp), Replaced(ReplacedOp) {
88     assert(Target->isReg());
89     assert(Replaced->isReg());
90   }
91 
92   virtual ~SDWAOperand() = default;
93 
94   virtual MachineInstr *potentialToConvert(const SIInstrInfo *TII) = 0;
95   virtual bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) = 0;
96 
97   MachineOperand *getTargetOperand() const { return Target; }
98   MachineOperand *getReplacedOperand() const { return Replaced; }
99   MachineInstr *getParentInst() const { return Target->getParent(); }
100 
101   MachineRegisterInfo *getMRI() const {
102     return &getParentInst()->getParent()->getParent()->getRegInfo();
103   }
104 
105 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
106   virtual void print(raw_ostream& OS) const = 0;
107   void dump() const { print(dbgs()); }
108 #endif
109 };
110 
111 using namespace AMDGPU::SDWA;
112 
113 class SDWASrcOperand : public SDWAOperand {
114 private:
115   SdwaSel SrcSel;
116   bool Abs;
117   bool Neg;
118   bool Sext;
119 
120 public:
121   SDWASrcOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp,
122                  SdwaSel SrcSel_ = DWORD, bool Abs_ = false, bool Neg_ = false,
123                  bool Sext_ = false)
124       : SDWAOperand(TargetOp, ReplacedOp),
125         SrcSel(SrcSel_), Abs(Abs_), Neg(Neg_), Sext(Sext_) {}
126 
127   MachineInstr *potentialToConvert(const SIInstrInfo *TII) override;
128   bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override;
129 
130   SdwaSel getSrcSel() const { return SrcSel; }
131   bool getAbs() const { return Abs; }
132   bool getNeg() const { return Neg; }
133   bool getSext() const { return Sext; }
134 
135   uint64_t getSrcMods(const SIInstrInfo *TII,
136                       const MachineOperand *SrcOp) const;
137 
138 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
139   void print(raw_ostream& OS) const override;
140 #endif
141 };
142 
143 class SDWADstOperand : public SDWAOperand {
144 private:
145   SdwaSel DstSel;
146   DstUnused DstUn;
147 
148 public:
149 
150   SDWADstOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp,
151                  SdwaSel DstSel_ = DWORD, DstUnused DstUn_ = UNUSED_PAD)
152     : SDWAOperand(TargetOp, ReplacedOp), DstSel(DstSel_), DstUn(DstUn_) {}
153 
154   MachineInstr *potentialToConvert(const SIInstrInfo *TII) override;
155   bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override;
156 
157   SdwaSel getDstSel() const { return DstSel; }
158   DstUnused getDstUnused() const { return DstUn; }
159 
160 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
161   void print(raw_ostream& OS) const override;
162 #endif
163 };
164 
165 class SDWADstPreserveOperand : public SDWADstOperand {
166 private:
167   MachineOperand *Preserve;
168 
169 public:
170   SDWADstPreserveOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp,
171                          MachineOperand *PreserveOp, SdwaSel DstSel_ = DWORD)
172       : SDWADstOperand(TargetOp, ReplacedOp, DstSel_, UNUSED_PRESERVE),
173         Preserve(PreserveOp) {}
174 
175   bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override;
176 
177   MachineOperand *getPreservedOperand() const { return Preserve; }
178 
179 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
180   void print(raw_ostream& OS) const override;
181 #endif
182 };
183 
184 } // end anonymous namespace
185 
186 INITIALIZE_PASS(SIPeepholeSDWA, DEBUG_TYPE, "SI Peephole SDWA", false, false)
187 
188 char SIPeepholeSDWA::ID = 0;
189 
190 char &llvm::SIPeepholeSDWAID = SIPeepholeSDWA::ID;
191 
192 FunctionPass *llvm::createSIPeepholeSDWAPass() {
193   return new SIPeepholeSDWA();
194 }
195 
196 
197 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
198 static raw_ostream& operator<<(raw_ostream &OS, SdwaSel Sel) {
199   switch(Sel) {
200   case BYTE_0: OS << "BYTE_0"; break;
201   case BYTE_1: OS << "BYTE_1"; break;
202   case BYTE_2: OS << "BYTE_2"; break;
203   case BYTE_3: OS << "BYTE_3"; break;
204   case WORD_0: OS << "WORD_0"; break;
205   case WORD_1: OS << "WORD_1"; break;
206   case DWORD:  OS << "DWORD"; break;
207   }
208   return OS;
209 }
210 
211 static raw_ostream& operator<<(raw_ostream &OS, const DstUnused &Un) {
212   switch(Un) {
213   case UNUSED_PAD: OS << "UNUSED_PAD"; break;
214   case UNUSED_SEXT: OS << "UNUSED_SEXT"; break;
215   case UNUSED_PRESERVE: OS << "UNUSED_PRESERVE"; break;
216   }
217   return OS;
218 }
219 
220 LLVM_DUMP_METHOD
221 void SDWASrcOperand::print(raw_ostream& OS) const {
222   OS << "SDWA src: " << *getTargetOperand()
223     << " src_sel:" << getSrcSel()
224     << " abs:" << getAbs() << " neg:" << getNeg()
225     << " sext:" << getSext() << '\n';
226 }
227 
228 LLVM_DUMP_METHOD
229 void SDWADstOperand::print(raw_ostream& OS) const {
230   OS << "SDWA dst: " << *getTargetOperand()
231     << " dst_sel:" << getDstSel()
232     << " dst_unused:" << getDstUnused() << '\n';
233 }
234 
235 LLVM_DUMP_METHOD
236 void SDWADstPreserveOperand::print(raw_ostream& OS) const {
237   OS << "SDWA preserve dst: " << *getTargetOperand()
238     << " dst_sel:" << getDstSel()
239     << " preserve:" << *getPreservedOperand() << '\n';
240 }
241 
242 #endif
243 
244 static void copyRegOperand(MachineOperand &To, const MachineOperand &From) {
245   assert(To.isReg() && From.isReg());
246   To.setReg(From.getReg());
247   To.setSubReg(From.getSubReg());
248   To.setIsUndef(From.isUndef());
249   if (To.isUse()) {
250     To.setIsKill(From.isKill());
251   } else {
252     To.setIsDead(From.isDead());
253   }
254 }
255 
256 static bool isSameReg(const MachineOperand &LHS, const MachineOperand &RHS) {
257   return LHS.isReg() &&
258          RHS.isReg() &&
259          LHS.getReg() == RHS.getReg() &&
260          LHS.getSubReg() == RHS.getSubReg();
261 }
262 
263 static MachineOperand *findSingleRegUse(const MachineOperand *Reg,
264                                         const MachineRegisterInfo *MRI) {
265   if (!Reg->isReg() || !Reg->isDef())
266     return nullptr;
267 
268   MachineOperand *ResMO = nullptr;
269   for (MachineOperand &UseMO : MRI->use_nodbg_operands(Reg->getReg())) {
270     // If there exist use of subreg of Reg then return nullptr
271     if (!isSameReg(UseMO, *Reg))
272       return nullptr;
273 
274     // Check that there is only one instruction that uses Reg
275     if (!ResMO) {
276       ResMO = &UseMO;
277     } else if (ResMO->getParent() != UseMO.getParent()) {
278       return nullptr;
279     }
280   }
281 
282   return ResMO;
283 }
284 
285 static MachineOperand *findSingleRegDef(const MachineOperand *Reg,
286                                         const MachineRegisterInfo *MRI) {
287   if (!Reg->isReg())
288     return nullptr;
289 
290   MachineInstr *DefInstr = MRI->getUniqueVRegDef(Reg->getReg());
291   if (!DefInstr)
292     return nullptr;
293 
294   for (auto &DefMO : DefInstr->defs()) {
295     if (DefMO.isReg() && DefMO.getReg() == Reg->getReg())
296       return &DefMO;
297   }
298 
299   // Ignore implicit defs.
300   return nullptr;
301 }
302 
303 uint64_t SDWASrcOperand::getSrcMods(const SIInstrInfo *TII,
304                                     const MachineOperand *SrcOp) const {
305   uint64_t Mods = 0;
306   const auto *MI = SrcOp->getParent();
307   if (TII->getNamedOperand(*MI, AMDGPU::OpName::src0) == SrcOp) {
308     if (auto *Mod = TII->getNamedOperand(*MI, AMDGPU::OpName::src0_modifiers)) {
309       Mods = Mod->getImm();
310     }
311   } else if (TII->getNamedOperand(*MI, AMDGPU::OpName::src1) == SrcOp) {
312     if (auto *Mod = TII->getNamedOperand(*MI, AMDGPU::OpName::src1_modifiers)) {
313       Mods = Mod->getImm();
314     }
315   }
316   if (Abs || Neg) {
317     assert(!Sext &&
318            "Float and integer src modifiers can't be set simulteniously");
319     Mods |= Abs ? SISrcMods::ABS : 0u;
320     Mods ^= Neg ? SISrcMods::NEG : 0u;
321   } else if (Sext) {
322     Mods |= SISrcMods::SEXT;
323   }
324 
325   return Mods;
326 }
327 
328 MachineInstr *SDWASrcOperand::potentialToConvert(const SIInstrInfo *TII) {
329   // For SDWA src operand potential instruction is one that use register
330   // defined by parent instruction
331   MachineOperand *PotentialMO = findSingleRegUse(getReplacedOperand(), getMRI());
332   if (!PotentialMO)
333     return nullptr;
334 
335   return PotentialMO->getParent();
336 }
337 
338 bool SDWASrcOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) {
339   // Find operand in instruction that matches source operand and replace it with
340   // target operand. Set corresponding src_sel
341   bool IsPreserveSrc = false;
342   MachineOperand *Src = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
343   MachineOperand *SrcSel = TII->getNamedOperand(MI, AMDGPU::OpName::src0_sel);
344   MachineOperand *SrcMods =
345       TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
346   assert(Src && (Src->isReg() || Src->isImm()));
347   if (!isSameReg(*Src, *getReplacedOperand())) {
348     // If this is not src0 then it could be src1
349     Src = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
350     SrcSel = TII->getNamedOperand(MI, AMDGPU::OpName::src1_sel);
351     SrcMods = TII->getNamedOperand(MI, AMDGPU::OpName::src1_modifiers);
352 
353     if (!Src ||
354         !isSameReg(*Src, *getReplacedOperand())) {
355       // It's possible this Src is a tied operand for
356       // UNUSED_PRESERVE, in which case we can either
357       // abandon the peephole attempt, or if legal we can
358       // copy the target operand into the tied slot
359       // if the preserve operation will effectively cause the same
360       // result by overwriting the rest of the dst.
361       MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
362       MachineOperand *DstUnused =
363         TII->getNamedOperand(MI, AMDGPU::OpName::dst_unused);
364 
365       if (Dst &&
366           DstUnused->getImm() == AMDGPU::SDWA::DstUnused::UNUSED_PRESERVE) {
367         // This will work if the tied src is acessing WORD_0, and the dst is
368         // writing WORD_1. Modifiers don't matter because all the bits that
369         // would be impacted are being overwritten by the dst.
370         // Any other case will not work.
371         SdwaSel DstSel = static_cast<SdwaSel>(
372             TII->getNamedImmOperand(MI, AMDGPU::OpName::dst_sel));
373         if (DstSel == AMDGPU::SDWA::SdwaSel::WORD_1 &&
374             getSrcSel() == AMDGPU::SDWA::SdwaSel::WORD_0) {
375           IsPreserveSrc = true;
376           auto DstIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
377                                                    AMDGPU::OpName::vdst);
378           auto TiedIdx = MI.findTiedOperandIdx(DstIdx);
379           Src = &MI.getOperand(TiedIdx);
380           SrcSel = nullptr;
381           SrcMods = nullptr;
382         } else {
383           // Not legal to convert this src
384           return false;
385         }
386       }
387     }
388     assert(Src && Src->isReg());
389 
390     if ((MI.getOpcode() == AMDGPU::V_FMAC_F16_sdwa ||
391          MI.getOpcode() == AMDGPU::V_FMAC_F32_sdwa ||
392          MI.getOpcode() == AMDGPU::V_MAC_F16_sdwa ||
393          MI.getOpcode() == AMDGPU::V_MAC_F32_sdwa) &&
394          !isSameReg(*Src, *getReplacedOperand())) {
395       // In case of v_mac_f16/32_sdwa this pass can try to apply src operand to
396       // src2. This is not allowed.
397       return false;
398     }
399 
400     assert(isSameReg(*Src, *getReplacedOperand()) &&
401            (IsPreserveSrc || (SrcSel && SrcMods)));
402   }
403   copyRegOperand(*Src, *getTargetOperand());
404   if (!IsPreserveSrc) {
405     SrcSel->setImm(getSrcSel());
406     SrcMods->setImm(getSrcMods(TII, Src));
407   }
408   getTargetOperand()->setIsKill(false);
409   return true;
410 }
411 
412 MachineInstr *SDWADstOperand::potentialToConvert(const SIInstrInfo *TII) {
413   // For SDWA dst operand potential instruction is one that defines register
414   // that this operand uses
415   MachineRegisterInfo *MRI = getMRI();
416   MachineInstr *ParentMI = getParentInst();
417 
418   MachineOperand *PotentialMO = findSingleRegDef(getReplacedOperand(), MRI);
419   if (!PotentialMO)
420     return nullptr;
421 
422   // Check that ParentMI is the only instruction that uses replaced register
423   for (MachineInstr &UseInst : MRI->use_nodbg_instructions(PotentialMO->getReg())) {
424     if (&UseInst != ParentMI)
425       return nullptr;
426   }
427 
428   return PotentialMO->getParent();
429 }
430 
431 bool SDWADstOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) {
432   // Replace vdst operand in MI with target operand. Set dst_sel and dst_unused
433 
434   if ((MI.getOpcode() == AMDGPU::V_FMAC_F16_sdwa ||
435        MI.getOpcode() == AMDGPU::V_FMAC_F32_sdwa ||
436        MI.getOpcode() == AMDGPU::V_MAC_F16_sdwa ||
437        MI.getOpcode() == AMDGPU::V_MAC_F32_sdwa) &&
438       getDstSel() != AMDGPU::SDWA::DWORD) {
439     // v_mac_f16/32_sdwa allow dst_sel to be equal only to DWORD
440     return false;
441   }
442 
443   MachineOperand *Operand = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
444   assert(Operand &&
445          Operand->isReg() &&
446          isSameReg(*Operand, *getReplacedOperand()));
447   copyRegOperand(*Operand, *getTargetOperand());
448   MachineOperand *DstSel= TII->getNamedOperand(MI, AMDGPU::OpName::dst_sel);
449   assert(DstSel);
450   DstSel->setImm(getDstSel());
451   MachineOperand *DstUnused= TII->getNamedOperand(MI, AMDGPU::OpName::dst_unused);
452   assert(DstUnused);
453   DstUnused->setImm(getDstUnused());
454 
455   // Remove original instruction  because it would conflict with our new
456   // instruction by register definition
457   getParentInst()->eraseFromParent();
458   return true;
459 }
460 
461 bool SDWADstPreserveOperand::convertToSDWA(MachineInstr &MI,
462                                            const SIInstrInfo *TII) {
463   // MI should be moved right before v_or_b32.
464   // For this we should clear all kill flags on uses of MI src-operands or else
465   // we can encounter problem with use of killed operand.
466   for (MachineOperand &MO : MI.uses()) {
467     if (!MO.isReg())
468       continue;
469     getMRI()->clearKillFlags(MO.getReg());
470   }
471 
472   // Move MI before v_or_b32
473   auto MBB = MI.getParent();
474   MBB->remove(&MI);
475   MBB->insert(getParentInst(), &MI);
476 
477   // Add Implicit use of preserved register
478   MachineInstrBuilder MIB(*MBB->getParent(), MI);
479   MIB.addReg(getPreservedOperand()->getReg(),
480              RegState::ImplicitKill,
481              getPreservedOperand()->getSubReg());
482 
483   // Tie dst to implicit use
484   MI.tieOperands(AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdst),
485                  MI.getNumOperands() - 1);
486 
487   // Convert MI as any other SDWADstOperand and remove v_or_b32
488   return SDWADstOperand::convertToSDWA(MI, TII);
489 }
490 
491 Optional<int64_t> SIPeepholeSDWA::foldToImm(const MachineOperand &Op) const {
492   if (Op.isImm()) {
493     return Op.getImm();
494   }
495 
496   // If this is not immediate then it can be copy of immediate value, e.g.:
497   // %1 = S_MOV_B32 255;
498   if (Op.isReg()) {
499     for (const MachineOperand &Def : MRI->def_operands(Op.getReg())) {
500       if (!isSameReg(Op, Def))
501         continue;
502 
503       const MachineInstr *DefInst = Def.getParent();
504       if (!TII->isFoldableCopy(*DefInst))
505         return None;
506 
507       const MachineOperand &Copied = DefInst->getOperand(1);
508       if (!Copied.isImm())
509         return None;
510 
511       return Copied.getImm();
512     }
513   }
514 
515   return None;
516 }
517 
518 std::unique_ptr<SDWAOperand>
519 SIPeepholeSDWA::matchSDWAOperand(MachineInstr &MI) {
520   unsigned Opcode = MI.getOpcode();
521   switch (Opcode) {
522   case AMDGPU::V_LSHRREV_B32_e32:
523   case AMDGPU::V_ASHRREV_I32_e32:
524   case AMDGPU::V_LSHLREV_B32_e32:
525   case AMDGPU::V_LSHRREV_B32_e64:
526   case AMDGPU::V_ASHRREV_I32_e64:
527   case AMDGPU::V_LSHLREV_B32_e64: {
528     // from: v_lshrrev_b32_e32 v1, 16/24, v0
529     // to SDWA src:v0 src_sel:WORD_1/BYTE_3
530 
531     // from: v_ashrrev_i32_e32 v1, 16/24, v0
532     // to SDWA src:v0 src_sel:WORD_1/BYTE_3 sext:1
533 
534     // from: v_lshlrev_b32_e32 v1, 16/24, v0
535     // to SDWA dst:v1 dst_sel:WORD_1/BYTE_3 dst_unused:UNUSED_PAD
536     MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
537     auto Imm = foldToImm(*Src0);
538     if (!Imm)
539       break;
540 
541     if (*Imm != 16 && *Imm != 24)
542       break;
543 
544     MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
545     MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
546     if (Src1->getReg().isPhysical() || Dst->getReg().isPhysical())
547       break;
548 
549     if (Opcode == AMDGPU::V_LSHLREV_B32_e32 ||
550         Opcode == AMDGPU::V_LSHLREV_B32_e64) {
551       return std::make_unique<SDWADstOperand>(
552           Dst, Src1, *Imm == 16 ? WORD_1 : BYTE_3, UNUSED_PAD);
553     } else {
554       return std::make_unique<SDWASrcOperand>(
555           Src1, Dst, *Imm == 16 ? WORD_1 : BYTE_3, false, false,
556           Opcode != AMDGPU::V_LSHRREV_B32_e32 &&
557           Opcode != AMDGPU::V_LSHRREV_B32_e64);
558     }
559     break;
560   }
561 
562   case AMDGPU::V_LSHRREV_B16_e32:
563   case AMDGPU::V_ASHRREV_I16_e32:
564   case AMDGPU::V_LSHLREV_B16_e32:
565   case AMDGPU::V_LSHRREV_B16_e64:
566   case AMDGPU::V_ASHRREV_I16_e64:
567   case AMDGPU::V_LSHLREV_B16_e64: {
568     // from: v_lshrrev_b16_e32 v1, 8, v0
569     // to SDWA src:v0 src_sel:BYTE_1
570 
571     // from: v_ashrrev_i16_e32 v1, 8, v0
572     // to SDWA src:v0 src_sel:BYTE_1 sext:1
573 
574     // from: v_lshlrev_b16_e32 v1, 8, v0
575     // to SDWA dst:v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD
576     MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
577     auto Imm = foldToImm(*Src0);
578     if (!Imm || *Imm != 8)
579       break;
580 
581     MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
582     MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
583 
584     if (Src1->getReg().isPhysical() || Dst->getReg().isPhysical())
585       break;
586 
587     if (Opcode == AMDGPU::V_LSHLREV_B16_e32 ||
588         Opcode == AMDGPU::V_LSHLREV_B16_e64) {
589       return std::make_unique<SDWADstOperand>(Dst, Src1, BYTE_1, UNUSED_PAD);
590     } else {
591       return std::make_unique<SDWASrcOperand>(
592             Src1, Dst, BYTE_1, false, false,
593             Opcode != AMDGPU::V_LSHRREV_B16_e32 &&
594             Opcode != AMDGPU::V_LSHRREV_B16_e64);
595     }
596     break;
597   }
598 
599   case AMDGPU::V_BFE_I32_e64:
600   case AMDGPU::V_BFE_U32_e64: {
601     // e.g.:
602     // from: v_bfe_u32 v1, v0, 8, 8
603     // to SDWA src:v0 src_sel:BYTE_1
604 
605     // offset | width | src_sel
606     // ------------------------
607     // 0      | 8     | BYTE_0
608     // 0      | 16    | WORD_0
609     // 0      | 32    | DWORD ?
610     // 8      | 8     | BYTE_1
611     // 16     | 8     | BYTE_2
612     // 16     | 16    | WORD_1
613     // 24     | 8     | BYTE_3
614 
615     MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
616     auto Offset = foldToImm(*Src1);
617     if (!Offset)
618       break;
619 
620     MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2);
621     auto Width = foldToImm(*Src2);
622     if (!Width)
623       break;
624 
625     SdwaSel SrcSel = DWORD;
626 
627     if (*Offset == 0 && *Width == 8)
628       SrcSel = BYTE_0;
629     else if (*Offset == 0 && *Width == 16)
630       SrcSel = WORD_0;
631     else if (*Offset == 0 && *Width == 32)
632       SrcSel = DWORD;
633     else if (*Offset == 8 && *Width == 8)
634       SrcSel = BYTE_1;
635     else if (*Offset == 16 && *Width == 8)
636       SrcSel = BYTE_2;
637     else if (*Offset == 16 && *Width == 16)
638       SrcSel = WORD_1;
639     else if (*Offset == 24 && *Width == 8)
640       SrcSel = BYTE_3;
641     else
642       break;
643 
644     MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
645     MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
646 
647     if (Src0->getReg().isPhysical() || Dst->getReg().isPhysical())
648       break;
649 
650     return std::make_unique<SDWASrcOperand>(
651           Src0, Dst, SrcSel, false, false, Opcode != AMDGPU::V_BFE_U32_e64);
652   }
653 
654   case AMDGPU::V_AND_B32_e32:
655   case AMDGPU::V_AND_B32_e64: {
656     // e.g.:
657     // from: v_and_b32_e32 v1, 0x0000ffff/0x000000ff, v0
658     // to SDWA src:v0 src_sel:WORD_0/BYTE_0
659 
660     MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
661     MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
662     auto ValSrc = Src1;
663     auto Imm = foldToImm(*Src0);
664 
665     if (!Imm) {
666       Imm = foldToImm(*Src1);
667       ValSrc = Src0;
668     }
669 
670     if (!Imm || (*Imm != 0x0000ffff && *Imm != 0x000000ff))
671       break;
672 
673     MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
674 
675     if (ValSrc->getReg().isPhysical() || Dst->getReg().isPhysical())
676       break;
677 
678     return std::make_unique<SDWASrcOperand>(
679         ValSrc, Dst, *Imm == 0x0000ffff ? WORD_0 : BYTE_0);
680   }
681 
682   case AMDGPU::V_OR_B32_e32:
683   case AMDGPU::V_OR_B32_e64: {
684     // Patterns for dst_unused:UNUSED_PRESERVE.
685     // e.g., from:
686     // v_add_f16_sdwa v0, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD
687     //                           src1_sel:WORD_1 src2_sel:WORD1
688     // v_add_f16_e32 v3, v1, v2
689     // v_or_b32_e32 v4, v0, v3
690     // to SDWA preserve dst:v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE preserve:v3
691 
692     // Check if one of operands of v_or_b32 is SDWA instruction
693     using CheckRetType = Optional<std::pair<MachineOperand *, MachineOperand *>>;
694     auto CheckOROperandsForSDWA =
695       [&](const MachineOperand *Op1, const MachineOperand *Op2) -> CheckRetType {
696         if (!Op1 || !Op1->isReg() || !Op2 || !Op2->isReg())
697           return CheckRetType(None);
698 
699         MachineOperand *Op1Def = findSingleRegDef(Op1, MRI);
700         if (!Op1Def)
701           return CheckRetType(None);
702 
703         MachineInstr *Op1Inst = Op1Def->getParent();
704         if (!TII->isSDWA(*Op1Inst))
705           return CheckRetType(None);
706 
707         MachineOperand *Op2Def = findSingleRegDef(Op2, MRI);
708         if (!Op2Def)
709           return CheckRetType(None);
710 
711         return CheckRetType(std::make_pair(Op1Def, Op2Def));
712       };
713 
714     MachineOperand *OrSDWA = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
715     MachineOperand *OrOther = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
716     assert(OrSDWA && OrOther);
717     auto Res = CheckOROperandsForSDWA(OrSDWA, OrOther);
718     if (!Res) {
719       OrSDWA = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
720       OrOther = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
721       assert(OrSDWA && OrOther);
722       Res = CheckOROperandsForSDWA(OrSDWA, OrOther);
723       if (!Res)
724         break;
725     }
726 
727     MachineOperand *OrSDWADef = Res->first;
728     MachineOperand *OrOtherDef = Res->second;
729     assert(OrSDWADef && OrOtherDef);
730 
731     MachineInstr *SDWAInst = OrSDWADef->getParent();
732     MachineInstr *OtherInst = OrOtherDef->getParent();
733 
734     // Check that OtherInstr is actually bitwise compatible with SDWAInst = their
735     // destination patterns don't overlap. Compatible instruction can be either
736     // regular instruction with compatible bitness or SDWA instruction with
737     // correct dst_sel
738     // SDWAInst | OtherInst bitness / OtherInst dst_sel
739     // -----------------------------------------------------
740     // DWORD    | no                    / no
741     // WORD_0   | no                    / BYTE_2/3, WORD_1
742     // WORD_1   | 8/16-bit instructions / BYTE_0/1, WORD_0
743     // BYTE_0   | no                    / BYTE_1/2/3, WORD_1
744     // BYTE_1   | 8-bit                 / BYTE_0/2/3, WORD_1
745     // BYTE_2   | 8/16-bit              / BYTE_0/1/3. WORD_0
746     // BYTE_3   | 8/16/24-bit           / BYTE_0/1/2, WORD_0
747     // E.g. if SDWAInst is v_add_f16_sdwa dst_sel:WORD_1 then v_add_f16 is OK
748     // but v_add_f32 is not.
749 
750     // TODO: add support for non-SDWA instructions as OtherInst.
751     // For now this only works with SDWA instructions. For regular instructions
752     // there is no way to determine if the instruction writes only 8/16/24-bit
753     // out of full register size and all registers are at min 32-bit wide.
754     if (!TII->isSDWA(*OtherInst))
755       break;
756 
757     SdwaSel DstSel = static_cast<SdwaSel>(
758       TII->getNamedImmOperand(*SDWAInst, AMDGPU::OpName::dst_sel));;
759     SdwaSel OtherDstSel = static_cast<SdwaSel>(
760       TII->getNamedImmOperand(*OtherInst, AMDGPU::OpName::dst_sel));
761 
762     bool DstSelAgree = false;
763     switch (DstSel) {
764     case WORD_0: DstSelAgree = ((OtherDstSel == BYTE_2) ||
765                                 (OtherDstSel == BYTE_3) ||
766                                 (OtherDstSel == WORD_1));
767       break;
768     case WORD_1: DstSelAgree = ((OtherDstSel == BYTE_0) ||
769                                 (OtherDstSel == BYTE_1) ||
770                                 (OtherDstSel == WORD_0));
771       break;
772     case BYTE_0: DstSelAgree = ((OtherDstSel == BYTE_1) ||
773                                 (OtherDstSel == BYTE_2) ||
774                                 (OtherDstSel == BYTE_3) ||
775                                 (OtherDstSel == WORD_1));
776       break;
777     case BYTE_1: DstSelAgree = ((OtherDstSel == BYTE_0) ||
778                                 (OtherDstSel == BYTE_2) ||
779                                 (OtherDstSel == BYTE_3) ||
780                                 (OtherDstSel == WORD_1));
781       break;
782     case BYTE_2: DstSelAgree = ((OtherDstSel == BYTE_0) ||
783                                 (OtherDstSel == BYTE_1) ||
784                                 (OtherDstSel == BYTE_3) ||
785                                 (OtherDstSel == WORD_0));
786       break;
787     case BYTE_3: DstSelAgree = ((OtherDstSel == BYTE_0) ||
788                                 (OtherDstSel == BYTE_1) ||
789                                 (OtherDstSel == BYTE_2) ||
790                                 (OtherDstSel == WORD_0));
791       break;
792     default: DstSelAgree = false;
793     }
794 
795     if (!DstSelAgree)
796       break;
797 
798     // Also OtherInst dst_unused should be UNUSED_PAD
799     DstUnused OtherDstUnused = static_cast<DstUnused>(
800       TII->getNamedImmOperand(*OtherInst, AMDGPU::OpName::dst_unused));
801     if (OtherDstUnused != DstUnused::UNUSED_PAD)
802       break;
803 
804     // Create DstPreserveOperand
805     MachineOperand *OrDst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
806     assert(OrDst && OrDst->isReg());
807 
808     return std::make_unique<SDWADstPreserveOperand>(
809       OrDst, OrSDWADef, OrOtherDef, DstSel);
810 
811   }
812   }
813 
814   return std::unique_ptr<SDWAOperand>(nullptr);
815 }
816 
817 #if !defined(NDEBUG)
818 static raw_ostream& operator<<(raw_ostream &OS, const SDWAOperand &Operand) {
819   Operand.print(OS);
820   return OS;
821 }
822 #endif
823 
824 void SIPeepholeSDWA::matchSDWAOperands(MachineBasicBlock &MBB) {
825   for (MachineInstr &MI : MBB) {
826     if (auto Operand = matchSDWAOperand(MI)) {
827       LLVM_DEBUG(dbgs() << "Match: " << MI << "To: " << *Operand << '\n');
828       SDWAOperands[&MI] = std::move(Operand);
829       ++NumSDWAPatternsFound;
830     }
831   }
832 }
833 
834 // Convert the V_ADDC_U32_e64 into V_ADDC_U32_e32, and
835 // V_ADD_CO_U32_e64 into V_ADD_CO_U32_e32. This allows isConvertibleToSDWA
836 // to perform its transformation on V_ADD_CO_U32_e32 into V_ADD_CO_U32_sdwa.
837 //
838 // We are transforming from a VOP3 into a VOP2 form of the instruction.
839 //   %19:vgpr_32 = V_AND_B32_e32 255,
840 //       killed %16:vgpr_32, implicit $exec
841 //   %47:vgpr_32, %49:sreg_64_xexec = V_ADD_CO_U32_e64
842 //       %26.sub0:vreg_64, %19:vgpr_32, implicit $exec
843 //  %48:vgpr_32, dead %50:sreg_64_xexec = V_ADDC_U32_e64
844 //       %26.sub1:vreg_64, %54:vgpr_32, killed %49:sreg_64_xexec, implicit $exec
845 //
846 // becomes
847 //   %47:vgpr_32 = V_ADD_CO_U32_sdwa
848 //       0, %26.sub0:vreg_64, 0, killed %16:vgpr_32, 0, 6, 0, 6, 0,
849 //       implicit-def $vcc, implicit $exec
850 //  %48:vgpr_32 = V_ADDC_U32_e32
851 //       0, %26.sub1:vreg_64, implicit-def $vcc, implicit $vcc, implicit $exec
852 void SIPeepholeSDWA::pseudoOpConvertToVOP2(MachineInstr &MI,
853                                            const GCNSubtarget &ST) const {
854   int Opc = MI.getOpcode();
855   assert((Opc == AMDGPU::V_ADD_CO_U32_e64 || Opc == AMDGPU::V_SUB_CO_U32_e64) &&
856          "Currently only handles V_ADD_CO_U32_e64 or V_SUB_CO_U32_e64");
857 
858   // Can the candidate MI be shrunk?
859   if (!TII->canShrink(MI, *MRI))
860     return;
861   Opc = AMDGPU::getVOPe32(Opc);
862   // Find the related ADD instruction.
863   const MachineOperand *Sdst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst);
864   if (!Sdst)
865     return;
866   MachineOperand *NextOp = findSingleRegUse(Sdst, MRI);
867   if (!NextOp)
868     return;
869   MachineInstr &MISucc = *NextOp->getParent();
870   // Can the successor be shrunk?
871   if (!TII->canShrink(MISucc, *MRI))
872     return;
873   int SuccOpc = AMDGPU::getVOPe32(MISucc.getOpcode());
874   // Make sure the carry in/out are subsequently unused.
875   MachineOperand *CarryIn = TII->getNamedOperand(MISucc, AMDGPU::OpName::src2);
876   if (!CarryIn)
877     return;
878   MachineOperand *CarryOut = TII->getNamedOperand(MISucc, AMDGPU::OpName::sdst);
879   if (!CarryOut)
880     return;
881   if (!MRI->hasOneUse(CarryIn->getReg()) || !MRI->use_empty(CarryOut->getReg()))
882     return;
883   // Make sure VCC or its subregs are dead before MI.
884   MachineBasicBlock &MBB = *MI.getParent();
885   auto Liveness = MBB.computeRegisterLiveness(TRI, AMDGPU::VCC, MI, 25);
886   if (Liveness != MachineBasicBlock::LQR_Dead)
887     return;
888   // Check if VCC is referenced in range of (MI,MISucc].
889   for (auto I = std::next(MI.getIterator()), E = MISucc.getIterator();
890        I != E; ++I) {
891     if (I->modifiesRegister(AMDGPU::VCC, TRI))
892       return;
893   }
894 
895   // Make the two new e32 instruction variants.
896   // Replace MI with V_{SUB|ADD}_I32_e32
897   BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(Opc))
898     .add(*TII->getNamedOperand(MI, AMDGPU::OpName::vdst))
899     .add(*TII->getNamedOperand(MI, AMDGPU::OpName::src0))
900     .add(*TII->getNamedOperand(MI, AMDGPU::OpName::src1))
901     .setMIFlags(MI.getFlags());
902 
903   MI.eraseFromParent();
904 
905   // Replace MISucc with V_{SUBB|ADDC}_U32_e32
906   BuildMI(MBB, MISucc, MISucc.getDebugLoc(), TII->get(SuccOpc))
907     .add(*TII->getNamedOperand(MISucc, AMDGPU::OpName::vdst))
908     .add(*TII->getNamedOperand(MISucc, AMDGPU::OpName::src0))
909     .add(*TII->getNamedOperand(MISucc, AMDGPU::OpName::src1))
910     .setMIFlags(MISucc.getFlags());
911 
912   MISucc.eraseFromParent();
913 }
914 
915 bool SIPeepholeSDWA::isConvertibleToSDWA(MachineInstr &MI,
916                                          const GCNSubtarget &ST) const {
917   // Check if this is already an SDWA instruction
918   unsigned Opc = MI.getOpcode();
919   if (TII->isSDWA(Opc))
920     return true;
921 
922   // Check if this instruction has opcode that supports SDWA
923   if (AMDGPU::getSDWAOp(Opc) == -1)
924     Opc = AMDGPU::getVOPe32(Opc);
925 
926   if (AMDGPU::getSDWAOp(Opc) == -1)
927     return false;
928 
929   if (!ST.hasSDWAOmod() && TII->hasModifiersSet(MI, AMDGPU::OpName::omod))
930     return false;
931 
932   if (TII->isVOPC(Opc)) {
933     if (!ST.hasSDWASdst()) {
934       const MachineOperand *SDst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst);
935       if (SDst && (SDst->getReg() != AMDGPU::VCC &&
936                    SDst->getReg() != AMDGPU::VCC_LO))
937         return false;
938     }
939 
940     if (!ST.hasSDWAOutModsVOPC() &&
941         (TII->hasModifiersSet(MI, AMDGPU::OpName::clamp) ||
942          TII->hasModifiersSet(MI, AMDGPU::OpName::omod)))
943       return false;
944 
945   } else if (TII->getNamedOperand(MI, AMDGPU::OpName::sdst) ||
946              !TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) {
947     return false;
948   }
949 
950   if (!ST.hasSDWAMac() && (Opc == AMDGPU::V_FMAC_F16_e32 ||
951                            Opc == AMDGPU::V_FMAC_F32_e32 ||
952                            Opc == AMDGPU::V_MAC_F16_e32 ||
953                            Opc == AMDGPU::V_MAC_F32_e32))
954     return false;
955 
956   // Check if target supports this SDWA opcode
957   if (TII->pseudoToMCOpcode(Opc) == -1)
958     return false;
959 
960   // FIXME: has SDWA but require handling of implicit VCC use
961   if (Opc == AMDGPU::V_CNDMASK_B32_e32)
962     return false;
963 
964   if (MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0)) {
965     if (!Src0->isReg() && !Src0->isImm())
966       return false;
967   }
968 
969   if (MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1)) {
970     if (!Src1->isReg() && !Src1->isImm())
971       return false;
972   }
973 
974   return true;
975 }
976 
977 bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI,
978                                    const SDWAOperandsVector &SDWAOperands) {
979 
980   LLVM_DEBUG(dbgs() << "Convert instruction:" << MI);
981 
982   // Convert to sdwa
983   int SDWAOpcode;
984   unsigned Opcode = MI.getOpcode();
985   if (TII->isSDWA(Opcode)) {
986     SDWAOpcode = Opcode;
987   } else {
988     SDWAOpcode = AMDGPU::getSDWAOp(Opcode);
989     if (SDWAOpcode == -1)
990       SDWAOpcode = AMDGPU::getSDWAOp(AMDGPU::getVOPe32(Opcode));
991   }
992   assert(SDWAOpcode != -1);
993 
994   const MCInstrDesc &SDWADesc = TII->get(SDWAOpcode);
995 
996   // Create SDWA version of instruction MI and initialize its operands
997   MachineInstrBuilder SDWAInst =
998     BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), SDWADesc)
999     .setMIFlags(MI.getFlags());
1000 
1001   // Copy dst, if it is present in original then should also be present in SDWA
1002   MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
1003   if (Dst) {
1004     assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::vdst) != -1);
1005     SDWAInst.add(*Dst);
1006   } else if ((Dst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst))) {
1007     assert(Dst &&
1008            AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::sdst) != -1);
1009     SDWAInst.add(*Dst);
1010   } else {
1011     assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::sdst) != -1);
1012     SDWAInst.addReg(TRI->getVCC(), RegState::Define);
1013   }
1014 
1015   // Copy src0, initialize src0_modifiers. All sdwa instructions has src0 and
1016   // src0_modifiers (except for v_nop_sdwa, but it can't get here)
1017   MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
1018   assert(
1019     Src0 &&
1020     AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src0) != -1 &&
1021     AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src0_modifiers) != -1);
1022   if (auto *Mod = TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers))
1023     SDWAInst.addImm(Mod->getImm());
1024   else
1025     SDWAInst.addImm(0);
1026   SDWAInst.add(*Src0);
1027 
1028   // Copy src1 if present, initialize src1_modifiers.
1029   MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
1030   if (Src1) {
1031     assert(
1032       AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src1) != -1 &&
1033       AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src1_modifiers) != -1);
1034     if (auto *Mod = TII->getNamedOperand(MI, AMDGPU::OpName::src1_modifiers))
1035       SDWAInst.addImm(Mod->getImm());
1036     else
1037       SDWAInst.addImm(0);
1038     SDWAInst.add(*Src1);
1039   }
1040 
1041   if (SDWAOpcode == AMDGPU::V_FMAC_F16_sdwa ||
1042       SDWAOpcode == AMDGPU::V_FMAC_F32_sdwa ||
1043       SDWAOpcode == AMDGPU::V_MAC_F16_sdwa ||
1044       SDWAOpcode == AMDGPU::V_MAC_F32_sdwa) {
1045     // v_mac_f16/32 has additional src2 operand tied to vdst
1046     MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2);
1047     assert(Src2);
1048     SDWAInst.add(*Src2);
1049   }
1050 
1051   // Copy clamp if present, initialize otherwise
1052   assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::clamp) != -1);
1053   MachineOperand *Clamp = TII->getNamedOperand(MI, AMDGPU::OpName::clamp);
1054   if (Clamp) {
1055     SDWAInst.add(*Clamp);
1056   } else {
1057     SDWAInst.addImm(0);
1058   }
1059 
1060   // Copy omod if present, initialize otherwise if needed
1061   if (AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::omod) != -1) {
1062     MachineOperand *OMod = TII->getNamedOperand(MI, AMDGPU::OpName::omod);
1063     if (OMod) {
1064       SDWAInst.add(*OMod);
1065     } else {
1066       SDWAInst.addImm(0);
1067     }
1068   }
1069 
1070   // Copy dst_sel if present, initialize otherwise if needed
1071   if (AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::dst_sel) != -1) {
1072     MachineOperand *DstSel = TII->getNamedOperand(MI, AMDGPU::OpName::dst_sel);
1073     if (DstSel) {
1074       SDWAInst.add(*DstSel);
1075     } else {
1076       SDWAInst.addImm(AMDGPU::SDWA::SdwaSel::DWORD);
1077     }
1078   }
1079 
1080   // Copy dst_unused if present, initialize otherwise if needed
1081   if (AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::dst_unused) != -1) {
1082     MachineOperand *DstUnused = TII->getNamedOperand(MI, AMDGPU::OpName::dst_unused);
1083     if (DstUnused) {
1084       SDWAInst.add(*DstUnused);
1085     } else {
1086       SDWAInst.addImm(AMDGPU::SDWA::DstUnused::UNUSED_PAD);
1087     }
1088   }
1089 
1090   // Copy src0_sel if present, initialize otherwise
1091   assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src0_sel) != -1);
1092   MachineOperand *Src0Sel = TII->getNamedOperand(MI, AMDGPU::OpName::src0_sel);
1093   if (Src0Sel) {
1094     SDWAInst.add(*Src0Sel);
1095   } else {
1096     SDWAInst.addImm(AMDGPU::SDWA::SdwaSel::DWORD);
1097   }
1098 
1099   // Copy src1_sel if present, initialize otherwise if needed
1100   if (Src1) {
1101     assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src1_sel) != -1);
1102     MachineOperand *Src1Sel = TII->getNamedOperand(MI, AMDGPU::OpName::src1_sel);
1103     if (Src1Sel) {
1104       SDWAInst.add(*Src1Sel);
1105     } else {
1106       SDWAInst.addImm(AMDGPU::SDWA::SdwaSel::DWORD);
1107     }
1108   }
1109 
1110   // Check for a preserved register that needs to be copied.
1111   auto DstUnused = TII->getNamedOperand(MI, AMDGPU::OpName::dst_unused);
1112   if (DstUnused &&
1113       DstUnused->getImm() == AMDGPU::SDWA::DstUnused::UNUSED_PRESERVE) {
1114     // We expect, if we are here, that the instruction was already in it's SDWA form,
1115     // with a tied operand.
1116     assert(Dst && Dst->isTied());
1117     assert(Opcode == static_cast<unsigned int>(SDWAOpcode));
1118     // We also expect a vdst, since sdst can't preserve.
1119     auto PreserveDstIdx = AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::vdst);
1120     assert(PreserveDstIdx != -1);
1121 
1122     auto TiedIdx = MI.findTiedOperandIdx(PreserveDstIdx);
1123     auto Tied = MI.getOperand(TiedIdx);
1124 
1125     SDWAInst.add(Tied);
1126     SDWAInst->tieOperands(PreserveDstIdx, SDWAInst->getNumOperands() - 1);
1127   }
1128 
1129   // Apply all sdwa operand patterns.
1130   bool Converted = false;
1131   for (auto &Operand : SDWAOperands) {
1132     LLVM_DEBUG(dbgs() << *SDWAInst << "\nOperand: " << *Operand);
1133     // There should be no intesection between SDWA operands and potential MIs
1134     // e.g.:
1135     // v_and_b32 v0, 0xff, v1 -> src:v1 sel:BYTE_0
1136     // v_and_b32 v2, 0xff, v0 -> src:v0 sel:BYTE_0
1137     // v_add_u32 v3, v4, v2
1138     //
1139     // In that example it is possible that we would fold 2nd instruction into 3rd
1140     // (v_add_u32_sdwa) and then try to fold 1st instruction into 2nd (that was
1141     // already destroyed). So if SDWAOperand is also a potential MI then do not
1142     // apply it.
1143     if (PotentialMatches.count(Operand->getParentInst()) == 0)
1144       Converted |= Operand->convertToSDWA(*SDWAInst, TII);
1145   }
1146   if (Converted) {
1147     ConvertedInstructions.push_back(SDWAInst);
1148   } else {
1149     SDWAInst->eraseFromParent();
1150     return false;
1151   }
1152 
1153   LLVM_DEBUG(dbgs() << "\nInto:" << *SDWAInst << '\n');
1154   ++NumSDWAInstructionsPeepholed;
1155 
1156   MI.eraseFromParent();
1157   return true;
1158 }
1159 
1160 // If an instruction was converted to SDWA it should not have immediates or SGPR
1161 // operands (allowed one SGPR on GFX9). Copy its scalar operands into VGPRs.
1162 void SIPeepholeSDWA::legalizeScalarOperands(MachineInstr &MI,
1163                                             const GCNSubtarget &ST) const {
1164   const MCInstrDesc &Desc = TII->get(MI.getOpcode());
1165   unsigned ConstantBusCount = 0;
1166   for (MachineOperand &Op : MI.explicit_uses()) {
1167     if (!Op.isImm() && !(Op.isReg() && !TRI->isVGPR(*MRI, Op.getReg())))
1168       continue;
1169 
1170     unsigned I = MI.getOperandNo(&Op);
1171     if (Desc.OpInfo[I].RegClass == -1 ||
1172        !TRI->hasVGPRs(TRI->getRegClass(Desc.OpInfo[I].RegClass)))
1173       continue;
1174 
1175     if (ST.hasSDWAScalar() && ConstantBusCount == 0 && Op.isReg() &&
1176         TRI->isSGPRReg(*MRI, Op.getReg())) {
1177       ++ConstantBusCount;
1178       continue;
1179     }
1180 
1181     Register VGPR = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1182     auto Copy = BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(),
1183                         TII->get(AMDGPU::V_MOV_B32_e32), VGPR);
1184     if (Op.isImm())
1185       Copy.addImm(Op.getImm());
1186     else if (Op.isReg())
1187       Copy.addReg(Op.getReg(), Op.isKill() ? RegState::Kill : 0,
1188                   Op.getSubReg());
1189     Op.ChangeToRegister(VGPR, false);
1190   }
1191 }
1192 
1193 bool SIPeepholeSDWA::runOnMachineFunction(MachineFunction &MF) {
1194   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1195 
1196   if (!ST.hasSDWA() || skipFunction(MF.getFunction()))
1197     return false;
1198 
1199   MRI = &MF.getRegInfo();
1200   TRI = ST.getRegisterInfo();
1201   TII = ST.getInstrInfo();
1202 
1203   // Find all SDWA operands in MF.
1204   bool Ret = false;
1205   for (MachineBasicBlock &MBB : MF) {
1206     bool Changed = false;
1207     do {
1208       // Preprocess the ADD/SUB pairs so they could be SDWA'ed.
1209       // Look for a possible ADD or SUB that resulted from a previously lowered
1210       // V_{ADD|SUB}_U64_PSEUDO. The function pseudoOpConvertToVOP2
1211       // lowers the pair of instructions into e32 form.
1212       matchSDWAOperands(MBB);
1213       for (const auto &OperandPair : SDWAOperands) {
1214         const auto &Operand = OperandPair.second;
1215         MachineInstr *PotentialMI = Operand->potentialToConvert(TII);
1216         if (PotentialMI &&
1217            (PotentialMI->getOpcode() == AMDGPU::V_ADD_CO_U32_e64 ||
1218             PotentialMI->getOpcode() == AMDGPU::V_SUB_CO_U32_e64))
1219           pseudoOpConvertToVOP2(*PotentialMI, ST);
1220       }
1221       SDWAOperands.clear();
1222 
1223       // Generate potential match list.
1224       matchSDWAOperands(MBB);
1225 
1226       for (const auto &OperandPair : SDWAOperands) {
1227         const auto &Operand = OperandPair.second;
1228         MachineInstr *PotentialMI = Operand->potentialToConvert(TII);
1229         if (PotentialMI && isConvertibleToSDWA(*PotentialMI, ST)) {
1230           PotentialMatches[PotentialMI].push_back(Operand.get());
1231         }
1232       }
1233 
1234       for (auto &PotentialPair : PotentialMatches) {
1235         MachineInstr &PotentialMI = *PotentialPair.first;
1236         convertToSDWA(PotentialMI, PotentialPair.second);
1237       }
1238 
1239       PotentialMatches.clear();
1240       SDWAOperands.clear();
1241 
1242       Changed = !ConvertedInstructions.empty();
1243 
1244       if (Changed)
1245         Ret = true;
1246       while (!ConvertedInstructions.empty())
1247         legalizeScalarOperands(*ConvertedInstructions.pop_back_val(), ST);
1248     } while (Changed);
1249   }
1250 
1251   return Ret;
1252 }
1253