xref: /llvm-project/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp (revision 8d13e7b8c382499c1cf0c2a3184b483e760f266b)
1 //=======- GCNDPPCombine.cpp - optimization for DPP instructions ---==========//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 // The pass combines V_MOV_B32_dpp instruction with its VALU uses as a DPP src0
9 // operand. If any of the use instruction cannot be combined with the mov the
10 // whole sequence is reverted.
11 //
12 // $old = ...
13 // $dpp_value = V_MOV_B32_dpp $old, $vgpr_to_be_read_from_other_lane,
14 //                            dpp_controls..., $row_mask, $bank_mask, $bound_ctrl
15 // $res = VALU $dpp_value [, src1]
16 //
17 // to
18 //
19 // $res = VALU_DPP $combined_old, $vgpr_to_be_read_from_other_lane, [src1,]
20 //                 dpp_controls..., $row_mask, $bank_mask, $combined_bound_ctrl
21 //
22 // Combining rules :
23 //
24 // if $row_mask and $bank_mask are fully enabled (0xF) and
25 //    $bound_ctrl==DPP_BOUND_ZERO or $old==0
26 // -> $combined_old = undef,
27 //    $combined_bound_ctrl = DPP_BOUND_ZERO
28 //
29 // if the VALU op is binary and
30 //    $bound_ctrl==DPP_BOUND_OFF and
31 //    $old==identity value (immediate) for the VALU op
32 // -> $combined_old = src1,
33 //    $combined_bound_ctrl = DPP_BOUND_OFF
34 //
35 // Otherwise cancel.
36 //
37 // The mov_dpp instruction should reside in the same BB as all its uses
38 //===----------------------------------------------------------------------===//
39 
40 #include "GCNDPPCombine.h"
41 #include "AMDGPU.h"
42 #include "GCNSubtarget.h"
43 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
44 #include "llvm/ADT/Statistic.h"
45 #include "llvm/CodeGen/MachineFunctionPass.h"
46 
47 using namespace llvm;
48 
49 #define DEBUG_TYPE "gcn-dpp-combine"
50 
51 STATISTIC(NumDPPMovsCombined, "Number of DPP moves combined.");
52 
53 namespace {
54 
55 class GCNDPPCombine {
56   MachineRegisterInfo *MRI;
57   const SIInstrInfo *TII;
58   const GCNSubtarget *ST;
59 
60   using RegSubRegPair = TargetInstrInfo::RegSubRegPair;
61 
62   MachineOperand *getOldOpndValue(MachineOperand &OldOpnd) const;
63 
64   MachineInstr *createDPPInst(MachineInstr &OrigMI, MachineInstr &MovMI,
65                               RegSubRegPair CombOldVGPR,
66                               MachineOperand *OldOpnd, bool CombBCZ,
67                               bool IsShrinkable) const;
68 
69   MachineInstr *createDPPInst(MachineInstr &OrigMI, MachineInstr &MovMI,
70                               RegSubRegPair CombOldVGPR, bool CombBCZ,
71                               bool IsShrinkable) const;
72 
73   bool hasNoImmOrEqual(MachineInstr &MI,
74                        unsigned OpndName,
75                        int64_t Value,
76                        int64_t Mask = -1) const;
77 
78   bool combineDPPMov(MachineInstr &MI) const;
79 
80   int getDPPOp(unsigned Op, bool IsShrinkable) const;
81   bool isShrinkable(MachineInstr &MI) const;
82 
83 public:
84   bool run(MachineFunction &MF);
85 };
86 
87 class GCNDPPCombineLegacy : public MachineFunctionPass {
88 public:
89   static char ID;
90 
91   GCNDPPCombineLegacy() : MachineFunctionPass(ID) {}
92 
93   bool runOnMachineFunction(MachineFunction &MF) override;
94 
95   StringRef getPassName() const override { return "GCN DPP Combine"; }
96 
97   void getAnalysisUsage(AnalysisUsage &AU) const override {
98     AU.setPreservesCFG();
99     MachineFunctionPass::getAnalysisUsage(AU);
100   }
101 
102   MachineFunctionProperties getRequiredProperties() const override {
103     return MachineFunctionProperties()
104       .set(MachineFunctionProperties::Property::IsSSA);
105   }
106 };
107 
108 } // end anonymous namespace
109 
110 INITIALIZE_PASS(GCNDPPCombineLegacy, DEBUG_TYPE, "GCN DPP Combine", false,
111                 false)
112 
113 char GCNDPPCombineLegacy::ID = 0;
114 
115 char &llvm::GCNDPPCombineLegacyID = GCNDPPCombineLegacy::ID;
116 
117 FunctionPass *llvm::createGCNDPPCombinePass() {
118   return new GCNDPPCombineLegacy();
119 }
120 
121 bool GCNDPPCombine::isShrinkable(MachineInstr &MI) const {
122   unsigned Op = MI.getOpcode();
123   if (!TII->isVOP3(Op)) {
124     return false;
125   }
126   if (!TII->hasVALU32BitEncoding(Op)) {
127     LLVM_DEBUG(dbgs() << "  Inst hasn't e32 equivalent\n");
128     return false;
129   }
130   // Do not shrink True16 instructions pre-RA to avoid the restriction in
131   // register allocation from only being able to use 128 VGPRs
132   if (AMDGPU::isTrue16Inst(Op))
133     return false;
134   if (const auto *SDst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst)) {
135     // Give up if there are any uses of the sdst in carry-out or VOPC.
136     // The shrunken form of the instruction would write it to vcc instead of to
137     // a virtual register. If we rewrote the uses the shrinking would be
138     // possible.
139     if (!MRI->use_nodbg_empty(SDst->getReg()))
140       return false;
141   }
142   // check if other than abs|neg modifiers are set (opsel for example)
143   const int64_t Mask = ~(SISrcMods::ABS | SISrcMods::NEG);
144   if (!hasNoImmOrEqual(MI, AMDGPU::OpName::src0_modifiers, 0, Mask) ||
145       !hasNoImmOrEqual(MI, AMDGPU::OpName::src1_modifiers, 0, Mask) ||
146       !hasNoImmOrEqual(MI, AMDGPU::OpName::clamp, 0) ||
147       !hasNoImmOrEqual(MI, AMDGPU::OpName::omod, 0) ||
148       !hasNoImmOrEqual(MI, AMDGPU::OpName::byte_sel, 0)) {
149     LLVM_DEBUG(dbgs() << "  Inst has non-default modifiers\n");
150     return false;
151   }
152   return true;
153 }
154 
155 int GCNDPPCombine::getDPPOp(unsigned Op, bool IsShrinkable) const {
156   int DPP32 = AMDGPU::getDPPOp32(Op);
157   if (IsShrinkable) {
158     assert(DPP32 == -1);
159     int E32 = AMDGPU::getVOPe32(Op);
160     DPP32 = (E32 == -1) ? -1 : AMDGPU::getDPPOp32(E32);
161   }
162   if (DPP32 != -1 && TII->pseudoToMCOpcode(DPP32) != -1)
163     return DPP32;
164   int DPP64 = -1;
165   if (ST->hasVOP3DPP())
166     DPP64 = AMDGPU::getDPPOp64(Op);
167   if (DPP64 != -1 && TII->pseudoToMCOpcode(DPP64) != -1)
168     return DPP64;
169   return -1;
170 }
171 
172 // tracks the register operand definition and returns:
173 //   1. immediate operand used to initialize the register if found
174 //   2. nullptr if the register operand is undef
175 //   3. the operand itself otherwise
176 MachineOperand *GCNDPPCombine::getOldOpndValue(MachineOperand &OldOpnd) const {
177   auto *Def = getVRegSubRegDef(getRegSubRegPair(OldOpnd), *MRI);
178   if (!Def)
179     return nullptr;
180 
181   switch(Def->getOpcode()) {
182   default: break;
183   case AMDGPU::IMPLICIT_DEF:
184     return nullptr;
185   case AMDGPU::COPY:
186   case AMDGPU::V_MOV_B32_e32:
187   case AMDGPU::V_MOV_B64_PSEUDO:
188   case AMDGPU::V_MOV_B64_e32:
189   case AMDGPU::V_MOV_B64_e64: {
190     auto &Op1 = Def->getOperand(1);
191     if (Op1.isImm())
192       return &Op1;
193     break;
194   }
195   }
196   return &OldOpnd;
197 }
198 
199 [[maybe_unused]] static unsigned getOperandSize(MachineInstr &MI, unsigned Idx,
200                                MachineRegisterInfo &MRI) {
201   int16_t RegClass = MI.getDesc().operands()[Idx].RegClass;
202   if (RegClass == -1)
203     return 0;
204 
205   const TargetRegisterInfo *TRI = MRI.getTargetRegisterInfo();
206   return TRI->getRegSizeInBits(*TRI->getRegClass(RegClass));
207 }
208 
209 MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI,
210                                            MachineInstr &MovMI,
211                                            RegSubRegPair CombOldVGPR,
212                                            bool CombBCZ,
213                                            bool IsShrinkable) const {
214   assert(MovMI.getOpcode() == AMDGPU::V_MOV_B32_dpp ||
215          MovMI.getOpcode() == AMDGPU::V_MOV_B64_dpp ||
216          MovMI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO);
217 
218   bool HasVOP3DPP = ST->hasVOP3DPP();
219   auto OrigOp = OrigMI.getOpcode();
220   auto DPPOp = getDPPOp(OrigOp, IsShrinkable);
221   if (DPPOp == -1) {
222     LLVM_DEBUG(dbgs() << "  failed: no DPP opcode\n");
223     return nullptr;
224   }
225   int OrigOpE32 = AMDGPU::getVOPe32(OrigOp);
226   // Prior checks cover Mask with VOPC condition, but not on purpose
227   auto *RowMaskOpnd = TII->getNamedOperand(MovMI, AMDGPU::OpName::row_mask);
228   assert(RowMaskOpnd && RowMaskOpnd->isImm());
229   auto *BankMaskOpnd = TII->getNamedOperand(MovMI, AMDGPU::OpName::bank_mask);
230   assert(BankMaskOpnd && BankMaskOpnd->isImm());
231   const bool MaskAllLanes =
232       RowMaskOpnd->getImm() == 0xF && BankMaskOpnd->getImm() == 0xF;
233   (void)MaskAllLanes;
234   assert((MaskAllLanes ||
235           !(TII->isVOPC(DPPOp) || (TII->isVOP3(DPPOp) && OrigOpE32 != -1 &&
236                                    TII->isVOPC(OrigOpE32)))) &&
237          "VOPC cannot form DPP unless mask is full");
238 
239   auto DPPInst = BuildMI(*OrigMI.getParent(), OrigMI,
240                          OrigMI.getDebugLoc(), TII->get(DPPOp))
241     .setMIFlags(OrigMI.getFlags());
242 
243   bool Fail = false;
244   do {
245     int NumOperands = 0;
246     if (auto *Dst = TII->getNamedOperand(OrigMI, AMDGPU::OpName::vdst)) {
247       DPPInst.add(*Dst);
248       ++NumOperands;
249     }
250     if (auto *SDst = TII->getNamedOperand(OrigMI, AMDGPU::OpName::sdst)) {
251       if (TII->isOperandLegal(*DPPInst.getInstr(), NumOperands, SDst)) {
252         DPPInst.add(*SDst);
253         ++NumOperands;
254       }
255       // If we shrunk a 64bit vop3b to 32bits, just ignore the sdst
256     }
257 
258     const int OldIdx = AMDGPU::getNamedOperandIdx(DPPOp, AMDGPU::OpName::old);
259     if (OldIdx != -1) {
260       assert(OldIdx == NumOperands);
261       assert(isOfRegClass(
262           CombOldVGPR,
263           *MRI->getRegClass(
264               TII->getNamedOperand(MovMI, AMDGPU::OpName::vdst)->getReg()),
265           *MRI));
266       auto *Def = getVRegSubRegDef(CombOldVGPR, *MRI);
267       DPPInst.addReg(CombOldVGPR.Reg, Def ? 0 : RegState::Undef,
268                      CombOldVGPR.SubReg);
269       ++NumOperands;
270     } else if (TII->isVOPC(DPPOp) || (TII->isVOP3(DPPOp) && OrigOpE32 != -1 &&
271                                       TII->isVOPC(OrigOpE32))) {
272       // VOPC DPP and VOPC promoted to VOP3 DPP do not have an old operand
273       // because they write to SGPRs not VGPRs
274     } else {
275       // TODO: this discards MAC/FMA instructions for now, let's add it later
276       LLVM_DEBUG(dbgs() << "  failed: no old operand in DPP instruction,"
277                            " TBD\n");
278       Fail = true;
279       break;
280     }
281 
282     auto *Mod0 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src0_modifiers);
283     if (Mod0) {
284       assert(NumOperands == AMDGPU::getNamedOperandIdx(DPPOp,
285                                           AMDGPU::OpName::src0_modifiers));
286       assert(HasVOP3DPP ||
287              (0LL == (Mod0->getImm() & ~(SISrcMods::ABS | SISrcMods::NEG))));
288       DPPInst.addImm(Mod0->getImm());
289       ++NumOperands;
290     } else if (AMDGPU::hasNamedOperand(DPPOp, AMDGPU::OpName::src0_modifiers)) {
291       DPPInst.addImm(0);
292       ++NumOperands;
293     }
294     auto *Src0 = TII->getNamedOperand(MovMI, AMDGPU::OpName::src0);
295     assert(Src0);
296     int Src0Idx = NumOperands;
297     if (!TII->isOperandLegal(*DPPInst.getInstr(), NumOperands, Src0)) {
298       LLVM_DEBUG(dbgs() << "  failed: src0 is illegal\n");
299       Fail = true;
300       break;
301     }
302     DPPInst.add(*Src0);
303     DPPInst->getOperand(NumOperands).setIsKill(false);
304     ++NumOperands;
305 
306     auto *Mod1 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src1_modifiers);
307     if (Mod1) {
308       assert(NumOperands == AMDGPU::getNamedOperandIdx(DPPOp,
309                                           AMDGPU::OpName::src1_modifiers));
310       assert(HasVOP3DPP ||
311              (0LL == (Mod1->getImm() & ~(SISrcMods::ABS | SISrcMods::NEG))));
312       DPPInst.addImm(Mod1->getImm());
313       ++NumOperands;
314     } else if (AMDGPU::hasNamedOperand(DPPOp, AMDGPU::OpName::src1_modifiers)) {
315       DPPInst.addImm(0);
316       ++NumOperands;
317     }
318     auto *Src1 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src1);
319     if (Src1) {
320       int OpNum = NumOperands;
321       // If subtarget does not support SGPRs for src1 operand then the
322       // requirements are the same as for src0. We check src0 instead because
323       // pseudos are shared between subtargets and allow SGPR for src1 on all.
324       if (!ST->hasDPPSrc1SGPR()) {
325         assert(getOperandSize(*DPPInst, Src0Idx, *MRI) ==
326                    getOperandSize(*DPPInst, NumOperands, *MRI) &&
327                "Src0 and Src1 operands should have the same size");
328         OpNum = Src0Idx;
329       }
330       if (!TII->isOperandLegal(*DPPInst.getInstr(), OpNum, Src1)) {
331         LLVM_DEBUG(dbgs() << "  failed: src1 is illegal\n");
332         Fail = true;
333         break;
334       }
335       DPPInst.add(*Src1);
336       ++NumOperands;
337     }
338 
339     auto *Mod2 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src2_modifiers);
340     if (Mod2) {
341       assert(NumOperands ==
342              AMDGPU::getNamedOperandIdx(DPPOp, AMDGPU::OpName::src2_modifiers));
343       assert(HasVOP3DPP ||
344              (0LL == (Mod2->getImm() & ~(SISrcMods::ABS | SISrcMods::NEG))));
345       DPPInst.addImm(Mod2->getImm());
346       ++NumOperands;
347     }
348     auto *Src2 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src2);
349     if (Src2) {
350       if (!TII->getNamedOperand(*DPPInst.getInstr(), AMDGPU::OpName::src2) ||
351           !TII->isOperandLegal(*DPPInst.getInstr(), NumOperands, Src2)) {
352         LLVM_DEBUG(dbgs() << "  failed: src2 is illegal\n");
353         Fail = true;
354         break;
355       }
356       DPPInst.add(*Src2);
357       ++NumOperands;
358     }
359 
360     if (HasVOP3DPP) {
361       auto *ClampOpr = TII->getNamedOperand(OrigMI, AMDGPU::OpName::clamp);
362       if (ClampOpr && AMDGPU::hasNamedOperand(DPPOp, AMDGPU::OpName::clamp)) {
363         DPPInst.addImm(ClampOpr->getImm());
364       }
365       auto *VdstInOpr = TII->getNamedOperand(OrigMI, AMDGPU::OpName::vdst_in);
366       if (VdstInOpr &&
367           AMDGPU::hasNamedOperand(DPPOp, AMDGPU::OpName::vdst_in)) {
368         DPPInst.add(*VdstInOpr);
369       }
370       auto *OmodOpr = TII->getNamedOperand(OrigMI, AMDGPU::OpName::omod);
371       if (OmodOpr && AMDGPU::hasNamedOperand(DPPOp, AMDGPU::OpName::omod)) {
372         DPPInst.addImm(OmodOpr->getImm());
373       }
374       // Validate OP_SEL has to be set to all 0 and OP_SEL_HI has to be set to
375       // all 1.
376       if (TII->getNamedOperand(OrigMI, AMDGPU::OpName::op_sel)) {
377         int64_t OpSel = 0;
378         OpSel |= (Mod0 ? (!!(Mod0->getImm() & SISrcMods::OP_SEL_0) << 0) : 0);
379         OpSel |= (Mod1 ? (!!(Mod1->getImm() & SISrcMods::OP_SEL_0) << 1) : 0);
380         OpSel |= (Mod2 ? (!!(Mod2->getImm() & SISrcMods::OP_SEL_0) << 2) : 0);
381         if (Mod0 && TII->isVOP3(OrigMI) && !TII->isVOP3P(OrigMI))
382           OpSel |= !!(Mod0->getImm() & SISrcMods::DST_OP_SEL) << 3;
383 
384         if (OpSel != 0) {
385           LLVM_DEBUG(dbgs() << "  failed: op_sel must be zero\n");
386           Fail = true;
387           break;
388         }
389         if (AMDGPU::hasNamedOperand(DPPOp, AMDGPU::OpName::op_sel))
390           DPPInst.addImm(OpSel);
391       }
392       if (TII->getNamedOperand(OrigMI, AMDGPU::OpName::op_sel_hi)) {
393         int64_t OpSelHi = 0;
394         OpSelHi |= (Mod0 ? (!!(Mod0->getImm() & SISrcMods::OP_SEL_1) << 0) : 0);
395         OpSelHi |= (Mod1 ? (!!(Mod1->getImm() & SISrcMods::OP_SEL_1) << 1) : 0);
396         OpSelHi |= (Mod2 ? (!!(Mod2->getImm() & SISrcMods::OP_SEL_1) << 2) : 0);
397 
398         // Only vop3p has op_sel_hi, and all vop3p have 3 operands, so check
399         // the bitmask for 3 op_sel_hi bits set
400         assert(Src2 && "Expected vop3p with 3 operands");
401         if (OpSelHi != 7) {
402           LLVM_DEBUG(dbgs() << "  failed: op_sel_hi must be all set to one\n");
403           Fail = true;
404           break;
405         }
406         if (AMDGPU::hasNamedOperand(DPPOp, AMDGPU::OpName::op_sel_hi))
407           DPPInst.addImm(OpSelHi);
408       }
409       auto *NegOpr = TII->getNamedOperand(OrigMI, AMDGPU::OpName::neg_lo);
410       if (NegOpr && AMDGPU::hasNamedOperand(DPPOp, AMDGPU::OpName::neg_lo)) {
411         DPPInst.addImm(NegOpr->getImm());
412       }
413       auto *NegHiOpr = TII->getNamedOperand(OrigMI, AMDGPU::OpName::neg_hi);
414       if (NegHiOpr && AMDGPU::hasNamedOperand(DPPOp, AMDGPU::OpName::neg_hi)) {
415         DPPInst.addImm(NegHiOpr->getImm());
416       }
417       auto *ByteSelOpr = TII->getNamedOperand(OrigMI, AMDGPU::OpName::byte_sel);
418       if (ByteSelOpr &&
419           AMDGPU::hasNamedOperand(DPPOp, AMDGPU::OpName::byte_sel)) {
420         DPPInst.addImm(ByteSelOpr->getImm());
421       }
422     }
423     DPPInst.add(*TII->getNamedOperand(MovMI, AMDGPU::OpName::dpp_ctrl));
424     DPPInst.add(*TII->getNamedOperand(MovMI, AMDGPU::OpName::row_mask));
425     DPPInst.add(*TII->getNamedOperand(MovMI, AMDGPU::OpName::bank_mask));
426     DPPInst.addImm(CombBCZ ? 1 : 0);
427   } while (false);
428 
429   if (Fail) {
430     DPPInst.getInstr()->eraseFromParent();
431     return nullptr;
432   }
433   LLVM_DEBUG(dbgs() << "  combined:  " << *DPPInst.getInstr());
434   return DPPInst.getInstr();
435 }
436 
437 static bool isIdentityValue(unsigned OrigMIOp, MachineOperand *OldOpnd) {
438   assert(OldOpnd->isImm());
439   switch (OrigMIOp) {
440   default: break;
441   case AMDGPU::V_ADD_U32_e32:
442   case AMDGPU::V_ADD_U32_e64:
443   case AMDGPU::V_ADD_CO_U32_e32:
444   case AMDGPU::V_ADD_CO_U32_e64:
445   case AMDGPU::V_OR_B32_e32:
446   case AMDGPU::V_OR_B32_e64:
447   case AMDGPU::V_SUBREV_U32_e32:
448   case AMDGPU::V_SUBREV_U32_e64:
449   case AMDGPU::V_SUBREV_CO_U32_e32:
450   case AMDGPU::V_SUBREV_CO_U32_e64:
451   case AMDGPU::V_MAX_U32_e32:
452   case AMDGPU::V_MAX_U32_e64:
453   case AMDGPU::V_XOR_B32_e32:
454   case AMDGPU::V_XOR_B32_e64:
455     if (OldOpnd->getImm() == 0)
456       return true;
457     break;
458   case AMDGPU::V_AND_B32_e32:
459   case AMDGPU::V_AND_B32_e64:
460   case AMDGPU::V_MIN_U32_e32:
461   case AMDGPU::V_MIN_U32_e64:
462     if (static_cast<uint32_t>(OldOpnd->getImm()) ==
463         std::numeric_limits<uint32_t>::max())
464       return true;
465     break;
466   case AMDGPU::V_MIN_I32_e32:
467   case AMDGPU::V_MIN_I32_e64:
468     if (static_cast<int32_t>(OldOpnd->getImm()) ==
469         std::numeric_limits<int32_t>::max())
470       return true;
471     break;
472   case AMDGPU::V_MAX_I32_e32:
473   case AMDGPU::V_MAX_I32_e64:
474     if (static_cast<int32_t>(OldOpnd->getImm()) ==
475         std::numeric_limits<int32_t>::min())
476       return true;
477     break;
478   case AMDGPU::V_MUL_I32_I24_e32:
479   case AMDGPU::V_MUL_I32_I24_e64:
480   case AMDGPU::V_MUL_U32_U24_e32:
481   case AMDGPU::V_MUL_U32_U24_e64:
482     if (OldOpnd->getImm() == 1)
483       return true;
484     break;
485   }
486   return false;
487 }
488 
489 MachineInstr *GCNDPPCombine::createDPPInst(
490     MachineInstr &OrigMI, MachineInstr &MovMI, RegSubRegPair CombOldVGPR,
491     MachineOperand *OldOpndValue, bool CombBCZ, bool IsShrinkable) const {
492   assert(CombOldVGPR.Reg);
493   if (!CombBCZ && OldOpndValue && OldOpndValue->isImm()) {
494     auto *Src1 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src1);
495     if (!Src1 || !Src1->isReg()) {
496       LLVM_DEBUG(dbgs() << "  failed: no src1 or it isn't a register\n");
497       return nullptr;
498     }
499     if (!isIdentityValue(OrigMI.getOpcode(), OldOpndValue)) {
500       LLVM_DEBUG(dbgs() << "  failed: old immediate isn't an identity\n");
501       return nullptr;
502     }
503     CombOldVGPR = getRegSubRegPair(*Src1);
504     auto *MovDst = TII->getNamedOperand(MovMI, AMDGPU::OpName::vdst);
505     const TargetRegisterClass *RC = MRI->getRegClass(MovDst->getReg());
506     if (!isOfRegClass(CombOldVGPR, *RC, *MRI)) {
507       LLVM_DEBUG(dbgs() << "  failed: src1 has wrong register class\n");
508       return nullptr;
509     }
510   }
511   return createDPPInst(OrigMI, MovMI, CombOldVGPR, CombBCZ, IsShrinkable);
512 }
513 
514 // returns true if MI doesn't have OpndName immediate operand or the
515 // operand has Value
516 bool GCNDPPCombine::hasNoImmOrEqual(MachineInstr &MI, unsigned OpndName,
517                                     int64_t Value, int64_t Mask) const {
518   auto *Imm = TII->getNamedOperand(MI, OpndName);
519   if (!Imm)
520     return true;
521 
522   assert(Imm->isImm());
523   return (Imm->getImm() & Mask) == Value;
524 }
525 
526 bool GCNDPPCombine::combineDPPMov(MachineInstr &MovMI) const {
527   assert(MovMI.getOpcode() == AMDGPU::V_MOV_B32_dpp ||
528          MovMI.getOpcode() == AMDGPU::V_MOV_B64_dpp ||
529          MovMI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO);
530   LLVM_DEBUG(dbgs() << "\nDPP combine: " << MovMI);
531 
532   auto *DstOpnd = TII->getNamedOperand(MovMI, AMDGPU::OpName::vdst);
533   assert(DstOpnd && DstOpnd->isReg());
534   auto DPPMovReg = DstOpnd->getReg();
535   if (DPPMovReg.isPhysical()) {
536     LLVM_DEBUG(dbgs() << "  failed: dpp move writes physreg\n");
537     return false;
538   }
539   if (execMayBeModifiedBeforeAnyUse(*MRI, DPPMovReg, MovMI)) {
540     LLVM_DEBUG(dbgs() << "  failed: EXEC mask should remain the same"
541                          " for all uses\n");
542     return false;
543   }
544 
545   if (MovMI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO ||
546       MovMI.getOpcode() == AMDGPU::V_MOV_B64_dpp) {
547     auto *DppCtrl = TII->getNamedOperand(MovMI, AMDGPU::OpName::dpp_ctrl);
548     assert(DppCtrl && DppCtrl->isImm());
549     if (!AMDGPU::isLegalDPALU_DPPControl(DppCtrl->getImm())) {
550       LLVM_DEBUG(dbgs() << "  failed: 64 bit dpp move uses unsupported"
551                            " control value\n");
552       // Let it split, then control may become legal.
553       return false;
554     }
555   }
556 
557   auto *RowMaskOpnd = TII->getNamedOperand(MovMI, AMDGPU::OpName::row_mask);
558   assert(RowMaskOpnd && RowMaskOpnd->isImm());
559   auto *BankMaskOpnd = TII->getNamedOperand(MovMI, AMDGPU::OpName::bank_mask);
560   assert(BankMaskOpnd && BankMaskOpnd->isImm());
561   const bool MaskAllLanes = RowMaskOpnd->getImm() == 0xF &&
562                             BankMaskOpnd->getImm() == 0xF;
563 
564   auto *BCZOpnd = TII->getNamedOperand(MovMI, AMDGPU::OpName::bound_ctrl);
565   assert(BCZOpnd && BCZOpnd->isImm());
566   bool BoundCtrlZero = BCZOpnd->getImm();
567 
568   auto *OldOpnd = TII->getNamedOperand(MovMI, AMDGPU::OpName::old);
569   auto *SrcOpnd = TII->getNamedOperand(MovMI, AMDGPU::OpName::src0);
570   assert(OldOpnd && OldOpnd->isReg());
571   assert(SrcOpnd && SrcOpnd->isReg());
572   if (OldOpnd->getReg().isPhysical() || SrcOpnd->getReg().isPhysical()) {
573     LLVM_DEBUG(dbgs() << "  failed: dpp move reads physreg\n");
574     return false;
575   }
576 
577   auto * const OldOpndValue = getOldOpndValue(*OldOpnd);
578   // OldOpndValue is either undef (IMPLICIT_DEF) or immediate or something else
579   // We could use: assert(!OldOpndValue || OldOpndValue->isImm())
580   // but the third option is used to distinguish undef from non-immediate
581   // to reuse IMPLICIT_DEF instruction later
582   assert(!OldOpndValue || OldOpndValue->isImm() || OldOpndValue == OldOpnd);
583 
584   bool CombBCZ = false;
585 
586   if (MaskAllLanes && BoundCtrlZero) { // [1]
587     CombBCZ = true;
588   } else {
589     if (!OldOpndValue || !OldOpndValue->isImm()) {
590       LLVM_DEBUG(dbgs() << "  failed: the DPP mov isn't combinable\n");
591       return false;
592     }
593 
594     if (OldOpndValue->getImm() == 0) {
595       if (MaskAllLanes) {
596         assert(!BoundCtrlZero); // by check [1]
597         CombBCZ = true;
598       }
599     } else if (BoundCtrlZero) {
600       assert(!MaskAllLanes); // by check [1]
601       LLVM_DEBUG(dbgs() <<
602         "  failed: old!=0 and bctrl:0 and not all lanes isn't combinable\n");
603       return false;
604     }
605   }
606 
607   LLVM_DEBUG(dbgs() << "  old=";
608     if (!OldOpndValue)
609       dbgs() << "undef";
610     else
611       dbgs() << *OldOpndValue;
612     dbgs() << ", bound_ctrl=" << CombBCZ << '\n');
613 
614   SmallVector<MachineInstr*, 4> OrigMIs, DPPMIs;
615   DenseMap<MachineInstr*, SmallVector<unsigned, 4>> RegSeqWithOpNos;
616   auto CombOldVGPR = getRegSubRegPair(*OldOpnd);
617   // try to reuse previous old reg if its undefined (IMPLICIT_DEF)
618   if (CombBCZ && OldOpndValue) { // CombOldVGPR should be undef
619     const TargetRegisterClass *RC = MRI->getRegClass(DPPMovReg);
620     CombOldVGPR = RegSubRegPair(
621       MRI->createVirtualRegister(RC));
622     auto UndefInst = BuildMI(*MovMI.getParent(), MovMI, MovMI.getDebugLoc(),
623                              TII->get(AMDGPU::IMPLICIT_DEF), CombOldVGPR.Reg);
624     DPPMIs.push_back(UndefInst.getInstr());
625   }
626 
627   OrigMIs.push_back(&MovMI);
628   bool Rollback = true;
629   SmallVector<MachineOperand*, 16> Uses;
630 
631   for (auto &Use : MRI->use_nodbg_operands(DPPMovReg)) {
632     Uses.push_back(&Use);
633   }
634 
635   while (!Uses.empty()) {
636     MachineOperand *Use = Uses.pop_back_val();
637     Rollback = true;
638 
639     auto &OrigMI = *Use->getParent();
640     LLVM_DEBUG(dbgs() << "  try: " << OrigMI);
641 
642     auto OrigOp = OrigMI.getOpcode();
643     assert((TII->get(OrigOp).getSize() != 4 || !AMDGPU::isTrue16Inst(OrigOp)) &&
644            "There should not be e32 True16 instructions pre-RA");
645     if (OrigOp == AMDGPU::REG_SEQUENCE) {
646       Register FwdReg = OrigMI.getOperand(0).getReg();
647       unsigned FwdSubReg = 0;
648 
649       if (execMayBeModifiedBeforeAnyUse(*MRI, FwdReg, OrigMI)) {
650         LLVM_DEBUG(dbgs() << "  failed: EXEC mask should remain the same"
651                              " for all uses\n");
652         break;
653       }
654 
655       unsigned OpNo, E = OrigMI.getNumOperands();
656       for (OpNo = 1; OpNo < E; OpNo += 2) {
657         if (OrigMI.getOperand(OpNo).getReg() == DPPMovReg) {
658           FwdSubReg = OrigMI.getOperand(OpNo + 1).getImm();
659           break;
660         }
661       }
662 
663       if (!FwdSubReg)
664         break;
665 
666       for (auto &Op : MRI->use_nodbg_operands(FwdReg)) {
667         if (Op.getSubReg() == FwdSubReg)
668           Uses.push_back(&Op);
669       }
670       RegSeqWithOpNos[&OrigMI].push_back(OpNo);
671       continue;
672     }
673 
674     bool IsShrinkable = isShrinkable(OrigMI);
675     if (!(IsShrinkable ||
676           ((TII->isVOP3P(OrigOp) || TII->isVOPC(OrigOp) ||
677             TII->isVOP3(OrigOp)) &&
678            ST->hasVOP3DPP()) ||
679           TII->isVOP1(OrigOp) || TII->isVOP2(OrigOp))) {
680       LLVM_DEBUG(dbgs() << "  failed: not VOP1/2/3/3P/C\n");
681       break;
682     }
683     if (OrigMI.modifiesRegister(AMDGPU::EXEC, ST->getRegisterInfo())) {
684       LLVM_DEBUG(dbgs() << "  failed: can't combine v_cmpx\n");
685       break;
686     }
687 
688     auto *Src0 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src0);
689     auto *Src1 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src1);
690     if (Use != Src0 && !(Use == Src1 && OrigMI.isCommutable())) { // [1]
691       LLVM_DEBUG(dbgs() << "  failed: no suitable operands\n");
692       break;
693     }
694 
695     auto *Src2 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src2);
696     assert(Src0 && "Src1 without Src0?");
697     if ((Use == Src0 && ((Src1 && Src1->isIdenticalTo(*Src0)) ||
698                          (Src2 && Src2->isIdenticalTo(*Src0)))) ||
699         (Use == Src1 && (Src1->isIdenticalTo(*Src0) ||
700                          (Src2 && Src2->isIdenticalTo(*Src1))))) {
701       LLVM_DEBUG(
702           dbgs()
703           << "  " << OrigMI
704           << "  failed: DPP register is used more than once per instruction\n");
705       break;
706     }
707 
708     LLVM_DEBUG(dbgs() << "  combining: " << OrigMI);
709     if (Use == Src0) {
710       if (auto *DPPInst = createDPPInst(OrigMI, MovMI, CombOldVGPR,
711                                         OldOpndValue, CombBCZ, IsShrinkable)) {
712         DPPMIs.push_back(DPPInst);
713         Rollback = false;
714       }
715     } else {
716       assert(Use == Src1 && OrigMI.isCommutable()); // by check [1]
717       auto *BB = OrigMI.getParent();
718       auto *NewMI = BB->getParent()->CloneMachineInstr(&OrigMI);
719       BB->insert(OrigMI, NewMI);
720       if (TII->commuteInstruction(*NewMI)) {
721         LLVM_DEBUG(dbgs() << "  commuted:  " << *NewMI);
722         if (auto *DPPInst =
723                 createDPPInst(*NewMI, MovMI, CombOldVGPR, OldOpndValue, CombBCZ,
724                               IsShrinkable)) {
725           DPPMIs.push_back(DPPInst);
726           Rollback = false;
727         }
728       } else
729         LLVM_DEBUG(dbgs() << "  failed: cannot be commuted\n");
730       NewMI->eraseFromParent();
731     }
732     if (Rollback)
733       break;
734     OrigMIs.push_back(&OrigMI);
735   }
736 
737   Rollback |= !Uses.empty();
738 
739   for (auto *MI : *(Rollback? &DPPMIs : &OrigMIs))
740     MI->eraseFromParent();
741 
742   if (!Rollback) {
743     for (auto &S : RegSeqWithOpNos) {
744       if (MRI->use_nodbg_empty(S.first->getOperand(0).getReg())) {
745         S.first->eraseFromParent();
746         continue;
747       }
748       while (!S.second.empty())
749         S.first->getOperand(S.second.pop_back_val()).setIsUndef();
750     }
751   }
752 
753   return !Rollback;
754 }
755 
756 bool GCNDPPCombineLegacy::runOnMachineFunction(MachineFunction &MF) {
757   if (skipFunction(MF.getFunction()))
758     return false;
759 
760   return GCNDPPCombine().run(MF);
761 }
762 
763 bool GCNDPPCombine::run(MachineFunction &MF) {
764   ST = &MF.getSubtarget<GCNSubtarget>();
765   if (!ST->hasDPP())
766     return false;
767 
768   MRI = &MF.getRegInfo();
769   TII = ST->getInstrInfo();
770 
771   bool Changed = false;
772   for (auto &MBB : MF) {
773     for (MachineInstr &MI : llvm::make_early_inc_range(llvm::reverse(MBB))) {
774       if (MI.getOpcode() == AMDGPU::V_MOV_B32_dpp && combineDPPMov(MI)) {
775         Changed = true;
776         ++NumDPPMovsCombined;
777       } else if (MI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO ||
778                  MI.getOpcode() == AMDGPU::V_MOV_B64_dpp) {
779         if (ST->hasDPALU_DPP() && combineDPPMov(MI)) {
780           Changed = true;
781           ++NumDPPMovsCombined;
782         } else {
783           auto Split = TII->expandMovDPP64(MI);
784           for (auto *M : {Split.first, Split.second}) {
785             if (M && combineDPPMov(*M))
786               ++NumDPPMovsCombined;
787           }
788           Changed = true;
789         }
790       }
791     }
792   }
793   return Changed;
794 }
795 
796 PreservedAnalyses GCNDPPCombinePass::run(MachineFunction &MF,
797                                          MachineFunctionAnalysisManager &) {
798   MFPropsModifier _(*this, MF);
799 
800   if (MF.getFunction().hasOptNone())
801     return PreservedAnalyses::all();
802 
803   bool Changed = GCNDPPCombine().run(MF);
804   if (!Changed)
805     return PreservedAnalyses::all();
806 
807   auto PA = getMachineFunctionPassPreservedAnalyses();
808   PA.preserveSet<CFGAnalyses>();
809   return PA;
810 }
811