xref: /llvm-project/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp (revision 56ea488d8b8c897599f13c9f9e8058cb36ff808f)
1 //===-- SIFoldOperands.cpp - Fold operands --- ----------------------------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 /// \file
9 //===----------------------------------------------------------------------===//
10 //
11 
12 #include "AMDGPU.h"
13 #include "AMDGPUSubtarget.h"
14 #include "SIInstrInfo.h"
15 #include "SIMachineFunctionInfo.h"
16 #include "llvm/CodeGen/LiveIntervalAnalysis.h"
17 #include "llvm/CodeGen/MachineFunctionPass.h"
18 #include "llvm/CodeGen/MachineInstrBuilder.h"
19 #include "llvm/CodeGen/MachineRegisterInfo.h"
20 #include "llvm/Support/Debug.h"
21 #include "llvm/Support/raw_ostream.h"
22 #include "llvm/Target/TargetMachine.h"
23 
24 #define DEBUG_TYPE "si-fold-operands"
25 using namespace llvm;
26 
27 namespace {
28 
29 struct FoldCandidate {
30   MachineInstr *UseMI;
31   union {
32     MachineOperand *OpToFold;
33     uint64_t ImmToFold;
34     int FrameIndexToFold;
35   };
36   unsigned char UseOpNo;
37   MachineOperand::MachineOperandType Kind;
38 
39   FoldCandidate(MachineInstr *MI, unsigned OpNo, MachineOperand *FoldOp) :
40     UseMI(MI), OpToFold(nullptr), UseOpNo(OpNo), Kind(FoldOp->getType()) {
41     if (FoldOp->isImm()) {
42       ImmToFold = FoldOp->getImm();
43     } else if (FoldOp->isFI()) {
44       FrameIndexToFold = FoldOp->getIndex();
45     } else {
46       assert(FoldOp->isReg());
47       OpToFold = FoldOp;
48     }
49   }
50 
51   bool isFI() const {
52     return Kind == MachineOperand::MO_FrameIndex;
53   }
54 
55   bool isImm() const {
56     return Kind == MachineOperand::MO_Immediate;
57   }
58 
59   bool isReg() const {
60     return Kind == MachineOperand::MO_Register;
61   }
62 };
63 
64 class SIFoldOperands : public MachineFunctionPass {
65 public:
66   static char ID;
67   MachineRegisterInfo *MRI;
68   const SIInstrInfo *TII;
69   const SIRegisterInfo *TRI;
70   const SISubtarget *ST;
71 
72   void foldOperand(MachineOperand &OpToFold,
73                    MachineInstr *UseMI,
74                    unsigned UseOpIdx,
75                    SmallVectorImpl<FoldCandidate> &FoldList,
76                    SmallVectorImpl<MachineInstr *> &CopiesToReplace) const;
77 
78   void foldInstOperand(MachineInstr &MI, MachineOperand &OpToFold) const;
79 
80   const MachineOperand *isClamp(const MachineInstr &MI) const;
81   bool tryFoldClamp(MachineInstr &MI);
82 
83   std::pair<const MachineOperand *, int> isOMod(const MachineInstr &MI) const;
84   bool tryFoldOMod(MachineInstr &MI);
85 
86 public:
87   SIFoldOperands() : MachineFunctionPass(ID) {
88     initializeSIFoldOperandsPass(*PassRegistry::getPassRegistry());
89   }
90 
91   bool runOnMachineFunction(MachineFunction &MF) override;
92 
93   StringRef getPassName() const override { return "SI Fold Operands"; }
94 
95   void getAnalysisUsage(AnalysisUsage &AU) const override {
96     AU.setPreservesCFG();
97     MachineFunctionPass::getAnalysisUsage(AU);
98   }
99 };
100 
101 } // End anonymous namespace.
102 
103 INITIALIZE_PASS(SIFoldOperands, DEBUG_TYPE,
104                 "SI Fold Operands", false, false)
105 
106 char SIFoldOperands::ID = 0;
107 
108 char &llvm::SIFoldOperandsID = SIFoldOperands::ID;
109 
110 // Wrapper around isInlineConstant that understands special cases when
111 // instruction types are replaced during operand folding.
112 static bool isInlineConstantIfFolded(const SIInstrInfo *TII,
113                                      const MachineInstr &UseMI,
114                                      unsigned OpNo,
115                                      const MachineOperand &OpToFold) {
116   if (TII->isInlineConstant(UseMI, OpNo, OpToFold))
117     return true;
118 
119   unsigned Opc = UseMI.getOpcode();
120   switch (Opc) {
121   case AMDGPU::V_MAC_F32_e64:
122   case AMDGPU::V_MAC_F16_e64: {
123     // Special case for mac. Since this is replaced with mad when folded into
124     // src2, we need to check the legality for the final instruction.
125     int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
126     if (static_cast<int>(OpNo) == Src2Idx) {
127       bool IsF32 = Opc == AMDGPU::V_MAC_F32_e64;
128       const MCInstrDesc &MadDesc
129         = TII->get(IsF32 ? AMDGPU::V_MAD_F32 : AMDGPU::V_MAD_F16);
130       return TII->isInlineConstant(OpToFold, MadDesc.OpInfo[OpNo].OperandType);
131     }
132   }
133   default:
134     return false;
135   }
136 }
137 
138 FunctionPass *llvm::createSIFoldOperandsPass() {
139   return new SIFoldOperands();
140 }
141 
142 static bool updateOperand(FoldCandidate &Fold,
143                           const TargetRegisterInfo &TRI) {
144   MachineInstr *MI = Fold.UseMI;
145   MachineOperand &Old = MI->getOperand(Fold.UseOpNo);
146   assert(Old.isReg());
147 
148   if (Fold.isImm()) {
149     Old.ChangeToImmediate(Fold.ImmToFold);
150     return true;
151   }
152 
153   if (Fold.isFI()) {
154     Old.ChangeToFrameIndex(Fold.FrameIndexToFold);
155     return true;
156   }
157 
158   MachineOperand *New = Fold.OpToFold;
159   if (TargetRegisterInfo::isVirtualRegister(Old.getReg()) &&
160       TargetRegisterInfo::isVirtualRegister(New->getReg())) {
161     Old.substVirtReg(New->getReg(), New->getSubReg(), TRI);
162     return true;
163   }
164 
165   // FIXME: Handle physical registers.
166 
167   return false;
168 }
169 
170 static bool isUseMIInFoldList(ArrayRef<FoldCandidate> FoldList,
171                               const MachineInstr *MI) {
172   for (auto Candidate : FoldList) {
173     if (Candidate.UseMI == MI)
174       return true;
175   }
176   return false;
177 }
178 
179 static bool tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList,
180                              MachineInstr *MI, unsigned OpNo,
181                              MachineOperand *OpToFold,
182                              const SIInstrInfo *TII) {
183   if (!TII->isOperandLegal(*MI, OpNo, OpToFold)) {
184 
185     // Special case for v_mac_{f16, f32}_e64 if we are trying to fold into src2
186     unsigned Opc = MI->getOpcode();
187     if ((Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64) &&
188         (int)OpNo == AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)) {
189       bool IsF32 = Opc == AMDGPU::V_MAC_F32_e64;
190 
191       // Check if changing this to a v_mad_{f16, f32} instruction will allow us
192       // to fold the operand.
193       MI->setDesc(TII->get(IsF32 ? AMDGPU::V_MAD_F32 : AMDGPU::V_MAD_F16));
194       bool FoldAsMAD = tryAddToFoldList(FoldList, MI, OpNo, OpToFold, TII);
195       if (FoldAsMAD) {
196         MI->untieRegOperand(OpNo);
197         return true;
198       }
199       MI->setDesc(TII->get(Opc));
200     }
201 
202     // Special case for s_setreg_b32
203     if (Opc == AMDGPU::S_SETREG_B32 && OpToFold->isImm()) {
204       MI->setDesc(TII->get(AMDGPU::S_SETREG_IMM32_B32));
205       FoldList.push_back(FoldCandidate(MI, OpNo, OpToFold));
206       return true;
207     }
208 
209     // If we are already folding into another operand of MI, then
210     // we can't commute the instruction, otherwise we risk making the
211     // other fold illegal.
212     if (isUseMIInFoldList(FoldList, MI))
213       return false;
214 
215     // Operand is not legal, so try to commute the instruction to
216     // see if this makes it possible to fold.
217     unsigned CommuteIdx0 = TargetInstrInfo::CommuteAnyOperandIndex;
218     unsigned CommuteIdx1 = TargetInstrInfo::CommuteAnyOperandIndex;
219     bool CanCommute = TII->findCommutedOpIndices(*MI, CommuteIdx0, CommuteIdx1);
220 
221     if (CanCommute) {
222       if (CommuteIdx0 == OpNo)
223         OpNo = CommuteIdx1;
224       else if (CommuteIdx1 == OpNo)
225         OpNo = CommuteIdx0;
226     }
227 
228     // One of operands might be an Imm operand, and OpNo may refer to it after
229     // the call of commuteInstruction() below. Such situations are avoided
230     // here explicitly as OpNo must be a register operand to be a candidate
231     // for memory folding.
232     if (CanCommute && (!MI->getOperand(CommuteIdx0).isReg() ||
233                        !MI->getOperand(CommuteIdx1).isReg()))
234       return false;
235 
236     if (!CanCommute ||
237         !TII->commuteInstruction(*MI, false, CommuteIdx0, CommuteIdx1))
238       return false;
239 
240     if (!TII->isOperandLegal(*MI, OpNo, OpToFold))
241       return false;
242   }
243 
244   FoldList.push_back(FoldCandidate(MI, OpNo, OpToFold));
245   return true;
246 }
247 
248 // If the use operand doesn't care about the value, this may be an operand only
249 // used for register indexing, in which case it is unsafe to fold.
250 static bool isUseSafeToFold(const SIInstrInfo *TII,
251                             const MachineInstr &MI,
252                             const MachineOperand &UseMO) {
253   return !UseMO.isUndef() && !TII->isSDWA(MI);
254   //return !MI.hasRegisterImplicitUseOperand(UseMO.getReg());
255 }
256 
257 void SIFoldOperands::foldOperand(
258   MachineOperand &OpToFold,
259   MachineInstr *UseMI,
260   unsigned UseOpIdx,
261   SmallVectorImpl<FoldCandidate> &FoldList,
262   SmallVectorImpl<MachineInstr *> &CopiesToReplace) const {
263   const MachineOperand &UseOp = UseMI->getOperand(UseOpIdx);
264 
265   if (!isUseSafeToFold(TII, *UseMI, UseOp))
266     return;
267 
268   // FIXME: Fold operands with subregs.
269   if (UseOp.isReg() && OpToFold.isReg()) {
270     if (UseOp.isImplicit() || UseOp.getSubReg() != AMDGPU::NoSubRegister)
271       return;
272 
273     // Don't fold subregister extracts into tied operands, only if it is a full
274     // copy since a subregister use tied to a full register def doesn't really
275     // make sense. e.g. don't fold:
276     //
277     // %vreg1 = COPY %vreg0:sub1
278     // %vreg2<tied3> = V_MAC_{F16, F32} %vreg3, %vreg4, %vreg1<tied0>
279     //
280     //  into
281     // %vreg2<tied3> = V_MAC_{F16, F32} %vreg3, %vreg4, %vreg0:sub1<tied0>
282     if (UseOp.isTied() && OpToFold.getSubReg() != AMDGPU::NoSubRegister)
283       return;
284   }
285 
286   // Special case for REG_SEQUENCE: We can't fold literals into
287   // REG_SEQUENCE instructions, so we have to fold them into the
288   // uses of REG_SEQUENCE.
289   if (UseMI->isRegSequence()) {
290     unsigned RegSeqDstReg = UseMI->getOperand(0).getReg();
291     unsigned RegSeqDstSubReg = UseMI->getOperand(UseOpIdx + 1).getImm();
292 
293     for (MachineRegisterInfo::use_iterator
294            RSUse = MRI->use_begin(RegSeqDstReg), RSE = MRI->use_end();
295          RSUse != RSE; ++RSUse) {
296 
297       MachineInstr *RSUseMI = RSUse->getParent();
298       if (RSUse->getSubReg() != RegSeqDstSubReg)
299         continue;
300 
301       foldOperand(OpToFold, RSUseMI, RSUse.getOperandNo(), FoldList,
302                   CopiesToReplace);
303     }
304 
305     return;
306   }
307 
308 
309   bool FoldingImm = OpToFold.isImm();
310 
311   // In order to fold immediates into copies, we need to change the
312   // copy to a MOV.
313   if (FoldingImm && UseMI->isCopy()) {
314     unsigned DestReg = UseMI->getOperand(0).getReg();
315     const TargetRegisterClass *DestRC
316       = TargetRegisterInfo::isVirtualRegister(DestReg) ?
317       MRI->getRegClass(DestReg) :
318       TRI->getPhysRegClass(DestReg);
319 
320     unsigned MovOp = TII->getMovOpcode(DestRC);
321     if (MovOp == AMDGPU::COPY)
322       return;
323 
324     UseMI->setDesc(TII->get(MovOp));
325     CopiesToReplace.push_back(UseMI);
326   } else {
327     const MCInstrDesc &UseDesc = UseMI->getDesc();
328 
329     // Don't fold into target independent nodes.  Target independent opcodes
330     // don't have defined register classes.
331     if (UseDesc.isVariadic() ||
332         UseDesc.OpInfo[UseOpIdx].RegClass == -1)
333       return;
334   }
335 
336   if (!FoldingImm) {
337     tryAddToFoldList(FoldList, UseMI, UseOpIdx, &OpToFold, TII);
338 
339     // FIXME: We could try to change the instruction from 64-bit to 32-bit
340     // to enable more folding opportunites.  The shrink operands pass
341     // already does this.
342     return;
343   }
344 
345 
346   const MCInstrDesc &FoldDesc = OpToFold.getParent()->getDesc();
347   const TargetRegisterClass *FoldRC =
348     TRI->getRegClass(FoldDesc.OpInfo[0].RegClass);
349 
350 
351   // Split 64-bit constants into 32-bits for folding.
352   if (UseOp.getSubReg() && AMDGPU::getRegBitWidth(FoldRC->getID()) == 64) {
353     unsigned UseReg = UseOp.getReg();
354     const TargetRegisterClass *UseRC
355       = TargetRegisterInfo::isVirtualRegister(UseReg) ?
356       MRI->getRegClass(UseReg) :
357       TRI->getPhysRegClass(UseReg);
358 
359     if (AMDGPU::getRegBitWidth(UseRC->getID()) != 64)
360       return;
361 
362     APInt Imm(64, OpToFold.getImm());
363     if (UseOp.getSubReg() == AMDGPU::sub0) {
364       Imm = Imm.getLoBits(32);
365     } else {
366       assert(UseOp.getSubReg() == AMDGPU::sub1);
367       Imm = Imm.getHiBits(32);
368     }
369 
370     MachineOperand ImmOp = MachineOperand::CreateImm(Imm.getSExtValue());
371     tryAddToFoldList(FoldList, UseMI, UseOpIdx, &ImmOp, TII);
372     return;
373   }
374 
375 
376 
377   tryAddToFoldList(FoldList, UseMI, UseOpIdx, &OpToFold, TII);
378 }
379 
380 static bool evalBinaryInstruction(unsigned Opcode, int32_t &Result,
381                                   uint32_t LHS, uint32_t RHS) {
382   switch (Opcode) {
383   case AMDGPU::V_AND_B32_e64:
384   case AMDGPU::V_AND_B32_e32:
385   case AMDGPU::S_AND_B32:
386     Result = LHS & RHS;
387     return true;
388   case AMDGPU::V_OR_B32_e64:
389   case AMDGPU::V_OR_B32_e32:
390   case AMDGPU::S_OR_B32:
391     Result = LHS | RHS;
392     return true;
393   case AMDGPU::V_XOR_B32_e64:
394   case AMDGPU::V_XOR_B32_e32:
395   case AMDGPU::S_XOR_B32:
396     Result = LHS ^ RHS;
397     return true;
398   case AMDGPU::V_LSHL_B32_e64:
399   case AMDGPU::V_LSHL_B32_e32:
400   case AMDGPU::S_LSHL_B32:
401     // The instruction ignores the high bits for out of bounds shifts.
402     Result = LHS << (RHS & 31);
403     return true;
404   case AMDGPU::V_LSHLREV_B32_e64:
405   case AMDGPU::V_LSHLREV_B32_e32:
406     Result = RHS << (LHS & 31);
407     return true;
408   case AMDGPU::V_LSHR_B32_e64:
409   case AMDGPU::V_LSHR_B32_e32:
410   case AMDGPU::S_LSHR_B32:
411     Result = LHS >> (RHS & 31);
412     return true;
413   case AMDGPU::V_LSHRREV_B32_e64:
414   case AMDGPU::V_LSHRREV_B32_e32:
415     Result = RHS >> (LHS & 31);
416     return true;
417   case AMDGPU::V_ASHR_I32_e64:
418   case AMDGPU::V_ASHR_I32_e32:
419   case AMDGPU::S_ASHR_I32:
420     Result = static_cast<int32_t>(LHS) >> (RHS & 31);
421     return true;
422   case AMDGPU::V_ASHRREV_I32_e64:
423   case AMDGPU::V_ASHRREV_I32_e32:
424     Result = static_cast<int32_t>(RHS) >> (LHS & 31);
425     return true;
426   default:
427     return false;
428   }
429 }
430 
431 static unsigned getMovOpc(bool IsScalar) {
432   return IsScalar ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
433 }
434 
435 /// Remove any leftover implicit operands from mutating the instruction. e.g.
436 /// if we replace an s_and_b32 with a copy, we don't need the implicit scc def
437 /// anymore.
438 static void stripExtraCopyOperands(MachineInstr &MI) {
439   const MCInstrDesc &Desc = MI.getDesc();
440   unsigned NumOps = Desc.getNumOperands() +
441                     Desc.getNumImplicitUses() +
442                     Desc.getNumImplicitDefs();
443 
444   for (unsigned I = MI.getNumOperands() - 1; I >= NumOps; --I)
445     MI.RemoveOperand(I);
446 }
447 
448 static void mutateCopyOp(MachineInstr &MI, const MCInstrDesc &NewDesc) {
449   MI.setDesc(NewDesc);
450   stripExtraCopyOperands(MI);
451 }
452 
453 static MachineOperand *getImmOrMaterializedImm(MachineRegisterInfo &MRI,
454                                                MachineOperand &Op) {
455   if (Op.isReg()) {
456     // If this has a subregister, it obviously is a register source.
457     if (Op.getSubReg() != AMDGPU::NoSubRegister)
458       return &Op;
459 
460     MachineInstr *Def = MRI.getVRegDef(Op.getReg());
461     if (Def->isMoveImmediate()) {
462       MachineOperand &ImmSrc = Def->getOperand(1);
463       if (ImmSrc.isImm())
464         return &ImmSrc;
465     }
466   }
467 
468   return &Op;
469 }
470 
471 // Try to simplify operations with a constant that may appear after instruction
472 // selection.
473 // TODO: See if a frame index with a fixed offset can fold.
474 static bool tryConstantFoldOp(MachineRegisterInfo &MRI,
475                               const SIInstrInfo *TII,
476                               MachineInstr *MI,
477                               MachineOperand *ImmOp) {
478   unsigned Opc = MI->getOpcode();
479   if (Opc == AMDGPU::V_NOT_B32_e64 || Opc == AMDGPU::V_NOT_B32_e32 ||
480       Opc == AMDGPU::S_NOT_B32) {
481     MI->getOperand(1).ChangeToImmediate(~ImmOp->getImm());
482     mutateCopyOp(*MI, TII->get(getMovOpc(Opc == AMDGPU::S_NOT_B32)));
483     return true;
484   }
485 
486   int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
487   if (Src1Idx == -1)
488     return false;
489 
490   int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
491   MachineOperand *Src0 = getImmOrMaterializedImm(MRI, MI->getOperand(Src0Idx));
492   MachineOperand *Src1 = getImmOrMaterializedImm(MRI, MI->getOperand(Src1Idx));
493 
494   if (!Src0->isImm() && !Src1->isImm())
495     return false;
496 
497   // and k0, k1 -> v_mov_b32 (k0 & k1)
498   // or k0, k1 -> v_mov_b32 (k0 | k1)
499   // xor k0, k1 -> v_mov_b32 (k0 ^ k1)
500   if (Src0->isImm() && Src1->isImm()) {
501     int32_t NewImm;
502     if (!evalBinaryInstruction(Opc, NewImm, Src0->getImm(), Src1->getImm()))
503       return false;
504 
505     const SIRegisterInfo &TRI = TII->getRegisterInfo();
506     bool IsSGPR = TRI.isSGPRReg(MRI, MI->getOperand(0).getReg());
507 
508     // Be careful to change the right operand, src0 may belong to a different
509     // instruction.
510     MI->getOperand(Src0Idx).ChangeToImmediate(NewImm);
511     MI->RemoveOperand(Src1Idx);
512     mutateCopyOp(*MI, TII->get(getMovOpc(IsSGPR)));
513     return true;
514   }
515 
516   if (!MI->isCommutable())
517     return false;
518 
519   if (Src0->isImm() && !Src1->isImm()) {
520     std::swap(Src0, Src1);
521     std::swap(Src0Idx, Src1Idx);
522   }
523 
524   int32_t Src1Val = static_cast<int32_t>(Src1->getImm());
525   if (Opc == AMDGPU::V_OR_B32_e64 ||
526       Opc == AMDGPU::V_OR_B32_e32 ||
527       Opc == AMDGPU::S_OR_B32) {
528     if (Src1Val == 0) {
529       // y = or x, 0 => y = copy x
530       MI->RemoveOperand(Src1Idx);
531       mutateCopyOp(*MI, TII->get(AMDGPU::COPY));
532     } else if (Src1Val == -1) {
533       // y = or x, -1 => y = v_mov_b32 -1
534       MI->RemoveOperand(Src1Idx);
535       mutateCopyOp(*MI, TII->get(getMovOpc(Opc == AMDGPU::S_OR_B32)));
536     } else
537       return false;
538 
539     return true;
540   }
541 
542   if (MI->getOpcode() == AMDGPU::V_AND_B32_e64 ||
543       MI->getOpcode() == AMDGPU::V_AND_B32_e32 ||
544       MI->getOpcode() == AMDGPU::S_AND_B32) {
545     if (Src1Val == 0) {
546       // y = and x, 0 => y = v_mov_b32 0
547       MI->RemoveOperand(Src0Idx);
548       mutateCopyOp(*MI, TII->get(getMovOpc(Opc == AMDGPU::S_AND_B32)));
549     } else if (Src1Val == -1) {
550       // y = and x, -1 => y = copy x
551       MI->RemoveOperand(Src1Idx);
552       mutateCopyOp(*MI, TII->get(AMDGPU::COPY));
553       stripExtraCopyOperands(*MI);
554     } else
555       return false;
556 
557     return true;
558   }
559 
560   if (MI->getOpcode() == AMDGPU::V_XOR_B32_e64 ||
561       MI->getOpcode() == AMDGPU::V_XOR_B32_e32 ||
562       MI->getOpcode() == AMDGPU::S_XOR_B32) {
563     if (Src1Val == 0) {
564       // y = xor x, 0 => y = copy x
565       MI->RemoveOperand(Src1Idx);
566       mutateCopyOp(*MI, TII->get(AMDGPU::COPY));
567       return true;
568     }
569   }
570 
571   return false;
572 }
573 
574 // Try to fold an instruction into a simpler one
575 static bool tryFoldInst(const SIInstrInfo *TII,
576                         MachineInstr *MI) {
577   unsigned Opc = MI->getOpcode();
578 
579   if (Opc == AMDGPU::V_CNDMASK_B32_e32    ||
580       Opc == AMDGPU::V_CNDMASK_B32_e64    ||
581       Opc == AMDGPU::V_CNDMASK_B64_PSEUDO) {
582     const MachineOperand *Src0 = TII->getNamedOperand(*MI, AMDGPU::OpName::src0);
583     const MachineOperand *Src1 = TII->getNamedOperand(*MI, AMDGPU::OpName::src1);
584     if (Src1->isIdenticalTo(*Src0)) {
585       DEBUG(dbgs() << "Folded " << *MI << " into ");
586       int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
587       if (Src2Idx != -1)
588         MI->RemoveOperand(Src2Idx);
589       MI->RemoveOperand(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1));
590       mutateCopyOp(*MI, TII->get(Src0->isReg() ? (unsigned)AMDGPU::COPY
591                                                : getMovOpc(false)));
592       DEBUG(dbgs() << *MI << '\n');
593       return true;
594     }
595   }
596 
597   return false;
598 }
599 
600 void SIFoldOperands::foldInstOperand(MachineInstr &MI,
601                                      MachineOperand &OpToFold) const {
602   // We need mutate the operands of new mov instructions to add implicit
603   // uses of EXEC, but adding them invalidates the use_iterator, so defer
604   // this.
605   SmallVector<MachineInstr *, 4> CopiesToReplace;
606   SmallVector<FoldCandidate, 4> FoldList;
607   MachineOperand &Dst = MI.getOperand(0);
608 
609   bool FoldingImm = OpToFold.isImm() || OpToFold.isFI();
610   if (FoldingImm) {
611     unsigned NumLiteralUses = 0;
612     MachineOperand *NonInlineUse = nullptr;
613     int NonInlineUseOpNo = -1;
614 
615     MachineRegisterInfo::use_iterator NextUse, NextInstUse;
616     for (MachineRegisterInfo::use_iterator
617            Use = MRI->use_begin(Dst.getReg()), E = MRI->use_end();
618          Use != E; Use = NextUse) {
619       NextUse = std::next(Use);
620       MachineInstr *UseMI = Use->getParent();
621       unsigned OpNo = Use.getOperandNo();
622 
623       // Folding the immediate may reveal operations that can be constant
624       // folded or replaced with a copy. This can happen for example after
625       // frame indices are lowered to constants or from splitting 64-bit
626       // constants.
627       //
628       // We may also encounter cases where one or both operands are
629       // immediates materialized into a register, which would ordinarily not
630       // be folded due to multiple uses or operand constraints.
631 
632       if (OpToFold.isImm() && tryConstantFoldOp(*MRI, TII, UseMI, &OpToFold)) {
633         DEBUG(dbgs() << "Constant folded " << *UseMI <<'\n');
634 
635         // Some constant folding cases change the same immediate's use to a new
636         // instruction, e.g. and x, 0 -> 0. Make sure we re-visit the user
637         // again. The same constant folded instruction could also have a second
638         // use operand.
639         NextUse = MRI->use_begin(Dst.getReg());
640         continue;
641       }
642 
643       // Try to fold any inline immediate uses, and then only fold other
644       // constants if they have one use.
645       //
646       // The legality of the inline immediate must be checked based on the use
647       // operand, not the defining instruction, because 32-bit instructions
648       // with 32-bit inline immediate sources may be used to materialize
649       // constants used in 16-bit operands.
650       //
651       // e.g. it is unsafe to fold:
652       //  s_mov_b32 s0, 1.0    // materializes 0x3f800000
653       //  v_add_f16 v0, v1, s0 // 1.0 f16 inline immediate sees 0x00003c00
654 
655       // Folding immediates with more than one use will increase program size.
656       // FIXME: This will also reduce register usage, which may be better
657       // in some cases. A better heuristic is needed.
658       if (isInlineConstantIfFolded(TII, *UseMI, OpNo, OpToFold)) {
659         foldOperand(OpToFold, UseMI, OpNo, FoldList, CopiesToReplace);
660       } else {
661         if (++NumLiteralUses == 1) {
662           NonInlineUse = &*Use;
663           NonInlineUseOpNo = OpNo;
664         }
665       }
666     }
667 
668     if (NumLiteralUses == 1) {
669       MachineInstr *UseMI = NonInlineUse->getParent();
670       foldOperand(OpToFold, UseMI, NonInlineUseOpNo, FoldList, CopiesToReplace);
671     }
672   } else {
673     // Folding register.
674     for (MachineRegisterInfo::use_iterator
675            Use = MRI->use_begin(Dst.getReg()), E = MRI->use_end();
676          Use != E; ++Use) {
677       MachineInstr *UseMI = Use->getParent();
678 
679       foldOperand(OpToFold, UseMI, Use.getOperandNo(),
680                   FoldList, CopiesToReplace);
681     }
682   }
683 
684   MachineFunction *MF = MI.getParent()->getParent();
685   // Make sure we add EXEC uses to any new v_mov instructions created.
686   for (MachineInstr *Copy : CopiesToReplace)
687     Copy->addImplicitDefUseOperands(*MF);
688 
689   for (FoldCandidate &Fold : FoldList) {
690     if (updateOperand(Fold, *TRI)) {
691       // Clear kill flags.
692       if (Fold.isReg()) {
693         assert(Fold.OpToFold && Fold.OpToFold->isReg());
694         // FIXME: Probably shouldn't bother trying to fold if not an
695         // SGPR. PeepholeOptimizer can eliminate redundant VGPR->VGPR
696         // copies.
697         MRI->clearKillFlags(Fold.OpToFold->getReg());
698       }
699       DEBUG(dbgs() << "Folded source from " << MI << " into OpNo " <<
700             static_cast<int>(Fold.UseOpNo) << " of " << *Fold.UseMI << '\n');
701       tryFoldInst(TII, Fold.UseMI);
702     }
703   }
704 }
705 
706 const MachineOperand *SIFoldOperands::isClamp(const MachineInstr &MI) const {
707   unsigned Op = MI.getOpcode();
708   switch (Op) {
709   case AMDGPU::V_MAX_F32_e64:
710   case AMDGPU::V_MAX_F16_e64:
711   case AMDGPU::V_MAX_F64: {
712     if (!TII->getNamedOperand(MI, AMDGPU::OpName::clamp)->getImm())
713       return nullptr;
714 
715     // Make sure sources are identical.
716     const MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
717     const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
718     if (!Src0->isReg() || Src0->getSubReg() != Src1->getSubReg() ||
719         Src0->getSubReg() != AMDGPU::NoSubRegister)
720       return nullptr;
721 
722     // Can't fold up if we have modifiers.
723     if (TII->hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers) ||
724         TII->hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers) ||
725         TII->hasModifiersSet(MI, AMDGPU::OpName::omod))
726       return nullptr;
727     return Src0;
728   }
729   default:
730     return nullptr;
731   }
732 }
733 
734 // We obviously have multiple uses in a clamp since the register is used twice
735 // in the same instruction.
736 static bool hasOneNonDBGUseInst(const MachineRegisterInfo &MRI, unsigned Reg) {
737   int Count = 0;
738   for (auto I = MRI.use_instr_nodbg_begin(Reg), E = MRI.use_instr_nodbg_end();
739        I != E; ++I) {
740     if (++Count > 1)
741       return false;
742   }
743 
744   return true;
745 }
746 
747 bool SIFoldOperands::tryFoldClamp(MachineInstr &MI) {
748   const MachineOperand *ClampSrc = isClamp(MI);
749   if (!ClampSrc || !hasOneNonDBGUseInst(*MRI, ClampSrc->getReg()))
750     return false;
751 
752   MachineInstr *Def = MRI->getVRegDef(ClampSrc->getReg());
753   if (!TII->hasFPClamp(*Def))
754     return false;
755   MachineOperand *DefClamp = TII->getNamedOperand(*Def, AMDGPU::OpName::clamp);
756   if (!DefClamp)
757     return false;
758 
759   DEBUG(dbgs() << "Folding clamp " << *DefClamp << " into " << *Def << '\n');
760 
761   // Clamp is applied after omod, so it is OK if omod is set.
762   DefClamp->setImm(1);
763   MRI->replaceRegWith(MI.getOperand(0).getReg(), Def->getOperand(0).getReg());
764   MI.eraseFromParent();
765   return true;
766 }
767 
768 static int getOModValue(unsigned Opc, int64_t Val) {
769   switch (Opc) {
770   case AMDGPU::V_MUL_F32_e64: {
771     switch (static_cast<uint32_t>(Val)) {
772     case 0x3f000000: // 0.5
773       return SIOutMods::DIV2;
774     case 0x40000000: // 2.0
775       return SIOutMods::MUL2;
776     case 0x40800000: // 4.0
777       return SIOutMods::MUL4;
778     default:
779       return SIOutMods::NONE;
780     }
781   }
782   case AMDGPU::V_MUL_F16_e64: {
783     switch (static_cast<uint16_t>(Val)) {
784     case 0x3800: // 0.5
785       return SIOutMods::DIV2;
786     case 0x4000: // 2.0
787       return SIOutMods::MUL2;
788     case 0x4400: // 4.0
789       return SIOutMods::MUL4;
790     default:
791       return SIOutMods::NONE;
792     }
793   }
794   default:
795     llvm_unreachable("invalid mul opcode");
796   }
797 }
798 
799 // FIXME: Does this really not support denormals with f16?
800 // FIXME: Does this need to check IEEE mode bit? SNaNs are generally not
801 // handled, so will anything other than that break?
802 std::pair<const MachineOperand *, int>
803 SIFoldOperands::isOMod(const MachineInstr &MI) const {
804   unsigned Op = MI.getOpcode();
805   switch (Op) {
806   case AMDGPU::V_MUL_F32_e64:
807   case AMDGPU::V_MUL_F16_e64: {
808     // If output denormals are enabled, omod is ignored.
809     if ((Op == AMDGPU::V_MUL_F32_e64 && ST->hasFP32Denormals()) ||
810         (Op == AMDGPU::V_MUL_F16_e64 && ST->hasFP16Denormals()))
811       return std::make_pair(nullptr, SIOutMods::NONE);
812 
813     const MachineOperand *RegOp = nullptr;
814     const MachineOperand *ImmOp = nullptr;
815     const MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
816     const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
817     if (Src0->isImm()) {
818       ImmOp = Src0;
819       RegOp = Src1;
820     } else if (Src1->isImm()) {
821       ImmOp = Src1;
822       RegOp = Src0;
823     } else
824       return std::make_pair(nullptr, SIOutMods::NONE);
825 
826     int OMod = getOModValue(Op, ImmOp->getImm());
827     if (OMod == SIOutMods::NONE ||
828         TII->hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers) ||
829         TII->hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers) ||
830         TII->hasModifiersSet(MI, AMDGPU::OpName::omod) ||
831         TII->hasModifiersSet(MI, AMDGPU::OpName::clamp))
832       return std::make_pair(nullptr, SIOutMods::NONE);
833 
834     return std::make_pair(RegOp, OMod);
835   }
836   case AMDGPU::V_ADD_F32_e64:
837   case AMDGPU::V_ADD_F16_e64: {
838     // If output denormals are enabled, omod is ignored.
839     if ((Op == AMDGPU::V_ADD_F32_e64 && ST->hasFP32Denormals()) ||
840         (Op == AMDGPU::V_ADD_F16_e64 && ST->hasFP16Denormals()))
841       return std::make_pair(nullptr, SIOutMods::NONE);
842 
843     // Look through the DAGCombiner canonicalization fmul x, 2 -> fadd x, x
844     const MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
845     const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
846 
847     if (Src0->isReg() && Src1->isReg() && Src0->getReg() == Src1->getReg() &&
848         Src0->getSubReg() == Src1->getSubReg() &&
849         !TII->hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers) &&
850         !TII->hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers) &&
851         !TII->hasModifiersSet(MI, AMDGPU::OpName::clamp) &&
852         !TII->hasModifiersSet(MI, AMDGPU::OpName::omod))
853       return std::make_pair(Src0, SIOutMods::MUL2);
854 
855     return std::make_pair(nullptr, SIOutMods::NONE);
856   }
857   default:
858     return std::make_pair(nullptr, SIOutMods::NONE);
859   }
860 }
861 
862 // FIXME: Does this need to check IEEE bit on function?
863 bool SIFoldOperands::tryFoldOMod(MachineInstr &MI) {
864   const MachineOperand *RegOp;
865   int OMod;
866   std::tie(RegOp, OMod) = isOMod(MI);
867   if (OMod == SIOutMods::NONE || !RegOp->isReg() ||
868       RegOp->getSubReg() != AMDGPU::NoSubRegister ||
869       !hasOneNonDBGUseInst(*MRI, RegOp->getReg()))
870     return false;
871 
872   MachineInstr *Def = MRI->getVRegDef(RegOp->getReg());
873   MachineOperand *DefOMod = TII->getNamedOperand(*Def, AMDGPU::OpName::omod);
874   if (!DefOMod || DefOMod->getImm() != SIOutMods::NONE)
875     return false;
876 
877   // Clamp is applied after omod. If the source already has clamp set, don't
878   // fold it.
879   if (TII->hasModifiersSet(*Def, AMDGPU::OpName::clamp))
880     return false;
881 
882   DEBUG(dbgs() << "Folding omod " << MI << " into " << *Def << '\n');
883 
884   DefOMod->setImm(OMod);
885   MRI->replaceRegWith(MI.getOperand(0).getReg(), Def->getOperand(0).getReg());
886   MI.eraseFromParent();
887   return true;
888 }
889 
890 bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
891   if (skipFunction(*MF.getFunction()))
892     return false;
893 
894   MRI = &MF.getRegInfo();
895   ST = &MF.getSubtarget<SISubtarget>();
896   TII = ST->getInstrInfo();
897   TRI = &TII->getRegisterInfo();
898 
899   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
900 
901   // omod is ignored by hardware if IEEE bit is enabled. omod also does not
902   // correctly handle signed zeros.
903   //
904   // TODO: Check nsz on instructions when fast math flags are preserved to MI
905   // level.
906   bool IsIEEEMode = ST->enableIEEEBit(MF) || !MFI->hasNoSignedZerosFPMath();
907 
908   for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
909        BI != BE; ++BI) {
910 
911     MachineBasicBlock &MBB = *BI;
912     MachineBasicBlock::iterator I, Next;
913     for (I = MBB.begin(); I != MBB.end(); I = Next) {
914       Next = std::next(I);
915       MachineInstr &MI = *I;
916 
917       tryFoldInst(TII, &MI);
918 
919       if (!TII->isFoldableCopy(MI)) {
920         if (IsIEEEMode || !tryFoldOMod(MI))
921           tryFoldClamp(MI);
922         continue;
923       }
924 
925       MachineOperand &OpToFold = MI.getOperand(1);
926       bool FoldingImm = OpToFold.isImm() || OpToFold.isFI();
927 
928       // FIXME: We could also be folding things like TargetIndexes.
929       if (!FoldingImm && !OpToFold.isReg())
930         continue;
931 
932       if (OpToFold.isReg() &&
933           !TargetRegisterInfo::isVirtualRegister(OpToFold.getReg()))
934         continue;
935 
936       // Prevent folding operands backwards in the function. For example,
937       // the COPY opcode must not be replaced by 1 in this example:
938       //
939       //    %vreg3<def> = COPY %VGPR0; VGPR_32:%vreg3
940       //    ...
941       //    %VGPR0<def> = V_MOV_B32_e32 1, %EXEC<imp-use>
942       MachineOperand &Dst = MI.getOperand(0);
943       if (Dst.isReg() &&
944           !TargetRegisterInfo::isVirtualRegister(Dst.getReg()))
945         continue;
946 
947       foldInstOperand(MI, OpToFold);
948     }
949   }
950   return false;
951 }
952