xref: /llvm-project/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp (revision 2bc198a333a3e6cf6b65e6a4d28018ab8a4efe18)
1 //===-- SIFoldOperands.cpp - Fold operands --- ----------------------------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 /// \file
9 //===----------------------------------------------------------------------===//
10 //
11 
12 #include "AMDGPU.h"
13 #include "AMDGPUSubtarget.h"
14 #include "SIInstrInfo.h"
15 #include "llvm/CodeGen/LiveIntervalAnalysis.h"
16 #include "llvm/CodeGen/MachineFunctionPass.h"
17 #include "llvm/CodeGen/MachineInstrBuilder.h"
18 #include "llvm/CodeGen/MachineRegisterInfo.h"
19 #include "llvm/Support/Debug.h"
20 #include "llvm/Support/raw_ostream.h"
21 #include "llvm/Target/TargetMachine.h"
22 
23 #define DEBUG_TYPE "si-fold-operands"
24 using namespace llvm;
25 
26 namespace {
27 
28 class SIFoldOperands : public MachineFunctionPass {
29 public:
30   static char ID;
31 
32 public:
33   SIFoldOperands() : MachineFunctionPass(ID) {
34     initializeSIFoldOperandsPass(*PassRegistry::getPassRegistry());
35   }
36 
37   bool runOnMachineFunction(MachineFunction &MF) override;
38 
39   const char *getPassName() const override {
40     return "SI Fold Operands";
41   }
42 
43   void getAnalysisUsage(AnalysisUsage &AU) const override {
44     AU.setPreservesCFG();
45     MachineFunctionPass::getAnalysisUsage(AU);
46   }
47 };
48 
49 struct FoldCandidate {
50   MachineInstr *UseMI;
51   union {
52     MachineOperand *OpToFold;
53     uint64_t ImmToFold;
54     int FrameIndexToFold;
55   };
56   unsigned char UseOpNo;
57   MachineOperand::MachineOperandType Kind;
58 
59   FoldCandidate(MachineInstr *MI, unsigned OpNo, MachineOperand *FoldOp) :
60     UseMI(MI), OpToFold(nullptr), UseOpNo(OpNo), Kind(FoldOp->getType()) {
61     if (FoldOp->isImm()) {
62       ImmToFold = FoldOp->getImm();
63     } else if (FoldOp->isFI()) {
64       FrameIndexToFold = FoldOp->getIndex();
65     } else {
66       assert(FoldOp->isReg());
67       OpToFold = FoldOp;
68     }
69   }
70 
71   bool isFI() const {
72     return Kind == MachineOperand::MO_FrameIndex;
73   }
74 
75   bool isImm() const {
76     return Kind == MachineOperand::MO_Immediate;
77   }
78 
79   bool isReg() const {
80     return Kind == MachineOperand::MO_Register;
81   }
82 };
83 
84 } // End anonymous namespace.
85 
86 INITIALIZE_PASS(SIFoldOperands, DEBUG_TYPE,
87                 "SI Fold Operands", false, false)
88 
89 char SIFoldOperands::ID = 0;
90 
91 char &llvm::SIFoldOperandsID = SIFoldOperands::ID;
92 
93 FunctionPass *llvm::createSIFoldOperandsPass() {
94   return new SIFoldOperands();
95 }
96 
97 static bool isSafeToFold(unsigned Opcode) {
98   switch(Opcode) {
99   case AMDGPU::V_MOV_B32_e32:
100   case AMDGPU::V_MOV_B32_e64:
101   case AMDGPU::V_MOV_B64_PSEUDO:
102   case AMDGPU::S_MOV_B32:
103   case AMDGPU::S_MOV_B64:
104   case AMDGPU::COPY:
105     return true;
106   default:
107     return false;
108   }
109 }
110 
111 static bool updateOperand(FoldCandidate &Fold,
112                           const TargetRegisterInfo &TRI) {
113   MachineInstr *MI = Fold.UseMI;
114   MachineOperand &Old = MI->getOperand(Fold.UseOpNo);
115   assert(Old.isReg());
116 
117   if (Fold.isImm()) {
118     Old.ChangeToImmediate(Fold.ImmToFold);
119     return true;
120   }
121 
122   if (Fold.isFI()) {
123     Old.ChangeToFrameIndex(Fold.FrameIndexToFold);
124     return true;
125   }
126 
127   MachineOperand *New = Fold.OpToFold;
128   if (TargetRegisterInfo::isVirtualRegister(Old.getReg()) &&
129       TargetRegisterInfo::isVirtualRegister(New->getReg())) {
130     Old.substVirtReg(New->getReg(), New->getSubReg(), TRI);
131     return true;
132   }
133 
134   // FIXME: Handle physical registers.
135 
136   return false;
137 }
138 
139 static bool isUseMIInFoldList(const std::vector<FoldCandidate> &FoldList,
140                               const MachineInstr *MI) {
141   for (auto Candidate : FoldList) {
142     if (Candidate.UseMI == MI)
143       return true;
144   }
145   return false;
146 }
147 
148 static bool tryAddToFoldList(std::vector<FoldCandidate> &FoldList,
149                              MachineInstr *MI, unsigned OpNo,
150                              MachineOperand *OpToFold,
151                              const SIInstrInfo *TII) {
152   if (!TII->isOperandLegal(*MI, OpNo, OpToFold)) {
153 
154     // Special case for v_mac_f32_e64 if we are trying to fold into src2
155     unsigned Opc = MI->getOpcode();
156     if (Opc == AMDGPU::V_MAC_F32_e64 &&
157         (int)OpNo == AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)) {
158       // Check if changing this to a v_mad_f32 instruction will allow us to
159       // fold the operand.
160       MI->setDesc(TII->get(AMDGPU::V_MAD_F32));
161       bool FoldAsMAD = tryAddToFoldList(FoldList, MI, OpNo, OpToFold, TII);
162       if (FoldAsMAD) {
163         MI->untieRegOperand(OpNo);
164         return true;
165       }
166       MI->setDesc(TII->get(Opc));
167     }
168 
169     // If we are already folding into another operand of MI, then
170     // we can't commute the instruction, otherwise we risk making the
171     // other fold illegal.
172     if (isUseMIInFoldList(FoldList, MI))
173       return false;
174 
175     // Operand is not legal, so try to commute the instruction to
176     // see if this makes it possible to fold.
177     unsigned CommuteIdx0 = TargetInstrInfo::CommuteAnyOperandIndex;
178     unsigned CommuteIdx1 = TargetInstrInfo::CommuteAnyOperandIndex;
179     bool CanCommute = TII->findCommutedOpIndices(*MI, CommuteIdx0, CommuteIdx1);
180 
181     if (CanCommute) {
182       if (CommuteIdx0 == OpNo)
183         OpNo = CommuteIdx1;
184       else if (CommuteIdx1 == OpNo)
185         OpNo = CommuteIdx0;
186     }
187 
188     // One of operands might be an Imm operand, and OpNo may refer to it after
189     // the call of commuteInstruction() below. Such situations are avoided
190     // here explicitly as OpNo must be a register operand to be a candidate
191     // for memory folding.
192     if (CanCommute && (!MI->getOperand(CommuteIdx0).isReg() ||
193                        !MI->getOperand(CommuteIdx1).isReg()))
194       return false;
195 
196     if (!CanCommute ||
197         !TII->commuteInstruction(*MI, false, CommuteIdx0, CommuteIdx1))
198       return false;
199 
200     if (!TII->isOperandLegal(*MI, OpNo, OpToFold))
201       return false;
202   }
203 
204   FoldList.push_back(FoldCandidate(MI, OpNo, OpToFold));
205   return true;
206 }
207 
208 static void foldOperand(MachineOperand &OpToFold, MachineInstr *UseMI,
209                         unsigned UseOpIdx,
210                         std::vector<FoldCandidate> &FoldList,
211                         SmallVectorImpl<MachineInstr *> &CopiesToReplace,
212                         const SIInstrInfo *TII, const SIRegisterInfo &TRI,
213                         MachineRegisterInfo &MRI) {
214   const MachineOperand &UseOp = UseMI->getOperand(UseOpIdx);
215 
216   // FIXME: Fold operands with subregs.
217   if (UseOp.isReg() && OpToFold.isReg()) {
218     if (UseOp.isImplicit() || UseOp.getSubReg() != AMDGPU::NoSubRegister)
219       return;
220 
221     // Don't fold subregister extracts into tied operands, only if it is a full
222     // copy since a subregister use tied to a full register def doesn't really
223     // make sense. e.g. don't fold:
224     //
225     // %vreg1 = COPY %vreg0:sub1
226     // %vreg2<tied3> = V_MAC_F32 %vreg3, %vreg4, %vreg1<tied0>
227     //
228     //  into
229     // %vreg2<tied3> = V_MAC_F32 %vreg3, %vreg4, %vreg0:sub1<tied0>
230     if (UseOp.isTied() && OpToFold.getSubReg() != AMDGPU::NoSubRegister)
231       return;
232   }
233 
234   bool FoldingImm = OpToFold.isImm();
235   APInt Imm;
236 
237   if (FoldingImm) {
238     unsigned UseReg = UseOp.getReg();
239     const TargetRegisterClass *UseRC
240       = TargetRegisterInfo::isVirtualRegister(UseReg) ?
241       MRI.getRegClass(UseReg) :
242       TRI.getPhysRegClass(UseReg);
243 
244     Imm = APInt(64, OpToFold.getImm());
245 
246     const MCInstrDesc &FoldDesc = TII->get(OpToFold.getParent()->getOpcode());
247     const TargetRegisterClass *FoldRC =
248         TRI.getRegClass(FoldDesc.OpInfo[0].RegClass);
249 
250     // Split 64-bit constants into 32-bits for folding.
251     if (FoldRC->getSize() == 8 && UseOp.getSubReg()) {
252       if (UseRC->getSize() != 8)
253         return;
254 
255       if (UseOp.getSubReg() == AMDGPU::sub0) {
256         Imm = Imm.getLoBits(32);
257       } else {
258         assert(UseOp.getSubReg() == AMDGPU::sub1);
259         Imm = Imm.getHiBits(32);
260       }
261     }
262 
263     // In order to fold immediates into copies, we need to change the
264     // copy to a MOV.
265     if (UseMI->getOpcode() == AMDGPU::COPY) {
266       unsigned DestReg = UseMI->getOperand(0).getReg();
267       const TargetRegisterClass *DestRC
268         = TargetRegisterInfo::isVirtualRegister(DestReg) ?
269         MRI.getRegClass(DestReg) :
270         TRI.getPhysRegClass(DestReg);
271 
272       unsigned MovOp = TII->getMovOpcode(DestRC);
273       if (MovOp == AMDGPU::COPY)
274         return;
275 
276       UseMI->setDesc(TII->get(MovOp));
277       CopiesToReplace.push_back(UseMI);
278     }
279   }
280 
281   // Special case for REG_SEQUENCE: We can't fold literals into
282   // REG_SEQUENCE instructions, so we have to fold them into the
283   // uses of REG_SEQUENCE.
284   if (UseMI->getOpcode() == AMDGPU::REG_SEQUENCE) {
285     unsigned RegSeqDstReg = UseMI->getOperand(0).getReg();
286     unsigned RegSeqDstSubReg = UseMI->getOperand(UseOpIdx + 1).getImm();
287 
288     for (MachineRegisterInfo::use_iterator
289          RSUse = MRI.use_begin(RegSeqDstReg),
290          RSE = MRI.use_end(); RSUse != RSE; ++RSUse) {
291 
292       MachineInstr *RSUseMI = RSUse->getParent();
293       if (RSUse->getSubReg() != RegSeqDstSubReg)
294         continue;
295 
296       foldOperand(OpToFold, RSUseMI, RSUse.getOperandNo(), FoldList,
297                   CopiesToReplace, TII, TRI, MRI);
298     }
299     return;
300   }
301 
302   const MCInstrDesc &UseDesc = UseMI->getDesc();
303 
304   // Don't fold into target independent nodes.  Target independent opcodes
305   // don't have defined register classes.
306   if (UseDesc.isVariadic() ||
307       UseDesc.OpInfo[UseOpIdx].RegClass == -1)
308     return;
309 
310   if (FoldingImm) {
311     MachineOperand ImmOp = MachineOperand::CreateImm(Imm.getSExtValue());
312     tryAddToFoldList(FoldList, UseMI, UseOpIdx, &ImmOp, TII);
313     return;
314   }
315 
316   tryAddToFoldList(FoldList, UseMI, UseOpIdx, &OpToFold, TII);
317 
318   // FIXME: We could try to change the instruction from 64-bit to 32-bit
319   // to enable more folding opportunites.  The shrink operands pass
320   // already does this.
321   return;
322 }
323 
324 static bool evalBinaryInstruction(unsigned Opcode, int32_t &Result,
325                                   int32_t LHS, int32_t RHS) {
326   switch (Opcode) {
327   case AMDGPU::V_AND_B32_e64:
328   case AMDGPU::S_AND_B32:
329     Result = LHS & RHS;
330     return true;
331   case AMDGPU::V_OR_B32_e64:
332   case AMDGPU::S_OR_B32:
333     Result = LHS | RHS;
334     return true;
335   case AMDGPU::V_XOR_B32_e64:
336   case AMDGPU::S_XOR_B32:
337     Result = LHS ^ RHS;
338     return true;
339   default:
340     return false;
341   }
342 }
343 
344 static unsigned getMovOpc(bool IsScalar) {
345   return IsScalar ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
346 }
347 
348 // Try to simplify operations with a constant that may appear after instruction
349 // selection.
350 static bool tryConstantFoldOp(MachineRegisterInfo &MRI,
351                               const SIInstrInfo *TII,
352                               MachineInstr *MI) {
353   unsigned Opc = MI->getOpcode();
354 
355   if (Opc == AMDGPU::V_NOT_B32_e64 || Opc == AMDGPU::V_NOT_B32_e32 ||
356       Opc == AMDGPU::S_NOT_B32) {
357     MachineOperand &Src0 = MI->getOperand(1);
358     if (Src0.isImm()) {
359       Src0.setImm(~Src0.getImm());
360       MI->setDesc(TII->get(getMovOpc(Opc == AMDGPU::S_NOT_B32)));
361       return true;
362     }
363 
364     return false;
365   }
366 
367   if (!MI->isCommutable())
368     return false;
369 
370   int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
371   int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
372 
373   MachineOperand *Src0 = &MI->getOperand(Src0Idx);
374   MachineOperand *Src1 = &MI->getOperand(Src1Idx);
375   if (!Src0->isImm() && !Src1->isImm())
376     return false;
377 
378   // and k0, k1 -> v_mov_b32 (k0 & k1)
379   // or k0, k1 -> v_mov_b32 (k0 | k1)
380   // xor k0, k1 -> v_mov_b32 (k0 ^ k1)
381   if (Src0->isImm() && Src1->isImm()) {
382     int32_t NewImm;
383     if (!evalBinaryInstruction(Opc, NewImm, Src0->getImm(), Src1->getImm()))
384       return false;
385 
386     const SIRegisterInfo &TRI = TII->getRegisterInfo();
387     bool IsSGPR = TRI.isSGPRReg(MRI, MI->getOperand(0).getReg());
388 
389     Src0->setImm(NewImm);
390     MI->RemoveOperand(Src1Idx);
391     MI->setDesc(TII->get(getMovOpc(IsSGPR)));
392     return true;
393   }
394 
395   if (Src0->isImm() && !Src1->isImm()) {
396     std::swap(Src0, Src1);
397     std::swap(Src0Idx, Src1Idx);
398   }
399 
400   int32_t Src1Val = static_cast<int32_t>(Src1->getImm());
401   if (Opc == AMDGPU::V_OR_B32_e64 || Opc == AMDGPU::S_OR_B32) {
402     if (Src1Val == 0) {
403       // y = or x, 0 => y = copy x
404       MI->RemoveOperand(Src1Idx);
405       MI->setDesc(TII->get(AMDGPU::COPY));
406     } else if (Src1Val == -1) {
407       // y = or x, -1 => y = v_mov_b32 -1
408       MI->RemoveOperand(Src1Idx);
409       MI->setDesc(TII->get(getMovOpc(Opc == AMDGPU::S_OR_B32)));
410     } else
411       return false;
412 
413     return true;
414   }
415 
416   if (MI->getOpcode() == AMDGPU::V_AND_B32_e64 ||
417       MI->getOpcode() == AMDGPU::S_AND_B32) {
418     if (Src1Val == 0) {
419       // y = and x, 0 => y = v_mov_b32 0
420       MI->RemoveOperand(Src0Idx);
421       MI->setDesc(TII->get(getMovOpc(Opc == AMDGPU::S_AND_B32)));
422     } else if (Src1Val == -1) {
423       // y = and x, -1 => y = copy x
424       MI->RemoveOperand(Src1Idx);
425       MI->setDesc(TII->get(AMDGPU::COPY));
426     } else
427       return false;
428 
429     return true;
430   }
431 
432   if (MI->getOpcode() == AMDGPU::V_XOR_B32_e64 ||
433       MI->getOpcode() == AMDGPU::S_XOR_B32) {
434     if (Src1Val == 0) {
435       // y = xor x, 0 => y = copy x
436       MI->RemoveOperand(Src1Idx);
437       MI->setDesc(TII->get(AMDGPU::COPY));
438     }
439   }
440 
441   return false;
442 }
443 
444 bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
445   if (skipFunction(*MF.getFunction()))
446     return false;
447 
448   const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
449 
450   MachineRegisterInfo &MRI = MF.getRegInfo();
451   const SIInstrInfo *TII = ST.getInstrInfo();
452   const SIRegisterInfo &TRI = TII->getRegisterInfo();
453 
454   for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
455                                                   BI != BE; ++BI) {
456 
457     MachineBasicBlock &MBB = *BI;
458     MachineBasicBlock::iterator I, Next;
459     for (I = MBB.begin(); I != MBB.end(); I = Next) {
460       Next = std::next(I);
461       MachineInstr &MI = *I;
462 
463       if (!isSafeToFold(MI.getOpcode()))
464         continue;
465 
466       unsigned OpSize = TII->getOpSize(MI, 1);
467       MachineOperand &OpToFold = MI.getOperand(1);
468       bool FoldingImm = OpToFold.isImm() || OpToFold.isFI();
469 
470       // FIXME: We could also be folding things like FrameIndexes and
471       // TargetIndexes.
472       if (!FoldingImm && !OpToFold.isReg())
473         continue;
474 
475       // Folding immediates with more than one use will increase program size.
476       // FIXME: This will also reduce register usage, which may be better
477       // in some cases.  A better heuristic is needed.
478       if (FoldingImm && !TII->isInlineConstant(OpToFold, OpSize) &&
479           !MRI.hasOneUse(MI.getOperand(0).getReg()))
480         continue;
481 
482       if (OpToFold.isReg() &&
483           !TargetRegisterInfo::isVirtualRegister(OpToFold.getReg()))
484         continue;
485 
486       // Prevent folding operands backwards in the function. For example,
487       // the COPY opcode must not be replaced by 1 in this example:
488       //
489       //    %vreg3<def> = COPY %VGPR0; VGPR_32:%vreg3
490       //    ...
491       //    %VGPR0<def> = V_MOV_B32_e32 1, %EXEC<imp-use>
492       MachineOperand &Dst = MI.getOperand(0);
493       if (Dst.isReg() &&
494           !TargetRegisterInfo::isVirtualRegister(Dst.getReg()))
495         continue;
496 
497       // We need mutate the operands of new mov instructions to add implicit
498       // uses of EXEC, but adding them invalidates the use_iterator, so defer
499       // this.
500       SmallVector<MachineInstr *, 4> CopiesToReplace;
501 
502       std::vector<FoldCandidate> FoldList;
503       for (MachineRegisterInfo::use_iterator
504            Use = MRI.use_begin(MI.getOperand(0).getReg()), E = MRI.use_end();
505            Use != E; ++Use) {
506 
507         MachineInstr *UseMI = Use->getParent();
508 
509         foldOperand(OpToFold, UseMI, Use.getOperandNo(), FoldList,
510                     CopiesToReplace, TII, TRI, MRI);
511       }
512 
513       // Make sure we add EXEC uses to any new v_mov instructions created.
514       for (MachineInstr *Copy : CopiesToReplace)
515         Copy->addImplicitDefUseOperands(MF);
516 
517       for (FoldCandidate &Fold : FoldList) {
518         if (updateOperand(Fold, TRI)) {
519           // Clear kill flags.
520           if (Fold.isReg()) {
521             assert(Fold.OpToFold && Fold.OpToFold->isReg());
522             // FIXME: Probably shouldn't bother trying to fold if not an
523             // SGPR. PeepholeOptimizer can eliminate redundant VGPR->VGPR
524             // copies.
525             MRI.clearKillFlags(Fold.OpToFold->getReg());
526           }
527           DEBUG(dbgs() << "Folded source from " << MI << " into OpNo " <<
528                 Fold.UseOpNo << " of " << *Fold.UseMI << '\n');
529 
530           // Folding the immediate may reveal operations that can be constant
531           // folded or replaced with a copy. This can happen for example after
532           // frame indices are lowered to constants or from splitting 64-bit
533           // constants.
534           tryConstantFoldOp(MRI, TII, Fold.UseMI);
535         }
536       }
537     }
538   }
539   return false;
540 }
541