xref: /llvm-project/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp (revision f7988a338ddb53b03e7cb89d839616925bd0ade1)
1 //===-- SIPreEmitPeephole.cpp ------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// This pass performs the peephole optimizations before code emission.
11 ///
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPU.h"
15 #include "GCNSubtarget.h"
16 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
17 #include "llvm/CodeGen/MachineFunctionPass.h"
18 #include "llvm/CodeGen/TargetSchedule.h"
19 #include "llvm/Support/BranchProbability.h"
20 
21 using namespace llvm;
22 
23 #define DEBUG_TYPE "si-pre-emit-peephole"
24 
25 namespace {
26 
27 class SIPreEmitPeephole : public MachineFunctionPass {
28 private:
29   const SIInstrInfo *TII = nullptr;
30   const SIRegisterInfo *TRI = nullptr;
31 
32   bool optimizeVccBranch(MachineInstr &MI) const;
33   bool optimizeSetGPR(MachineInstr &First, MachineInstr &MI) const;
34   bool getBlockDestinations(MachineBasicBlock &SrcMBB,
35                             MachineBasicBlock *&TrueMBB,
36                             MachineBasicBlock *&FalseMBB,
37                             SmallVectorImpl<MachineOperand> &Cond);
38   bool mustRetainExeczBranch(const MachineInstr &Branch,
39                              const MachineBasicBlock &From,
40                              const MachineBasicBlock &To) const;
41   bool removeExeczBranch(MachineInstr &MI, MachineBasicBlock &SrcMBB);
42 
43 public:
44   static char ID;
45 
46   SIPreEmitPeephole() : MachineFunctionPass(ID) {
47     initializeSIPreEmitPeepholePass(*PassRegistry::getPassRegistry());
48   }
49 
50   bool runOnMachineFunction(MachineFunction &MF) override;
51 };
52 
53 } // End anonymous namespace.
54 
55 INITIALIZE_PASS(SIPreEmitPeephole, DEBUG_TYPE,
56                 "SI peephole optimizations", false, false)
57 
58 char SIPreEmitPeephole::ID = 0;
59 
60 char &llvm::SIPreEmitPeepholeID = SIPreEmitPeephole::ID;
61 
62 bool SIPreEmitPeephole::optimizeVccBranch(MachineInstr &MI) const {
63   // Match:
64   // sreg = -1 or 0
65   // vcc = S_AND_B64 exec, sreg or S_ANDN2_B64 exec, sreg
66   // S_CBRANCH_VCC[N]Z
67   // =>
68   // S_CBRANCH_EXEC[N]Z
69   // We end up with this pattern sometimes after basic block placement.
70   // It happens while combining a block which assigns -1 or 0 to a saved mask
71   // and another block which consumes that saved mask and then a branch.
72   //
73   // While searching this also performs the following substitution:
74   // vcc = V_CMP
75   // vcc = S_AND exec, vcc
76   // S_CBRANCH_VCC[N]Z
77   // =>
78   // vcc = V_CMP
79   // S_CBRANCH_VCC[N]Z
80 
81   bool Changed = false;
82   MachineBasicBlock &MBB = *MI.getParent();
83   const GCNSubtarget &ST = MBB.getParent()->getSubtarget<GCNSubtarget>();
84   const bool IsWave32 = ST.isWave32();
85   const unsigned CondReg = TRI->getVCC();
86   const unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
87   const unsigned And = IsWave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
88   const unsigned AndN2 = IsWave32 ? AMDGPU::S_ANDN2_B32 : AMDGPU::S_ANDN2_B64;
89   const unsigned Mov = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
90 
91   MachineBasicBlock::reverse_iterator A = MI.getReverseIterator(),
92                                       E = MBB.rend();
93   bool ReadsCond = false;
94   unsigned Threshold = 5;
95   for (++A; A != E; ++A) {
96     if (!--Threshold)
97       return false;
98     if (A->modifiesRegister(ExecReg, TRI))
99       return false;
100     if (A->modifiesRegister(CondReg, TRI)) {
101       if (!A->definesRegister(CondReg, TRI) ||
102           (A->getOpcode() != And && A->getOpcode() != AndN2))
103         return false;
104       break;
105     }
106     ReadsCond |= A->readsRegister(CondReg, TRI);
107   }
108   if (A == E)
109     return false;
110 
111   MachineOperand &Op1 = A->getOperand(1);
112   MachineOperand &Op2 = A->getOperand(2);
113   if (Op1.getReg() != ExecReg && Op2.isReg() && Op2.getReg() == ExecReg) {
114     TII->commuteInstruction(*A);
115     Changed = true;
116   }
117   if (Op1.getReg() != ExecReg)
118     return Changed;
119   if (Op2.isImm() && !(Op2.getImm() == -1 || Op2.getImm() == 0))
120     return Changed;
121 
122   int64_t MaskValue = 0;
123   Register SReg;
124   if (Op2.isReg()) {
125     SReg = Op2.getReg();
126     auto M = std::next(A);
127     bool ReadsSreg = false;
128     bool ModifiesExec = false;
129     for (; M != E; ++M) {
130       if (M->definesRegister(SReg, TRI))
131         break;
132       if (M->modifiesRegister(SReg, TRI))
133         return Changed;
134       ReadsSreg |= M->readsRegister(SReg, TRI);
135       ModifiesExec |= M->modifiesRegister(ExecReg, TRI);
136     }
137     if (M == E)
138       return Changed;
139     // If SReg is VCC and SReg definition is a VALU comparison.
140     // This means S_AND with EXEC is not required.
141     // Erase the S_AND and return.
142     // Note: isVOPC is used instead of isCompare to catch V_CMP_CLASS
143     if (A->getOpcode() == And && SReg == CondReg && !ModifiesExec &&
144         TII->isVOPC(*M)) {
145       A->eraseFromParent();
146       return true;
147     }
148     if (!M->isMoveImmediate() || !M->getOperand(1).isImm() ||
149         (M->getOperand(1).getImm() != -1 && M->getOperand(1).getImm() != 0))
150       return Changed;
151     MaskValue = M->getOperand(1).getImm();
152     // First if sreg is only used in the AND instruction fold the immediate
153     // into the AND.
154     if (!ReadsSreg && Op2.isKill()) {
155       A->getOperand(2).ChangeToImmediate(MaskValue);
156       M->eraseFromParent();
157     }
158   } else if (Op2.isImm()) {
159     MaskValue = Op2.getImm();
160   } else {
161     llvm_unreachable("Op2 must be register or immediate");
162   }
163 
164   // Invert mask for s_andn2
165   assert(MaskValue == 0 || MaskValue == -1);
166   if (A->getOpcode() == AndN2)
167     MaskValue = ~MaskValue;
168 
169   if (!ReadsCond && A->registerDefIsDead(AMDGPU::SCC, /*TRI=*/nullptr)) {
170     if (!MI.killsRegister(CondReg, TRI)) {
171       // Replace AND with MOV
172       if (MaskValue == 0) {
173         BuildMI(*A->getParent(), *A, A->getDebugLoc(), TII->get(Mov), CondReg)
174             .addImm(0);
175       } else {
176         BuildMI(*A->getParent(), *A, A->getDebugLoc(), TII->get(Mov), CondReg)
177             .addReg(ExecReg);
178       }
179     }
180     // Remove AND instruction
181     A->eraseFromParent();
182   }
183 
184   bool IsVCCZ = MI.getOpcode() == AMDGPU::S_CBRANCH_VCCZ;
185   if (SReg == ExecReg) {
186     // EXEC is updated directly
187     if (IsVCCZ) {
188       MI.eraseFromParent();
189       return true;
190     }
191     MI.setDesc(TII->get(AMDGPU::S_BRANCH));
192   } else if (IsVCCZ && MaskValue == 0) {
193     // Will always branch
194     // Remove all successors shadowed by new unconditional branch
195     MachineBasicBlock *Parent = MI.getParent();
196     SmallVector<MachineInstr *, 4> ToRemove;
197     bool Found = false;
198     for (MachineInstr &Term : Parent->terminators()) {
199       if (Found) {
200         if (Term.isBranch())
201           ToRemove.push_back(&Term);
202       } else {
203         Found = Term.isIdenticalTo(MI);
204       }
205     }
206     assert(Found && "conditional branch is not terminator");
207     for (auto *BranchMI : ToRemove) {
208       MachineOperand &Dst = BranchMI->getOperand(0);
209       assert(Dst.isMBB() && "destination is not basic block");
210       Parent->removeSuccessor(Dst.getMBB());
211       BranchMI->eraseFromParent();
212     }
213 
214     if (MachineBasicBlock *Succ = Parent->getFallThrough()) {
215       Parent->removeSuccessor(Succ);
216     }
217 
218     // Rewrite to unconditional branch
219     MI.setDesc(TII->get(AMDGPU::S_BRANCH));
220   } else if (!IsVCCZ && MaskValue == 0) {
221     // Will never branch
222     MachineOperand &Dst = MI.getOperand(0);
223     assert(Dst.isMBB() && "destination is not basic block");
224     MI.getParent()->removeSuccessor(Dst.getMBB());
225     MI.eraseFromParent();
226     return true;
227   } else if (MaskValue == -1) {
228     // Depends only on EXEC
229     MI.setDesc(
230         TII->get(IsVCCZ ? AMDGPU::S_CBRANCH_EXECZ : AMDGPU::S_CBRANCH_EXECNZ));
231   }
232 
233   MI.removeOperand(MI.findRegisterUseOperandIdx(CondReg, TRI, false /*Kill*/));
234   MI.addImplicitDefUseOperands(*MBB.getParent());
235 
236   return true;
237 }
238 
239 bool SIPreEmitPeephole::optimizeSetGPR(MachineInstr &First,
240                                        MachineInstr &MI) const {
241   MachineBasicBlock &MBB = *MI.getParent();
242   const MachineFunction &MF = *MBB.getParent();
243   const MachineRegisterInfo &MRI = MF.getRegInfo();
244   MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
245   Register IdxReg = Idx->isReg() ? Idx->getReg() : Register();
246   SmallVector<MachineInstr *, 4> ToRemove;
247   bool IdxOn = true;
248 
249   if (!MI.isIdenticalTo(First))
250     return false;
251 
252   // Scan back to find an identical S_SET_GPR_IDX_ON
253   for (MachineBasicBlock::instr_iterator I = std::next(First.getIterator()),
254                                          E = MI.getIterator();
255        I != E; ++I) {
256     if (I->isBundle())
257       continue;
258     switch (I->getOpcode()) {
259     case AMDGPU::S_SET_GPR_IDX_MODE:
260       return false;
261     case AMDGPU::S_SET_GPR_IDX_OFF:
262       IdxOn = false;
263       ToRemove.push_back(&*I);
264       break;
265     default:
266       if (I->modifiesRegister(AMDGPU::M0, TRI))
267         return false;
268       if (IdxReg && I->modifiesRegister(IdxReg, TRI))
269         return false;
270       if (llvm::any_of(I->operands(),
271                        [&MRI, this](const MachineOperand &MO) {
272                          return MO.isReg() &&
273                                 TRI->isVectorRegister(MRI, MO.getReg());
274                        })) {
275         // The only exception allowed here is another indirect vector move
276         // with the same mode.
277         if (!IdxOn || !(I->getOpcode() == AMDGPU::V_MOV_B32_indirect_write ||
278                         I->getOpcode() == AMDGPU::V_MOV_B32_indirect_read))
279           return false;
280       }
281     }
282   }
283 
284   MI.eraseFromBundle();
285   for (MachineInstr *RI : ToRemove)
286     RI->eraseFromBundle();
287   return true;
288 }
289 
290 bool SIPreEmitPeephole::getBlockDestinations(
291     MachineBasicBlock &SrcMBB, MachineBasicBlock *&TrueMBB,
292     MachineBasicBlock *&FalseMBB, SmallVectorImpl<MachineOperand> &Cond) {
293   if (TII->analyzeBranch(SrcMBB, TrueMBB, FalseMBB, Cond))
294     return false;
295 
296   if (!FalseMBB)
297     FalseMBB = SrcMBB.getNextNode();
298 
299   return true;
300 }
301 
302 namespace {
303 class BranchWeightCostModel {
304   const SIInstrInfo &TII;
305   const TargetSchedModel &SchedModel;
306   BranchProbability BranchProb;
307   static constexpr uint64_t BranchNotTakenCost = 1;
308   uint64_t BranchTakenCost;
309   uint64_t ThenCyclesCost = 0;
310 
311 public:
312   BranchWeightCostModel(const SIInstrInfo &TII, const MachineInstr &Branch,
313                         const MachineBasicBlock &Succ)
314       : TII(TII), SchedModel(TII.getSchedModel()) {
315     const MachineBasicBlock &Head = *Branch.getParent();
316     const auto *FromIt = find(Head.successors(), &Succ);
317     assert(FromIt != Head.succ_end());
318 
319     BranchProb = Head.getSuccProbability(FromIt);
320     if (BranchProb.isUnknown())
321       BranchProb = BranchProbability::getZero();
322     BranchTakenCost = SchedModel.computeInstrLatency(&Branch);
323   }
324 
325   bool isProfitable(const MachineInstr &MI) {
326     if (TII.isWaitcnt(MI.getOpcode()))
327       return false;
328 
329     ThenCyclesCost += SchedModel.computeInstrLatency(&MI);
330 
331     // Consider `P = N/D` to be the probability of execz being false (skipping
332     // the then-block) The transformation is profitable if always executing the
333     // 'then' block is cheaper than executing sometimes 'then' and always
334     // executing s_cbranch_execz:
335     // * ThenCost <= P*ThenCost + (1-P)*BranchTakenCost + P*BranchNotTakenCost
336     // * (1-P) * ThenCost <= (1-P)*BranchTakenCost + P*BranchNotTakenCost
337     // * (D-N)/D * ThenCost <= (D-N)/D * BranchTakenCost + N/D *
338     // BranchNotTakenCost
339     uint64_t Numerator = BranchProb.getNumerator();
340     uint64_t Denominator = BranchProb.getDenominator();
341     return (Denominator - Numerator) * ThenCyclesCost <=
342            ((Denominator - Numerator) * BranchTakenCost +
343             Numerator * BranchNotTakenCost);
344   }
345 };
346 
347 bool SIPreEmitPeephole::mustRetainExeczBranch(
348     const MachineInstr &Branch, const MachineBasicBlock &From,
349     const MachineBasicBlock &To) const {
350   assert(is_contained(Branch.getParent()->successors(), &From));
351   BranchWeightCostModel CostModel{*TII, Branch, From};
352 
353   const MachineFunction *MF = From.getParent();
354   for (MachineFunction::const_iterator MBBI(&From), ToI(&To), End = MF->end();
355        MBBI != End && MBBI != ToI; ++MBBI) {
356     const MachineBasicBlock &MBB = *MBBI;
357 
358     for (const MachineInstr &MI : MBB) {
359       // When a uniform loop is inside non-uniform control flow, the branch
360       // leaving the loop might never be taken when EXEC = 0.
361       // Hence we should retain cbranch out of the loop lest it become infinite.
362       if (MI.isConditionalBranch())
363         return true;
364 
365       if (MI.isUnconditionalBranch() &&
366           TII->getBranchDestBlock(MI) != MBB.getNextNode())
367         return true;
368 
369       if (MI.isMetaInstruction())
370         continue;
371 
372       if (TII->hasUnwantedEffectsWhenEXECEmpty(MI))
373         return true;
374 
375       if (!CostModel.isProfitable(MI))
376         return true;
377     }
378   }
379 
380   return false;
381 }
382 } // namespace
383 
384 // Returns true if the skip branch instruction is removed.
385 bool SIPreEmitPeephole::removeExeczBranch(MachineInstr &MI,
386                                           MachineBasicBlock &SrcMBB) {
387 
388   if (!TII->getSchedModel().hasInstrSchedModel())
389     return false;
390 
391   MachineBasicBlock *TrueMBB = nullptr;
392   MachineBasicBlock *FalseMBB = nullptr;
393   SmallVector<MachineOperand, 1> Cond;
394 
395   if (!getBlockDestinations(SrcMBB, TrueMBB, FalseMBB, Cond))
396     return false;
397 
398   // Consider only the forward branches.
399   if (SrcMBB.getNumber() >= TrueMBB->getNumber())
400     return false;
401 
402   // Consider only when it is legal and profitable
403   if (mustRetainExeczBranch(MI, *FalseMBB, *TrueMBB))
404     return false;
405 
406   LLVM_DEBUG(dbgs() << "Removing the execz branch: " << MI);
407   MI.eraseFromParent();
408   SrcMBB.removeSuccessor(TrueMBB);
409 
410   return true;
411 }
412 
413 bool SIPreEmitPeephole::runOnMachineFunction(MachineFunction &MF) {
414   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
415   TII = ST.getInstrInfo();
416   TRI = &TII->getRegisterInfo();
417   bool Changed = false;
418 
419   MF.RenumberBlocks();
420 
421   for (MachineBasicBlock &MBB : MF) {
422     MachineBasicBlock::iterator TermI = MBB.getFirstTerminator();
423     // Check first terminator for branches to optimize
424     if (TermI != MBB.end()) {
425       MachineInstr &MI = *TermI;
426       switch (MI.getOpcode()) {
427       case AMDGPU::S_CBRANCH_VCCZ:
428       case AMDGPU::S_CBRANCH_VCCNZ:
429         Changed |= optimizeVccBranch(MI);
430         break;
431       case AMDGPU::S_CBRANCH_EXECZ:
432         Changed |= removeExeczBranch(MI, MBB);
433         break;
434       }
435     }
436 
437     if (!ST.hasVGPRIndexMode())
438       continue;
439 
440     MachineInstr *SetGPRMI = nullptr;
441     const unsigned Threshold = 20;
442     unsigned Count = 0;
443     // Scan the block for two S_SET_GPR_IDX_ON instructions to see if a
444     // second is not needed. Do expensive checks in the optimizeSetGPR()
445     // and limit the distance to 20 instructions for compile time purposes.
446     // Note: this needs to work on bundles as S_SET_GPR_IDX* instructions
447     // may be bundled with the instructions they modify.
448     for (auto &MI : make_early_inc_range(MBB.instrs())) {
449       if (Count == Threshold)
450         SetGPRMI = nullptr;
451       else
452         ++Count;
453 
454       if (MI.getOpcode() != AMDGPU::S_SET_GPR_IDX_ON)
455         continue;
456 
457       Count = 0;
458       if (!SetGPRMI) {
459         SetGPRMI = &MI;
460         continue;
461       }
462 
463       if (optimizeSetGPR(*SetGPRMI, MI))
464         Changed = true;
465       else
466         SetGPRMI = &MI;
467     }
468   }
469 
470   return Changed;
471 }
472