xref: /llvm-project/llvm/lib/CodeGen/MachineCombiner.cpp (revision b0fb98227c90adf2536c9ad644a74d5e92961111)
1 //===---- MachineCombiner.cpp - Instcombining on SSA form machine code ----===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // The machine combiner pass uses machine trace metrics to ensure the combined
10 // instructions do not lengthen the critical path or the resource depth.
11 //===----------------------------------------------------------------------===//
12 
13 #include "llvm/ADT/DenseMap.h"
14 #include "llvm/ADT/Statistic.h"
15 #include "llvm/Analysis/ProfileSummaryInfo.h"
16 #include "llvm/CodeGen/LazyMachineBlockFrequencyInfo.h"
17 #include "llvm/CodeGen/MachineCombinerPattern.h"
18 #include "llvm/CodeGen/MachineDominators.h"
19 #include "llvm/CodeGen/MachineFunction.h"
20 #include "llvm/CodeGen/MachineFunctionPass.h"
21 #include "llvm/CodeGen/MachineLoopInfo.h"
22 #include "llvm/CodeGen/MachineRegisterInfo.h"
23 #include "llvm/CodeGen/MachineSizeOpts.h"
24 #include "llvm/CodeGen/MachineTraceMetrics.h"
25 #include "llvm/CodeGen/RegisterClassInfo.h"
26 #include "llvm/CodeGen/TargetInstrInfo.h"
27 #include "llvm/CodeGen/TargetRegisterInfo.h"
28 #include "llvm/CodeGen/TargetSchedule.h"
29 #include "llvm/CodeGen/TargetSubtargetInfo.h"
30 #include "llvm/InitializePasses.h"
31 #include "llvm/Support/CommandLine.h"
32 #include "llvm/Support/Debug.h"
33 #include "llvm/Support/raw_ostream.h"
34 
35 using namespace llvm;
36 
37 #define DEBUG_TYPE "machine-combiner"
38 
39 STATISTIC(NumInstCombined, "Number of machineinst combined");
40 
41 static cl::opt<unsigned>
42 inc_threshold("machine-combiner-inc-threshold", cl::Hidden,
43               cl::desc("Incremental depth computation will be used for basic "
44                        "blocks with more instructions."), cl::init(500));
45 
46 static cl::opt<bool> dump_intrs("machine-combiner-dump-subst-intrs", cl::Hidden,
47                                 cl::desc("Dump all substituted intrs"),
48                                 cl::init(false));
49 
50 #ifdef EXPENSIVE_CHECKS
51 static cl::opt<bool> VerifyPatternOrder(
52     "machine-combiner-verify-pattern-order", cl::Hidden,
53     cl::desc(
54         "Verify that the generated patterns are ordered by increasing latency"),
55     cl::init(true));
56 #else
57 static cl::opt<bool> VerifyPatternOrder(
58     "machine-combiner-verify-pattern-order", cl::Hidden,
59     cl::desc(
60         "Verify that the generated patterns are ordered by increasing latency"),
61     cl::init(false));
62 #endif
63 
64 namespace {
65 class MachineCombiner : public MachineFunctionPass {
66   const TargetSubtargetInfo *STI = nullptr;
67   const TargetInstrInfo *TII = nullptr;
68   const TargetRegisterInfo *TRI = nullptr;
69   MCSchedModel SchedModel;
70   MachineRegisterInfo *MRI = nullptr;
71   MachineLoopInfo *MLI = nullptr; // Current MachineLoopInfo
72   MachineTraceMetrics *Traces = nullptr;
73   MachineTraceMetrics::Ensemble *TraceEnsemble = nullptr;
74   MachineBlockFrequencyInfo *MBFI = nullptr;
75   ProfileSummaryInfo *PSI = nullptr;
76   RegisterClassInfo RegClassInfo;
77 
78   TargetSchedModel TSchedModel;
79 
80   /// True if optimizing for code size.
81   bool OptSize = false;
82 
83 public:
84   static char ID;
85   MachineCombiner() : MachineFunctionPass(ID) {
86     initializeMachineCombinerPass(*PassRegistry::getPassRegistry());
87   }
88   void getAnalysisUsage(AnalysisUsage &AU) const override;
89   bool runOnMachineFunction(MachineFunction &MF) override;
90   StringRef getPassName() const override { return "Machine InstCombiner"; }
91 
92 private:
93   bool combineInstructions(MachineBasicBlock *);
94   MachineInstr *getOperandDef(const MachineOperand &MO);
95   bool isTransientMI(const MachineInstr *MI);
96   unsigned getDepth(SmallVectorImpl<MachineInstr *> &InsInstrs,
97                     DenseMap<unsigned, unsigned> &InstrIdxForVirtReg,
98                     MachineTraceMetrics::Trace BlockTrace,
99                     const MachineBasicBlock &MBB);
100   unsigned getLatency(MachineInstr *Root, MachineInstr *NewRoot,
101                       MachineTraceMetrics::Trace BlockTrace);
102   bool
103   improvesCriticalPathLen(MachineBasicBlock *MBB, MachineInstr *Root,
104                           MachineTraceMetrics::Trace BlockTrace,
105                           SmallVectorImpl<MachineInstr *> &InsInstrs,
106                           SmallVectorImpl<MachineInstr *> &DelInstrs,
107                           DenseMap<unsigned, unsigned> &InstrIdxForVirtReg,
108                           MachineCombinerPattern Pattern, bool SlackIsAccurate);
109   bool reduceRegisterPressure(MachineInstr &Root, MachineBasicBlock *MBB,
110                               SmallVectorImpl<MachineInstr *> &InsInstrs,
111                               SmallVectorImpl<MachineInstr *> &DelInstrs,
112                               MachineCombinerPattern Pattern);
113   bool preservesResourceLen(MachineBasicBlock *MBB,
114                             MachineTraceMetrics::Trace BlockTrace,
115                             SmallVectorImpl<MachineInstr *> &InsInstrs,
116                             SmallVectorImpl<MachineInstr *> &DelInstrs);
117   void instr2instrSC(SmallVectorImpl<MachineInstr *> &Instrs,
118                      SmallVectorImpl<const MCSchedClassDesc *> &InstrsSC);
119   std::pair<unsigned, unsigned>
120   getLatenciesForInstrSequences(MachineInstr &MI,
121                                 SmallVectorImpl<MachineInstr *> &InsInstrs,
122                                 SmallVectorImpl<MachineInstr *> &DelInstrs,
123                                 MachineTraceMetrics::Trace BlockTrace);
124 
125   void verifyPatternOrder(MachineBasicBlock *MBB, MachineInstr &Root,
126                           SmallVector<MachineCombinerPattern, 16> &Patterns);
127 };
128 }
129 
130 char MachineCombiner::ID = 0;
131 char &llvm::MachineCombinerID = MachineCombiner::ID;
132 
133 INITIALIZE_PASS_BEGIN(MachineCombiner, DEBUG_TYPE,
134                       "Machine InstCombiner", false, false)
135 INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
136 INITIALIZE_PASS_DEPENDENCY(MachineTraceMetrics)
137 INITIALIZE_PASS_END(MachineCombiner, DEBUG_TYPE, "Machine InstCombiner",
138                     false, false)
139 
140 void MachineCombiner::getAnalysisUsage(AnalysisUsage &AU) const {
141   AU.setPreservesCFG();
142   AU.addPreserved<MachineDominatorTree>();
143   AU.addRequired<MachineLoopInfo>();
144   AU.addPreserved<MachineLoopInfo>();
145   AU.addRequired<MachineTraceMetrics>();
146   AU.addPreserved<MachineTraceMetrics>();
147   AU.addRequired<LazyMachineBlockFrequencyInfoPass>();
148   AU.addRequired<ProfileSummaryInfoWrapperPass>();
149   MachineFunctionPass::getAnalysisUsage(AU);
150 }
151 
152 MachineInstr *
153 MachineCombiner::getOperandDef(const MachineOperand &MO) {
154   MachineInstr *DefInstr = nullptr;
155   // We need a virtual register definition.
156   if (MO.isReg() && MO.getReg().isVirtual())
157     DefInstr = MRI->getUniqueVRegDef(MO.getReg());
158   // PHI's have no depth etc.
159   if (DefInstr && DefInstr->isPHI())
160     DefInstr = nullptr;
161   return DefInstr;
162 }
163 
164 /// Return true if MI is unlikely to generate an actual target instruction.
165 bool MachineCombiner::isTransientMI(const MachineInstr *MI) {
166   if (!MI->isCopy())
167     return MI->isTransient();
168 
169   // If MI is a COPY, check if its src and dst registers can be coalesced.
170   Register Dst = MI->getOperand(0).getReg();
171   Register Src = MI->getOperand(1).getReg();
172 
173   if (!MI->isFullCopy()) {
174     // If src RC contains super registers of dst RC, it can also be coalesced.
175     if (MI->getOperand(0).getSubReg() || Src.isPhysical() || Dst.isPhysical())
176       return false;
177 
178     auto SrcSub = MI->getOperand(1).getSubReg();
179     auto SrcRC = MRI->getRegClass(Src);
180     auto DstRC = MRI->getRegClass(Dst);
181     return TRI->getMatchingSuperRegClass(SrcRC, DstRC, SrcSub) != nullptr;
182   }
183 
184   if (Src.isPhysical() && Dst.isPhysical())
185     return Src == Dst;
186 
187   if (Src.isVirtual() && Dst.isVirtual()) {
188     auto SrcRC = MRI->getRegClass(Src);
189     auto DstRC = MRI->getRegClass(Dst);
190     return SrcRC->hasSuperClassEq(DstRC) || SrcRC->hasSubClassEq(DstRC);
191   }
192 
193   if (Src.isVirtual())
194     std::swap(Src, Dst);
195 
196   // Now Src is physical register, Dst is virtual register.
197   auto DstRC = MRI->getRegClass(Dst);
198   return DstRC->contains(Src);
199 }
200 
201 /// Computes depth of instructions in vector \InsInstr.
202 ///
203 /// \param InsInstrs is a vector of machine instructions
204 /// \param InstrIdxForVirtReg is a dense map of virtual register to index
205 /// of defining machine instruction in \p InsInstrs
206 /// \param BlockTrace is a trace of machine instructions
207 ///
208 /// \returns Depth of last instruction in \InsInstrs ("NewRoot")
209 unsigned
210 MachineCombiner::getDepth(SmallVectorImpl<MachineInstr *> &InsInstrs,
211                           DenseMap<unsigned, unsigned> &InstrIdxForVirtReg,
212                           MachineTraceMetrics::Trace BlockTrace,
213                           const MachineBasicBlock &MBB) {
214   SmallVector<unsigned, 16> InstrDepth;
215   // For each instruction in the new sequence compute the depth based on the
216   // operands. Use the trace information when possible. For new operands which
217   // are tracked in the InstrIdxForVirtReg map depth is looked up in InstrDepth
218   for (auto *InstrPtr : InsInstrs) { // for each Use
219     unsigned IDepth = 0;
220     for (const MachineOperand &MO : InstrPtr->operands()) {
221       // Check for virtual register operand.
222       if (!(MO.isReg() && MO.getReg().isVirtual()))
223         continue;
224       if (!MO.isUse())
225         continue;
226       unsigned DepthOp = 0;
227       unsigned LatencyOp = 0;
228       DenseMap<unsigned, unsigned>::iterator II =
229           InstrIdxForVirtReg.find(MO.getReg());
230       if (II != InstrIdxForVirtReg.end()) {
231         // Operand is new virtual register not in trace
232         assert(II->second < InstrDepth.size() && "Bad Index");
233         MachineInstr *DefInstr = InsInstrs[II->second];
234         assert(DefInstr &&
235                "There must be a definition for a new virtual register");
236         DepthOp = InstrDepth[II->second];
237         int DefIdx = DefInstr->findRegisterDefOperandIdx(MO.getReg());
238         int UseIdx = InstrPtr->findRegisterUseOperandIdx(MO.getReg());
239         LatencyOp = TSchedModel.computeOperandLatency(DefInstr, DefIdx,
240                                                       InstrPtr, UseIdx);
241       } else {
242         MachineInstr *DefInstr = getOperandDef(MO);
243         if (DefInstr && (TII->getMachineCombinerTraceStrategy() !=
244                              MachineTraceStrategy::TS_Local ||
245                          DefInstr->getParent() == &MBB)) {
246           DepthOp = BlockTrace.getInstrCycles(*DefInstr).Depth;
247           if (!isTransientMI(DefInstr))
248             LatencyOp = TSchedModel.computeOperandLatency(
249                 DefInstr, DefInstr->findRegisterDefOperandIdx(MO.getReg()),
250                 InstrPtr, InstrPtr->findRegisterUseOperandIdx(MO.getReg()));
251         }
252       }
253       IDepth = std::max(IDepth, DepthOp + LatencyOp);
254     }
255     InstrDepth.push_back(IDepth);
256   }
257   unsigned NewRootIdx = InsInstrs.size() - 1;
258   return InstrDepth[NewRootIdx];
259 }
260 
261 /// Computes instruction latency as max of latency of defined operands.
262 ///
263 /// \param Root is a machine instruction that could be replaced by NewRoot.
264 /// It is used to compute a more accurate latency information for NewRoot in
265 /// case there is a dependent instruction in the same trace (\p BlockTrace)
266 /// \param NewRoot is the instruction for which the latency is computed
267 /// \param BlockTrace is a trace of machine instructions
268 ///
269 /// \returns Latency of \p NewRoot
270 unsigned MachineCombiner::getLatency(MachineInstr *Root, MachineInstr *NewRoot,
271                                      MachineTraceMetrics::Trace BlockTrace) {
272   // Check each definition in NewRoot and compute the latency
273   unsigned NewRootLatency = 0;
274 
275   for (const MachineOperand &MO : NewRoot->operands()) {
276     // Check for virtual register operand.
277     if (!(MO.isReg() && MO.getReg().isVirtual()))
278       continue;
279     if (!MO.isDef())
280       continue;
281     // Get the first instruction that uses MO
282     MachineRegisterInfo::reg_iterator RI = MRI->reg_begin(MO.getReg());
283     RI++;
284     if (RI == MRI->reg_end())
285       continue;
286     MachineInstr *UseMO = RI->getParent();
287     unsigned LatencyOp = 0;
288     if (UseMO && BlockTrace.isDepInTrace(*Root, *UseMO)) {
289       LatencyOp = TSchedModel.computeOperandLatency(
290           NewRoot, NewRoot->findRegisterDefOperandIdx(MO.getReg()), UseMO,
291           UseMO->findRegisterUseOperandIdx(MO.getReg()));
292     } else {
293       LatencyOp = TSchedModel.computeInstrLatency(NewRoot);
294     }
295     NewRootLatency = std::max(NewRootLatency, LatencyOp);
296   }
297   return NewRootLatency;
298 }
299 
300 /// The combiner's goal may differ based on which pattern it is attempting
301 /// to optimize.
302 enum class CombinerObjective {
303   MustReduceDepth,            // The data dependency chain must be improved.
304   MustReduceRegisterPressure, // The register pressure must be reduced.
305   Default                     // The critical path must not be lengthened.
306 };
307 
308 static CombinerObjective getCombinerObjective(MachineCombinerPattern P) {
309   // TODO: If C++ ever gets a real enum class, make this part of the
310   // MachineCombinerPattern class.
311   switch (P) {
312   case MachineCombinerPattern::REASSOC_AX_BY:
313   case MachineCombinerPattern::REASSOC_AX_YB:
314   case MachineCombinerPattern::REASSOC_XA_BY:
315   case MachineCombinerPattern::REASSOC_XA_YB:
316   case MachineCombinerPattern::REASSOC_XY_AMM_BMM:
317   case MachineCombinerPattern::REASSOC_XMM_AMM_BMM:
318   case MachineCombinerPattern::SUBADD_OP1:
319   case MachineCombinerPattern::SUBADD_OP2:
320   case MachineCombinerPattern::FMADD_AX:
321   case MachineCombinerPattern::FMADD_XA:
322   case MachineCombinerPattern::FMSUB:
323   case MachineCombinerPattern::FNMSUB:
324     return CombinerObjective::MustReduceDepth;
325   case MachineCombinerPattern::REASSOC_XY_BCA:
326   case MachineCombinerPattern::REASSOC_XY_BAC:
327     return CombinerObjective::MustReduceRegisterPressure;
328   default:
329     return CombinerObjective::Default;
330   }
331 }
332 
333 /// Estimate the latency of the new and original instruction sequence by summing
334 /// up the latencies of the inserted and deleted instructions. This assumes
335 /// that the inserted and deleted instructions are dependent instruction chains,
336 /// which might not hold in all cases.
337 std::pair<unsigned, unsigned> MachineCombiner::getLatenciesForInstrSequences(
338     MachineInstr &MI, SmallVectorImpl<MachineInstr *> &InsInstrs,
339     SmallVectorImpl<MachineInstr *> &DelInstrs,
340     MachineTraceMetrics::Trace BlockTrace) {
341   assert(!InsInstrs.empty() && "Only support sequences that insert instrs.");
342   unsigned NewRootLatency = 0;
343   // NewRoot is the last instruction in the \p InsInstrs vector.
344   MachineInstr *NewRoot = InsInstrs.back();
345   for (unsigned i = 0; i < InsInstrs.size() - 1; i++)
346     NewRootLatency += TSchedModel.computeInstrLatency(InsInstrs[i]);
347   NewRootLatency += getLatency(&MI, NewRoot, BlockTrace);
348 
349   unsigned RootLatency = 0;
350   for (auto *I : DelInstrs)
351     RootLatency += TSchedModel.computeInstrLatency(I);
352 
353   return {NewRootLatency, RootLatency};
354 }
355 
356 bool MachineCombiner::reduceRegisterPressure(
357     MachineInstr &Root, MachineBasicBlock *MBB,
358     SmallVectorImpl<MachineInstr *> &InsInstrs,
359     SmallVectorImpl<MachineInstr *> &DelInstrs,
360     MachineCombinerPattern Pattern) {
361   // FIXME: for now, we don't do any check for the register pressure patterns.
362   // We treat them as always profitable. But we can do better if we make
363   // RegPressureTracker class be aware of TIE attribute. Then we can get an
364   // accurate compare of register pressure with DelInstrs or InsInstrs.
365   return true;
366 }
367 
368 /// The DAGCombine code sequence ends in MI (Machine Instruction) Root.
369 /// The new code sequence ends in MI NewRoot. A necessary condition for the new
370 /// sequence to replace the old sequence is that it cannot lengthen the critical
371 /// path. The definition of "improve" may be restricted by specifying that the
372 /// new path improves the data dependency chain (MustReduceDepth).
373 bool MachineCombiner::improvesCriticalPathLen(
374     MachineBasicBlock *MBB, MachineInstr *Root,
375     MachineTraceMetrics::Trace BlockTrace,
376     SmallVectorImpl<MachineInstr *> &InsInstrs,
377     SmallVectorImpl<MachineInstr *> &DelInstrs,
378     DenseMap<unsigned, unsigned> &InstrIdxForVirtReg,
379     MachineCombinerPattern Pattern,
380     bool SlackIsAccurate) {
381   // Get depth and latency of NewRoot and Root.
382   unsigned NewRootDepth =
383       getDepth(InsInstrs, InstrIdxForVirtReg, BlockTrace, *MBB);
384   unsigned RootDepth = BlockTrace.getInstrCycles(*Root).Depth;
385 
386   LLVM_DEBUG(dbgs() << "  Dependence data for " << *Root << "\tNewRootDepth: "
387                     << NewRootDepth << "\tRootDepth: " << RootDepth);
388 
389   // For a transform such as reassociation, the cost equation is
390   // conservatively calculated so that we must improve the depth (data
391   // dependency cycles) in the critical path to proceed with the transform.
392   // Being conservative also protects against inaccuracies in the underlying
393   // machine trace metrics and CPU models.
394   if (getCombinerObjective(Pattern) == CombinerObjective::MustReduceDepth) {
395     LLVM_DEBUG(dbgs() << "\tIt MustReduceDepth ");
396     LLVM_DEBUG(NewRootDepth < RootDepth
397                    ? dbgs() << "\t  and it does it\n"
398                    : dbgs() << "\t  but it does NOT do it\n");
399     return NewRootDepth < RootDepth;
400   }
401 
402   // A more flexible cost calculation for the critical path includes the slack
403   // of the original code sequence. This may allow the transform to proceed
404   // even if the instruction depths (data dependency cycles) become worse.
405 
406   // Account for the latency of the inserted and deleted instructions by
407   unsigned NewRootLatency, RootLatency;
408   if (TII->accumulateInstrSeqToRootLatency(*Root)) {
409     std::tie(NewRootLatency, RootLatency) =
410         getLatenciesForInstrSequences(*Root, InsInstrs, DelInstrs, BlockTrace);
411   } else {
412     NewRootLatency = TSchedModel.computeInstrLatency(InsInstrs.back());
413     RootLatency = TSchedModel.computeInstrLatency(Root);
414   }
415 
416   unsigned RootSlack = BlockTrace.getInstrSlack(*Root);
417   unsigned NewCycleCount = NewRootDepth + NewRootLatency;
418   unsigned OldCycleCount =
419       RootDepth + RootLatency + (SlackIsAccurate ? RootSlack : 0);
420   LLVM_DEBUG(dbgs() << "\n\tNewRootLatency: " << NewRootLatency
421                     << "\tRootLatency: " << RootLatency << "\n\tRootSlack: "
422                     << RootSlack << " SlackIsAccurate=" << SlackIsAccurate
423                     << "\n\tNewRootDepth + NewRootLatency = " << NewCycleCount
424                     << "\n\tRootDepth + RootLatency + RootSlack = "
425                     << OldCycleCount;);
426   LLVM_DEBUG(NewCycleCount <= OldCycleCount
427                  ? dbgs() << "\n\t  It IMPROVES PathLen because"
428                  : dbgs() << "\n\t  It DOES NOT improve PathLen because");
429   LLVM_DEBUG(dbgs() << "\n\t\tNewCycleCount = " << NewCycleCount
430                     << ", OldCycleCount = " << OldCycleCount << "\n");
431 
432   return NewCycleCount <= OldCycleCount;
433 }
434 
435 /// helper routine to convert instructions into SC
436 void MachineCombiner::instr2instrSC(
437     SmallVectorImpl<MachineInstr *> &Instrs,
438     SmallVectorImpl<const MCSchedClassDesc *> &InstrsSC) {
439   for (auto *InstrPtr : Instrs) {
440     unsigned Opc = InstrPtr->getOpcode();
441     unsigned Idx = TII->get(Opc).getSchedClass();
442     const MCSchedClassDesc *SC = SchedModel.getSchedClassDesc(Idx);
443     InstrsSC.push_back(SC);
444   }
445 }
446 
447 /// True when the new instructions do not increase resource length
448 bool MachineCombiner::preservesResourceLen(
449     MachineBasicBlock *MBB, MachineTraceMetrics::Trace BlockTrace,
450     SmallVectorImpl<MachineInstr *> &InsInstrs,
451     SmallVectorImpl<MachineInstr *> &DelInstrs) {
452   if (!TSchedModel.hasInstrSchedModel())
453     return true;
454 
455   // Compute current resource length
456 
457   //ArrayRef<const MachineBasicBlock *> MBBarr(MBB);
458   SmallVector <const MachineBasicBlock *, 1> MBBarr;
459   MBBarr.push_back(MBB);
460   unsigned ResLenBeforeCombine = BlockTrace.getResourceLength(MBBarr);
461 
462   // Deal with SC rather than Instructions.
463   SmallVector<const MCSchedClassDesc *, 16> InsInstrsSC;
464   SmallVector<const MCSchedClassDesc *, 16> DelInstrsSC;
465 
466   instr2instrSC(InsInstrs, InsInstrsSC);
467   instr2instrSC(DelInstrs, DelInstrsSC);
468 
469   ArrayRef<const MCSchedClassDesc *> MSCInsArr{InsInstrsSC};
470   ArrayRef<const MCSchedClassDesc *> MSCDelArr{DelInstrsSC};
471 
472   // Compute new resource length.
473   unsigned ResLenAfterCombine =
474       BlockTrace.getResourceLength(MBBarr, MSCInsArr, MSCDelArr);
475 
476   LLVM_DEBUG(dbgs() << "\t\tResource length before replacement: "
477                     << ResLenBeforeCombine
478                     << " and after: " << ResLenAfterCombine << "\n";);
479   LLVM_DEBUG(
480       ResLenAfterCombine <=
481       ResLenBeforeCombine + TII->getExtendResourceLenLimit()
482           ? dbgs() << "\t\t  As result it IMPROVES/PRESERVES Resource Length\n"
483           : dbgs() << "\t\t  As result it DOES NOT improve/preserve Resource "
484                       "Length\n");
485 
486   return ResLenAfterCombine <=
487          ResLenBeforeCombine + TII->getExtendResourceLenLimit();
488 }
489 
490 /// Inserts InsInstrs and deletes DelInstrs. Incrementally updates instruction
491 /// depths if requested.
492 ///
493 /// \param MBB basic block to insert instructions in
494 /// \param MI current machine instruction
495 /// \param InsInstrs new instructions to insert in \p MBB
496 /// \param DelInstrs instruction to delete from \p MBB
497 /// \param TraceEnsemble is a pointer to the machine trace information
498 /// \param RegUnits set of live registers, needed to compute instruction depths
499 /// \param TII is target instruction info, used to call target hook
500 /// \param Pattern is used to call target hook finalizeInsInstrs
501 /// \param IncrementalUpdate if true, compute instruction depths incrementally,
502 ///                          otherwise invalidate the trace
503 static void insertDeleteInstructions(
504     MachineBasicBlock *MBB, MachineInstr &MI,
505     SmallVectorImpl<MachineInstr *> &InsInstrs,
506     SmallVectorImpl<MachineInstr *> &DelInstrs,
507     MachineTraceMetrics::Ensemble *TraceEnsemble,
508     SparseSet<LiveRegUnit> &RegUnits, const TargetInstrInfo *TII,
509     MachineCombinerPattern Pattern, bool IncrementalUpdate) {
510   // If we want to fix up some placeholder for some target, do it now.
511   // We need this because in genAlternativeCodeSequence, we have not decided the
512   // better pattern InsInstrs or DelInstrs, so we don't want generate some
513   // sideeffect to the function. For example we need to delay the constant pool
514   // entry creation here after InsInstrs is selected as better pattern.
515   // Otherwise the constant pool entry created for InsInstrs will not be deleted
516   // even if InsInstrs is not the better pattern.
517   TII->finalizeInsInstrs(MI, Pattern, InsInstrs);
518 
519   for (auto *InstrPtr : InsInstrs)
520     MBB->insert((MachineBasicBlock::iterator)&MI, InstrPtr);
521 
522   for (auto *InstrPtr : DelInstrs) {
523     InstrPtr->eraseFromParent();
524     // Erase all LiveRegs defined by the removed instruction
525     for (auto *I = RegUnits.begin(); I != RegUnits.end();) {
526       if (I->MI == InstrPtr)
527         I = RegUnits.erase(I);
528       else
529         I++;
530     }
531   }
532 
533   if (IncrementalUpdate)
534     for (auto *InstrPtr : InsInstrs)
535       TraceEnsemble->updateDepth(MBB, *InstrPtr, RegUnits);
536   else
537     TraceEnsemble->invalidate(MBB);
538 
539   NumInstCombined++;
540 }
541 
542 // Check that the difference between original and new latency is decreasing for
543 // later patterns. This helps to discover sub-optimal pattern orderings.
544 void MachineCombiner::verifyPatternOrder(
545     MachineBasicBlock *MBB, MachineInstr &Root,
546     SmallVector<MachineCombinerPattern, 16> &Patterns) {
547   long PrevLatencyDiff = std::numeric_limits<long>::max();
548   (void)PrevLatencyDiff; // Variable is used in assert only.
549   for (auto P : Patterns) {
550     SmallVector<MachineInstr *, 16> InsInstrs;
551     SmallVector<MachineInstr *, 16> DelInstrs;
552     DenseMap<unsigned, unsigned> InstrIdxForVirtReg;
553     TII->genAlternativeCodeSequence(Root, P, InsInstrs, DelInstrs,
554                                     InstrIdxForVirtReg);
555     // Found pattern, but did not generate alternative sequence.
556     // This can happen e.g. when an immediate could not be materialized
557     // in a single instruction.
558     if (InsInstrs.empty() || !TSchedModel.hasInstrSchedModelOrItineraries())
559       continue;
560 
561     unsigned NewRootLatency, RootLatency;
562     std::tie(NewRootLatency, RootLatency) = getLatenciesForInstrSequences(
563         Root, InsInstrs, DelInstrs, TraceEnsemble->getTrace(MBB));
564     long CurrentLatencyDiff = ((long)RootLatency) - ((long)NewRootLatency);
565     assert(CurrentLatencyDiff <= PrevLatencyDiff &&
566            "Current pattern is better than previous pattern.");
567     PrevLatencyDiff = CurrentLatencyDiff;
568   }
569 }
570 
571 /// Substitute a slow code sequence with a faster one by
572 /// evaluating instruction combining pattern.
573 /// The prototype of such a pattern is MUl + ADD -> MADD. Performs instruction
574 /// combining based on machine trace metrics. Only combine a sequence of
575 /// instructions  when this neither lengthens the critical path nor increases
576 /// resource pressure. When optimizing for codesize always combine when the new
577 /// sequence is shorter.
578 bool MachineCombiner::combineInstructions(MachineBasicBlock *MBB) {
579   bool Changed = false;
580   LLVM_DEBUG(dbgs() << "Combining MBB " << MBB->getName() << "\n");
581 
582   bool IncrementalUpdate = false;
583   auto BlockIter = MBB->begin();
584   decltype(BlockIter) LastUpdate;
585   // Check if the block is in a loop.
586   const MachineLoop *ML = MLI->getLoopFor(MBB);
587   if (!TraceEnsemble)
588     TraceEnsemble = Traces->getEnsemble(TII->getMachineCombinerTraceStrategy());
589 
590   SparseSet<LiveRegUnit> RegUnits;
591   RegUnits.setUniverse(TRI->getNumRegUnits());
592 
593   bool OptForSize = OptSize || llvm::shouldOptimizeForSize(MBB, PSI, MBFI);
594 
595   bool DoRegPressureReduce =
596       TII->shouldReduceRegisterPressure(MBB, &RegClassInfo);
597 
598   while (BlockIter != MBB->end()) {
599     auto &MI = *BlockIter++;
600     SmallVector<MachineCombinerPattern, 16> Patterns;
601     // The motivating example is:
602     //
603     //     MUL  Other        MUL_op1 MUL_op2  Other
604     //      \    /               \      |    /
605     //      ADD/SUB      =>        MADD/MSUB
606     //      (=Root)                (=NewRoot)
607 
608     // The DAGCombine code always replaced MUL + ADD/SUB by MADD. While this is
609     // usually beneficial for code size it unfortunately can hurt performance
610     // when the ADD is on the critical path, but the MUL is not. With the
611     // substitution the MUL becomes part of the critical path (in form of the
612     // MADD) and can lengthen it on architectures where the MADD latency is
613     // longer than the ADD latency.
614     //
615     // For each instruction we check if it can be the root of a combiner
616     // pattern. Then for each pattern the new code sequence in form of MI is
617     // generated and evaluated. When the efficiency criteria (don't lengthen
618     // critical path, don't use more resources) is met the new sequence gets
619     // hooked up into the basic block before the old sequence is removed.
620     //
621     // The algorithm does not try to evaluate all patterns and pick the best.
622     // This is only an artificial restriction though. In practice there is
623     // mostly one pattern, and getMachineCombinerPatterns() can order patterns
624     // based on an internal cost heuristic. If
625     // machine-combiner-verify-pattern-order is enabled, all patterns are
626     // checked to ensure later patterns do not provide better latency savings.
627 
628     if (!TII->getMachineCombinerPatterns(MI, Patterns, DoRegPressureReduce))
629       continue;
630 
631     if (VerifyPatternOrder)
632       verifyPatternOrder(MBB, MI, Patterns);
633 
634     for (const auto P : Patterns) {
635       SmallVector<MachineInstr *, 16> InsInstrs;
636       SmallVector<MachineInstr *, 16> DelInstrs;
637       DenseMap<unsigned, unsigned> InstrIdxForVirtReg;
638       TII->genAlternativeCodeSequence(MI, P, InsInstrs, DelInstrs,
639                                       InstrIdxForVirtReg);
640       // Found pattern, but did not generate alternative sequence.
641       // This can happen e.g. when an immediate could not be materialized
642       // in a single instruction.
643       if (InsInstrs.empty())
644         continue;
645 
646       LLVM_DEBUG(if (dump_intrs) {
647         dbgs() << "\tFor the Pattern (" << (int)P
648                << ") these instructions could be removed\n";
649         for (auto const *InstrPtr : DelInstrs)
650           InstrPtr->print(dbgs(), /*IsStandalone*/false, /*SkipOpers*/false,
651                           /*SkipDebugLoc*/false, /*AddNewLine*/true, TII);
652         dbgs() << "\tThese instructions could replace the removed ones\n";
653         for (auto const *InstrPtr : InsInstrs)
654           InstrPtr->print(dbgs(), /*IsStandalone*/false, /*SkipOpers*/false,
655                           /*SkipDebugLoc*/false, /*AddNewLine*/true, TII);
656       });
657 
658       if (IncrementalUpdate && LastUpdate != BlockIter) {
659         // Update depths since the last incremental update.
660         TraceEnsemble->updateDepths(LastUpdate, BlockIter, RegUnits);
661         LastUpdate = BlockIter;
662       }
663 
664       if (DoRegPressureReduce &&
665           getCombinerObjective(P) ==
666               CombinerObjective::MustReduceRegisterPressure) {
667         if (MBB->size() > inc_threshold) {
668           // Use incremental depth updates for basic blocks above threshold
669           IncrementalUpdate = true;
670           LastUpdate = BlockIter;
671         }
672         if (reduceRegisterPressure(MI, MBB, InsInstrs, DelInstrs, P)) {
673           // Replace DelInstrs with InsInstrs.
674           insertDeleteInstructions(MBB, MI, InsInstrs, DelInstrs, TraceEnsemble,
675                                    RegUnits, TII, P, IncrementalUpdate);
676           Changed |= true;
677 
678           // Go back to previous instruction as it may have ILP reassociation
679           // opportunity.
680           BlockIter--;
681           break;
682         }
683       }
684 
685       if (ML && TII->isThroughputPattern(P)) {
686         LLVM_DEBUG(dbgs() << "\t Replacing due to throughput pattern in loop\n");
687         insertDeleteInstructions(MBB, MI, InsInstrs, DelInstrs, TraceEnsemble,
688                                  RegUnits, TII, P, IncrementalUpdate);
689         // Eagerly stop after the first pattern fires.
690         Changed = true;
691         break;
692       } else if (OptForSize && InsInstrs.size() < DelInstrs.size()) {
693         LLVM_DEBUG(dbgs() << "\t Replacing due to OptForSize ("
694                           << InsInstrs.size() << " < "
695                           << DelInstrs.size() << ")\n");
696         insertDeleteInstructions(MBB, MI, InsInstrs, DelInstrs, TraceEnsemble,
697                                  RegUnits, TII, P, IncrementalUpdate);
698         // Eagerly stop after the first pattern fires.
699         Changed = true;
700         break;
701       } else {
702         // For big basic blocks, we only compute the full trace the first time
703         // we hit this. We do not invalidate the trace, but instead update the
704         // instruction depths incrementally.
705         // NOTE: Only the instruction depths up to MI are accurate. All other
706         // trace information is not updated.
707         MachineTraceMetrics::Trace BlockTrace = TraceEnsemble->getTrace(MBB);
708         Traces->verifyAnalysis();
709         if (improvesCriticalPathLen(MBB, &MI, BlockTrace, InsInstrs, DelInstrs,
710                                     InstrIdxForVirtReg, P,
711                                     !IncrementalUpdate) &&
712             preservesResourceLen(MBB, BlockTrace, InsInstrs, DelInstrs)) {
713           if (MBB->size() > inc_threshold) {
714             // Use incremental depth updates for basic blocks above treshold
715             IncrementalUpdate = true;
716             LastUpdate = BlockIter;
717           }
718 
719           insertDeleteInstructions(MBB, MI, InsInstrs, DelInstrs, TraceEnsemble,
720                                    RegUnits, TII, P, IncrementalUpdate);
721 
722           // Eagerly stop after the first pattern fires.
723           Changed = true;
724           break;
725         }
726         // Cleanup instructions of the alternative code sequence. There is no
727         // use for them.
728         MachineFunction *MF = MBB->getParent();
729         for (auto *InstrPtr : InsInstrs)
730           MF->deleteMachineInstr(InstrPtr);
731       }
732       InstrIdxForVirtReg.clear();
733     }
734   }
735 
736   if (Changed && IncrementalUpdate)
737     Traces->invalidate(MBB);
738   return Changed;
739 }
740 
741 bool MachineCombiner::runOnMachineFunction(MachineFunction &MF) {
742   STI = &MF.getSubtarget();
743   TII = STI->getInstrInfo();
744   TRI = STI->getRegisterInfo();
745   SchedModel = STI->getSchedModel();
746   TSchedModel.init(STI);
747   MRI = &MF.getRegInfo();
748   MLI = &getAnalysis<MachineLoopInfo>();
749   Traces = &getAnalysis<MachineTraceMetrics>();
750   PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
751   MBFI = (PSI && PSI->hasProfileSummary()) ?
752          &getAnalysis<LazyMachineBlockFrequencyInfoPass>().getBFI() :
753          nullptr;
754   TraceEnsemble = nullptr;
755   OptSize = MF.getFunction().hasOptSize();
756   RegClassInfo.runOnMachineFunction(MF);
757 
758   LLVM_DEBUG(dbgs() << getPassName() << ": " << MF.getName() << '\n');
759   if (!TII->useMachineCombiner()) {
760     LLVM_DEBUG(
761         dbgs()
762         << "  Skipping pass: Target does not support machine combiner\n");
763     return false;
764   }
765 
766   bool Changed = false;
767 
768   // Try to combine instructions.
769   for (auto &MBB : MF)
770     Changed |= combineInstructions(&MBB);
771 
772   return Changed;
773 }
774