xref: /llvm-project/bolt/lib/Passes/TailDuplication.cpp (revision fd38366e4525c5507bbb2a2fc1f7d113a964224e)
1 //===- bolt/Passes/TailDuplication.cpp ------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements the TailDuplication class.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "bolt/Passes/TailDuplication.h"
14 #include "llvm/ADT/DenseMap.h"
15 #include "llvm/MC/MCRegisterInfo.h"
16 
17 #include <numeric>
18 #include <queue>
19 
20 #define DEBUG_TYPE "taildup"
21 
22 using namespace llvm;
23 
24 namespace opts {
25 
26 extern cl::OptionCategory BoltOptCategory;
27 extern cl::opt<bool> NoThreads;
28 
29 cl::opt<bolt::TailDuplication::DuplicationMode> TailDuplicationMode(
30     "tail-duplication",
31     cl::desc("duplicate unconditional branches that cross a cache line"),
32     cl::init(bolt::TailDuplication::TD_NONE),
33     cl::values(clEnumValN(bolt::TailDuplication::TD_NONE, "none",
34                           "do not apply"),
35                clEnumValN(bolt::TailDuplication::TD_AGGRESSIVE, "aggressive",
36                           "aggressive strategy"),
37                clEnumValN(bolt::TailDuplication::TD_MODERATE, "moderate",
38                           "moderate strategy"),
39                clEnumValN(bolt::TailDuplication::TD_CACHE, "cache",
40                           "cache-aware duplication strategy")),
41     cl::ZeroOrMore, cl::Hidden, cl::cat(BoltOptCategory));
42 
43 static cl::opt<unsigned>
44     TailDuplicationMinimumOffset("tail-duplication-minimum-offset",
45                                  cl::desc("minimum offset needed between block "
46                                           "and successor to allow duplication"),
47                                  cl::ReallyHidden, cl::init(64),
48                                  cl::cat(BoltOptCategory));
49 
50 static cl::opt<unsigned> TailDuplicationMaximumDuplication(
51     "tail-duplication-maximum-duplication",
52     cl::desc("tail blocks whose size (in bytes) exceeds the value are never "
53              "duplicated"),
54     cl::ZeroOrMore, cl::ReallyHidden, cl::init(24), cl::cat(BoltOptCategory));
55 
56 static cl::opt<unsigned> TailDuplicationMinimumDuplication(
57     "tail-duplication-minimum-duplication",
58     cl::desc("tail blocks with size (in bytes) not exceeding the value are "
59              "always duplicated"),
60     cl::ReallyHidden, cl::init(2), cl::cat(BoltOptCategory));
61 
62 static cl::opt<bool> TailDuplicationConstCopyPropagation(
63     "tail-duplication-const-copy-propagation",
64     cl::desc("enable const and copy propagation after tail duplication"),
65     cl::ReallyHidden, cl::init(false), cl::cat(BoltOptCategory));
66 
67 static cl::opt<unsigned> TailDuplicationMaxCacheDistance(
68     "tail-duplication-max-cache-distance",
69     cl::desc("The weight of backward jumps for ExtTSP value"), cl::init(256),
70     cl::ReallyHidden, cl::cat(BoltOptCategory));
71 
72 static cl::opt<double> TailDuplicationCacheBackwardWeight(
73     "tail-duplication-cache-backward-weight",
74     cl::desc(
75         "The maximum distance (in bytes) of backward jumps for ExtTSP value"),
76     cl::init(0.5), cl::ReallyHidden, cl::cat(BoltOptCategory));
77 
78 } // namespace opts
79 
80 namespace llvm {
81 namespace bolt {
82 
getCallerSavedRegs(const MCInst & Inst,BitVector & Regs,BinaryContext & BC) const83 void TailDuplication::getCallerSavedRegs(const MCInst &Inst, BitVector &Regs,
84                                          BinaryContext &BC) const {
85   if (!BC.MIB->isCall(Inst))
86     return;
87   BitVector CallRegs = BitVector(BC.MRI->getNumRegs(), false);
88   BC.MIB->getCalleeSavedRegs(CallRegs);
89   CallRegs.flip();
90   Regs |= CallRegs;
91 }
92 
regIsPossiblyOverwritten(const MCInst & Inst,unsigned Reg,BinaryContext & BC) const93 bool TailDuplication::regIsPossiblyOverwritten(const MCInst &Inst, unsigned Reg,
94                                                BinaryContext &BC) const {
95   BitVector WrittenRegs = BitVector(BC.MRI->getNumRegs(), false);
96   BC.MIB->getWrittenRegs(Inst, WrittenRegs);
97   getCallerSavedRegs(Inst, WrittenRegs, BC);
98   if (BC.MIB->isRep(Inst))
99     BC.MIB->getRepRegs(WrittenRegs);
100   WrittenRegs &= BC.MIB->getAliases(Reg, false);
101   return WrittenRegs.any();
102 }
103 
regIsDefinitelyOverwritten(const MCInst & Inst,unsigned Reg,BinaryContext & BC) const104 bool TailDuplication::regIsDefinitelyOverwritten(const MCInst &Inst,
105                                                  unsigned Reg,
106                                                  BinaryContext &BC) const {
107   BitVector WrittenRegs = BitVector(BC.MRI->getNumRegs(), false);
108   BC.MIB->getWrittenRegs(Inst, WrittenRegs);
109   getCallerSavedRegs(Inst, WrittenRegs, BC);
110   if (BC.MIB->isRep(Inst))
111     BC.MIB->getRepRegs(WrittenRegs);
112   return (!regIsUsed(Inst, Reg, BC) && WrittenRegs.test(Reg) &&
113           !BC.MIB->isConditionalMove(Inst));
114 }
115 
regIsUsed(const MCInst & Inst,unsigned Reg,BinaryContext & BC) const116 bool TailDuplication::regIsUsed(const MCInst &Inst, unsigned Reg,
117                                 BinaryContext &BC) const {
118   BitVector SrcRegs = BitVector(BC.MRI->getNumRegs(), false);
119   BC.MIB->getSrcRegs(Inst, SrcRegs);
120   SrcRegs &= BC.MIB->getAliases(Reg, true);
121   return SrcRegs.any();
122 }
123 
isOverwrittenBeforeUsed(BinaryBasicBlock & StartBB,unsigned Reg) const124 bool TailDuplication::isOverwrittenBeforeUsed(BinaryBasicBlock &StartBB,
125                                               unsigned Reg) const {
126   BinaryFunction *BF = StartBB.getFunction();
127   BinaryContext &BC = BF->getBinaryContext();
128   std::queue<BinaryBasicBlock *> Q;
129   for (auto Itr = StartBB.succ_begin(); Itr != StartBB.succ_end(); ++Itr) {
130     BinaryBasicBlock *NextBB = *Itr;
131     Q.push(NextBB);
132   }
133   std::set<BinaryBasicBlock *> Visited;
134   // Breadth first search through successive blocks and see if Reg is ever used
135   // before its overwritten
136   while (Q.size() > 0) {
137     BinaryBasicBlock *CurrBB = Q.front();
138     Q.pop();
139     if (Visited.count(CurrBB))
140       continue;
141     Visited.insert(CurrBB);
142     bool Overwritten = false;
143     for (auto Itr = CurrBB->begin(); Itr != CurrBB->end(); ++Itr) {
144       MCInst &Inst = *Itr;
145       if (regIsUsed(Inst, Reg, BC))
146         return false;
147       if (regIsDefinitelyOverwritten(Inst, Reg, BC)) {
148         Overwritten = true;
149         break;
150       }
151     }
152     if (Overwritten)
153       continue;
154     for (auto Itr = CurrBB->succ_begin(); Itr != CurrBB->succ_end(); ++Itr) {
155       BinaryBasicBlock *NextBB = *Itr;
156       Q.push(NextBB);
157     }
158   }
159   return true;
160 }
161 
constantAndCopyPropagate(BinaryBasicBlock & OriginalBB,std::vector<BinaryBasicBlock * > & BlocksToPropagate)162 void TailDuplication::constantAndCopyPropagate(
163     BinaryBasicBlock &OriginalBB,
164     std::vector<BinaryBasicBlock *> &BlocksToPropagate) {
165   BinaryFunction *BF = OriginalBB.getFunction();
166   BinaryContext &BC = BF->getBinaryContext();
167 
168   BlocksToPropagate.insert(BlocksToPropagate.begin(), &OriginalBB);
169   // Iterate through the original instructions to find one to propagate
170   for (auto Itr = OriginalBB.begin(); Itr != OriginalBB.end(); ++Itr) {
171     MCInst &OriginalInst = *Itr;
172     // It must be a non conditional
173     if (BC.MIB->isConditionalMove(OriginalInst))
174       continue;
175 
176     // Move immediate or move register
177     if ((!BC.MII->get(OriginalInst.getOpcode()).isMoveImmediate() ||
178          !OriginalInst.getOperand(1).isImm()) &&
179         (!BC.MII->get(OriginalInst.getOpcode()).isMoveReg() ||
180          !OriginalInst.getOperand(1).isReg()))
181       continue;
182 
183     // True if this is constant propagation and not copy propagation
184     bool ConstantProp = BC.MII->get(OriginalInst.getOpcode()).isMoveImmediate();
185     // The Register to replaced
186     unsigned Reg = OriginalInst.getOperand(0).getReg();
187     // True if the register to replace was replaced everywhere it was used
188     bool ReplacedEverywhere = true;
189     // True if the register was definitely overwritten
190     bool Overwritten = false;
191     // True if the register to replace and the register to replace with (for
192     // copy propagation) has not been overwritten and is still usable
193     bool RegsActive = true;
194 
195     // Iterate through successor blocks and through their instructions
196     for (BinaryBasicBlock *NextBB : BlocksToPropagate) {
197       for (auto PropagateItr =
198                ((NextBB == &OriginalBB) ? Itr + 1 : NextBB->begin());
199            PropagateItr < NextBB->end(); ++PropagateItr) {
200         MCInst &PropagateInst = *PropagateItr;
201         if (regIsUsed(PropagateInst, Reg, BC)) {
202           bool Replaced = false;
203           // If both registers are active for copy propagation or the register
204           // to replace is active for constant propagation
205           if (RegsActive) {
206             // Set Replaced and so ReplacedEverwhere to false if it cannot be
207             // replaced (no replacing that opcode, Register is src and dest)
208             if (ConstantProp)
209               Replaced = BC.MIB->replaceRegWithImm(
210                   PropagateInst, Reg, OriginalInst.getOperand(1).getImm());
211             else
212               Replaced = BC.MIB->replaceRegWithReg(
213                   PropagateInst, Reg, OriginalInst.getOperand(1).getReg());
214           }
215           ReplacedEverywhere = ReplacedEverywhere && Replaced;
216         }
217         // For copy propagation, make sure no propagation happens after the
218         // register to replace with is overwritten
219         if (!ConstantProp &&
220             regIsPossiblyOverwritten(PropagateInst,
221                                      OriginalInst.getOperand(1).getReg(), BC))
222           RegsActive = false;
223 
224         // Make sure no propagation happens after the register to replace is
225         // overwritten
226         if (regIsPossiblyOverwritten(PropagateInst, Reg, BC))
227           RegsActive = false;
228 
229         // Record if the register to replace is overwritten
230         if (regIsDefinitelyOverwritten(PropagateInst, Reg, BC)) {
231           Overwritten = true;
232           break;
233         }
234       }
235       if (Overwritten)
236         break;
237     }
238 
239     // If the register was replaced everywhere and it was overwritten in either
240     // one of the iterated through blocks or one of the successor blocks, delete
241     // the original move instruction
242     if (ReplacedEverywhere &&
243         (Overwritten ||
244          isOverwrittenBeforeUsed(
245              *BlocksToPropagate[BlocksToPropagate.size() - 1], Reg))) {
246       // If both registers are active for copy propagation or the register
247       // to replace is active for constant propagation
248       StaticInstructionDeletionCount++;
249       DynamicInstructionDeletionCount += OriginalBB.getExecutionCount();
250       Itr = std::prev(OriginalBB.eraseInstruction(Itr));
251     }
252   }
253 }
254 
isInCacheLine(const BinaryBasicBlock & BB,const BinaryBasicBlock & Succ) const255 bool TailDuplication::isInCacheLine(const BinaryBasicBlock &BB,
256                                     const BinaryBasicBlock &Succ) const {
257   if (&BB == &Succ)
258     return true;
259 
260   uint64_t Distance = 0;
261   int Direction = (Succ.getLayoutIndex() > BB.getLayoutIndex()) ? 1 : -1;
262 
263   for (unsigned I = BB.getLayoutIndex() + Direction; I != Succ.getLayoutIndex();
264        I += Direction) {
265     Distance += BB.getFunction()->getLayout().getBlock(I)->getOriginalSize();
266     if (Distance > opts::TailDuplicationMinimumOffset)
267       return false;
268   }
269   return true;
270 }
271 
272 std::vector<BinaryBasicBlock *>
moderateDuplicate(BinaryBasicBlock & BB,BinaryBasicBlock & Tail) const273 TailDuplication::moderateDuplicate(BinaryBasicBlock &BB,
274                                    BinaryBasicBlock &Tail) const {
275   std::vector<BinaryBasicBlock *> BlocksToDuplicate;
276   // The block must be hot
277   if (BB.getKnownExecutionCount() == 0)
278     return BlocksToDuplicate;
279   // and its successor is not already in the same cache line
280   if (isInCacheLine(BB, Tail))
281     return BlocksToDuplicate;
282   // and its size do not exceed the maximum allowed size
283   if (Tail.getOriginalSize() > opts::TailDuplicationMaximumDuplication)
284     return BlocksToDuplicate;
285   // If duplicating would introduce a new branch, don't duplicate
286   for (auto Itr = Tail.succ_begin(); Itr != Tail.succ_end(); ++Itr) {
287     if ((*Itr)->getLayoutIndex() == Tail.getLayoutIndex() + 1)
288       return BlocksToDuplicate;
289   }
290 
291   BlocksToDuplicate.push_back(&Tail);
292   return BlocksToDuplicate;
293 }
294 
295 std::vector<BinaryBasicBlock *>
aggressiveDuplicate(BinaryBasicBlock & BB,BinaryBasicBlock & Tail) const296 TailDuplication::aggressiveDuplicate(BinaryBasicBlock &BB,
297                                      BinaryBasicBlock &Tail) const {
298   std::vector<BinaryBasicBlock *> BlocksToDuplicate;
299   // The block must be hot
300   if (BB.getKnownExecutionCount() == 0)
301     return BlocksToDuplicate;
302   // and its successor is not already in the same cache line
303   if (isInCacheLine(BB, Tail))
304     return BlocksToDuplicate;
305 
306   BinaryBasicBlock *CurrBB = &Tail;
307   while (CurrBB) {
308     LLVM_DEBUG(dbgs() << "Aggressive tail duplication: adding "
309                       << CurrBB->getName() << " to duplication list\n";);
310     BlocksToDuplicate.push_back(CurrBB);
311 
312     if (CurrBB->hasJumpTable()) {
313       LLVM_DEBUG(dbgs() << "Aggressive tail duplication: clearing duplication "
314                            "list due to a JT in "
315                         << CurrBB->getName() << '\n';);
316       BlocksToDuplicate.clear();
317       break;
318     }
319 
320     // With no successors, we've reached the end and should duplicate all of
321     // BlocksToDuplicate
322     if (CurrBB->succ_size() == 0)
323       break;
324 
325     // With two successors, if they're both a jump, we should duplicate all
326     // blocks in BlocksToDuplicate. Otherwise, we cannot find a simple stream of
327     // blocks to copy
328     if (CurrBB->succ_size() >= 2) {
329       if (CurrBB->getConditionalSuccessor(false)->getLayoutIndex() ==
330               CurrBB->getLayoutIndex() + 1 ||
331           CurrBB->getConditionalSuccessor(true)->getLayoutIndex() ==
332               CurrBB->getLayoutIndex() + 1) {
333         LLVM_DEBUG(dbgs() << "Aggressive tail duplication: clearing "
334                              "duplication list, can't find a simple stream at "
335                           << CurrBB->getName() << '\n';);
336         BlocksToDuplicate.clear();
337       }
338       break;
339     }
340 
341     // With one successor, if its a jump, we should duplicate all blocks in
342     // BlocksToDuplicate. Otherwise, we should keep going
343     BinaryBasicBlock *SuccBB = CurrBB->getSuccessor();
344     if (SuccBB->getLayoutIndex() != CurrBB->getLayoutIndex() + 1)
345       break;
346     CurrBB = SuccBB;
347   }
348   // Don't duplicate if its too much code
349   unsigned DuplicationByteCount = std::accumulate(
350       std::begin(BlocksToDuplicate), std::end(BlocksToDuplicate), 0,
351       [](int value, BinaryBasicBlock *p) {
352         return value + p->getOriginalSize();
353       });
354   if (DuplicationByteCount > opts::TailDuplicationMaximumDuplication) {
355     LLVM_DEBUG(dbgs() << "Aggressive tail duplication: duplication byte count ("
356                       << DuplicationByteCount << ") exceeds maximum "
357                       << opts::TailDuplicationMaximumDuplication << '\n';);
358     BlocksToDuplicate.clear();
359   }
360   LLVM_DEBUG(dbgs() << "Aggressive tail duplication: found "
361                     << BlocksToDuplicate.size() << " blocks to duplicate\n";);
362   return BlocksToDuplicate;
363 }
364 
shouldDuplicate(BinaryBasicBlock * Pred,BinaryBasicBlock * Tail) const365 bool TailDuplication::shouldDuplicate(BinaryBasicBlock *Pred,
366                                       BinaryBasicBlock *Tail) const {
367   if (Pred == Tail)
368     return false;
369   // Cannot duplicate non-tail blocks
370   if (Tail->succ_size() != 0)
371     return false;
372   // The blocks are already in the order
373   if (Pred->getLayoutIndex() + 1 == Tail->getLayoutIndex())
374     return false;
375   // No tail duplication for blocks with jump tables
376   if (Pred->hasJumpTable())
377     return false;
378   if (Tail->hasJumpTable())
379     return false;
380 
381   return true;
382 }
383 
cacheScore(uint64_t SrcAddr,uint64_t SrcSize,uint64_t DstAddr,uint64_t DstSize,uint64_t Count) const384 double TailDuplication::cacheScore(uint64_t SrcAddr, uint64_t SrcSize,
385                                    uint64_t DstAddr, uint64_t DstSize,
386                                    uint64_t Count) const {
387   assert(Count != BinaryBasicBlock::COUNT_NO_PROFILE);
388 
389   bool IsForwardJump = SrcAddr <= DstAddr;
390   uint64_t JumpDistance = 0;
391   // Computing the length of the jump so that it takes the sizes of the two
392   // blocks into consideration
393   if (IsForwardJump) {
394     JumpDistance = (DstAddr + DstSize) - (SrcAddr);
395   } else {
396     JumpDistance = (SrcAddr + SrcSize) - (DstAddr);
397   }
398 
399   if (JumpDistance >= opts::TailDuplicationMaxCacheDistance)
400     return 0;
401   double Prob = 1.0 - static_cast<double>(JumpDistance) /
402                           opts::TailDuplicationMaxCacheDistance;
403   return (IsForwardJump ? 1.0 : opts::TailDuplicationCacheBackwardWeight) *
404          Prob * Count;
405 }
406 
cacheScoreImproved(const MCCodeEmitter * Emitter,BinaryFunction & BF,BinaryBasicBlock * Pred,BinaryBasicBlock * Tail) const407 bool TailDuplication::cacheScoreImproved(const MCCodeEmitter *Emitter,
408                                          BinaryFunction &BF,
409                                          BinaryBasicBlock *Pred,
410                                          BinaryBasicBlock *Tail) const {
411   // Collect (estimated) basic block sizes
412   DenseMap<const BinaryBasicBlock *, uint64_t> BBSize;
413   for (const BinaryBasicBlock &BB : BF) {
414     BBSize[&BB] = std::max<uint64_t>(BB.estimateSize(Emitter), 1);
415   }
416 
417   // Build current addresses of basic blocks starting at the entry block
418   DenseMap<BinaryBasicBlock *, uint64_t> CurAddr;
419   uint64_t Addr = 0;
420   for (BinaryBasicBlock *SrcBB : BF.getLayout().blocks()) {
421     CurAddr[SrcBB] = Addr;
422     Addr += BBSize[SrcBB];
423   }
424 
425   // Build new addresses (after duplication) starting at the entry block
426   DenseMap<BinaryBasicBlock *, uint64_t> NewAddr;
427   Addr = 0;
428   for (BinaryBasicBlock *SrcBB : BF.getLayout().blocks()) {
429     NewAddr[SrcBB] = Addr;
430     Addr += BBSize[SrcBB];
431     if (SrcBB == Pred)
432       Addr += BBSize[Tail];
433   }
434 
435   // Compute the cache score for the existing layout of basic blocks
436   double CurScore = 0;
437   for (BinaryBasicBlock *SrcBB : BF.getLayout().blocks()) {
438     auto BI = SrcBB->branch_info_begin();
439     for (BinaryBasicBlock *DstBB : SrcBB->successors()) {
440       if (SrcBB != DstBB) {
441         CurScore += cacheScore(CurAddr[SrcBB], BBSize[SrcBB], CurAddr[DstBB],
442                                BBSize[DstBB], BI->Count);
443       }
444       ++BI;
445     }
446   }
447 
448   // Compute the cache score for the layout of blocks after tail duplication
449   double NewScore = 0;
450   for (BinaryBasicBlock *SrcBB : BF.getLayout().blocks()) {
451     auto BI = SrcBB->branch_info_begin();
452     for (BinaryBasicBlock *DstBB : SrcBB->successors()) {
453       if (SrcBB != DstBB) {
454         if (SrcBB == Pred && DstBB == Tail) {
455           NewScore += cacheScore(NewAddr[SrcBB], BBSize[SrcBB],
456                                  NewAddr[SrcBB] + BBSize[SrcBB], BBSize[DstBB],
457                                  BI->Count);
458         } else {
459           NewScore += cacheScore(NewAddr[SrcBB], BBSize[SrcBB], NewAddr[DstBB],
460                                  BBSize[DstBB], BI->Count);
461         }
462       }
463       ++BI;
464     }
465   }
466 
467   return NewScore > CurScore;
468 }
469 
470 std::vector<BinaryBasicBlock *>
cacheDuplicate(const MCCodeEmitter * Emitter,BinaryFunction & BF,BinaryBasicBlock * Pred,BinaryBasicBlock * Tail) const471 TailDuplication::cacheDuplicate(const MCCodeEmitter *Emitter,
472                                 BinaryFunction &BF, BinaryBasicBlock *Pred,
473                                 BinaryBasicBlock *Tail) const {
474   std::vector<BinaryBasicBlock *> BlocksToDuplicate;
475 
476   // No need to duplicate cold basic blocks
477   if (Pred->isCold() || Tail->isCold()) {
478     return BlocksToDuplicate;
479   }
480   // Always duplicate "small" tail basic blocks, which might be beneficial for
481   // code size, since a jump instruction is eliminated
482   if (Tail->estimateSize(Emitter) <= opts::TailDuplicationMinimumDuplication) {
483     BlocksToDuplicate.push_back(Tail);
484     return BlocksToDuplicate;
485   }
486   // Never duplicate "large" tail basic blocks
487   if (Tail->estimateSize(Emitter) > opts::TailDuplicationMaximumDuplication) {
488     return BlocksToDuplicate;
489   }
490   // Do not append basic blocks after the last hot block in the current layout
491   auto NextBlock = BF.getLayout().getBasicBlockAfter(Pred);
492   if (NextBlock == nullptr || (!Pred->isCold() && NextBlock->isCold())) {
493     return BlocksToDuplicate;
494   }
495 
496   // Duplicate the tail only if it improves the cache score
497   if (cacheScoreImproved(Emitter, BF, Pred, Tail)) {
498     BlocksToDuplicate.push_back(Tail);
499   }
500 
501   return BlocksToDuplicate;
502 }
503 
duplicateBlocks(BinaryBasicBlock & BB,const std::vector<BinaryBasicBlock * > & BlocksToDuplicate) const504 std::vector<BinaryBasicBlock *> TailDuplication::duplicateBlocks(
505     BinaryBasicBlock &BB,
506     const std::vector<BinaryBasicBlock *> &BlocksToDuplicate) const {
507   BinaryFunction *BF = BB.getFunction();
508   BinaryContext &BC = BF->getBinaryContext();
509 
510   // Ratio of this new branches execution count to the total size of the
511   // successor's execution count.  Used to set this new branches execution count
512   // and lower the old successor's execution count
513   double ExecutionCountRatio =
514       BB.getExecutionCount() >= BB.getSuccessor()->getExecutionCount()
515           ? 1.0
516           : (double)BB.getExecutionCount() /
517                 BB.getSuccessor()->getExecutionCount();
518 
519   // Use the last branch info when adding a successor to LastBB
520   BinaryBasicBlock::BinaryBranchInfo &LastBI =
521       BB.getBranchInfo(*(BB.getSuccessor()));
522 
523   BinaryBasicBlock *LastOriginalBB = &BB;
524   BinaryBasicBlock *LastDuplicatedBB = &BB;
525   assert(LastDuplicatedBB->succ_size() == 1 &&
526          "tail duplication cannot act on a block with more than 1 successor");
527   LastDuplicatedBB->removeSuccessor(LastDuplicatedBB->getSuccessor());
528 
529   std::vector<std::unique_ptr<BinaryBasicBlock>> DuplicatedBlocks;
530   std::vector<BinaryBasicBlock *> DuplicatedBlocksToReturn;
531 
532   for (BinaryBasicBlock *CurBB : BlocksToDuplicate) {
533     DuplicatedBlocks.emplace_back(
534         BF->createBasicBlock((BC.Ctx)->createNamedTempSymbol("tail-dup")));
535     BinaryBasicBlock *NewBB = DuplicatedBlocks.back().get();
536 
537     NewBB->addInstructions(CurBB->begin(), CurBB->end());
538     // Set execution count as if it was just a copy of the original
539     NewBB->setExecutionCount(CurBB->getExecutionCount());
540     NewBB->setIsCold(CurBB->isCold());
541     LastDuplicatedBB->addSuccessor(NewBB, LastBI);
542 
543     DuplicatedBlocksToReturn.push_back(NewBB);
544 
545     // As long as its not the first block, adjust both original and duplicated
546     // to what they should be
547     if (LastDuplicatedBB != &BB) {
548       LastOriginalBB->adjustExecutionCount(1.0 - ExecutionCountRatio);
549       LastDuplicatedBB->adjustExecutionCount(ExecutionCountRatio);
550     }
551 
552     if (CurBB->succ_size() == 1)
553       LastBI = CurBB->getBranchInfo(*(CurBB->getSuccessor()));
554 
555     LastOriginalBB = CurBB;
556     LastDuplicatedBB = NewBB;
557   }
558 
559   LastDuplicatedBB->addSuccessors(
560       LastOriginalBB->succ_begin(), LastOriginalBB->succ_end(),
561       LastOriginalBB->branch_info_begin(), LastOriginalBB->branch_info_end());
562 
563   LastOriginalBB->adjustExecutionCount(1.0 - ExecutionCountRatio);
564   LastDuplicatedBB->adjustExecutionCount(ExecutionCountRatio);
565 
566   BF->insertBasicBlocks(&BB, std::move(DuplicatedBlocks));
567 
568   return DuplicatedBlocksToReturn;
569 }
570 
runOnFunction(BinaryFunction & Function)571 void TailDuplication::runOnFunction(BinaryFunction &Function) {
572   // Create a separate MCCodeEmitter to allow lock-free execution
573   BinaryContext::IndependentCodeEmitter Emitter;
574   if (!opts::NoThreads) {
575     Emitter = Function.getBinaryContext().createIndependentMCCodeEmitter();
576   }
577 
578   Function.getLayout().updateLayoutIndices();
579 
580   // New blocks will be added and layout will change,
581   // so make a copy here to iterate over the original layout
582   BinaryFunction::BasicBlockOrderType BlockLayout(
583       Function.getLayout().block_begin(), Function.getLayout().block_end());
584   bool ModifiedFunction = false;
585   for (BinaryBasicBlock *BB : BlockLayout) {
586     AllDynamicCount += BB->getKnownExecutionCount();
587 
588     // The block must be with one successor
589     if (BB->succ_size() != 1)
590       continue;
591     BinaryBasicBlock *Tail = BB->getSuccessor();
592     // Verify that the tail should be duplicated
593     if (!shouldDuplicate(BB, Tail))
594       continue;
595 
596     std::vector<BinaryBasicBlock *> BlocksToDuplicate;
597     if (opts::TailDuplicationMode == TailDuplication::TD_AGGRESSIVE) {
598       BlocksToDuplicate = aggressiveDuplicate(*BB, *Tail);
599     } else if (opts::TailDuplicationMode == TailDuplication::TD_MODERATE) {
600       BlocksToDuplicate = moderateDuplicate(*BB, *Tail);
601     } else if (opts::TailDuplicationMode == TailDuplication::TD_CACHE) {
602       BlocksToDuplicate = cacheDuplicate(Emitter.MCE.get(), Function, BB, Tail);
603     } else {
604       llvm_unreachable("unknown tail duplication mode");
605     }
606 
607     if (BlocksToDuplicate.empty())
608       continue;
609 
610     // Apply the duplication
611     ModifiedFunction = true;
612     DuplicationsDynamicCount += BB->getExecutionCount();
613     auto DuplicatedBlocks = duplicateBlocks(*BB, BlocksToDuplicate);
614     for (BinaryBasicBlock *BB : DuplicatedBlocks) {
615       DuplicatedBlockCount++;
616       DuplicatedByteCount += BB->estimateSize(Emitter.MCE.get());
617     }
618 
619     if (opts::TailDuplicationConstCopyPropagation) {
620       constantAndCopyPropagate(*BB, DuplicatedBlocks);
621       BinaryBasicBlock *FirstBB = BlocksToDuplicate[0];
622       if (FirstBB->pred_size() == 1) {
623         BinaryBasicBlock *PredBB = *FirstBB->pred_begin();
624         if (PredBB->succ_size() == 1)
625           constantAndCopyPropagate(*PredBB, BlocksToDuplicate);
626       }
627     }
628 
629     // Layout indices might be stale after duplication
630     Function.getLayout().updateLayoutIndices();
631   }
632   if (ModifiedFunction)
633     ModifiedFunctions++;
634 }
635 
runOnFunctions(BinaryContext & BC)636 Error TailDuplication::runOnFunctions(BinaryContext &BC) {
637   if (opts::TailDuplicationMode == TailDuplication::TD_NONE)
638     return Error::success();
639 
640   for (auto &It : BC.getBinaryFunctions()) {
641     BinaryFunction &Function = It.second;
642     if (!shouldOptimize(Function))
643       continue;
644     runOnFunction(Function);
645   }
646 
647   BC.outs()
648       << "BOLT-INFO: tail duplication"
649       << format(" modified %zu (%.2f%%) functions;", ModifiedFunctions,
650                 100.0 * ModifiedFunctions / BC.getBinaryFunctions().size())
651       << format(" duplicated %zu blocks (%zu bytes) responsible for",
652                 DuplicatedBlockCount, DuplicatedByteCount)
653       << format(" %zu dynamic executions (%.2f%% of all block executions)",
654                 DuplicationsDynamicCount,
655                 100.0 * DuplicationsDynamicCount / AllDynamicCount)
656       << "\n";
657 
658   if (opts::TailDuplicationConstCopyPropagation) {
659     BC.outs() << "BOLT-INFO: tail duplication "
660               << format(
661                      "applied %zu static and %zu dynamic propagation deletions",
662                      StaticInstructionDeletionCount,
663                      DynamicInstructionDeletionCount)
664               << "\n";
665   }
666   return Error::success();
667 }
668 
669 } // end namespace bolt
670 } // end namespace llvm
671