1 //===- bolt/Passes/MCF.cpp ------------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements functions for solving minimum-cost flow problem.
10 //
11 //===----------------------------------------------------------------------===//
12
13 #include "bolt/Passes/MCF.h"
14 #include "bolt/Core/BinaryFunction.h"
15 #include "bolt/Core/ParallelUtilities.h"
16 #include "bolt/Passes/DataflowInfoManager.h"
17 #include "bolt/Utils/CommandLineOpts.h"
18 #include "llvm/ADT/DenseMap.h"
19 #include "llvm/ADT/STLExtras.h"
20 #include "llvm/Support/CommandLine.h"
21 #include <algorithm>
22 #include <vector>
23
24 #undef DEBUG_TYPE
25 #define DEBUG_TYPE "mcf"
26
27 using namespace llvm;
28 using namespace bolt;
29
30 namespace opts {
31
32 extern cl::OptionCategory BoltOptCategory;
33
34 static cl::opt<bool> IterativeGuess(
35 "iterative-guess",
36 cl::desc("in non-LBR mode, guess edge counts using iterative technique"),
37 cl::Hidden, cl::cat(BoltOptCategory));
38 } // namespace opts
39
40 namespace llvm {
41 namespace bolt {
42
43 namespace {
44
45 // Edge Weight Inference Heuristic
46 //
47 // We start by maintaining the invariant used in LBR mode where the sum of
48 // pred edges count is equal to the block execution count. This loop will set
49 // pred edges count by balancing its own execution count in different pred
50 // edges. The weight of each edge is guessed by looking at how hot each pred
51 // block is (in terms of samples).
52 // There are two caveats in this approach. One is for critical edges and the
53 // other is for self-referencing blocks (loops of 1 BB). For critical edges,
54 // we can't infer the hotness of them based solely on pred BBs execution
55 // count. For each critical edge we look at the pred BB, then look at its
56 // succs to adjust its weight.
57 //
58 // [ 60 ] [ 25 ]
59 // | \ |
60 // [ 10 ] [ 75 ]
61 //
62 // The illustration above shows a critical edge \. We wish to adjust bb count
63 // 60 to 50 to properly determine the weight of the critical edge to be
64 // 50 / 75.
65 // For self-referencing edges, we attribute its weight by subtracting the
66 // current BB execution count by the sum of predecessors count if this result
67 // is non-negative.
68 using EdgeWeightMap =
69 DenseMap<std::pair<const BinaryBasicBlock *, const BinaryBasicBlock *>,
70 double>;
71
72 template <class NodeT>
73 void updateEdgeWeight(EdgeWeightMap &EdgeWeights, const BinaryBasicBlock *A,
74 const BinaryBasicBlock *B, double Weight);
75
76 template <>
updateEdgeWeight(EdgeWeightMap & EdgeWeights,const BinaryBasicBlock * A,const BinaryBasicBlock * B,double Weight)77 void updateEdgeWeight<BinaryBasicBlock *>(EdgeWeightMap &EdgeWeights,
78 const BinaryBasicBlock *A,
79 const BinaryBasicBlock *B,
80 double Weight) {
81 EdgeWeights[std::make_pair(A, B)] = Weight;
82 }
83
84 template <>
updateEdgeWeight(EdgeWeightMap & EdgeWeights,const BinaryBasicBlock * A,const BinaryBasicBlock * B,double Weight)85 void updateEdgeWeight<Inverse<BinaryBasicBlock *>>(EdgeWeightMap &EdgeWeights,
86 const BinaryBasicBlock *A,
87 const BinaryBasicBlock *B,
88 double Weight) {
89 EdgeWeights[std::make_pair(B, A)] = Weight;
90 }
91
92 template <class NodeT>
computeEdgeWeights(BinaryBasicBlock * BB,EdgeWeightMap & EdgeWeights)93 void computeEdgeWeights(BinaryBasicBlock *BB, EdgeWeightMap &EdgeWeights) {
94 typedef GraphTraits<NodeT> GraphT;
95 typedef GraphTraits<Inverse<NodeT>> InvTraits;
96
97 double TotalChildrenCount = 0.0;
98 SmallVector<double, 4> ChildrenExecCount;
99 // First pass computes total children execution count that directly
100 // contribute to this BB.
101 for (typename GraphT::ChildIteratorType CI = GraphT::child_begin(BB),
102 E = GraphT::child_end(BB);
103 CI != E; ++CI) {
104 typename GraphT::NodeRef Child = *CI;
105 double ChildExecCount = Child->getExecutionCount();
106 // Is self-reference?
107 if (Child == BB) {
108 ChildExecCount = 0.0; // will fill this in second pass
109 } else if (GraphT::child_end(BB) - GraphT::child_begin(BB) > 1 &&
110 InvTraits::child_end(Child) - InvTraits::child_begin(Child) >
111 1) {
112 // Handle critical edges. This will cause a skew towards crit edges, but
113 // it is a quick solution.
114 double CritWeight = 0.0;
115 uint64_t Denominator = 0;
116 for (typename InvTraits::ChildIteratorType
117 II = InvTraits::child_begin(Child),
118 IE = InvTraits::child_end(Child);
119 II != IE; ++II) {
120 typename GraphT::NodeRef N = *II;
121 Denominator += N->getExecutionCount();
122 if (N != BB)
123 continue;
124 CritWeight = N->getExecutionCount();
125 }
126 if (Denominator)
127 CritWeight /= static_cast<double>(Denominator);
128 ChildExecCount *= CritWeight;
129 }
130 ChildrenExecCount.push_back(ChildExecCount);
131 TotalChildrenCount += ChildExecCount;
132 }
133 // Second pass fixes the weight of a possible self-reference edge
134 uint32_t ChildIndex = 0;
135 for (typename GraphT::ChildIteratorType CI = GraphT::child_begin(BB),
136 E = GraphT::child_end(BB);
137 CI != E; ++CI) {
138 typename GraphT::NodeRef Child = *CI;
139 if (Child != BB) {
140 ++ChildIndex;
141 continue;
142 }
143 if (static_cast<double>(BB->getExecutionCount()) > TotalChildrenCount) {
144 ChildrenExecCount[ChildIndex] =
145 BB->getExecutionCount() - TotalChildrenCount;
146 TotalChildrenCount += ChildrenExecCount[ChildIndex];
147 }
148 break;
149 }
150 // Third pass finally assigns weights to edges
151 ChildIndex = 0;
152 for (typename GraphT::ChildIteratorType CI = GraphT::child_begin(BB),
153 E = GraphT::child_end(BB);
154 CI != E; ++CI) {
155 typename GraphT::NodeRef Child = *CI;
156 double Weight = 1 / (GraphT::child_end(BB) - GraphT::child_begin(BB));
157 if (TotalChildrenCount != 0.0)
158 Weight = ChildrenExecCount[ChildIndex] / TotalChildrenCount;
159 updateEdgeWeight<NodeT>(EdgeWeights, BB, Child, Weight);
160 ++ChildIndex;
161 }
162 }
163
164 template <class NodeT>
computeEdgeWeights(BinaryFunction & BF,EdgeWeightMap & EdgeWeights)165 void computeEdgeWeights(BinaryFunction &BF, EdgeWeightMap &EdgeWeights) {
166 for (BinaryBasicBlock &BB : BF)
167 computeEdgeWeights<NodeT>(&BB, EdgeWeights);
168 }
169
170 /// Make BB count match the sum of all incoming edges. If AllEdges is true,
171 /// make it match max(SumPredEdges, SumSuccEdges).
recalculateBBCounts(BinaryFunction & BF,bool AllEdges)172 void recalculateBBCounts(BinaryFunction &BF, bool AllEdges) {
173 for (BinaryBasicBlock &BB : BF) {
174 uint64_t TotalPredsEWeight = 0;
175 for (BinaryBasicBlock *Pred : BB.predecessors())
176 TotalPredsEWeight += Pred->getBranchInfo(BB).Count;
177
178 if (TotalPredsEWeight > BB.getExecutionCount())
179 BB.setExecutionCount(TotalPredsEWeight);
180
181 if (!AllEdges)
182 continue;
183
184 uint64_t TotalSuccsEWeight = 0;
185 for (BinaryBasicBlock::BinaryBranchInfo &BI : BB.branch_info())
186 TotalSuccsEWeight += BI.Count;
187
188 if (TotalSuccsEWeight > BB.getExecutionCount())
189 BB.setExecutionCount(TotalSuccsEWeight);
190 }
191 }
192
193 // This is our main edge count guessing heuristic. Look at predecessors and
194 // assign a proportionally higher count to pred edges coming from blocks with
195 // a higher execution count in comparison with the other predecessor blocks,
196 // making SumPredEdges match the current BB count.
197 // If "UseSucc" is true, apply the same logic to successor edges as well. Since
198 // some successor edges may already have assigned a count, only update it if the
199 // new count is higher.
guessEdgeByRelHotness(BinaryFunction & BF,bool UseSucc,EdgeWeightMap & PredEdgeWeights,EdgeWeightMap & SuccEdgeWeights)200 void guessEdgeByRelHotness(BinaryFunction &BF, bool UseSucc,
201 EdgeWeightMap &PredEdgeWeights,
202 EdgeWeightMap &SuccEdgeWeights) {
203 for (BinaryBasicBlock &BB : BF) {
204 for (BinaryBasicBlock *Pred : BB.predecessors()) {
205 double RelativeExec = PredEdgeWeights[std::make_pair(Pred, &BB)];
206 RelativeExec *= BB.getExecutionCount();
207 BinaryBasicBlock::BinaryBranchInfo &BI = Pred->getBranchInfo(BB);
208 if (static_cast<uint64_t>(RelativeExec) > BI.Count)
209 BI.Count = static_cast<uint64_t>(RelativeExec);
210 }
211
212 if (!UseSucc)
213 continue;
214
215 auto BI = BB.branch_info_begin();
216 for (BinaryBasicBlock *Succ : BB.successors()) {
217 double RelativeExec = SuccEdgeWeights[std::make_pair(&BB, Succ)];
218 RelativeExec *= BB.getExecutionCount();
219 if (static_cast<uint64_t>(RelativeExec) > BI->Count)
220 BI->Count = static_cast<uint64_t>(RelativeExec);
221 ++BI;
222 }
223 }
224 }
225
226 using ArcSet =
227 DenseSet<std::pair<const BinaryBasicBlock *, const BinaryBasicBlock *>>;
228
229 /// Predecessor edges version of guessEdgeByIterativeApproach. GuessedArcs has
230 /// all edges we already established their count. Try to guess the count of
231 /// the remaining edge, if there is only one to guess, and return true if we
232 /// were able to guess.
guessPredEdgeCounts(BinaryBasicBlock * BB,ArcSet & GuessedArcs)233 bool guessPredEdgeCounts(BinaryBasicBlock *BB, ArcSet &GuessedArcs) {
234 if (BB->pred_size() == 0)
235 return false;
236
237 uint64_t TotalPredCount = 0;
238 unsigned NumGuessedEdges = 0;
239 for (BinaryBasicBlock *Pred : BB->predecessors()) {
240 if (GuessedArcs.count(std::make_pair(Pred, BB)))
241 ++NumGuessedEdges;
242 TotalPredCount += Pred->getBranchInfo(*BB).Count;
243 }
244
245 if (NumGuessedEdges != BB->pred_size() - 1)
246 return false;
247
248 int64_t Guessed =
249 static_cast<int64_t>(BB->getExecutionCount()) - TotalPredCount;
250 if (Guessed < 0)
251 Guessed = 0;
252
253 for (BinaryBasicBlock *Pred : BB->predecessors()) {
254 if (GuessedArcs.count(std::make_pair(Pred, BB)))
255 continue;
256
257 Pred->getBranchInfo(*BB).Count = Guessed;
258 GuessedArcs.insert(std::make_pair(Pred, BB));
259 return true;
260 }
261 llvm_unreachable("Expected unguessed arc");
262 }
263
264 /// Successor edges version of guessEdgeByIterativeApproach. GuessedArcs has
265 /// all edges we already established their count. Try to guess the count of
266 /// the remaining edge, if there is only one to guess, and return true if we
267 /// were able to guess.
guessSuccEdgeCounts(BinaryBasicBlock * BB,ArcSet & GuessedArcs)268 bool guessSuccEdgeCounts(BinaryBasicBlock *BB, ArcSet &GuessedArcs) {
269 if (BB->succ_size() == 0)
270 return false;
271
272 uint64_t TotalSuccCount = 0;
273 unsigned NumGuessedEdges = 0;
274 auto BI = BB->branch_info_begin();
275 for (BinaryBasicBlock *Succ : BB->successors()) {
276 if (GuessedArcs.count(std::make_pair(BB, Succ)))
277 ++NumGuessedEdges;
278 TotalSuccCount += BI->Count;
279 ++BI;
280 }
281
282 if (NumGuessedEdges != BB->succ_size() - 1)
283 return false;
284
285 int64_t Guessed =
286 static_cast<int64_t>(BB->getExecutionCount()) - TotalSuccCount;
287 if (Guessed < 0)
288 Guessed = 0;
289
290 BI = BB->branch_info_begin();
291 for (BinaryBasicBlock *Succ : BB->successors()) {
292 if (GuessedArcs.count(std::make_pair(BB, Succ))) {
293 ++BI;
294 continue;
295 }
296
297 BI->Count = Guessed;
298 GuessedArcs.insert(std::make_pair(BB, Succ));
299 return true;
300 }
301 llvm_unreachable("Expected unguessed arc");
302 }
303
304 /// Guess edge count whenever we have only one edge (pred or succ) left
305 /// to guess. Then make its count equal to BB count minus all other edge
306 /// counts we already know their count. Repeat this until there is no
307 /// change.
guessEdgeByIterativeApproach(BinaryFunction & BF)308 void guessEdgeByIterativeApproach(BinaryFunction &BF) {
309 ArcSet KnownArcs;
310 bool Changed = false;
311
312 do {
313 Changed = false;
314 for (BinaryBasicBlock &BB : BF) {
315 if (guessPredEdgeCounts(&BB, KnownArcs))
316 Changed = true;
317 if (guessSuccEdgeCounts(&BB, KnownArcs))
318 Changed = true;
319 }
320 } while (Changed);
321
322 // Guess count for non-inferred edges
323 for (BinaryBasicBlock &BB : BF) {
324 for (BinaryBasicBlock *Pred : BB.predecessors()) {
325 if (KnownArcs.count(std::make_pair(Pred, &BB)))
326 continue;
327 BinaryBasicBlock::BinaryBranchInfo &BI = Pred->getBranchInfo(BB);
328 BI.Count =
329 std::min(Pred->getExecutionCount(), BB.getExecutionCount()) / 2;
330 KnownArcs.insert(std::make_pair(Pred, &BB));
331 }
332 auto BI = BB.branch_info_begin();
333 for (BinaryBasicBlock *Succ : BB.successors()) {
334 if (KnownArcs.count(std::make_pair(&BB, Succ))) {
335 ++BI;
336 continue;
337 }
338 BI->Count =
339 std::min(BB.getExecutionCount(), Succ->getExecutionCount()) / 2;
340 KnownArcs.insert(std::make_pair(&BB, Succ));
341 break;
342 }
343 }
344 }
345
346 /// Associate each basic block with the BinaryLoop object corresponding to the
347 /// innermost loop containing this block.
348 DenseMap<const BinaryBasicBlock *, const BinaryLoop *>
createLoopNestLevelMap(BinaryFunction & BF)349 createLoopNestLevelMap(BinaryFunction &BF) {
350 DenseMap<const BinaryBasicBlock *, const BinaryLoop *> LoopNestLevel;
351 const BinaryLoopInfo &BLI = BF.getLoopInfo();
352
353 for (BinaryBasicBlock &BB : BF)
354 LoopNestLevel[&BB] = BLI[&BB];
355
356 return LoopNestLevel;
357 }
358
359 } // end anonymous namespace
360
equalizeBBCounts(DataflowInfoManager & Info,BinaryFunction & BF)361 void equalizeBBCounts(DataflowInfoManager &Info, BinaryFunction &BF) {
362 if (BF.begin() == BF.end())
363 return;
364
365 DominatorAnalysis<false> &DA = Info.getDominatorAnalysis();
366 DominatorAnalysis<true> &PDA = Info.getPostDominatorAnalysis();
367 auto &InsnToBB = Info.getInsnToBBMap();
368 // These analyses work at the instruction granularity, but we really only need
369 // basic block granularity here. So we'll use a set of visited edges to avoid
370 // revisiting the same BBs again and again.
371 DenseMap<const BinaryBasicBlock *, std::set<const BinaryBasicBlock *>>
372 Visited;
373 // Equivalence classes mapping. Each equivalence class is defined by the set
374 // of BBs that obeys the aforementioned properties.
375 DenseMap<const BinaryBasicBlock *, signed> BBsToEC;
376 std::vector<std::vector<BinaryBasicBlock *>> Classes;
377
378 BF.calculateLoopInfo();
379 DenseMap<const BinaryBasicBlock *, const BinaryLoop *> LoopNestLevel =
380 createLoopNestLevelMap(BF);
381
382 for (BinaryBasicBlock &BB : BF)
383 BBsToEC[&BB] = -1;
384
385 for (BinaryBasicBlock &BB : BF) {
386 auto I = BB.begin();
387 if (I == BB.end())
388 continue;
389
390 DA.doForAllDominators(*I, [&](const MCInst &DomInst) {
391 BinaryBasicBlock *DomBB = InsnToBB[&DomInst];
392 if (Visited[DomBB].count(&BB))
393 return;
394 Visited[DomBB].insert(&BB);
395 if (!PDA.doesADominateB(*I, DomInst))
396 return;
397 if (LoopNestLevel[&BB] != LoopNestLevel[DomBB])
398 return;
399 if (BBsToEC[DomBB] == -1 && BBsToEC[&BB] == -1) {
400 BBsToEC[DomBB] = Classes.size();
401 BBsToEC[&BB] = Classes.size();
402 Classes.emplace_back();
403 Classes.back().push_back(DomBB);
404 Classes.back().push_back(&BB);
405 return;
406 }
407 if (BBsToEC[DomBB] == -1) {
408 BBsToEC[DomBB] = BBsToEC[&BB];
409 Classes[BBsToEC[&BB]].push_back(DomBB);
410 return;
411 }
412 if (BBsToEC[&BB] == -1) {
413 BBsToEC[&BB] = BBsToEC[DomBB];
414 Classes[BBsToEC[DomBB]].push_back(&BB);
415 return;
416 }
417 signed BBECNum = BBsToEC[&BB];
418 std::vector<BinaryBasicBlock *> DomEC = Classes[BBsToEC[DomBB]];
419 std::vector<BinaryBasicBlock *> BBEC = Classes[BBECNum];
420 for (BinaryBasicBlock *Block : DomEC) {
421 BBsToEC[Block] = BBECNum;
422 BBEC.push_back(Block);
423 }
424 DomEC.clear();
425 });
426 }
427
428 for (std::vector<BinaryBasicBlock *> &Class : Classes) {
429 uint64_t Max = 0ULL;
430 for (BinaryBasicBlock *BB : Class)
431 Max = std::max(Max, BB->getExecutionCount());
432 for (BinaryBasicBlock *BB : Class)
433 BB->setExecutionCount(Max);
434 }
435 }
436
runOnFunction(BinaryFunction & BF)437 void EstimateEdgeCounts::runOnFunction(BinaryFunction &BF) {
438 EdgeWeightMap PredEdgeWeights;
439 EdgeWeightMap SuccEdgeWeights;
440 if (!opts::IterativeGuess) {
441 computeEdgeWeights<Inverse<BinaryBasicBlock *>>(BF, PredEdgeWeights);
442 computeEdgeWeights<BinaryBasicBlock *>(BF, SuccEdgeWeights);
443 }
444 if (opts::EqualizeBBCounts) {
445 LLVM_DEBUG(BF.print(dbgs(), "before equalize BB counts"));
446 auto Info = DataflowInfoManager(BF, nullptr, nullptr);
447 equalizeBBCounts(Info, BF);
448 LLVM_DEBUG(BF.print(dbgs(), "after equalize BB counts"));
449 }
450 if (opts::IterativeGuess)
451 guessEdgeByIterativeApproach(BF);
452 else
453 guessEdgeByRelHotness(BF, /*UseSuccs=*/false, PredEdgeWeights,
454 SuccEdgeWeights);
455 recalculateBBCounts(BF, /*AllEdges=*/false);
456 }
457
runOnFunctions(BinaryContext & BC)458 Error EstimateEdgeCounts::runOnFunctions(BinaryContext &BC) {
459 if (llvm::none_of(llvm::make_second_range(BC.getBinaryFunctions()),
460 [](const BinaryFunction &BF) {
461 return BF.getProfileFlags() == BinaryFunction::PF_SAMPLE;
462 }))
463 return Error::success();
464
465 ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) {
466 runOnFunction(BF);
467 };
468 ParallelUtilities::PredicateTy SkipFunc = [&](const BinaryFunction &BF) {
469 return BF.getProfileFlags() != BinaryFunction::PF_SAMPLE;
470 };
471
472 ParallelUtilities::runOnEachFunction(
473 BC, ParallelUtilities::SchedulingPolicy::SP_BB_QUADRATIC, WorkFun,
474 SkipFunc, "EstimateEdgeCounts");
475 return Error::success();
476 }
477
478 } // namespace bolt
479 } // namespace llvm
480