1 //===- bolt/Passes/BinaryPasses.cpp - Binary-level passes -----------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements multiple passes for binary optimization and analysis. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "bolt/Passes/BinaryPasses.h" 14 #include "bolt/Core/FunctionLayout.h" 15 #include "bolt/Core/ParallelUtilities.h" 16 #include "bolt/Passes/ReorderAlgorithm.h" 17 #include "bolt/Passes/ReorderFunctions.h" 18 #include "bolt/Utils/CommandLineOpts.h" 19 #include "llvm/Support/CommandLine.h" 20 #include <atomic> 21 #include <mutex> 22 #include <numeric> 23 #include <vector> 24 25 #define DEBUG_TYPE "bolt-opts" 26 27 using namespace llvm; 28 using namespace bolt; 29 30 static const char *dynoStatsOptName(const bolt::DynoStats::Category C) { 31 assert(C > bolt::DynoStats::FIRST_DYNO_STAT && 32 C < DynoStats::LAST_DYNO_STAT && "Unexpected dyno stat category."); 33 34 static std::string OptNames[bolt::DynoStats::LAST_DYNO_STAT + 1]; 35 36 OptNames[C] = bolt::DynoStats::Description(C); 37 38 std::replace(OptNames[C].begin(), OptNames[C].end(), ' ', '-'); 39 40 return OptNames[C].c_str(); 41 } 42 43 namespace opts { 44 45 extern cl::OptionCategory BoltCategory; 46 extern cl::OptionCategory BoltOptCategory; 47 48 extern cl::opt<unsigned> Verbosity; 49 extern cl::opt<bool> EnableBAT; 50 extern cl::opt<unsigned> ExecutionCountThreshold; 51 extern cl::opt<bool> UpdateDebugSections; 52 extern cl::opt<bolt::ReorderFunctions::ReorderType> ReorderFunctions; 53 54 enum DynoStatsSortOrder : char { 55 Ascending, 56 Descending 57 }; 58 59 static cl::opt<DynoStatsSortOrder> DynoStatsSortOrderOpt( 60 "print-sorted-by-order", 61 cl::desc("use ascending or descending order when printing functions " 62 "ordered by dyno stats"), 63 cl::init(DynoStatsSortOrder::Descending), cl::cat(BoltOptCategory)); 64 65 cl::list<std::string> 66 HotTextMoveSections("hot-text-move-sections", 67 cl::desc("list of sections containing functions used for hugifying hot text. " 68 "BOLT makes sure these functions are not placed on the same page as " 69 "the hot text. (default=\'.stub,.mover\')."), 70 cl::value_desc("sec1,sec2,sec3,..."), 71 cl::CommaSeparated, 72 cl::ZeroOrMore, 73 cl::cat(BoltCategory)); 74 75 bool isHotTextMover(const BinaryFunction &Function) { 76 for (std::string &SectionName : opts::HotTextMoveSections) { 77 if (Function.getOriginSectionName() && 78 *Function.getOriginSectionName() == SectionName) 79 return true; 80 } 81 82 return false; 83 } 84 85 static cl::opt<bool> MinBranchClusters( 86 "min-branch-clusters", 87 cl::desc("use a modified clustering algorithm geared towards minimizing " 88 "branches"), 89 cl::Hidden, cl::cat(BoltOptCategory)); 90 91 static cl::list<Peepholes::PeepholeOpts> Peepholes( 92 "peepholes", cl::CommaSeparated, cl::desc("enable peephole optimizations"), 93 cl::value_desc("opt1,opt2,opt3,..."), 94 cl::values(clEnumValN(Peepholes::PEEP_NONE, "none", "disable peepholes"), 95 clEnumValN(Peepholes::PEEP_DOUBLE_JUMPS, "double-jumps", 96 "remove double jumps when able"), 97 clEnumValN(Peepholes::PEEP_TAILCALL_TRAPS, "tailcall-traps", 98 "insert tail call traps"), 99 clEnumValN(Peepholes::PEEP_USELESS_BRANCHES, "useless-branches", 100 "remove useless conditional branches"), 101 clEnumValN(Peepholes::PEEP_ALL, "all", 102 "enable all peephole optimizations")), 103 cl::ZeroOrMore, cl::cat(BoltOptCategory)); 104 105 static cl::opt<unsigned> 106 PrintFuncStat("print-function-statistics", 107 cl::desc("print statistics about basic block ordering"), 108 cl::init(0), cl::cat(BoltOptCategory)); 109 110 static cl::opt<bool> PrintLargeFunctions( 111 "print-large-functions", 112 cl::desc("print functions that could not be overwritten due to excessive " 113 "size"), 114 cl::init(false), cl::cat(BoltOptCategory)); 115 116 static cl::list<bolt::DynoStats::Category> 117 PrintSortedBy("print-sorted-by", cl::CommaSeparated, 118 cl::desc("print functions sorted by order of dyno stats"), 119 cl::value_desc("key1,key2,key3,..."), 120 cl::values( 121 #define D(name, description, ...) \ 122 clEnumValN(bolt::DynoStats::name, dynoStatsOptName(bolt::DynoStats::name), \ 123 description), 124 REAL_DYNO_STATS 125 #undef D 126 clEnumValN(bolt::DynoStats::LAST_DYNO_STAT, "all", 127 "sorted by all names")), 128 cl::ZeroOrMore, cl::cat(BoltOptCategory)); 129 130 static cl::opt<bool> 131 PrintUnknown("print-unknown", 132 cl::desc("print names of functions with unknown control flow"), 133 cl::cat(BoltCategory), cl::Hidden); 134 135 static cl::opt<bool> 136 PrintUnknownCFG("print-unknown-cfg", 137 cl::desc("dump CFG of functions with unknown control flow"), 138 cl::cat(BoltCategory), cl::ReallyHidden); 139 140 // Please MSVC19 with a forward declaration: otherwise it reports an error about 141 // an undeclared variable inside a callback. 142 extern cl::opt<bolt::ReorderBasicBlocks::LayoutType> ReorderBlocks; 143 cl::opt<bolt::ReorderBasicBlocks::LayoutType> ReorderBlocks( 144 "reorder-blocks", cl::desc("change layout of basic blocks in a function"), 145 cl::init(bolt::ReorderBasicBlocks::LT_NONE), 146 cl::values( 147 clEnumValN(bolt::ReorderBasicBlocks::LT_NONE, "none", 148 "do not reorder basic blocks"), 149 clEnumValN(bolt::ReorderBasicBlocks::LT_REVERSE, "reverse", 150 "layout blocks in reverse order"), 151 clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE, "normal", 152 "perform optimal layout based on profile"), 153 clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_BRANCH, 154 "branch-predictor", 155 "perform optimal layout prioritizing branch " 156 "predictions"), 157 clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_CACHE, "cache", 158 "perform optimal layout prioritizing I-cache " 159 "behavior"), 160 clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_CACHE_PLUS, "cache+", 161 "perform layout optimizing I-cache behavior"), 162 clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_EXT_TSP, "ext-tsp", 163 "perform layout optimizing I-cache behavior"), 164 clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_SHUFFLE, 165 "cluster-shuffle", "perform random layout of clusters")), 166 cl::ZeroOrMore, cl::cat(BoltOptCategory), 167 cl::callback([](const bolt::ReorderBasicBlocks::LayoutType &option) { 168 if (option == bolt::ReorderBasicBlocks::LT_OPTIMIZE_CACHE_PLUS) { 169 errs() << "BOLT-WARNING: '-reorder-blocks=cache+' is deprecated, please" 170 << " use '-reorder-blocks=ext-tsp' instead\n"; 171 ReorderBlocks = bolt::ReorderBasicBlocks::LT_OPTIMIZE_EXT_TSP; 172 } 173 })); 174 175 static cl::opt<unsigned> ReportBadLayout( 176 "report-bad-layout", 177 cl::desc("print top <uint> functions with suboptimal code layout on input"), 178 cl::init(0), cl::Hidden, cl::cat(BoltOptCategory)); 179 180 static cl::opt<bool> 181 ReportStaleFuncs("report-stale", 182 cl::desc("print the list of functions with stale profile"), 183 cl::Hidden, cl::cat(BoltOptCategory)); 184 185 enum SctcModes : char { 186 SctcAlways, 187 SctcPreserveDirection, 188 SctcHeuristic 189 }; 190 191 static cl::opt<SctcModes> 192 SctcMode("sctc-mode", 193 cl::desc("mode for simplify conditional tail calls"), 194 cl::init(SctcAlways), 195 cl::values(clEnumValN(SctcAlways, "always", "always perform sctc"), 196 clEnumValN(SctcPreserveDirection, 197 "preserve", 198 "only perform sctc when branch direction is " 199 "preserved"), 200 clEnumValN(SctcHeuristic, 201 "heuristic", 202 "use branch prediction data to control sctc")), 203 cl::ZeroOrMore, 204 cl::cat(BoltOptCategory)); 205 206 static cl::opt<unsigned> 207 StaleThreshold("stale-threshold", 208 cl::desc( 209 "maximum percentage of stale functions to tolerate (default: 100)"), 210 cl::init(100), 211 cl::Hidden, 212 cl::cat(BoltOptCategory)); 213 214 static cl::opt<unsigned> TSPThreshold( 215 "tsp-threshold", 216 cl::desc( 217 "maximum number of hot basic blocks in a function for which to use " 218 "a precise TSP solution while re-ordering basic blocks"), 219 cl::init(10), cl::Hidden, cl::cat(BoltOptCategory)); 220 221 static cl::opt<unsigned> TopCalledLimit( 222 "top-called-limit", 223 cl::desc("maximum number of functions to print in top called " 224 "functions section"), 225 cl::init(100), cl::Hidden, cl::cat(BoltCategory)); 226 227 // Profile density options, synced with llvm-profgen/ProfileGenerator.cpp 228 static cl::opt<int> ProfileDensityCutOffHot( 229 "profile-density-cutoff-hot", cl::init(990000), 230 cl::desc("Total samples cutoff for functions used to calculate " 231 "profile density.")); 232 233 static cl::opt<double> ProfileDensityThreshold( 234 "profile-density-threshold", cl::init(60), 235 cl::desc("If the profile density is below the given threshold, it " 236 "will be suggested to increase the sampling rate."), 237 cl::Optional); 238 239 } // namespace opts 240 241 namespace llvm { 242 namespace bolt { 243 244 bool BinaryFunctionPass::shouldOptimize(const BinaryFunction &BF) const { 245 return BF.isSimple() && BF.getState() == BinaryFunction::State::CFG && 246 !BF.isIgnored(); 247 } 248 249 bool BinaryFunctionPass::shouldPrint(const BinaryFunction &BF) const { 250 return BF.isSimple() && !BF.isIgnored(); 251 } 252 253 void NormalizeCFG::runOnFunction(BinaryFunction &BF) { 254 uint64_t NumRemoved = 0; 255 uint64_t NumDuplicateEdges = 0; 256 uint64_t NeedsFixBranches = 0; 257 for (BinaryBasicBlock &BB : BF) { 258 if (!BB.empty()) 259 continue; 260 261 if (BB.isEntryPoint() || BB.isLandingPad()) 262 continue; 263 264 // Handle a dangling empty block. 265 if (BB.succ_size() == 0) { 266 // If an empty dangling basic block has a predecessor, it could be a 267 // result of codegen for __builtin_unreachable. In such case, do not 268 // remove the block. 269 if (BB.pred_size() == 0) { 270 BB.markValid(false); 271 ++NumRemoved; 272 } 273 continue; 274 } 275 276 // The block should have just one successor. 277 BinaryBasicBlock *Successor = BB.getSuccessor(); 278 assert(Successor && "invalid CFG encountered"); 279 280 // Redirect all predecessors to the successor block. 281 while (!BB.pred_empty()) { 282 BinaryBasicBlock *Predecessor = *BB.pred_begin(); 283 if (Predecessor->hasJumpTable()) 284 break; 285 286 if (Predecessor == Successor) 287 break; 288 289 BinaryBasicBlock::BinaryBranchInfo &BI = Predecessor->getBranchInfo(BB); 290 Predecessor->replaceSuccessor(&BB, Successor, BI.Count, 291 BI.MispredictedCount); 292 // We need to fix branches even if we failed to replace all successors 293 // and remove the block. 294 NeedsFixBranches = true; 295 } 296 297 if (BB.pred_empty()) { 298 BB.removeAllSuccessors(); 299 BB.markValid(false); 300 ++NumRemoved; 301 } 302 } 303 304 if (NumRemoved) 305 BF.eraseInvalidBBs(); 306 307 // Check for duplicate successors. Do it after the empty block elimination as 308 // we can get more duplicate successors. 309 for (BinaryBasicBlock &BB : BF) 310 if (!BB.hasJumpTable() && BB.succ_size() == 2 && 311 BB.getConditionalSuccessor(false) == BB.getConditionalSuccessor(true)) 312 ++NumDuplicateEdges; 313 314 // fixBranches() will get rid of duplicate edges and update jump instructions. 315 if (NumDuplicateEdges || NeedsFixBranches) 316 BF.fixBranches(); 317 318 NumDuplicateEdgesMerged += NumDuplicateEdges; 319 NumBlocksRemoved += NumRemoved; 320 } 321 322 Error NormalizeCFG::runOnFunctions(BinaryContext &BC) { 323 ParallelUtilities::runOnEachFunction( 324 BC, ParallelUtilities::SchedulingPolicy::SP_BB_LINEAR, 325 [&](BinaryFunction &BF) { runOnFunction(BF); }, 326 [&](const BinaryFunction &BF) { return !shouldOptimize(BF); }, 327 "NormalizeCFG"); 328 if (NumBlocksRemoved) 329 BC.outs() << "BOLT-INFO: removed " << NumBlocksRemoved << " empty block" 330 << (NumBlocksRemoved == 1 ? "" : "s") << '\n'; 331 if (NumDuplicateEdgesMerged) 332 BC.outs() << "BOLT-INFO: merged " << NumDuplicateEdgesMerged 333 << " duplicate CFG edge" 334 << (NumDuplicateEdgesMerged == 1 ? "" : "s") << '\n'; 335 return Error::success(); 336 } 337 338 void EliminateUnreachableBlocks::runOnFunction(BinaryFunction &Function) { 339 BinaryContext &BC = Function.getBinaryContext(); 340 unsigned Count; 341 uint64_t Bytes; 342 Function.markUnreachableBlocks(); 343 LLVM_DEBUG({ 344 for (BinaryBasicBlock &BB : Function) { 345 if (!BB.isValid()) { 346 dbgs() << "BOLT-INFO: UCE found unreachable block " << BB.getName() 347 << " in function " << Function << "\n"; 348 Function.dump(); 349 } 350 } 351 }); 352 BinaryContext::IndependentCodeEmitter Emitter = 353 BC.createIndependentMCCodeEmitter(); 354 std::tie(Count, Bytes) = Function.eraseInvalidBBs(Emitter.MCE.get()); 355 DeletedBlocks += Count; 356 DeletedBytes += Bytes; 357 if (Count) { 358 auto L = BC.scopeLock(); 359 Modified.insert(&Function); 360 if (opts::Verbosity > 0) 361 BC.outs() << "BOLT-INFO: removed " << Count 362 << " dead basic block(s) accounting for " << Bytes 363 << " bytes in function " << Function << '\n'; 364 } 365 } 366 367 Error EliminateUnreachableBlocks::runOnFunctions(BinaryContext &BC) { 368 ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) { 369 runOnFunction(BF); 370 }; 371 372 ParallelUtilities::PredicateTy SkipPredicate = [&](const BinaryFunction &BF) { 373 return !shouldOptimize(BF) || BF.getLayout().block_empty(); 374 }; 375 376 ParallelUtilities::runOnEachFunction( 377 BC, ParallelUtilities::SchedulingPolicy::SP_CONSTANT, WorkFun, 378 SkipPredicate, "elimininate-unreachable"); 379 380 if (DeletedBlocks) 381 BC.outs() << "BOLT-INFO: UCE removed " << DeletedBlocks << " blocks and " 382 << DeletedBytes << " bytes of code\n"; 383 return Error::success(); 384 } 385 386 bool ReorderBasicBlocks::shouldPrint(const BinaryFunction &BF) const { 387 return (BinaryFunctionPass::shouldPrint(BF) && 388 opts::ReorderBlocks != ReorderBasicBlocks::LT_NONE); 389 } 390 391 bool ReorderBasicBlocks::shouldOptimize(const BinaryFunction &BF) const { 392 // Apply execution count threshold 393 if (BF.getKnownExecutionCount() < opts::ExecutionCountThreshold) 394 return false; 395 396 return BinaryFunctionPass::shouldOptimize(BF); 397 } 398 399 Error ReorderBasicBlocks::runOnFunctions(BinaryContext &BC) { 400 if (opts::ReorderBlocks == ReorderBasicBlocks::LT_NONE) 401 return Error::success(); 402 403 std::atomic_uint64_t ModifiedFuncCount(0); 404 std::mutex FunctionEditDistanceMutex; 405 DenseMap<const BinaryFunction *, uint64_t> FunctionEditDistance; 406 407 ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) { 408 SmallVector<const BinaryBasicBlock *, 0> OldBlockOrder; 409 if (opts::PrintFuncStat > 0) 410 llvm::copy(BF.getLayout().blocks(), std::back_inserter(OldBlockOrder)); 411 412 const bool LayoutChanged = 413 modifyFunctionLayout(BF, opts::ReorderBlocks, opts::MinBranchClusters); 414 if (LayoutChanged) { 415 ModifiedFuncCount.fetch_add(1, std::memory_order_relaxed); 416 if (opts::PrintFuncStat > 0) { 417 const uint64_t Distance = BF.getLayout().getEditDistance(OldBlockOrder); 418 std::lock_guard<std::mutex> Lock(FunctionEditDistanceMutex); 419 FunctionEditDistance[&BF] = Distance; 420 } 421 } 422 }; 423 424 ParallelUtilities::PredicateTy SkipFunc = [&](const BinaryFunction &BF) { 425 return !shouldOptimize(BF); 426 }; 427 428 ParallelUtilities::runOnEachFunction( 429 BC, ParallelUtilities::SchedulingPolicy::SP_BB_LINEAR, WorkFun, SkipFunc, 430 "ReorderBasicBlocks"); 431 const size_t NumAllProfiledFunctions = 432 BC.NumProfiledFuncs + BC.NumStaleProfileFuncs; 433 434 BC.outs() << "BOLT-INFO: basic block reordering modified layout of " 435 << format( 436 "%zu functions (%.2lf%% of profiled, %.2lf%% of total)\n", 437 ModifiedFuncCount.load(std::memory_order_relaxed), 438 100.0 * ModifiedFuncCount.load(std::memory_order_relaxed) / 439 NumAllProfiledFunctions, 440 100.0 * ModifiedFuncCount.load(std::memory_order_relaxed) / 441 BC.getBinaryFunctions().size()); 442 443 if (opts::PrintFuncStat > 0) { 444 raw_ostream &OS = BC.outs(); 445 // Copy all the values into vector in order to sort them 446 std::map<uint64_t, BinaryFunction &> ScoreMap; 447 auto &BFs = BC.getBinaryFunctions(); 448 for (auto It = BFs.begin(); It != BFs.end(); ++It) 449 ScoreMap.insert(std::pair<uint64_t, BinaryFunction &>( 450 It->second.getFunctionScore(), It->second)); 451 452 OS << "\nBOLT-INFO: Printing Function Statistics:\n\n"; 453 OS << " There are " << BFs.size() << " functions in total. \n"; 454 OS << " Number of functions being modified: " 455 << ModifiedFuncCount.load(std::memory_order_relaxed) << "\n"; 456 OS << " User asks for detailed information on top " 457 << opts::PrintFuncStat << " functions. (Ranked by function score)" 458 << "\n\n"; 459 uint64_t I = 0; 460 for (std::map<uint64_t, BinaryFunction &>::reverse_iterator Rit = 461 ScoreMap.rbegin(); 462 Rit != ScoreMap.rend() && I < opts::PrintFuncStat; ++Rit, ++I) { 463 BinaryFunction &Function = Rit->second; 464 465 OS << " Information for function of top: " << (I + 1) << ": \n"; 466 OS << " Function Score is: " << Function.getFunctionScore() 467 << "\n"; 468 OS << " There are " << Function.size() 469 << " number of blocks in this function.\n"; 470 OS << " There are " << Function.getInstructionCount() 471 << " number of instructions in this function.\n"; 472 OS << " The edit distance for this function is: " 473 << FunctionEditDistance.lookup(&Function) << "\n\n"; 474 } 475 } 476 return Error::success(); 477 } 478 479 bool ReorderBasicBlocks::modifyFunctionLayout(BinaryFunction &BF, 480 LayoutType Type, 481 bool MinBranchClusters) const { 482 if (BF.size() == 0 || Type == LT_NONE) 483 return false; 484 485 BinaryFunction::BasicBlockOrderType NewLayout; 486 std::unique_ptr<ReorderAlgorithm> Algo; 487 488 // Cannot do optimal layout without profile. 489 if (Type != LT_REVERSE && !BF.hasValidProfile()) 490 return false; 491 492 if (Type == LT_REVERSE) { 493 Algo.reset(new ReverseReorderAlgorithm()); 494 } else if (BF.size() <= opts::TSPThreshold && Type != LT_OPTIMIZE_SHUFFLE) { 495 // Work on optimal solution if problem is small enough 496 LLVM_DEBUG(dbgs() << "finding optimal block layout for " << BF << "\n"); 497 Algo.reset(new TSPReorderAlgorithm()); 498 } else { 499 LLVM_DEBUG(dbgs() << "running block layout heuristics on " << BF << "\n"); 500 501 std::unique_ptr<ClusterAlgorithm> CAlgo; 502 if (MinBranchClusters) 503 CAlgo.reset(new MinBranchGreedyClusterAlgorithm()); 504 else 505 CAlgo.reset(new PHGreedyClusterAlgorithm()); 506 507 switch (Type) { 508 case LT_OPTIMIZE: 509 Algo.reset(new OptimizeReorderAlgorithm(std::move(CAlgo))); 510 break; 511 512 case LT_OPTIMIZE_BRANCH: 513 Algo.reset(new OptimizeBranchReorderAlgorithm(std::move(CAlgo))); 514 break; 515 516 case LT_OPTIMIZE_CACHE: 517 Algo.reset(new OptimizeCacheReorderAlgorithm(std::move(CAlgo))); 518 break; 519 520 case LT_OPTIMIZE_EXT_TSP: 521 Algo.reset(new ExtTSPReorderAlgorithm()); 522 break; 523 524 case LT_OPTIMIZE_SHUFFLE: 525 Algo.reset(new RandomClusterReorderAlgorithm(std::move(CAlgo))); 526 break; 527 528 default: 529 llvm_unreachable("unexpected layout type"); 530 } 531 } 532 533 Algo->reorderBasicBlocks(BF, NewLayout); 534 535 return BF.getLayout().update(NewLayout); 536 } 537 538 Error FixupBranches::runOnFunctions(BinaryContext &BC) { 539 for (auto &It : BC.getBinaryFunctions()) { 540 BinaryFunction &Function = It.second; 541 if (!BC.shouldEmit(Function) || !Function.isSimple()) 542 continue; 543 544 Function.fixBranches(); 545 } 546 return Error::success(); 547 } 548 549 Error FinalizeFunctions::runOnFunctions(BinaryContext &BC) { 550 std::atomic<bool> HasFatal{false}; 551 ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) { 552 if (!BF.finalizeCFIState()) { 553 if (BC.HasRelocations) { 554 BC.errs() << "BOLT-ERROR: unable to fix CFI state for function " << BF 555 << ". Exiting.\n"; 556 HasFatal = true; 557 return; 558 } 559 BF.setSimple(false); 560 return; 561 } 562 563 BF.setFinalized(); 564 565 // Update exception handling information. 566 BF.updateEHRanges(); 567 }; 568 569 ParallelUtilities::PredicateTy SkipPredicate = [&](const BinaryFunction &BF) { 570 return !BC.shouldEmit(BF); 571 }; 572 573 ParallelUtilities::runOnEachFunction( 574 BC, ParallelUtilities::SchedulingPolicy::SP_CONSTANT, WorkFun, 575 SkipPredicate, "FinalizeFunctions"); 576 if (HasFatal) 577 return createFatalBOLTError("finalize CFI state failure"); 578 return Error::success(); 579 } 580 581 Error CheckLargeFunctions::runOnFunctions(BinaryContext &BC) { 582 if (BC.HasRelocations) 583 return Error::success(); 584 585 // If the function wouldn't fit, mark it as non-simple. Otherwise, we may emit 586 // incorrect meta data. 587 ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) { 588 uint64_t HotSize, ColdSize; 589 std::tie(HotSize, ColdSize) = 590 BC.calculateEmittedSize(BF, /*FixBranches=*/false); 591 uint64_t MainFragmentSize = HotSize; 592 if (BF.hasIslandsInfo()) { 593 MainFragmentSize += 594 offsetToAlignment(BF.getAddress() + MainFragmentSize, 595 Align(BF.getConstantIslandAlignment())); 596 MainFragmentSize += BF.estimateConstantIslandSize(); 597 } 598 if (MainFragmentSize > BF.getMaxSize()) { 599 if (opts::PrintLargeFunctions) 600 BC.outs() << "BOLT-INFO: " << BF << " size of " << MainFragmentSize 601 << " bytes exceeds allocated space by " 602 << (MainFragmentSize - BF.getMaxSize()) << " bytes\n"; 603 BF.setSimple(false); 604 } 605 }; 606 607 ParallelUtilities::PredicateTy SkipFunc = [&](const BinaryFunction &BF) { 608 return !shouldOptimize(BF); 609 }; 610 611 ParallelUtilities::runOnEachFunction( 612 BC, ParallelUtilities::SchedulingPolicy::SP_INST_LINEAR, WorkFun, 613 SkipFunc, "CheckLargeFunctions"); 614 615 return Error::success(); 616 } 617 618 bool CheckLargeFunctions::shouldOptimize(const BinaryFunction &BF) const { 619 // Unlike other passes, allow functions in non-CFG state. 620 return BF.isSimple() && !BF.isIgnored(); 621 } 622 623 Error LowerAnnotations::runOnFunctions(BinaryContext &BC) { 624 // Convert GnuArgsSize annotations into CFIs. 625 for (BinaryFunction *BF : BC.getAllBinaryFunctions()) { 626 for (FunctionFragment &FF : BF->getLayout().fragments()) { 627 // Reset at the start of the new fragment. 628 int64_t CurrentGnuArgsSize = 0; 629 630 for (BinaryBasicBlock *const BB : FF) { 631 for (auto II = BB->begin(); II != BB->end(); ++II) { 632 if (!BF->usesGnuArgsSize() || !BC.MIB->isInvoke(*II)) 633 continue; 634 635 const int64_t NewGnuArgsSize = BC.MIB->getGnuArgsSize(*II); 636 assert(NewGnuArgsSize >= 0 && "Expected non-negative GNU_args_size."); 637 if (NewGnuArgsSize == CurrentGnuArgsSize) 638 continue; 639 640 auto InsertII = BF->addCFIInstruction( 641 BB, II, 642 MCCFIInstruction::createGnuArgsSize(nullptr, NewGnuArgsSize)); 643 CurrentGnuArgsSize = NewGnuArgsSize; 644 II = std::next(InsertII); 645 } 646 } 647 } 648 } 649 return Error::success(); 650 } 651 652 // Check for dirty state in MCSymbol objects that might be a consequence 653 // of running calculateEmittedSize() in parallel, during split functions 654 // pass. If an inconsistent state is found (symbol already registered or 655 // already defined), clean it. 656 Error CleanMCState::runOnFunctions(BinaryContext &BC) { 657 MCContext &Ctx = *BC.Ctx; 658 for (const auto &SymMapEntry : Ctx.getSymbols()) { 659 const MCSymbol *S = SymMapEntry.getValue().Symbol; 660 if (!S) 661 continue; 662 if (S->isDefined()) { 663 LLVM_DEBUG(dbgs() << "BOLT-DEBUG: Symbol \"" << S->getName() 664 << "\" is already defined\n"); 665 const_cast<MCSymbol *>(S)->setUndefined(); 666 } 667 if (S->isRegistered()) { 668 LLVM_DEBUG(dbgs() << "BOLT-DEBUG: Symbol \"" << S->getName() 669 << "\" is already registered\n"); 670 const_cast<MCSymbol *>(S)->setIsRegistered(false); 671 } 672 LLVM_DEBUG(if (S->isVariable()) { 673 dbgs() << "BOLT-DEBUG: Symbol \"" << S->getName() << "\" is variable\n"; 674 }); 675 } 676 return Error::success(); 677 } 678 679 // This peephole fixes jump instructions that jump to another basic 680 // block with a single jump instruction, e.g. 681 // 682 // B0: ... 683 // jmp B1 (or jcc B1) 684 // 685 // B1: jmp B2 686 // 687 // -> 688 // 689 // B0: ... 690 // jmp B2 (or jcc B2) 691 // 692 static uint64_t fixDoubleJumps(BinaryFunction &Function, bool MarkInvalid) { 693 uint64_t NumDoubleJumps = 0; 694 695 MCContext *Ctx = Function.getBinaryContext().Ctx.get(); 696 MCPlusBuilder *MIB = Function.getBinaryContext().MIB.get(); 697 for (BinaryBasicBlock &BB : Function) { 698 auto checkAndPatch = [&](BinaryBasicBlock *Pred, BinaryBasicBlock *Succ, 699 const MCSymbol *SuccSym, 700 std::optional<uint32_t> Offset) { 701 // Ignore infinite loop jumps or fallthrough tail jumps. 702 if (Pred == Succ || Succ == &BB) 703 return false; 704 705 if (Succ) { 706 const MCSymbol *TBB = nullptr; 707 const MCSymbol *FBB = nullptr; 708 MCInst *CondBranch = nullptr; 709 MCInst *UncondBranch = nullptr; 710 bool Res = Pred->analyzeBranch(TBB, FBB, CondBranch, UncondBranch); 711 if (!Res) { 712 LLVM_DEBUG(dbgs() << "analyzeBranch failed in peepholes in block:\n"; 713 Pred->dump()); 714 return false; 715 } 716 Pred->replaceSuccessor(&BB, Succ); 717 718 // We must patch up any existing branch instructions to match up 719 // with the new successor. 720 assert((CondBranch || (!CondBranch && Pred->succ_size() == 1)) && 721 "Predecessor block has inconsistent number of successors"); 722 if (CondBranch && MIB->getTargetSymbol(*CondBranch) == BB.getLabel()) { 723 MIB->replaceBranchTarget(*CondBranch, Succ->getLabel(), Ctx); 724 } else if (UncondBranch && 725 MIB->getTargetSymbol(*UncondBranch) == BB.getLabel()) { 726 MIB->replaceBranchTarget(*UncondBranch, Succ->getLabel(), Ctx); 727 } else if (!UncondBranch) { 728 assert(Function.getLayout().getBasicBlockAfter(Pred, false) != Succ && 729 "Don't add an explicit jump to a fallthrough block."); 730 Pred->addBranchInstruction(Succ); 731 } 732 } else { 733 // Succ will be null in the tail call case. In this case we 734 // need to explicitly add a tail call instruction. 735 MCInst *Branch = Pred->getLastNonPseudoInstr(); 736 if (Branch && MIB->isUnconditionalBranch(*Branch)) { 737 assert(MIB->getTargetSymbol(*Branch) == BB.getLabel()); 738 Pred->removeSuccessor(&BB); 739 Pred->eraseInstruction(Pred->findInstruction(Branch)); 740 Pred->addTailCallInstruction(SuccSym); 741 if (Offset) { 742 MCInst *TailCall = Pred->getLastNonPseudoInstr(); 743 assert(TailCall); 744 MIB->setOffset(*TailCall, *Offset); 745 } 746 } else { 747 return false; 748 } 749 } 750 751 ++NumDoubleJumps; 752 LLVM_DEBUG(dbgs() << "Removed double jump in " << Function << " from " 753 << Pred->getName() << " -> " << BB.getName() << " to " 754 << Pred->getName() << " -> " << SuccSym->getName() 755 << (!Succ ? " (tail)\n" : "\n")); 756 757 return true; 758 }; 759 760 if (BB.getNumNonPseudos() != 1 || BB.isLandingPad()) 761 continue; 762 763 MCInst *Inst = BB.getFirstNonPseudoInstr(); 764 const bool IsTailCall = MIB->isTailCall(*Inst); 765 766 if (!MIB->isUnconditionalBranch(*Inst) && !IsTailCall) 767 continue; 768 769 // If we operate after SCTC make sure it's not a conditional tail call. 770 if (IsTailCall && MIB->isConditionalBranch(*Inst)) 771 continue; 772 773 const MCSymbol *SuccSym = MIB->getTargetSymbol(*Inst); 774 BinaryBasicBlock *Succ = BB.getSuccessor(); 775 776 if (((!Succ || &BB == Succ) && !IsTailCall) || (IsTailCall && !SuccSym)) 777 continue; 778 779 std::vector<BinaryBasicBlock *> Preds = {BB.pred_begin(), BB.pred_end()}; 780 781 for (BinaryBasicBlock *Pred : Preds) { 782 if (Pred->isLandingPad()) 783 continue; 784 785 if (Pred->getSuccessor() == &BB || 786 (Pred->getConditionalSuccessor(true) == &BB && !IsTailCall) || 787 Pred->getConditionalSuccessor(false) == &BB) 788 if (checkAndPatch(Pred, Succ, SuccSym, MIB->getOffset(*Inst)) && 789 MarkInvalid) 790 BB.markValid(BB.pred_size() != 0 || BB.isLandingPad() || 791 BB.isEntryPoint()); 792 } 793 } 794 795 return NumDoubleJumps; 796 } 797 798 bool SimplifyConditionalTailCalls::shouldRewriteBranch( 799 const BinaryBasicBlock *PredBB, const MCInst &CondBranch, 800 const BinaryBasicBlock *BB, const bool DirectionFlag) { 801 if (BeenOptimized.count(PredBB)) 802 return false; 803 804 const bool IsForward = BinaryFunction::isForwardBranch(PredBB, BB); 805 806 if (IsForward) 807 ++NumOrigForwardBranches; 808 else 809 ++NumOrigBackwardBranches; 810 811 if (opts::SctcMode == opts::SctcAlways) 812 return true; 813 814 if (opts::SctcMode == opts::SctcPreserveDirection) 815 return IsForward == DirectionFlag; 816 817 const ErrorOr<std::pair<double, double>> Frequency = 818 PredBB->getBranchStats(BB); 819 820 // It's ok to rewrite the conditional branch if the new target will be 821 // a backward branch. 822 823 // If no data available for these branches, then it should be ok to 824 // do the optimization since it will reduce code size. 825 if (Frequency.getError()) 826 return true; 827 828 // TODO: should this use misprediction frequency instead? 829 const bool Result = (IsForward && Frequency.get().first >= 0.5) || 830 (!IsForward && Frequency.get().first <= 0.5); 831 832 return Result == DirectionFlag; 833 } 834 835 uint64_t SimplifyConditionalTailCalls::fixTailCalls(BinaryFunction &BF) { 836 // Need updated indices to correctly detect branch' direction. 837 BF.getLayout().updateLayoutIndices(); 838 BF.markUnreachableBlocks(); 839 840 MCPlusBuilder *MIB = BF.getBinaryContext().MIB.get(); 841 MCContext *Ctx = BF.getBinaryContext().Ctx.get(); 842 uint64_t NumLocalCTCCandidates = 0; 843 uint64_t NumLocalCTCs = 0; 844 uint64_t LocalCTCTakenCount = 0; 845 uint64_t LocalCTCExecCount = 0; 846 std::vector<std::pair<BinaryBasicBlock *, const BinaryBasicBlock *>> 847 NeedsUncondBranch; 848 849 // Will block be deleted by UCE? 850 auto isValid = [](const BinaryBasicBlock *BB) { 851 return (BB->pred_size() != 0 || BB->isLandingPad() || BB->isEntryPoint()); 852 }; 853 854 for (BinaryBasicBlock *BB : BF.getLayout().blocks()) { 855 // Locate BB with a single direct tail-call instruction. 856 if (BB->getNumNonPseudos() != 1) 857 continue; 858 859 MCInst *Instr = BB->getFirstNonPseudoInstr(); 860 if (!MIB->isTailCall(*Instr) || MIB->isConditionalBranch(*Instr)) 861 continue; 862 863 const MCSymbol *CalleeSymbol = MIB->getTargetSymbol(*Instr); 864 if (!CalleeSymbol) 865 continue; 866 867 // Detect direction of the possible conditional tail call. 868 const bool IsForwardCTC = BF.isForwardCall(CalleeSymbol); 869 870 // Iterate through all predecessors. 871 for (BinaryBasicBlock *PredBB : BB->predecessors()) { 872 BinaryBasicBlock *CondSucc = PredBB->getConditionalSuccessor(true); 873 if (!CondSucc) 874 continue; 875 876 ++NumLocalCTCCandidates; 877 878 const MCSymbol *TBB = nullptr; 879 const MCSymbol *FBB = nullptr; 880 MCInst *CondBranch = nullptr; 881 MCInst *UncondBranch = nullptr; 882 bool Result = PredBB->analyzeBranch(TBB, FBB, CondBranch, UncondBranch); 883 884 // analyzeBranch() can fail due to unusual branch instructions, e.g. jrcxz 885 if (!Result) { 886 LLVM_DEBUG(dbgs() << "analyzeBranch failed in SCTC in block:\n"; 887 PredBB->dump()); 888 continue; 889 } 890 891 assert(Result && "internal error analyzing conditional branch"); 892 assert(CondBranch && "conditional branch expected"); 893 894 // Skip dynamic branches for now. 895 if (BF.getBinaryContext().MIB->isDynamicBranch(*CondBranch)) 896 continue; 897 898 // It's possible that PredBB is also a successor to BB that may have 899 // been processed by a previous iteration of the SCTC loop, in which 900 // case it may have been marked invalid. We should skip rewriting in 901 // this case. 902 if (!PredBB->isValid()) { 903 assert(PredBB->isSuccessor(BB) && 904 "PredBB should be valid if it is not a successor to BB"); 905 continue; 906 } 907 908 // We don't want to reverse direction of the branch in new order 909 // without further profile analysis. 910 const bool DirectionFlag = CondSucc == BB ? IsForwardCTC : !IsForwardCTC; 911 if (!shouldRewriteBranch(PredBB, *CondBranch, BB, DirectionFlag)) 912 continue; 913 914 // Record this block so that we don't try to optimize it twice. 915 BeenOptimized.insert(PredBB); 916 917 uint64_t Count = 0; 918 if (CondSucc != BB) { 919 // Patch the new target address into the conditional branch. 920 MIB->reverseBranchCondition(*CondBranch, CalleeSymbol, Ctx); 921 // Since we reversed the condition on the branch we need to change 922 // the target for the unconditional branch or add a unconditional 923 // branch to the old target. This has to be done manually since 924 // fixupBranches is not called after SCTC. 925 NeedsUncondBranch.emplace_back(PredBB, CondSucc); 926 Count = PredBB->getFallthroughBranchInfo().Count; 927 } else { 928 // Change destination of the conditional branch. 929 MIB->replaceBranchTarget(*CondBranch, CalleeSymbol, Ctx); 930 Count = PredBB->getTakenBranchInfo().Count; 931 } 932 const uint64_t CTCTakenFreq = 933 Count == BinaryBasicBlock::COUNT_NO_PROFILE ? 0 : Count; 934 935 // Annotate it, so "isCall" returns true for this jcc 936 MIB->setConditionalTailCall(*CondBranch); 937 // Add info about the conditional tail call frequency, otherwise this 938 // info will be lost when we delete the associated BranchInfo entry 939 auto &CTCAnnotation = 940 MIB->getOrCreateAnnotationAs<uint64_t>(*CondBranch, "CTCTakenCount"); 941 CTCAnnotation = CTCTakenFreq; 942 // Preserve Offset annotation, used in BAT. 943 // Instr is a direct tail call instruction that was created when CTCs are 944 // first expanded, and has the original CTC offset set. 945 if (std::optional<uint32_t> Offset = MIB->getOffset(*Instr)) 946 MIB->setOffset(*CondBranch, *Offset); 947 948 // Remove the unused successor which may be eliminated later 949 // if there are no other users. 950 PredBB->removeSuccessor(BB); 951 // Update BB execution count 952 if (CTCTakenFreq && CTCTakenFreq <= BB->getKnownExecutionCount()) 953 BB->setExecutionCount(BB->getExecutionCount() - CTCTakenFreq); 954 else if (CTCTakenFreq > BB->getKnownExecutionCount()) 955 BB->setExecutionCount(0); 956 957 ++NumLocalCTCs; 958 LocalCTCTakenCount += CTCTakenFreq; 959 LocalCTCExecCount += PredBB->getKnownExecutionCount(); 960 } 961 962 // Remove the block from CFG if all predecessors were removed. 963 BB->markValid(isValid(BB)); 964 } 965 966 // Add unconditional branches at the end of BBs to new successors 967 // as long as the successor is not a fallthrough. 968 for (auto &Entry : NeedsUncondBranch) { 969 BinaryBasicBlock *PredBB = Entry.first; 970 const BinaryBasicBlock *CondSucc = Entry.second; 971 972 const MCSymbol *TBB = nullptr; 973 const MCSymbol *FBB = nullptr; 974 MCInst *CondBranch = nullptr; 975 MCInst *UncondBranch = nullptr; 976 PredBB->analyzeBranch(TBB, FBB, CondBranch, UncondBranch); 977 978 // Find the next valid block. Invalid blocks will be deleted 979 // so they shouldn't be considered fallthrough targets. 980 const BinaryBasicBlock *NextBlock = 981 BF.getLayout().getBasicBlockAfter(PredBB, false); 982 while (NextBlock && !isValid(NextBlock)) 983 NextBlock = BF.getLayout().getBasicBlockAfter(NextBlock, false); 984 985 // Get the unconditional successor to this block. 986 const BinaryBasicBlock *PredSucc = PredBB->getSuccessor(); 987 assert(PredSucc && "The other branch should be a tail call"); 988 989 const bool HasFallthrough = (NextBlock && PredSucc == NextBlock); 990 991 if (UncondBranch) { 992 if (HasFallthrough) 993 PredBB->eraseInstruction(PredBB->findInstruction(UncondBranch)); 994 else 995 MIB->replaceBranchTarget(*UncondBranch, CondSucc->getLabel(), Ctx); 996 } else if (!HasFallthrough) { 997 MCInst Branch; 998 MIB->createUncondBranch(Branch, CondSucc->getLabel(), Ctx); 999 PredBB->addInstruction(Branch); 1000 } 1001 } 1002 1003 if (NumLocalCTCs > 0) { 1004 NumDoubleJumps += fixDoubleJumps(BF, true); 1005 // Clean-up unreachable tail-call blocks. 1006 const std::pair<unsigned, uint64_t> Stats = BF.eraseInvalidBBs(); 1007 DeletedBlocks += Stats.first; 1008 DeletedBytes += Stats.second; 1009 1010 assert(BF.validateCFG()); 1011 } 1012 1013 LLVM_DEBUG(dbgs() << "BOLT: created " << NumLocalCTCs 1014 << " conditional tail calls from a total of " 1015 << NumLocalCTCCandidates << " candidates in function " << BF 1016 << ". CTCs execution count for this function is " 1017 << LocalCTCExecCount << " and CTC taken count is " 1018 << LocalCTCTakenCount << "\n";); 1019 1020 NumTailCallsPatched += NumLocalCTCs; 1021 NumCandidateTailCalls += NumLocalCTCCandidates; 1022 CTCExecCount += LocalCTCExecCount; 1023 CTCTakenCount += LocalCTCTakenCount; 1024 1025 return NumLocalCTCs > 0; 1026 } 1027 1028 Error SimplifyConditionalTailCalls::runOnFunctions(BinaryContext &BC) { 1029 if (!BC.isX86()) 1030 return Error::success(); 1031 1032 for (auto &It : BC.getBinaryFunctions()) { 1033 BinaryFunction &Function = It.second; 1034 1035 if (!shouldOptimize(Function)) 1036 continue; 1037 1038 if (fixTailCalls(Function)) { 1039 Modified.insert(&Function); 1040 Function.setHasCanonicalCFG(false); 1041 } 1042 } 1043 1044 if (NumTailCallsPatched) 1045 BC.outs() << "BOLT-INFO: SCTC: patched " << NumTailCallsPatched 1046 << " tail calls (" << NumOrigForwardBranches << " forward)" 1047 << " tail calls (" << NumOrigBackwardBranches << " backward)" 1048 << " from a total of " << NumCandidateTailCalls 1049 << " while removing " << NumDoubleJumps << " double jumps" 1050 << " and removing " << DeletedBlocks << " basic blocks" 1051 << " totalling " << DeletedBytes 1052 << " bytes of code. CTCs total execution count is " 1053 << CTCExecCount << " and the number of times CTCs are taken is " 1054 << CTCTakenCount << "\n"; 1055 return Error::success(); 1056 } 1057 1058 uint64_t ShortenInstructions::shortenInstructions(BinaryFunction &Function) { 1059 uint64_t Count = 0; 1060 const BinaryContext &BC = Function.getBinaryContext(); 1061 for (BinaryBasicBlock &BB : Function) { 1062 for (MCInst &Inst : BB) { 1063 // Skip shortening instructions with Size annotation. 1064 if (BC.MIB->getSize(Inst)) 1065 continue; 1066 1067 MCInst OriginalInst; 1068 if (opts::Verbosity > 2) 1069 OriginalInst = Inst; 1070 1071 if (!BC.MIB->shortenInstruction(Inst, *BC.STI)) 1072 continue; 1073 1074 if (opts::Verbosity > 2) { 1075 BC.scopeLock(); 1076 BC.outs() << "BOLT-INFO: shortening:\nBOLT-INFO: "; 1077 BC.printInstruction(BC.outs(), OriginalInst, 0, &Function); 1078 BC.outs() << "BOLT-INFO: to:"; 1079 BC.printInstruction(BC.outs(), Inst, 0, &Function); 1080 } 1081 1082 ++Count; 1083 } 1084 } 1085 1086 return Count; 1087 } 1088 1089 Error ShortenInstructions::runOnFunctions(BinaryContext &BC) { 1090 std::atomic<uint64_t> NumShortened{0}; 1091 if (!BC.isX86()) 1092 return Error::success(); 1093 1094 ParallelUtilities::runOnEachFunction( 1095 BC, ParallelUtilities::SchedulingPolicy::SP_INST_LINEAR, 1096 [&](BinaryFunction &BF) { NumShortened += shortenInstructions(BF); }, 1097 nullptr, "ShortenInstructions"); 1098 1099 if (NumShortened) 1100 BC.outs() << "BOLT-INFO: " << NumShortened 1101 << " instructions were shortened\n"; 1102 return Error::success(); 1103 } 1104 1105 void Peepholes::addTailcallTraps(BinaryFunction &Function) { 1106 MCPlusBuilder *MIB = Function.getBinaryContext().MIB.get(); 1107 for (BinaryBasicBlock &BB : Function) { 1108 MCInst *Inst = BB.getLastNonPseudoInstr(); 1109 if (Inst && MIB->isTailCall(*Inst) && MIB->isIndirectBranch(*Inst)) { 1110 MCInst Trap; 1111 MIB->createTrap(Trap); 1112 BB.addInstruction(Trap); 1113 ++TailCallTraps; 1114 } 1115 } 1116 } 1117 1118 void Peepholes::removeUselessCondBranches(BinaryFunction &Function) { 1119 for (BinaryBasicBlock &BB : Function) { 1120 if (BB.succ_size() != 2) 1121 continue; 1122 1123 BinaryBasicBlock *CondBB = BB.getConditionalSuccessor(true); 1124 BinaryBasicBlock *UncondBB = BB.getConditionalSuccessor(false); 1125 if (CondBB != UncondBB) 1126 continue; 1127 1128 const MCSymbol *TBB = nullptr; 1129 const MCSymbol *FBB = nullptr; 1130 MCInst *CondBranch = nullptr; 1131 MCInst *UncondBranch = nullptr; 1132 bool Result = BB.analyzeBranch(TBB, FBB, CondBranch, UncondBranch); 1133 1134 // analyzeBranch() can fail due to unusual branch instructions, 1135 // e.g. jrcxz, or jump tables (indirect jump). 1136 if (!Result || !CondBranch) 1137 continue; 1138 1139 BB.removeDuplicateConditionalSuccessor(CondBranch); 1140 ++NumUselessCondBranches; 1141 } 1142 } 1143 1144 Error Peepholes::runOnFunctions(BinaryContext &BC) { 1145 const char Opts = 1146 std::accumulate(opts::Peepholes.begin(), opts::Peepholes.end(), 0, 1147 [](const char A, const PeepholeOpts B) { return A | B; }); 1148 if (Opts == PEEP_NONE) 1149 return Error::success(); 1150 1151 for (auto &It : BC.getBinaryFunctions()) { 1152 BinaryFunction &Function = It.second; 1153 if (shouldOptimize(Function)) { 1154 if (Opts & PEEP_DOUBLE_JUMPS) 1155 NumDoubleJumps += fixDoubleJumps(Function, false); 1156 if (Opts & PEEP_TAILCALL_TRAPS) 1157 addTailcallTraps(Function); 1158 if (Opts & PEEP_USELESS_BRANCHES) 1159 removeUselessCondBranches(Function); 1160 assert(Function.validateCFG()); 1161 } 1162 } 1163 BC.outs() << "BOLT-INFO: Peephole: " << NumDoubleJumps 1164 << " double jumps patched.\n" 1165 << "BOLT-INFO: Peephole: " << TailCallTraps 1166 << " tail call traps inserted.\n" 1167 << "BOLT-INFO: Peephole: " << NumUselessCondBranches 1168 << " useless conditional branches removed.\n"; 1169 return Error::success(); 1170 } 1171 1172 bool SimplifyRODataLoads::simplifyRODataLoads(BinaryFunction &BF) { 1173 BinaryContext &BC = BF.getBinaryContext(); 1174 MCPlusBuilder *MIB = BC.MIB.get(); 1175 1176 uint64_t NumLocalLoadsSimplified = 0; 1177 uint64_t NumDynamicLocalLoadsSimplified = 0; 1178 uint64_t NumLocalLoadsFound = 0; 1179 uint64_t NumDynamicLocalLoadsFound = 0; 1180 1181 for (BinaryBasicBlock *BB : BF.getLayout().blocks()) { 1182 for (MCInst &Inst : *BB) { 1183 unsigned Opcode = Inst.getOpcode(); 1184 const MCInstrDesc &Desc = BC.MII->get(Opcode); 1185 1186 // Skip instructions that do not load from memory. 1187 if (!Desc.mayLoad()) 1188 continue; 1189 1190 // Try to statically evaluate the target memory address; 1191 uint64_t TargetAddress; 1192 1193 if (MIB->hasPCRelOperand(Inst)) { 1194 // Try to find the symbol that corresponds to the PC-relative operand. 1195 MCOperand *DispOpI = MIB->getMemOperandDisp(Inst); 1196 assert(DispOpI != Inst.end() && "expected PC-relative displacement"); 1197 assert(DispOpI->isExpr() && 1198 "found PC-relative with non-symbolic displacement"); 1199 1200 // Get displacement symbol. 1201 const MCSymbol *DisplSymbol; 1202 uint64_t DisplOffset; 1203 1204 std::tie(DisplSymbol, DisplOffset) = 1205 MIB->getTargetSymbolInfo(DispOpI->getExpr()); 1206 1207 if (!DisplSymbol) 1208 continue; 1209 1210 // Look up the symbol address in the global symbols map of the binary 1211 // context object. 1212 BinaryData *BD = BC.getBinaryDataByName(DisplSymbol->getName()); 1213 if (!BD) 1214 continue; 1215 TargetAddress = BD->getAddress() + DisplOffset; 1216 } else if (!MIB->evaluateMemOperandTarget(Inst, TargetAddress)) { 1217 continue; 1218 } 1219 1220 // Get the contents of the section containing the target address of the 1221 // memory operand. We are only interested in read-only sections. 1222 ErrorOr<BinarySection &> DataSection = 1223 BC.getSectionForAddress(TargetAddress); 1224 if (!DataSection || DataSection->isWritable()) 1225 continue; 1226 1227 if (BC.getRelocationAt(TargetAddress) || 1228 BC.getDynamicRelocationAt(TargetAddress)) 1229 continue; 1230 1231 uint32_t Offset = TargetAddress - DataSection->getAddress(); 1232 StringRef ConstantData = DataSection->getContents(); 1233 1234 ++NumLocalLoadsFound; 1235 if (BB->hasProfile()) 1236 NumDynamicLocalLoadsFound += BB->getExecutionCount(); 1237 1238 if (MIB->replaceMemOperandWithImm(Inst, ConstantData, Offset)) { 1239 ++NumLocalLoadsSimplified; 1240 if (BB->hasProfile()) 1241 NumDynamicLocalLoadsSimplified += BB->getExecutionCount(); 1242 } 1243 } 1244 } 1245 1246 NumLoadsFound += NumLocalLoadsFound; 1247 NumDynamicLoadsFound += NumDynamicLocalLoadsFound; 1248 NumLoadsSimplified += NumLocalLoadsSimplified; 1249 NumDynamicLoadsSimplified += NumDynamicLocalLoadsSimplified; 1250 1251 return NumLocalLoadsSimplified > 0; 1252 } 1253 1254 Error SimplifyRODataLoads::runOnFunctions(BinaryContext &BC) { 1255 for (auto &It : BC.getBinaryFunctions()) { 1256 BinaryFunction &Function = It.second; 1257 if (shouldOptimize(Function) && simplifyRODataLoads(Function)) 1258 Modified.insert(&Function); 1259 } 1260 1261 BC.outs() << "BOLT-INFO: simplified " << NumLoadsSimplified << " out of " 1262 << NumLoadsFound << " loads from a statically computed address.\n" 1263 << "BOLT-INFO: dynamic loads simplified: " 1264 << NumDynamicLoadsSimplified << "\n" 1265 << "BOLT-INFO: dynamic loads found: " << NumDynamicLoadsFound 1266 << "\n"; 1267 return Error::success(); 1268 } 1269 1270 Error AssignSections::runOnFunctions(BinaryContext &BC) { 1271 for (BinaryFunction *Function : BC.getInjectedBinaryFunctions()) { 1272 Function->setCodeSectionName(BC.getInjectedCodeSectionName()); 1273 Function->setColdCodeSectionName(BC.getInjectedColdCodeSectionName()); 1274 } 1275 1276 // In non-relocation mode functions have pre-assigned section names. 1277 if (!BC.HasRelocations) 1278 return Error::success(); 1279 1280 const bool UseColdSection = 1281 BC.NumProfiledFuncs > 0 || 1282 opts::ReorderFunctions == ReorderFunctions::RT_USER; 1283 for (auto &BFI : BC.getBinaryFunctions()) { 1284 BinaryFunction &Function = BFI.second; 1285 if (opts::isHotTextMover(Function)) { 1286 Function.setCodeSectionName(BC.getHotTextMoverSectionName()); 1287 Function.setColdCodeSectionName(BC.getHotTextMoverSectionName()); 1288 continue; 1289 } 1290 1291 if (!UseColdSection || Function.hasValidIndex()) 1292 Function.setCodeSectionName(BC.getMainCodeSectionName()); 1293 else 1294 Function.setCodeSectionName(BC.getColdCodeSectionName()); 1295 1296 if (Function.isSplit()) 1297 Function.setColdCodeSectionName(BC.getColdCodeSectionName()); 1298 } 1299 return Error::success(); 1300 } 1301 1302 Error PrintProfileStats::runOnFunctions(BinaryContext &BC) { 1303 double FlowImbalanceMean = 0.0; 1304 size_t NumBlocksConsidered = 0; 1305 double WorstBias = 0.0; 1306 const BinaryFunction *WorstBiasFunc = nullptr; 1307 1308 // For each function CFG, we fill an IncomingMap with the sum of the frequency 1309 // of incoming edges for each BB. Likewise for each OutgoingMap and the sum 1310 // of the frequency of outgoing edges. 1311 using FlowMapTy = std::unordered_map<const BinaryBasicBlock *, uint64_t>; 1312 std::unordered_map<const BinaryFunction *, FlowMapTy> TotalIncomingMaps; 1313 std::unordered_map<const BinaryFunction *, FlowMapTy> TotalOutgoingMaps; 1314 1315 // Compute mean 1316 for (const auto &BFI : BC.getBinaryFunctions()) { 1317 const BinaryFunction &Function = BFI.second; 1318 if (Function.empty() || !Function.isSimple()) 1319 continue; 1320 FlowMapTy &IncomingMap = TotalIncomingMaps[&Function]; 1321 FlowMapTy &OutgoingMap = TotalOutgoingMaps[&Function]; 1322 for (const BinaryBasicBlock &BB : Function) { 1323 uint64_t TotalOutgoing = 0ULL; 1324 auto SuccBIIter = BB.branch_info_begin(); 1325 for (BinaryBasicBlock *Succ : BB.successors()) { 1326 uint64_t Count = SuccBIIter->Count; 1327 if (Count == BinaryBasicBlock::COUNT_NO_PROFILE || Count == 0) { 1328 ++SuccBIIter; 1329 continue; 1330 } 1331 TotalOutgoing += Count; 1332 IncomingMap[Succ] += Count; 1333 ++SuccBIIter; 1334 } 1335 OutgoingMap[&BB] = TotalOutgoing; 1336 } 1337 1338 size_t NumBlocks = 0; 1339 double Mean = 0.0; 1340 for (const BinaryBasicBlock &BB : Function) { 1341 // Do not compute score for low frequency blocks, entry or exit blocks 1342 if (IncomingMap[&BB] < 100 || OutgoingMap[&BB] == 0 || BB.isEntryPoint()) 1343 continue; 1344 ++NumBlocks; 1345 const double Difference = (double)OutgoingMap[&BB] - IncomingMap[&BB]; 1346 Mean += fabs(Difference / IncomingMap[&BB]); 1347 } 1348 1349 FlowImbalanceMean += Mean; 1350 NumBlocksConsidered += NumBlocks; 1351 if (!NumBlocks) 1352 continue; 1353 double FuncMean = Mean / NumBlocks; 1354 if (FuncMean > WorstBias) { 1355 WorstBias = FuncMean; 1356 WorstBiasFunc = &Function; 1357 } 1358 } 1359 if (NumBlocksConsidered > 0) 1360 FlowImbalanceMean /= NumBlocksConsidered; 1361 1362 // Compute standard deviation 1363 NumBlocksConsidered = 0; 1364 double FlowImbalanceVar = 0.0; 1365 for (const auto &BFI : BC.getBinaryFunctions()) { 1366 const BinaryFunction &Function = BFI.second; 1367 if (Function.empty() || !Function.isSimple()) 1368 continue; 1369 FlowMapTy &IncomingMap = TotalIncomingMaps[&Function]; 1370 FlowMapTy &OutgoingMap = TotalOutgoingMaps[&Function]; 1371 for (const BinaryBasicBlock &BB : Function) { 1372 if (IncomingMap[&BB] < 100 || OutgoingMap[&BB] == 0) 1373 continue; 1374 ++NumBlocksConsidered; 1375 const double Difference = (double)OutgoingMap[&BB] - IncomingMap[&BB]; 1376 FlowImbalanceVar += 1377 pow(fabs(Difference / IncomingMap[&BB]) - FlowImbalanceMean, 2); 1378 } 1379 } 1380 if (NumBlocksConsidered) { 1381 FlowImbalanceVar /= NumBlocksConsidered; 1382 FlowImbalanceVar = sqrt(FlowImbalanceVar); 1383 } 1384 1385 // Report to user 1386 BC.outs() << format("BOLT-INFO: Profile bias score: %.4lf%% StDev: %.4lf%%\n", 1387 (100.0 * FlowImbalanceMean), (100.0 * FlowImbalanceVar)); 1388 if (WorstBiasFunc && opts::Verbosity >= 1) { 1389 BC.outs() << "Worst average bias observed in " 1390 << WorstBiasFunc->getPrintName() << "\n"; 1391 LLVM_DEBUG(WorstBiasFunc->dump()); 1392 } 1393 return Error::success(); 1394 } 1395 1396 Error PrintProgramStats::runOnFunctions(BinaryContext &BC) { 1397 uint64_t NumRegularFunctions = 0; 1398 uint64_t NumStaleProfileFunctions = 0; 1399 uint64_t NumAllStaleFunctions = 0; 1400 uint64_t NumInferredFunctions = 0; 1401 uint64_t NumNonSimpleProfiledFunctions = 0; 1402 uint64_t NumUnknownControlFlowFunctions = 0; 1403 uint64_t TotalSampleCount = 0; 1404 uint64_t StaleSampleCount = 0; 1405 uint64_t InferredSampleCount = 0; 1406 std::vector<const BinaryFunction *> ProfiledFunctions; 1407 std::vector<std::pair<double, uint64_t>> FuncDensityList; 1408 const char *StaleFuncsHeader = "BOLT-INFO: Functions with stale profile:\n"; 1409 for (auto &BFI : BC.getBinaryFunctions()) { 1410 const BinaryFunction &Function = BFI.second; 1411 1412 // Ignore PLT functions for stats. 1413 if (Function.isPLTFunction()) 1414 continue; 1415 1416 // Adjustment for BAT mode: the profile for BOLT split fragments is combined 1417 // so only count the hot fragment. 1418 const uint64_t Address = Function.getAddress(); 1419 bool IsHotParentOfBOLTSplitFunction = !Function.getFragments().empty() && 1420 BAT && BAT->isBATFunction(Address) && 1421 !BAT->fetchParentAddress(Address); 1422 1423 ++NumRegularFunctions; 1424 1425 // In BOLTed binaries split functions are non-simple (due to non-relocation 1426 // mode), but the original function is known to be simple and we have a 1427 // valid profile for it. 1428 if (!Function.isSimple() && !IsHotParentOfBOLTSplitFunction) { 1429 if (Function.hasProfile()) 1430 ++NumNonSimpleProfiledFunctions; 1431 continue; 1432 } 1433 1434 if (Function.hasUnknownControlFlow()) { 1435 if (opts::PrintUnknownCFG) 1436 Function.dump(); 1437 else if (opts::PrintUnknown) 1438 BC.errs() << "function with unknown control flow: " << Function << '\n'; 1439 1440 ++NumUnknownControlFlowFunctions; 1441 } 1442 1443 if (!Function.hasProfile()) 1444 continue; 1445 1446 uint64_t SampleCount = Function.getRawBranchCount(); 1447 TotalSampleCount += SampleCount; 1448 1449 if (Function.hasValidProfile()) { 1450 ProfiledFunctions.push_back(&Function); 1451 if (Function.hasInferredProfile()) { 1452 ++NumInferredFunctions; 1453 InferredSampleCount += SampleCount; 1454 ++NumAllStaleFunctions; 1455 } 1456 } else { 1457 if (opts::ReportStaleFuncs) { 1458 BC.outs() << StaleFuncsHeader; 1459 StaleFuncsHeader = ""; 1460 BC.outs() << " " << Function << '\n'; 1461 } 1462 ++NumStaleProfileFunctions; 1463 StaleSampleCount += SampleCount; 1464 ++NumAllStaleFunctions; 1465 } 1466 1467 if (opts::ShowDensity) { 1468 uint64_t Size = Function.getSize(); 1469 // In case of BOLT split functions registered in BAT, executed traces are 1470 // automatically attributed to the main fragment. Add up function sizes 1471 // for all fragments. 1472 if (IsHotParentOfBOLTSplitFunction) 1473 for (const BinaryFunction *Fragment : Function.getFragments()) 1474 Size += Fragment->getSize(); 1475 double Density = (double)1.0 * Function.getSampleCountInBytes() / Size; 1476 FuncDensityList.emplace_back(Density, SampleCount); 1477 LLVM_DEBUG(BC.outs() << Function << ": executed bytes " 1478 << Function.getSampleCountInBytes() << ", size (b) " 1479 << Size << ", density " << Density 1480 << ", sample count " << SampleCount << '\n'); 1481 } 1482 } 1483 BC.NumProfiledFuncs = ProfiledFunctions.size(); 1484 BC.NumStaleProfileFuncs = NumStaleProfileFunctions; 1485 1486 const size_t NumAllProfiledFunctions = 1487 ProfiledFunctions.size() + NumStaleProfileFunctions; 1488 BC.outs() << "BOLT-INFO: " << NumAllProfiledFunctions << " out of " 1489 << NumRegularFunctions << " functions in the binary (" 1490 << format("%.1f", NumAllProfiledFunctions / 1491 (float)NumRegularFunctions * 100.0f) 1492 << "%) have non-empty execution profile\n"; 1493 if (NumNonSimpleProfiledFunctions) { 1494 BC.outs() << "BOLT-INFO: " << NumNonSimpleProfiledFunctions << " function" 1495 << (NumNonSimpleProfiledFunctions == 1 ? "" : "s") 1496 << " with profile could not be optimized\n"; 1497 } 1498 if (NumAllStaleFunctions) { 1499 const float PctStale = 1500 NumAllStaleFunctions / (float)NumAllProfiledFunctions * 100.0f; 1501 const float PctStaleFuncsWithEqualBlockCount = 1502 (float)BC.Stats.NumStaleFuncsWithEqualBlockCount / 1503 NumAllStaleFunctions * 100.0f; 1504 const float PctStaleBlocksWithEqualIcount = 1505 (float)BC.Stats.NumStaleBlocksWithEqualIcount / 1506 BC.Stats.NumStaleBlocks * 100.0f; 1507 auto printErrorOrWarning = [&]() { 1508 if (PctStale > opts::StaleThreshold) 1509 BC.errs() << "BOLT-ERROR: "; 1510 else 1511 BC.errs() << "BOLT-WARNING: "; 1512 }; 1513 printErrorOrWarning(); 1514 BC.errs() << NumAllStaleFunctions 1515 << format(" (%.1f%% of all profiled)", PctStale) << " function" 1516 << (NumAllStaleFunctions == 1 ? "" : "s") 1517 << " have invalid (possibly stale) profile." 1518 " Use -report-stale to see the list.\n"; 1519 if (TotalSampleCount > 0) { 1520 printErrorOrWarning(); 1521 BC.errs() << (StaleSampleCount + InferredSampleCount) << " out of " 1522 << TotalSampleCount << " samples in the binary (" 1523 << format("%.1f", 1524 ((100.0f * (StaleSampleCount + InferredSampleCount)) / 1525 TotalSampleCount)) 1526 << "%) belong to functions with invalid" 1527 " (possibly stale) profile.\n"; 1528 } 1529 BC.outs() << "BOLT-INFO: " << BC.Stats.NumStaleFuncsWithEqualBlockCount 1530 << " stale function" 1531 << (BC.Stats.NumStaleFuncsWithEqualBlockCount == 1 ? "" : "s") 1532 << format(" (%.1f%% of all stale)", 1533 PctStaleFuncsWithEqualBlockCount) 1534 << " have matching block count.\n"; 1535 BC.outs() << "BOLT-INFO: " << BC.Stats.NumStaleBlocksWithEqualIcount 1536 << " stale block" 1537 << (BC.Stats.NumStaleBlocksWithEqualIcount == 1 ? "" : "s") 1538 << format(" (%.1f%% of all stale)", PctStaleBlocksWithEqualIcount) 1539 << " have matching icount.\n"; 1540 if (PctStale > opts::StaleThreshold) { 1541 return createFatalBOLTError( 1542 Twine("BOLT-ERROR: stale functions exceed specified threshold of ") + 1543 Twine(opts::StaleThreshold.getValue()) + Twine("%. Exiting.\n")); 1544 } 1545 } 1546 if (NumInferredFunctions) { 1547 BC.outs() << format( 1548 "BOLT-INFO: inferred profile for %d (%.2f%% of profiled, " 1549 "%.2f%% of stale) functions responsible for %.2f%% samples" 1550 " (%zu out of %zu)\n", 1551 NumInferredFunctions, 1552 100.0 * NumInferredFunctions / NumAllProfiledFunctions, 1553 100.0 * NumInferredFunctions / NumAllStaleFunctions, 1554 100.0 * InferredSampleCount / TotalSampleCount, InferredSampleCount, 1555 TotalSampleCount); 1556 BC.outs() << format( 1557 "BOLT-INFO: inference found an exact match for %.2f%% of basic blocks" 1558 " (%zu out of %zu stale) responsible for %.2f%% samples" 1559 " (%zu out of %zu stale)\n", 1560 100.0 * BC.Stats.NumExactMatchedBlocks / BC.Stats.NumStaleBlocks, 1561 BC.Stats.NumExactMatchedBlocks, BC.Stats.NumStaleBlocks, 1562 100.0 * BC.Stats.ExactMatchedSampleCount / BC.Stats.StaleSampleCount, 1563 BC.Stats.ExactMatchedSampleCount, BC.Stats.StaleSampleCount); 1564 BC.outs() << format( 1565 "BOLT-INFO: inference found an exact pseudo probe match for %.2f%% of " 1566 "basic blocks (%zu out of %zu stale) responsible for %.2f%% samples" 1567 " (%zu out of %zu stale)\n", 1568 100.0 * BC.Stats.NumPseudoProbeExactMatchedBlocks / 1569 BC.Stats.NumStaleBlocks, 1570 BC.Stats.NumPseudoProbeExactMatchedBlocks, BC.Stats.NumStaleBlocks, 1571 100.0 * BC.Stats.PseudoProbeExactMatchedSampleCount / 1572 BC.Stats.StaleSampleCount, 1573 BC.Stats.PseudoProbeExactMatchedSampleCount, BC.Stats.StaleSampleCount); 1574 BC.outs() << format( 1575 "BOLT-INFO: inference found a loose pseudo probe match for %.2f%% of " 1576 "basic blocks (%zu out of %zu stale) responsible for %.2f%% samples" 1577 " (%zu out of %zu stale)\n", 1578 100.0 * BC.Stats.NumPseudoProbeLooseMatchedBlocks / 1579 BC.Stats.NumStaleBlocks, 1580 BC.Stats.NumPseudoProbeLooseMatchedBlocks, BC.Stats.NumStaleBlocks, 1581 100.0 * BC.Stats.PseudoProbeLooseMatchedSampleCount / 1582 BC.Stats.StaleSampleCount, 1583 BC.Stats.PseudoProbeLooseMatchedSampleCount, BC.Stats.StaleSampleCount); 1584 BC.outs() << format( 1585 "BOLT-INFO: inference found a call match for %.2f%% of basic " 1586 "blocks" 1587 " (%zu out of %zu stale) responsible for %.2f%% samples" 1588 " (%zu out of %zu stale)\n", 1589 100.0 * BC.Stats.NumCallMatchedBlocks / BC.Stats.NumStaleBlocks, 1590 BC.Stats.NumCallMatchedBlocks, BC.Stats.NumStaleBlocks, 1591 100.0 * BC.Stats.CallMatchedSampleCount / BC.Stats.StaleSampleCount, 1592 BC.Stats.CallMatchedSampleCount, BC.Stats.StaleSampleCount); 1593 BC.outs() << format( 1594 "BOLT-INFO: inference found a loose match for %.2f%% of basic " 1595 "blocks" 1596 " (%zu out of %zu stale) responsible for %.2f%% samples" 1597 " (%zu out of %zu stale)\n", 1598 100.0 * BC.Stats.NumLooseMatchedBlocks / BC.Stats.NumStaleBlocks, 1599 BC.Stats.NumLooseMatchedBlocks, BC.Stats.NumStaleBlocks, 1600 100.0 * BC.Stats.LooseMatchedSampleCount / BC.Stats.StaleSampleCount, 1601 BC.Stats.LooseMatchedSampleCount, BC.Stats.StaleSampleCount); 1602 } 1603 1604 if (const uint64_t NumUnusedObjects = BC.getNumUnusedProfiledObjects()) { 1605 BC.outs() << "BOLT-INFO: profile for " << NumUnusedObjects 1606 << " objects was ignored\n"; 1607 } 1608 1609 if (ProfiledFunctions.size() > 10) { 1610 if (opts::Verbosity >= 1) { 1611 BC.outs() << "BOLT-INFO: top called functions are:\n"; 1612 llvm::sort(ProfiledFunctions, 1613 [](const BinaryFunction *A, const BinaryFunction *B) { 1614 return B->getExecutionCount() < A->getExecutionCount(); 1615 }); 1616 auto SFI = ProfiledFunctions.begin(); 1617 auto SFIend = ProfiledFunctions.end(); 1618 for (unsigned I = 0u; I < opts::TopCalledLimit && SFI != SFIend; 1619 ++SFI, ++I) 1620 BC.outs() << " " << **SFI << " : " << (*SFI)->getExecutionCount() 1621 << '\n'; 1622 } 1623 } 1624 1625 if (!opts::PrintSortedBy.empty()) { 1626 std::vector<BinaryFunction *> Functions; 1627 std::map<const BinaryFunction *, DynoStats> Stats; 1628 1629 for (auto &BFI : BC.getBinaryFunctions()) { 1630 BinaryFunction &BF = BFI.second; 1631 if (shouldOptimize(BF) && BF.hasValidProfile()) { 1632 Functions.push_back(&BF); 1633 Stats.emplace(&BF, getDynoStats(BF)); 1634 } 1635 } 1636 1637 const bool SortAll = 1638 llvm::is_contained(opts::PrintSortedBy, DynoStats::LAST_DYNO_STAT); 1639 1640 const bool Ascending = 1641 opts::DynoStatsSortOrderOpt == opts::DynoStatsSortOrder::Ascending; 1642 1643 std::function<bool(const DynoStats &, const DynoStats &)> 1644 DynoStatsComparator = 1645 SortAll ? [](const DynoStats &StatsA, 1646 const DynoStats &StatsB) { return StatsA < StatsB; } 1647 : [](const DynoStats &StatsA, const DynoStats &StatsB) { 1648 return StatsA.lessThan(StatsB, opts::PrintSortedBy); 1649 }; 1650 1651 llvm::stable_sort(Functions, 1652 [Ascending, &Stats, DynoStatsComparator]( 1653 const BinaryFunction *A, const BinaryFunction *B) { 1654 auto StatsItr = Stats.find(A); 1655 assert(StatsItr != Stats.end()); 1656 const DynoStats &StatsA = StatsItr->second; 1657 1658 StatsItr = Stats.find(B); 1659 assert(StatsItr != Stats.end()); 1660 const DynoStats &StatsB = StatsItr->second; 1661 1662 return Ascending ? DynoStatsComparator(StatsA, StatsB) 1663 : DynoStatsComparator(StatsB, StatsA); 1664 }); 1665 1666 BC.outs() << "BOLT-INFO: top functions sorted by "; 1667 if (SortAll) { 1668 BC.outs() << "dyno stats"; 1669 } else { 1670 BC.outs() << "("; 1671 bool PrintComma = false; 1672 for (const DynoStats::Category Category : opts::PrintSortedBy) { 1673 if (PrintComma) 1674 BC.outs() << ", "; 1675 BC.outs() << DynoStats::Description(Category); 1676 PrintComma = true; 1677 } 1678 BC.outs() << ")"; 1679 } 1680 1681 BC.outs() << " are:\n"; 1682 auto SFI = Functions.begin(); 1683 for (unsigned I = 0; I < 100 && SFI != Functions.end(); ++SFI, ++I) { 1684 const DynoStats Stats = getDynoStats(**SFI); 1685 BC.outs() << " " << **SFI; 1686 if (!SortAll) { 1687 BC.outs() << " ("; 1688 bool PrintComma = false; 1689 for (const DynoStats::Category Category : opts::PrintSortedBy) { 1690 if (PrintComma) 1691 BC.outs() << ", "; 1692 BC.outs() << dynoStatsOptName(Category) << "=" << Stats[Category]; 1693 PrintComma = true; 1694 } 1695 BC.outs() << ")"; 1696 } 1697 BC.outs() << "\n"; 1698 } 1699 } 1700 1701 if (!BC.TrappedFunctions.empty()) { 1702 BC.errs() << "BOLT-WARNING: " << BC.TrappedFunctions.size() << " function" 1703 << (BC.TrappedFunctions.size() > 1 ? "s" : "") 1704 << " will trap on entry. Use -trap-avx512=0 to disable" 1705 " traps."; 1706 if (opts::Verbosity >= 1 || BC.TrappedFunctions.size() <= 5) { 1707 BC.errs() << '\n'; 1708 for (const BinaryFunction *Function : BC.TrappedFunctions) 1709 BC.errs() << " " << *Function << '\n'; 1710 } else { 1711 BC.errs() << " Use -v=1 to see the list.\n"; 1712 } 1713 } 1714 1715 // Collect and print information about suboptimal code layout on input. 1716 if (opts::ReportBadLayout) { 1717 std::vector<BinaryFunction *> SuboptimalFuncs; 1718 for (auto &BFI : BC.getBinaryFunctions()) { 1719 BinaryFunction &BF = BFI.second; 1720 if (!BF.hasValidProfile()) 1721 continue; 1722 1723 const uint64_t HotThreshold = 1724 std::max<uint64_t>(BF.getKnownExecutionCount(), 1); 1725 bool HotSeen = false; 1726 for (const BinaryBasicBlock *BB : BF.getLayout().rblocks()) { 1727 if (!HotSeen && BB->getKnownExecutionCount() > HotThreshold) { 1728 HotSeen = true; 1729 continue; 1730 } 1731 if (HotSeen && BB->getKnownExecutionCount() == 0) { 1732 SuboptimalFuncs.push_back(&BF); 1733 break; 1734 } 1735 } 1736 } 1737 1738 if (!SuboptimalFuncs.empty()) { 1739 llvm::sort(SuboptimalFuncs, 1740 [](const BinaryFunction *A, const BinaryFunction *B) { 1741 return A->getKnownExecutionCount() / A->getSize() > 1742 B->getKnownExecutionCount() / B->getSize(); 1743 }); 1744 1745 BC.outs() << "BOLT-INFO: " << SuboptimalFuncs.size() 1746 << " functions have " 1747 "cold code in the middle of hot code. Top functions are:\n"; 1748 for (unsigned I = 0; 1749 I < std::min(static_cast<size_t>(opts::ReportBadLayout), 1750 SuboptimalFuncs.size()); 1751 ++I) 1752 SuboptimalFuncs[I]->print(BC.outs()); 1753 } 1754 } 1755 1756 if (NumUnknownControlFlowFunctions) { 1757 BC.outs() << "BOLT-INFO: " << NumUnknownControlFlowFunctions 1758 << " functions have instructions with unknown control flow"; 1759 if (!opts::PrintUnknown) 1760 BC.outs() << ". Use -print-unknown to see the list."; 1761 BC.outs() << '\n'; 1762 } 1763 1764 if (opts::ShowDensity) { 1765 double Density = 0.0; 1766 // Sorted by the density in descending order. 1767 llvm::stable_sort(FuncDensityList, 1768 [&](const std::pair<double, uint64_t> &A, 1769 const std::pair<double, uint64_t> &B) { 1770 if (A.first != B.first) 1771 return A.first > B.first; 1772 return A.second < B.second; 1773 }); 1774 1775 uint64_t AccumulatedSamples = 0; 1776 uint32_t I = 0; 1777 assert(opts::ProfileDensityCutOffHot <= 1000000 && 1778 "The cutoff value is greater than 1000000(100%)"); 1779 while (AccumulatedSamples < 1780 TotalSampleCount * 1781 static_cast<float>(opts::ProfileDensityCutOffHot) / 1782 1000000 && 1783 I < FuncDensityList.size()) { 1784 AccumulatedSamples += FuncDensityList[I].second; 1785 Density = FuncDensityList[I].first; 1786 I++; 1787 } 1788 if (Density == 0.0) { 1789 BC.errs() << "BOLT-WARNING: the output profile is empty or the " 1790 "--profile-density-cutoff-hot option is " 1791 "set too low. Please check your command.\n"; 1792 } else if (Density < opts::ProfileDensityThreshold) { 1793 BC.errs() 1794 << "BOLT-WARNING: BOLT is estimated to optimize better with " 1795 << format("%.1f", opts::ProfileDensityThreshold / Density) 1796 << "x more samples. Please consider increasing sampling rate or " 1797 "profiling for longer duration to get more samples.\n"; 1798 } 1799 1800 BC.outs() << "BOLT-INFO: Functions with density >= " 1801 << format("%.1f", Density) << " account for " 1802 << format("%.2f", 1803 static_cast<double>(opts::ProfileDensityCutOffHot) / 1804 10000) 1805 << "% total sample counts.\n"; 1806 } 1807 return Error::success(); 1808 } 1809 1810 Error InstructionLowering::runOnFunctions(BinaryContext &BC) { 1811 for (auto &BFI : BC.getBinaryFunctions()) 1812 for (BinaryBasicBlock &BB : BFI.second) 1813 for (MCInst &Instruction : BB) 1814 BC.MIB->lowerTailCall(Instruction); 1815 return Error::success(); 1816 } 1817 1818 Error StripRepRet::runOnFunctions(BinaryContext &BC) { 1819 if (!BC.isX86()) 1820 return Error::success(); 1821 1822 uint64_t NumPrefixesRemoved = 0; 1823 uint64_t NumBytesSaved = 0; 1824 for (auto &BFI : BC.getBinaryFunctions()) { 1825 for (BinaryBasicBlock &BB : BFI.second) { 1826 auto LastInstRIter = BB.getLastNonPseudo(); 1827 if (LastInstRIter == BB.rend() || !BC.MIB->isReturn(*LastInstRIter) || 1828 !BC.MIB->deleteREPPrefix(*LastInstRIter)) 1829 continue; 1830 1831 NumPrefixesRemoved += BB.getKnownExecutionCount(); 1832 ++NumBytesSaved; 1833 } 1834 } 1835 1836 if (NumBytesSaved) 1837 BC.outs() << "BOLT-INFO: removed " << NumBytesSaved 1838 << " 'repz' prefixes" 1839 " with estimated execution count of " 1840 << NumPrefixesRemoved << " times.\n"; 1841 return Error::success(); 1842 } 1843 1844 Error InlineMemcpy::runOnFunctions(BinaryContext &BC) { 1845 if (!BC.isX86()) 1846 return Error::success(); 1847 1848 uint64_t NumInlined = 0; 1849 uint64_t NumInlinedDyno = 0; 1850 for (auto &BFI : BC.getBinaryFunctions()) { 1851 for (BinaryBasicBlock &BB : BFI.second) { 1852 for (auto II = BB.begin(); II != BB.end(); ++II) { 1853 MCInst &Inst = *II; 1854 1855 if (!BC.MIB->isCall(Inst) || MCPlus::getNumPrimeOperands(Inst) != 1 || 1856 !Inst.getOperand(0).isExpr()) 1857 continue; 1858 1859 const MCSymbol *CalleeSymbol = BC.MIB->getTargetSymbol(Inst); 1860 if (CalleeSymbol->getName() != "memcpy" && 1861 CalleeSymbol->getName() != "memcpy@PLT" && 1862 CalleeSymbol->getName() != "_memcpy8") 1863 continue; 1864 1865 const bool IsMemcpy8 = (CalleeSymbol->getName() == "_memcpy8"); 1866 const bool IsTailCall = BC.MIB->isTailCall(Inst); 1867 1868 const InstructionListType NewCode = 1869 BC.MIB->createInlineMemcpy(IsMemcpy8); 1870 II = BB.replaceInstruction(II, NewCode); 1871 std::advance(II, NewCode.size() - 1); 1872 if (IsTailCall) { 1873 MCInst Return; 1874 BC.MIB->createReturn(Return); 1875 II = BB.insertInstruction(std::next(II), std::move(Return)); 1876 } 1877 1878 ++NumInlined; 1879 NumInlinedDyno += BB.getKnownExecutionCount(); 1880 } 1881 } 1882 } 1883 1884 if (NumInlined) { 1885 BC.outs() << "BOLT-INFO: inlined " << NumInlined << " memcpy() calls"; 1886 if (NumInlinedDyno) 1887 BC.outs() << ". The calls were executed " << NumInlinedDyno 1888 << " times based on profile."; 1889 BC.outs() << '\n'; 1890 } 1891 return Error::success(); 1892 } 1893 1894 bool SpecializeMemcpy1::shouldOptimize(const BinaryFunction &Function) const { 1895 if (!BinaryFunctionPass::shouldOptimize(Function)) 1896 return false; 1897 1898 for (const std::string &FunctionSpec : Spec) { 1899 StringRef FunctionName = StringRef(FunctionSpec).split(':').first; 1900 if (Function.hasNameRegex(FunctionName)) 1901 return true; 1902 } 1903 1904 return false; 1905 } 1906 1907 std::set<size_t> SpecializeMemcpy1::getCallSitesToOptimize( 1908 const BinaryFunction &Function) const { 1909 StringRef SitesString; 1910 for (const std::string &FunctionSpec : Spec) { 1911 StringRef FunctionName; 1912 std::tie(FunctionName, SitesString) = StringRef(FunctionSpec).split(':'); 1913 if (Function.hasNameRegex(FunctionName)) 1914 break; 1915 SitesString = ""; 1916 } 1917 1918 std::set<size_t> Sites; 1919 SmallVector<StringRef, 4> SitesVec; 1920 SitesString.split(SitesVec, ':'); 1921 for (StringRef SiteString : SitesVec) { 1922 if (SiteString.empty()) 1923 continue; 1924 size_t Result; 1925 if (!SiteString.getAsInteger(10, Result)) 1926 Sites.emplace(Result); 1927 } 1928 1929 return Sites; 1930 } 1931 1932 Error SpecializeMemcpy1::runOnFunctions(BinaryContext &BC) { 1933 if (!BC.isX86()) 1934 return Error::success(); 1935 1936 uint64_t NumSpecialized = 0; 1937 uint64_t NumSpecializedDyno = 0; 1938 for (auto &BFI : BC.getBinaryFunctions()) { 1939 BinaryFunction &Function = BFI.second; 1940 if (!shouldOptimize(Function)) 1941 continue; 1942 1943 std::set<size_t> CallsToOptimize = getCallSitesToOptimize(Function); 1944 auto shouldOptimize = [&](size_t N) { 1945 return CallsToOptimize.empty() || CallsToOptimize.count(N); 1946 }; 1947 1948 std::vector<BinaryBasicBlock *> Blocks(Function.pbegin(), Function.pend()); 1949 size_t CallSiteID = 0; 1950 for (BinaryBasicBlock *CurBB : Blocks) { 1951 for (auto II = CurBB->begin(); II != CurBB->end(); ++II) { 1952 MCInst &Inst = *II; 1953 1954 if (!BC.MIB->isCall(Inst) || MCPlus::getNumPrimeOperands(Inst) != 1 || 1955 !Inst.getOperand(0).isExpr()) 1956 continue; 1957 1958 const MCSymbol *CalleeSymbol = BC.MIB->getTargetSymbol(Inst); 1959 if (CalleeSymbol->getName() != "memcpy" && 1960 CalleeSymbol->getName() != "memcpy@PLT") 1961 continue; 1962 1963 if (BC.MIB->isTailCall(Inst)) 1964 continue; 1965 1966 ++CallSiteID; 1967 1968 if (!shouldOptimize(CallSiteID)) 1969 continue; 1970 1971 // Create a copy of a call to memcpy(dest, src, size). 1972 MCInst MemcpyInstr = Inst; 1973 1974 BinaryBasicBlock *OneByteMemcpyBB = CurBB->splitAt(II); 1975 1976 BinaryBasicBlock *NextBB = nullptr; 1977 if (OneByteMemcpyBB->getNumNonPseudos() > 1) { 1978 NextBB = OneByteMemcpyBB->splitAt(OneByteMemcpyBB->begin()); 1979 NextBB->eraseInstruction(NextBB->begin()); 1980 } else { 1981 NextBB = OneByteMemcpyBB->getSuccessor(); 1982 OneByteMemcpyBB->eraseInstruction(OneByteMemcpyBB->begin()); 1983 assert(NextBB && "unexpected call to memcpy() with no return"); 1984 } 1985 1986 BinaryBasicBlock *MemcpyBB = Function.addBasicBlock(); 1987 MemcpyBB->setOffset(CurBB->getInputOffset()); 1988 InstructionListType CmpJCC = 1989 BC.MIB->createCmpJE(BC.MIB->getIntArgRegister(2), 1, 1990 OneByteMemcpyBB->getLabel(), BC.Ctx.get()); 1991 CurBB->addInstructions(CmpJCC); 1992 CurBB->addSuccessor(MemcpyBB); 1993 1994 MemcpyBB->addInstruction(std::move(MemcpyInstr)); 1995 MemcpyBB->addSuccessor(NextBB); 1996 MemcpyBB->setCFIState(NextBB->getCFIState()); 1997 MemcpyBB->setExecutionCount(0); 1998 1999 // To prevent the actual call from being moved to cold, we set its 2000 // execution count to 1. 2001 if (CurBB->getKnownExecutionCount() > 0) 2002 MemcpyBB->setExecutionCount(1); 2003 2004 InstructionListType OneByteMemcpy = BC.MIB->createOneByteMemcpy(); 2005 OneByteMemcpyBB->addInstructions(OneByteMemcpy); 2006 2007 ++NumSpecialized; 2008 NumSpecializedDyno += CurBB->getKnownExecutionCount(); 2009 2010 CurBB = NextBB; 2011 2012 // Note: we don't expect the next instruction to be a call to memcpy. 2013 II = CurBB->begin(); 2014 } 2015 } 2016 } 2017 2018 if (NumSpecialized) { 2019 BC.outs() << "BOLT-INFO: specialized " << NumSpecialized 2020 << " memcpy() call sites for size 1"; 2021 if (NumSpecializedDyno) 2022 BC.outs() << ". The calls were executed " << NumSpecializedDyno 2023 << " times based on profile."; 2024 BC.outs() << '\n'; 2025 } 2026 return Error::success(); 2027 } 2028 2029 void RemoveNops::runOnFunction(BinaryFunction &BF) { 2030 const BinaryContext &BC = BF.getBinaryContext(); 2031 for (BinaryBasicBlock &BB : BF) { 2032 for (int64_t I = BB.size() - 1; I >= 0; --I) { 2033 MCInst &Inst = BB.getInstructionAtIndex(I); 2034 if (BC.MIB->isNoop(Inst) && BC.MIB->hasAnnotation(Inst, "NOP")) 2035 BB.eraseInstructionAtIndex(I); 2036 } 2037 } 2038 } 2039 2040 Error RemoveNops::runOnFunctions(BinaryContext &BC) { 2041 ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) { 2042 runOnFunction(BF); 2043 }; 2044 2045 ParallelUtilities::PredicateTy SkipFunc = [&](const BinaryFunction &BF) { 2046 return BF.shouldPreserveNops(); 2047 }; 2048 2049 ParallelUtilities::runOnEachFunction( 2050 BC, ParallelUtilities::SchedulingPolicy::SP_INST_LINEAR, WorkFun, 2051 SkipFunc, "RemoveNops"); 2052 return Error::success(); 2053 } 2054 2055 } // namespace bolt 2056 } // namespace llvm 2057