1 //===- bolt/Passes/BinaryPasses.cpp - Binary-level passes -----------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements multiple passes for binary optimization and analysis. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "bolt/Passes/BinaryPasses.h" 14 #include "bolt/Core/FunctionLayout.h" 15 #include "bolt/Core/ParallelUtilities.h" 16 #include "bolt/Passes/ReorderAlgorithm.h" 17 #include "bolt/Passes/ReorderFunctions.h" 18 #include "llvm/Support/CommandLine.h" 19 #include <atomic> 20 #include <mutex> 21 #include <numeric> 22 #include <vector> 23 24 #define DEBUG_TYPE "bolt-opts" 25 26 using namespace llvm; 27 using namespace bolt; 28 29 static const char *dynoStatsOptName(const bolt::DynoStats::Category C) { 30 assert(C > bolt::DynoStats::FIRST_DYNO_STAT && 31 C < DynoStats::LAST_DYNO_STAT && "Unexpected dyno stat category."); 32 33 static std::string OptNames[bolt::DynoStats::LAST_DYNO_STAT + 1]; 34 35 OptNames[C] = bolt::DynoStats::Description(C); 36 37 std::replace(OptNames[C].begin(), OptNames[C].end(), ' ', '-'); 38 39 return OptNames[C].c_str(); 40 } 41 42 namespace opts { 43 44 extern cl::OptionCategory BoltCategory; 45 extern cl::OptionCategory BoltOptCategory; 46 47 extern cl::opt<bolt::MacroFusionType> AlignMacroOpFusion; 48 extern cl::opt<unsigned> Verbosity; 49 extern cl::opt<bool> EnableBAT; 50 extern cl::opt<unsigned> ExecutionCountThreshold; 51 extern cl::opt<bool> UpdateDebugSections; 52 extern cl::opt<bolt::ReorderFunctions::ReorderType> ReorderFunctions; 53 54 enum DynoStatsSortOrder : char { 55 Ascending, 56 Descending 57 }; 58 59 static cl::opt<DynoStatsSortOrder> DynoStatsSortOrderOpt( 60 "print-sorted-by-order", 61 cl::desc("use ascending or descending order when printing functions " 62 "ordered by dyno stats"), 63 cl::init(DynoStatsSortOrder::Descending), cl::cat(BoltOptCategory)); 64 65 cl::list<std::string> 66 HotTextMoveSections("hot-text-move-sections", 67 cl::desc("list of sections containing functions used for hugifying hot text. " 68 "BOLT makes sure these functions are not placed on the same page as " 69 "the hot text. (default=\'.stub,.mover\')."), 70 cl::value_desc("sec1,sec2,sec3,..."), 71 cl::CommaSeparated, 72 cl::ZeroOrMore, 73 cl::cat(BoltCategory)); 74 75 bool isHotTextMover(const BinaryFunction &Function) { 76 for (std::string &SectionName : opts::HotTextMoveSections) { 77 if (Function.getOriginSectionName() && 78 *Function.getOriginSectionName() == SectionName) 79 return true; 80 } 81 82 return false; 83 } 84 85 static cl::opt<bool> MinBranchClusters( 86 "min-branch-clusters", 87 cl::desc("use a modified clustering algorithm geared towards minimizing " 88 "branches"), 89 cl::Hidden, cl::cat(BoltOptCategory)); 90 91 static cl::list<Peepholes::PeepholeOpts> Peepholes( 92 "peepholes", cl::CommaSeparated, cl::desc("enable peephole optimizations"), 93 cl::value_desc("opt1,opt2,opt3,..."), 94 cl::values(clEnumValN(Peepholes::PEEP_NONE, "none", "disable peepholes"), 95 clEnumValN(Peepholes::PEEP_DOUBLE_JUMPS, "double-jumps", 96 "remove double jumps when able"), 97 clEnumValN(Peepholes::PEEP_TAILCALL_TRAPS, "tailcall-traps", 98 "insert tail call traps"), 99 clEnumValN(Peepholes::PEEP_USELESS_BRANCHES, "useless-branches", 100 "remove useless conditional branches"), 101 clEnumValN(Peepholes::PEEP_ALL, "all", 102 "enable all peephole optimizations")), 103 cl::ZeroOrMore, cl::cat(BoltOptCategory)); 104 105 static cl::opt<unsigned> 106 PrintFuncStat("print-function-statistics", 107 cl::desc("print statistics about basic block ordering"), 108 cl::init(0), cl::cat(BoltOptCategory)); 109 110 static cl::opt<bool> PrintLargeFunctions( 111 "print-large-functions", 112 cl::desc("print functions that could not be overwritten due to excessive " 113 "size"), 114 cl::init(false), cl::cat(BoltOptCategory)); 115 116 static cl::list<bolt::DynoStats::Category> 117 PrintSortedBy("print-sorted-by", cl::CommaSeparated, 118 cl::desc("print functions sorted by order of dyno stats"), 119 cl::value_desc("key1,key2,key3,..."), 120 cl::values( 121 #define D(name, description, ...) \ 122 clEnumValN(bolt::DynoStats::name, dynoStatsOptName(bolt::DynoStats::name), \ 123 description), 124 REAL_DYNO_STATS 125 #undef D 126 clEnumValN(bolt::DynoStats::LAST_DYNO_STAT, "all", 127 "sorted by all names")), 128 cl::ZeroOrMore, cl::cat(BoltOptCategory)); 129 130 static cl::opt<bool> 131 PrintUnknown("print-unknown", 132 cl::desc("print names of functions with unknown control flow"), 133 cl::cat(BoltCategory), cl::Hidden); 134 135 static cl::opt<bool> 136 PrintUnknownCFG("print-unknown-cfg", 137 cl::desc("dump CFG of functions with unknown control flow"), 138 cl::cat(BoltCategory), cl::ReallyHidden); 139 140 // Please MSVC19 with a forward declaration: otherwise it reports an error about 141 // an undeclared variable inside a callback. 142 extern cl::opt<bolt::ReorderBasicBlocks::LayoutType> ReorderBlocks; 143 cl::opt<bolt::ReorderBasicBlocks::LayoutType> ReorderBlocks( 144 "reorder-blocks", cl::desc("change layout of basic blocks in a function"), 145 cl::init(bolt::ReorderBasicBlocks::LT_NONE), 146 cl::values( 147 clEnumValN(bolt::ReorderBasicBlocks::LT_NONE, "none", 148 "do not reorder basic blocks"), 149 clEnumValN(bolt::ReorderBasicBlocks::LT_REVERSE, "reverse", 150 "layout blocks in reverse order"), 151 clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE, "normal", 152 "perform optimal layout based on profile"), 153 clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_BRANCH, 154 "branch-predictor", 155 "perform optimal layout prioritizing branch " 156 "predictions"), 157 clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_CACHE, "cache", 158 "perform optimal layout prioritizing I-cache " 159 "behavior"), 160 clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_CACHE_PLUS, "cache+", 161 "perform layout optimizing I-cache behavior"), 162 clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_EXT_TSP, "ext-tsp", 163 "perform layout optimizing I-cache behavior"), 164 clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_SHUFFLE, 165 "cluster-shuffle", "perform random layout of clusters")), 166 cl::ZeroOrMore, cl::cat(BoltOptCategory), 167 cl::callback([](const bolt::ReorderBasicBlocks::LayoutType &option) { 168 if (option == bolt::ReorderBasicBlocks::LT_OPTIMIZE_CACHE_PLUS) { 169 errs() << "BOLT-WARNING: '-reorder-blocks=cache+' is deprecated, please" 170 << " use '-reorder-blocks=ext-tsp' instead\n"; 171 ReorderBlocks = bolt::ReorderBasicBlocks::LT_OPTIMIZE_EXT_TSP; 172 } 173 })); 174 175 static cl::opt<unsigned> ReportBadLayout( 176 "report-bad-layout", 177 cl::desc("print top <uint> functions with suboptimal code layout on input"), 178 cl::init(0), cl::Hidden, cl::cat(BoltOptCategory)); 179 180 static cl::opt<bool> 181 ReportStaleFuncs("report-stale", 182 cl::desc("print the list of functions with stale profile"), 183 cl::Hidden, cl::cat(BoltOptCategory)); 184 185 enum SctcModes : char { 186 SctcAlways, 187 SctcPreserveDirection, 188 SctcHeuristic 189 }; 190 191 static cl::opt<SctcModes> 192 SctcMode("sctc-mode", 193 cl::desc("mode for simplify conditional tail calls"), 194 cl::init(SctcAlways), 195 cl::values(clEnumValN(SctcAlways, "always", "always perform sctc"), 196 clEnumValN(SctcPreserveDirection, 197 "preserve", 198 "only perform sctc when branch direction is " 199 "preserved"), 200 clEnumValN(SctcHeuristic, 201 "heuristic", 202 "use branch prediction data to control sctc")), 203 cl::ZeroOrMore, 204 cl::cat(BoltOptCategory)); 205 206 static cl::opt<unsigned> 207 StaleThreshold("stale-threshold", 208 cl::desc( 209 "maximum percentage of stale functions to tolerate (default: 100)"), 210 cl::init(100), 211 cl::Hidden, 212 cl::cat(BoltOptCategory)); 213 214 static cl::opt<unsigned> TSPThreshold( 215 "tsp-threshold", 216 cl::desc( 217 "maximum number of hot basic blocks in a function for which to use " 218 "a precise TSP solution while re-ordering basic blocks"), 219 cl::init(10), cl::Hidden, cl::cat(BoltOptCategory)); 220 221 static cl::opt<unsigned> TopCalledLimit( 222 "top-called-limit", 223 cl::desc("maximum number of functions to print in top called " 224 "functions section"), 225 cl::init(100), cl::Hidden, cl::cat(BoltCategory)); 226 227 } // namespace opts 228 229 namespace llvm { 230 namespace bolt { 231 232 bool BinaryFunctionPass::shouldOptimize(const BinaryFunction &BF) const { 233 return BF.isSimple() && BF.getState() == BinaryFunction::State::CFG && 234 !BF.isIgnored(); 235 } 236 237 bool BinaryFunctionPass::shouldPrint(const BinaryFunction &BF) const { 238 return BF.isSimple() && !BF.isIgnored(); 239 } 240 241 void NormalizeCFG::runOnFunction(BinaryFunction &BF) { 242 uint64_t NumRemoved = 0; 243 uint64_t NumDuplicateEdges = 0; 244 uint64_t NeedsFixBranches = 0; 245 for (BinaryBasicBlock &BB : BF) { 246 if (!BB.empty()) 247 continue; 248 249 if (BB.isEntryPoint() || BB.isLandingPad()) 250 continue; 251 252 // Handle a dangling empty block. 253 if (BB.succ_size() == 0) { 254 // If an empty dangling basic block has a predecessor, it could be a 255 // result of codegen for __builtin_unreachable. In such case, do not 256 // remove the block. 257 if (BB.pred_size() == 0) { 258 BB.markValid(false); 259 ++NumRemoved; 260 } 261 continue; 262 } 263 264 // The block should have just one successor. 265 BinaryBasicBlock *Successor = BB.getSuccessor(); 266 assert(Successor && "invalid CFG encountered"); 267 268 // Redirect all predecessors to the successor block. 269 while (!BB.pred_empty()) { 270 BinaryBasicBlock *Predecessor = *BB.pred_begin(); 271 if (Predecessor->hasJumpTable()) 272 break; 273 274 if (Predecessor == Successor) 275 break; 276 277 BinaryBasicBlock::BinaryBranchInfo &BI = Predecessor->getBranchInfo(BB); 278 Predecessor->replaceSuccessor(&BB, Successor, BI.Count, 279 BI.MispredictedCount); 280 // We need to fix branches even if we failed to replace all successors 281 // and remove the block. 282 NeedsFixBranches = true; 283 } 284 285 if (BB.pred_empty()) { 286 BB.removeAllSuccessors(); 287 BB.markValid(false); 288 ++NumRemoved; 289 } 290 } 291 292 if (NumRemoved) 293 BF.eraseInvalidBBs(); 294 295 // Check for duplicate successors. Do it after the empty block elimination as 296 // we can get more duplicate successors. 297 for (BinaryBasicBlock &BB : BF) 298 if (!BB.hasJumpTable() && BB.succ_size() == 2 && 299 BB.getConditionalSuccessor(false) == BB.getConditionalSuccessor(true)) 300 ++NumDuplicateEdges; 301 302 // fixBranches() will get rid of duplicate edges and update jump instructions. 303 if (NumDuplicateEdges || NeedsFixBranches) 304 BF.fixBranches(); 305 306 NumDuplicateEdgesMerged += NumDuplicateEdges; 307 NumBlocksRemoved += NumRemoved; 308 } 309 310 Error NormalizeCFG::runOnFunctions(BinaryContext &BC) { 311 ParallelUtilities::runOnEachFunction( 312 BC, ParallelUtilities::SchedulingPolicy::SP_BB_LINEAR, 313 [&](BinaryFunction &BF) { runOnFunction(BF); }, 314 [&](const BinaryFunction &BF) { return !shouldOptimize(BF); }, 315 "NormalizeCFG"); 316 if (NumBlocksRemoved) 317 BC.outs() << "BOLT-INFO: removed " << NumBlocksRemoved << " empty block" 318 << (NumBlocksRemoved == 1 ? "" : "s") << '\n'; 319 if (NumDuplicateEdgesMerged) 320 BC.outs() << "BOLT-INFO: merged " << NumDuplicateEdgesMerged 321 << " duplicate CFG edge" 322 << (NumDuplicateEdgesMerged == 1 ? "" : "s") << '\n'; 323 return Error::success(); 324 } 325 326 void EliminateUnreachableBlocks::runOnFunction(BinaryFunction &Function) { 327 BinaryContext &BC = Function.getBinaryContext(); 328 unsigned Count; 329 uint64_t Bytes; 330 Function.markUnreachableBlocks(); 331 LLVM_DEBUG({ 332 for (BinaryBasicBlock &BB : Function) { 333 if (!BB.isValid()) { 334 dbgs() << "BOLT-INFO: UCE found unreachable block " << BB.getName() 335 << " in function " << Function << "\n"; 336 Function.dump(); 337 } 338 } 339 }); 340 BinaryContext::IndependentCodeEmitter Emitter = 341 BC.createIndependentMCCodeEmitter(); 342 std::tie(Count, Bytes) = Function.eraseInvalidBBs(Emitter.MCE.get()); 343 DeletedBlocks += Count; 344 DeletedBytes += Bytes; 345 if (Count) { 346 auto L = BC.scopeLock(); 347 Modified.insert(&Function); 348 if (opts::Verbosity > 0) 349 BC.outs() << "BOLT-INFO: removed " << Count 350 << " dead basic block(s) accounting for " << Bytes 351 << " bytes in function " << Function << '\n'; 352 } 353 } 354 355 Error EliminateUnreachableBlocks::runOnFunctions(BinaryContext &BC) { 356 ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) { 357 runOnFunction(BF); 358 }; 359 360 ParallelUtilities::PredicateTy SkipPredicate = [&](const BinaryFunction &BF) { 361 return !shouldOptimize(BF) || BF.getLayout().block_empty(); 362 }; 363 364 ParallelUtilities::runOnEachFunction( 365 BC, ParallelUtilities::SchedulingPolicy::SP_CONSTANT, WorkFun, 366 SkipPredicate, "elimininate-unreachable"); 367 368 if (DeletedBlocks) 369 BC.outs() << "BOLT-INFO: UCE removed " << DeletedBlocks << " blocks and " 370 << DeletedBytes << " bytes of code\n"; 371 return Error::success(); 372 } 373 374 bool ReorderBasicBlocks::shouldPrint(const BinaryFunction &BF) const { 375 return (BinaryFunctionPass::shouldPrint(BF) && 376 opts::ReorderBlocks != ReorderBasicBlocks::LT_NONE); 377 } 378 379 bool ReorderBasicBlocks::shouldOptimize(const BinaryFunction &BF) const { 380 // Apply execution count threshold 381 if (BF.getKnownExecutionCount() < opts::ExecutionCountThreshold) 382 return false; 383 384 return BinaryFunctionPass::shouldOptimize(BF); 385 } 386 387 Error ReorderBasicBlocks::runOnFunctions(BinaryContext &BC) { 388 if (opts::ReorderBlocks == ReorderBasicBlocks::LT_NONE) 389 return Error::success(); 390 391 std::atomic_uint64_t ModifiedFuncCount(0); 392 std::mutex FunctionEditDistanceMutex; 393 DenseMap<const BinaryFunction *, uint64_t> FunctionEditDistance; 394 395 ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) { 396 SmallVector<const BinaryBasicBlock *, 0> OldBlockOrder; 397 if (opts::PrintFuncStat > 0) 398 llvm::copy(BF.getLayout().blocks(), std::back_inserter(OldBlockOrder)); 399 400 const bool LayoutChanged = 401 modifyFunctionLayout(BF, opts::ReorderBlocks, opts::MinBranchClusters); 402 if (LayoutChanged) { 403 ModifiedFuncCount.fetch_add(1, std::memory_order_relaxed); 404 if (opts::PrintFuncStat > 0) { 405 const uint64_t Distance = BF.getLayout().getEditDistance(OldBlockOrder); 406 std::lock_guard<std::mutex> Lock(FunctionEditDistanceMutex); 407 FunctionEditDistance[&BF] = Distance; 408 } 409 } 410 }; 411 412 ParallelUtilities::PredicateTy SkipFunc = [&](const BinaryFunction &BF) { 413 return !shouldOptimize(BF); 414 }; 415 416 ParallelUtilities::runOnEachFunction( 417 BC, ParallelUtilities::SchedulingPolicy::SP_BB_LINEAR, WorkFun, SkipFunc, 418 "ReorderBasicBlocks"); 419 const size_t NumAllProfiledFunctions = 420 BC.NumProfiledFuncs + BC.NumStaleProfileFuncs; 421 422 BC.outs() << "BOLT-INFO: basic block reordering modified layout of " 423 << format( 424 "%zu functions (%.2lf%% of profiled, %.2lf%% of total)\n", 425 ModifiedFuncCount.load(std::memory_order_relaxed), 426 100.0 * ModifiedFuncCount.load(std::memory_order_relaxed) / 427 NumAllProfiledFunctions, 428 100.0 * ModifiedFuncCount.load(std::memory_order_relaxed) / 429 BC.getBinaryFunctions().size()); 430 431 if (opts::PrintFuncStat > 0) { 432 raw_ostream &OS = BC.outs(); 433 // Copy all the values into vector in order to sort them 434 std::map<uint64_t, BinaryFunction &> ScoreMap; 435 auto &BFs = BC.getBinaryFunctions(); 436 for (auto It = BFs.begin(); It != BFs.end(); ++It) 437 ScoreMap.insert(std::pair<uint64_t, BinaryFunction &>( 438 It->second.getFunctionScore(), It->second)); 439 440 OS << "\nBOLT-INFO: Printing Function Statistics:\n\n"; 441 OS << " There are " << BFs.size() << " functions in total. \n"; 442 OS << " Number of functions being modified: " 443 << ModifiedFuncCount.load(std::memory_order_relaxed) << "\n"; 444 OS << " User asks for detailed information on top " 445 << opts::PrintFuncStat << " functions. (Ranked by function score)" 446 << "\n\n"; 447 uint64_t I = 0; 448 for (std::map<uint64_t, BinaryFunction &>::reverse_iterator Rit = 449 ScoreMap.rbegin(); 450 Rit != ScoreMap.rend() && I < opts::PrintFuncStat; ++Rit, ++I) { 451 BinaryFunction &Function = Rit->second; 452 453 OS << " Information for function of top: " << (I + 1) << ": \n"; 454 OS << " Function Score is: " << Function.getFunctionScore() 455 << "\n"; 456 OS << " There are " << Function.size() 457 << " number of blocks in this function.\n"; 458 OS << " There are " << Function.getInstructionCount() 459 << " number of instructions in this function.\n"; 460 OS << " The edit distance for this function is: " 461 << FunctionEditDistance.lookup(&Function) << "\n\n"; 462 } 463 } 464 return Error::success(); 465 } 466 467 bool ReorderBasicBlocks::modifyFunctionLayout(BinaryFunction &BF, 468 LayoutType Type, 469 bool MinBranchClusters) const { 470 if (BF.size() == 0 || Type == LT_NONE) 471 return false; 472 473 BinaryFunction::BasicBlockOrderType NewLayout; 474 std::unique_ptr<ReorderAlgorithm> Algo; 475 476 // Cannot do optimal layout without profile. 477 if (Type != LT_REVERSE && !BF.hasValidProfile()) 478 return false; 479 480 if (Type == LT_REVERSE) { 481 Algo.reset(new ReverseReorderAlgorithm()); 482 } else if (BF.size() <= opts::TSPThreshold && Type != LT_OPTIMIZE_SHUFFLE) { 483 // Work on optimal solution if problem is small enough 484 LLVM_DEBUG(dbgs() << "finding optimal block layout for " << BF << "\n"); 485 Algo.reset(new TSPReorderAlgorithm()); 486 } else { 487 LLVM_DEBUG(dbgs() << "running block layout heuristics on " << BF << "\n"); 488 489 std::unique_ptr<ClusterAlgorithm> CAlgo; 490 if (MinBranchClusters) 491 CAlgo.reset(new MinBranchGreedyClusterAlgorithm()); 492 else 493 CAlgo.reset(new PHGreedyClusterAlgorithm()); 494 495 switch (Type) { 496 case LT_OPTIMIZE: 497 Algo.reset(new OptimizeReorderAlgorithm(std::move(CAlgo))); 498 break; 499 500 case LT_OPTIMIZE_BRANCH: 501 Algo.reset(new OptimizeBranchReorderAlgorithm(std::move(CAlgo))); 502 break; 503 504 case LT_OPTIMIZE_CACHE: 505 Algo.reset(new OptimizeCacheReorderAlgorithm(std::move(CAlgo))); 506 break; 507 508 case LT_OPTIMIZE_EXT_TSP: 509 Algo.reset(new ExtTSPReorderAlgorithm()); 510 break; 511 512 case LT_OPTIMIZE_SHUFFLE: 513 Algo.reset(new RandomClusterReorderAlgorithm(std::move(CAlgo))); 514 break; 515 516 default: 517 llvm_unreachable("unexpected layout type"); 518 } 519 } 520 521 Algo->reorderBasicBlocks(BF, NewLayout); 522 523 return BF.getLayout().update(NewLayout); 524 } 525 526 Error FixupBranches::runOnFunctions(BinaryContext &BC) { 527 for (auto &It : BC.getBinaryFunctions()) { 528 BinaryFunction &Function = It.second; 529 if (!BC.shouldEmit(Function) || !Function.isSimple()) 530 continue; 531 532 Function.fixBranches(); 533 } 534 return Error::success(); 535 } 536 537 Error FinalizeFunctions::runOnFunctions(BinaryContext &BC) { 538 std::atomic<bool> HasFatal{false}; 539 ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) { 540 if (!BF.finalizeCFIState()) { 541 if (BC.HasRelocations) { 542 BC.errs() << "BOLT-ERROR: unable to fix CFI state for function " << BF 543 << ". Exiting.\n"; 544 HasFatal = true; 545 return; 546 } 547 BF.setSimple(false); 548 return; 549 } 550 551 BF.setFinalized(); 552 553 // Update exception handling information. 554 BF.updateEHRanges(); 555 }; 556 557 ParallelUtilities::PredicateTy SkipPredicate = [&](const BinaryFunction &BF) { 558 return !BC.shouldEmit(BF); 559 }; 560 561 ParallelUtilities::runOnEachFunction( 562 BC, ParallelUtilities::SchedulingPolicy::SP_CONSTANT, WorkFun, 563 SkipPredicate, "FinalizeFunctions"); 564 if (HasFatal) 565 return createFatalBOLTError("finalize CFI state failure"); 566 return Error::success(); 567 } 568 569 Error CheckLargeFunctions::runOnFunctions(BinaryContext &BC) { 570 if (BC.HasRelocations) 571 return Error::success(); 572 573 // If the function wouldn't fit, mark it as non-simple. Otherwise, we may emit 574 // incorrect meta data. 575 ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) { 576 uint64_t HotSize, ColdSize; 577 std::tie(HotSize, ColdSize) = 578 BC.calculateEmittedSize(BF, /*FixBranches=*/false); 579 if (HotSize > BF.getMaxSize()) { 580 if (opts::PrintLargeFunctions) 581 BC.outs() << "BOLT-INFO: " << BF << " size exceeds allocated space by " 582 << (HotSize - BF.getMaxSize()) << " bytes\n"; 583 BF.setSimple(false); 584 } 585 }; 586 587 ParallelUtilities::PredicateTy SkipFunc = [&](const BinaryFunction &BF) { 588 return !shouldOptimize(BF); 589 }; 590 591 ParallelUtilities::runOnEachFunction( 592 BC, ParallelUtilities::SchedulingPolicy::SP_INST_LINEAR, WorkFun, 593 SkipFunc, "CheckLargeFunctions"); 594 595 return Error::success(); 596 } 597 598 bool CheckLargeFunctions::shouldOptimize(const BinaryFunction &BF) const { 599 // Unlike other passes, allow functions in non-CFG state. 600 return BF.isSimple() && !BF.isIgnored(); 601 } 602 603 Error LowerAnnotations::runOnFunctions(BinaryContext &BC) { 604 // Convert GnuArgsSize annotations into CFIs. 605 for (BinaryFunction *BF : BC.getAllBinaryFunctions()) { 606 for (FunctionFragment &FF : BF->getLayout().fragments()) { 607 // Reset at the start of the new fragment. 608 int64_t CurrentGnuArgsSize = 0; 609 610 for (BinaryBasicBlock *const BB : FF) { 611 for (auto II = BB->begin(); II != BB->end(); ++II) { 612 if (!BF->usesGnuArgsSize() || !BC.MIB->isInvoke(*II)) 613 continue; 614 615 const int64_t NewGnuArgsSize = BC.MIB->getGnuArgsSize(*II); 616 assert(NewGnuArgsSize >= 0 && "Expected non-negative GNU_args_size."); 617 if (NewGnuArgsSize == CurrentGnuArgsSize) 618 continue; 619 620 auto InsertII = BF->addCFIInstruction( 621 BB, II, 622 MCCFIInstruction::createGnuArgsSize(nullptr, NewGnuArgsSize)); 623 CurrentGnuArgsSize = NewGnuArgsSize; 624 II = std::next(InsertII); 625 } 626 } 627 } 628 } 629 return Error::success(); 630 } 631 632 // Check for dirty state in MCSymbol objects that might be a consequence 633 // of running calculateEmittedSize() in parallel, during split functions 634 // pass. If an inconsistent state is found (symbol already registered or 635 // already defined), clean it. 636 Error CleanMCState::runOnFunctions(BinaryContext &BC) { 637 MCContext &Ctx = *BC.Ctx; 638 for (const auto &SymMapEntry : Ctx.getSymbols()) { 639 const MCSymbol *S = SymMapEntry.getValue().Symbol; 640 if (!S) 641 continue; 642 if (S->isDefined()) { 643 LLVM_DEBUG(dbgs() << "BOLT-DEBUG: Symbol \"" << S->getName() 644 << "\" is already defined\n"); 645 const_cast<MCSymbol *>(S)->setUndefined(); 646 } 647 if (S->isRegistered()) { 648 LLVM_DEBUG(dbgs() << "BOLT-DEBUG: Symbol \"" << S->getName() 649 << "\" is already registered\n"); 650 const_cast<MCSymbol *>(S)->setIsRegistered(false); 651 } 652 LLVM_DEBUG(if (S->isVariable()) { 653 dbgs() << "BOLT-DEBUG: Symbol \"" << S->getName() << "\" is variable\n"; 654 }); 655 } 656 return Error::success(); 657 } 658 659 // This peephole fixes jump instructions that jump to another basic 660 // block with a single jump instruction, e.g. 661 // 662 // B0: ... 663 // jmp B1 (or jcc B1) 664 // 665 // B1: jmp B2 666 // 667 // -> 668 // 669 // B0: ... 670 // jmp B2 (or jcc B2) 671 // 672 static uint64_t fixDoubleJumps(BinaryFunction &Function, bool MarkInvalid) { 673 uint64_t NumDoubleJumps = 0; 674 675 MCContext *Ctx = Function.getBinaryContext().Ctx.get(); 676 MCPlusBuilder *MIB = Function.getBinaryContext().MIB.get(); 677 for (BinaryBasicBlock &BB : Function) { 678 auto checkAndPatch = [&](BinaryBasicBlock *Pred, BinaryBasicBlock *Succ, 679 const MCSymbol *SuccSym, 680 std::optional<uint32_t> Offset) { 681 // Ignore infinite loop jumps or fallthrough tail jumps. 682 if (Pred == Succ || Succ == &BB) 683 return false; 684 685 if (Succ) { 686 const MCSymbol *TBB = nullptr; 687 const MCSymbol *FBB = nullptr; 688 MCInst *CondBranch = nullptr; 689 MCInst *UncondBranch = nullptr; 690 bool Res = Pred->analyzeBranch(TBB, FBB, CondBranch, UncondBranch); 691 if (!Res) { 692 LLVM_DEBUG(dbgs() << "analyzeBranch failed in peepholes in block:\n"; 693 Pred->dump()); 694 return false; 695 } 696 Pred->replaceSuccessor(&BB, Succ); 697 698 // We must patch up any existing branch instructions to match up 699 // with the new successor. 700 assert((CondBranch || (!CondBranch && Pred->succ_size() == 1)) && 701 "Predecessor block has inconsistent number of successors"); 702 if (CondBranch && MIB->getTargetSymbol(*CondBranch) == BB.getLabel()) { 703 MIB->replaceBranchTarget(*CondBranch, Succ->getLabel(), Ctx); 704 } else if (UncondBranch && 705 MIB->getTargetSymbol(*UncondBranch) == BB.getLabel()) { 706 MIB->replaceBranchTarget(*UncondBranch, Succ->getLabel(), Ctx); 707 } else if (!UncondBranch) { 708 assert(Function.getLayout().getBasicBlockAfter(Pred, false) != Succ && 709 "Don't add an explicit jump to a fallthrough block."); 710 Pred->addBranchInstruction(Succ); 711 } 712 } else { 713 // Succ will be null in the tail call case. In this case we 714 // need to explicitly add a tail call instruction. 715 MCInst *Branch = Pred->getLastNonPseudoInstr(); 716 if (Branch && MIB->isUnconditionalBranch(*Branch)) { 717 assert(MIB->getTargetSymbol(*Branch) == BB.getLabel()); 718 Pred->removeSuccessor(&BB); 719 Pred->eraseInstruction(Pred->findInstruction(Branch)); 720 Pred->addTailCallInstruction(SuccSym); 721 if (Offset) { 722 MCInst *TailCall = Pred->getLastNonPseudoInstr(); 723 assert(TailCall); 724 MIB->setOffset(*TailCall, *Offset); 725 } 726 } else { 727 return false; 728 } 729 } 730 731 ++NumDoubleJumps; 732 LLVM_DEBUG(dbgs() << "Removed double jump in " << Function << " from " 733 << Pred->getName() << " -> " << BB.getName() << " to " 734 << Pred->getName() << " -> " << SuccSym->getName() 735 << (!Succ ? " (tail)\n" : "\n")); 736 737 return true; 738 }; 739 740 if (BB.getNumNonPseudos() != 1 || BB.isLandingPad()) 741 continue; 742 743 MCInst *Inst = BB.getFirstNonPseudoInstr(); 744 const bool IsTailCall = MIB->isTailCall(*Inst); 745 746 if (!MIB->isUnconditionalBranch(*Inst) && !IsTailCall) 747 continue; 748 749 // If we operate after SCTC make sure it's not a conditional tail call. 750 if (IsTailCall && MIB->isConditionalBranch(*Inst)) 751 continue; 752 753 const MCSymbol *SuccSym = MIB->getTargetSymbol(*Inst); 754 BinaryBasicBlock *Succ = BB.getSuccessor(); 755 756 if (((!Succ || &BB == Succ) && !IsTailCall) || (IsTailCall && !SuccSym)) 757 continue; 758 759 std::vector<BinaryBasicBlock *> Preds = {BB.pred_begin(), BB.pred_end()}; 760 761 for (BinaryBasicBlock *Pred : Preds) { 762 if (Pred->isLandingPad()) 763 continue; 764 765 if (Pred->getSuccessor() == &BB || 766 (Pred->getConditionalSuccessor(true) == &BB && !IsTailCall) || 767 Pred->getConditionalSuccessor(false) == &BB) 768 if (checkAndPatch(Pred, Succ, SuccSym, MIB->getOffset(*Inst)) && 769 MarkInvalid) 770 BB.markValid(BB.pred_size() != 0 || BB.isLandingPad() || 771 BB.isEntryPoint()); 772 } 773 } 774 775 return NumDoubleJumps; 776 } 777 778 bool SimplifyConditionalTailCalls::shouldRewriteBranch( 779 const BinaryBasicBlock *PredBB, const MCInst &CondBranch, 780 const BinaryBasicBlock *BB, const bool DirectionFlag) { 781 if (BeenOptimized.count(PredBB)) 782 return false; 783 784 const bool IsForward = BinaryFunction::isForwardBranch(PredBB, BB); 785 786 if (IsForward) 787 ++NumOrigForwardBranches; 788 else 789 ++NumOrigBackwardBranches; 790 791 if (opts::SctcMode == opts::SctcAlways) 792 return true; 793 794 if (opts::SctcMode == opts::SctcPreserveDirection) 795 return IsForward == DirectionFlag; 796 797 const ErrorOr<std::pair<double, double>> Frequency = 798 PredBB->getBranchStats(BB); 799 800 // It's ok to rewrite the conditional branch if the new target will be 801 // a backward branch. 802 803 // If no data available for these branches, then it should be ok to 804 // do the optimization since it will reduce code size. 805 if (Frequency.getError()) 806 return true; 807 808 // TODO: should this use misprediction frequency instead? 809 const bool Result = (IsForward && Frequency.get().first >= 0.5) || 810 (!IsForward && Frequency.get().first <= 0.5); 811 812 return Result == DirectionFlag; 813 } 814 815 uint64_t SimplifyConditionalTailCalls::fixTailCalls(BinaryFunction &BF) { 816 // Need updated indices to correctly detect branch' direction. 817 BF.getLayout().updateLayoutIndices(); 818 BF.markUnreachableBlocks(); 819 820 MCPlusBuilder *MIB = BF.getBinaryContext().MIB.get(); 821 MCContext *Ctx = BF.getBinaryContext().Ctx.get(); 822 uint64_t NumLocalCTCCandidates = 0; 823 uint64_t NumLocalCTCs = 0; 824 uint64_t LocalCTCTakenCount = 0; 825 uint64_t LocalCTCExecCount = 0; 826 std::vector<std::pair<BinaryBasicBlock *, const BinaryBasicBlock *>> 827 NeedsUncondBranch; 828 829 // Will block be deleted by UCE? 830 auto isValid = [](const BinaryBasicBlock *BB) { 831 return (BB->pred_size() != 0 || BB->isLandingPad() || BB->isEntryPoint()); 832 }; 833 834 for (BinaryBasicBlock *BB : BF.getLayout().blocks()) { 835 // Locate BB with a single direct tail-call instruction. 836 if (BB->getNumNonPseudos() != 1) 837 continue; 838 839 MCInst *Instr = BB->getFirstNonPseudoInstr(); 840 if (!MIB->isTailCall(*Instr) || MIB->isConditionalBranch(*Instr)) 841 continue; 842 843 const MCSymbol *CalleeSymbol = MIB->getTargetSymbol(*Instr); 844 if (!CalleeSymbol) 845 continue; 846 847 // Detect direction of the possible conditional tail call. 848 const bool IsForwardCTC = BF.isForwardCall(CalleeSymbol); 849 850 // Iterate through all predecessors. 851 for (BinaryBasicBlock *PredBB : BB->predecessors()) { 852 BinaryBasicBlock *CondSucc = PredBB->getConditionalSuccessor(true); 853 if (!CondSucc) 854 continue; 855 856 ++NumLocalCTCCandidates; 857 858 const MCSymbol *TBB = nullptr; 859 const MCSymbol *FBB = nullptr; 860 MCInst *CondBranch = nullptr; 861 MCInst *UncondBranch = nullptr; 862 bool Result = PredBB->analyzeBranch(TBB, FBB, CondBranch, UncondBranch); 863 864 // analyzeBranch() can fail due to unusual branch instructions, e.g. jrcxz 865 if (!Result) { 866 LLVM_DEBUG(dbgs() << "analyzeBranch failed in SCTC in block:\n"; 867 PredBB->dump()); 868 continue; 869 } 870 871 assert(Result && "internal error analyzing conditional branch"); 872 assert(CondBranch && "conditional branch expected"); 873 874 // Skip dynamic branches for now. 875 if (BF.getBinaryContext().MIB->isDynamicBranch(*CondBranch)) 876 continue; 877 878 // It's possible that PredBB is also a successor to BB that may have 879 // been processed by a previous iteration of the SCTC loop, in which 880 // case it may have been marked invalid. We should skip rewriting in 881 // this case. 882 if (!PredBB->isValid()) { 883 assert(PredBB->isSuccessor(BB) && 884 "PredBB should be valid if it is not a successor to BB"); 885 continue; 886 } 887 888 // We don't want to reverse direction of the branch in new order 889 // without further profile analysis. 890 const bool DirectionFlag = CondSucc == BB ? IsForwardCTC : !IsForwardCTC; 891 if (!shouldRewriteBranch(PredBB, *CondBranch, BB, DirectionFlag)) 892 continue; 893 894 // Record this block so that we don't try to optimize it twice. 895 BeenOptimized.insert(PredBB); 896 897 uint64_t Count = 0; 898 if (CondSucc != BB) { 899 // Patch the new target address into the conditional branch. 900 MIB->reverseBranchCondition(*CondBranch, CalleeSymbol, Ctx); 901 // Since we reversed the condition on the branch we need to change 902 // the target for the unconditional branch or add a unconditional 903 // branch to the old target. This has to be done manually since 904 // fixupBranches is not called after SCTC. 905 NeedsUncondBranch.emplace_back(PredBB, CondSucc); 906 Count = PredBB->getFallthroughBranchInfo().Count; 907 } else { 908 // Change destination of the conditional branch. 909 MIB->replaceBranchTarget(*CondBranch, CalleeSymbol, Ctx); 910 Count = PredBB->getTakenBranchInfo().Count; 911 } 912 const uint64_t CTCTakenFreq = 913 Count == BinaryBasicBlock::COUNT_NO_PROFILE ? 0 : Count; 914 915 // Annotate it, so "isCall" returns true for this jcc 916 MIB->setConditionalTailCall(*CondBranch); 917 // Add info about the conditional tail call frequency, otherwise this 918 // info will be lost when we delete the associated BranchInfo entry 919 auto &CTCAnnotation = 920 MIB->getOrCreateAnnotationAs<uint64_t>(*CondBranch, "CTCTakenCount"); 921 CTCAnnotation = CTCTakenFreq; 922 // Preserve Offset annotation, used in BAT. 923 // Instr is a direct tail call instruction that was created when CTCs are 924 // first expanded, and has the original CTC offset set. 925 if (std::optional<uint32_t> Offset = MIB->getOffset(*Instr)) 926 MIB->setOffset(*CondBranch, *Offset); 927 928 // Remove the unused successor which may be eliminated later 929 // if there are no other users. 930 PredBB->removeSuccessor(BB); 931 // Update BB execution count 932 if (CTCTakenFreq && CTCTakenFreq <= BB->getKnownExecutionCount()) 933 BB->setExecutionCount(BB->getExecutionCount() - CTCTakenFreq); 934 else if (CTCTakenFreq > BB->getKnownExecutionCount()) 935 BB->setExecutionCount(0); 936 937 ++NumLocalCTCs; 938 LocalCTCTakenCount += CTCTakenFreq; 939 LocalCTCExecCount += PredBB->getKnownExecutionCount(); 940 } 941 942 // Remove the block from CFG if all predecessors were removed. 943 BB->markValid(isValid(BB)); 944 } 945 946 // Add unconditional branches at the end of BBs to new successors 947 // as long as the successor is not a fallthrough. 948 for (auto &Entry : NeedsUncondBranch) { 949 BinaryBasicBlock *PredBB = Entry.first; 950 const BinaryBasicBlock *CondSucc = Entry.second; 951 952 const MCSymbol *TBB = nullptr; 953 const MCSymbol *FBB = nullptr; 954 MCInst *CondBranch = nullptr; 955 MCInst *UncondBranch = nullptr; 956 PredBB->analyzeBranch(TBB, FBB, CondBranch, UncondBranch); 957 958 // Find the next valid block. Invalid blocks will be deleted 959 // so they shouldn't be considered fallthrough targets. 960 const BinaryBasicBlock *NextBlock = 961 BF.getLayout().getBasicBlockAfter(PredBB, false); 962 while (NextBlock && !isValid(NextBlock)) 963 NextBlock = BF.getLayout().getBasicBlockAfter(NextBlock, false); 964 965 // Get the unconditional successor to this block. 966 const BinaryBasicBlock *PredSucc = PredBB->getSuccessor(); 967 assert(PredSucc && "The other branch should be a tail call"); 968 969 const bool HasFallthrough = (NextBlock && PredSucc == NextBlock); 970 971 if (UncondBranch) { 972 if (HasFallthrough) 973 PredBB->eraseInstruction(PredBB->findInstruction(UncondBranch)); 974 else 975 MIB->replaceBranchTarget(*UncondBranch, CondSucc->getLabel(), Ctx); 976 } else if (!HasFallthrough) { 977 MCInst Branch; 978 MIB->createUncondBranch(Branch, CondSucc->getLabel(), Ctx); 979 PredBB->addInstruction(Branch); 980 } 981 } 982 983 if (NumLocalCTCs > 0) { 984 NumDoubleJumps += fixDoubleJumps(BF, true); 985 // Clean-up unreachable tail-call blocks. 986 const std::pair<unsigned, uint64_t> Stats = BF.eraseInvalidBBs(); 987 DeletedBlocks += Stats.first; 988 DeletedBytes += Stats.second; 989 990 assert(BF.validateCFG()); 991 } 992 993 LLVM_DEBUG(dbgs() << "BOLT: created " << NumLocalCTCs 994 << " conditional tail calls from a total of " 995 << NumLocalCTCCandidates << " candidates in function " << BF 996 << ". CTCs execution count for this function is " 997 << LocalCTCExecCount << " and CTC taken count is " 998 << LocalCTCTakenCount << "\n";); 999 1000 NumTailCallsPatched += NumLocalCTCs; 1001 NumCandidateTailCalls += NumLocalCTCCandidates; 1002 CTCExecCount += LocalCTCExecCount; 1003 CTCTakenCount += LocalCTCTakenCount; 1004 1005 return NumLocalCTCs > 0; 1006 } 1007 1008 Error SimplifyConditionalTailCalls::runOnFunctions(BinaryContext &BC) { 1009 if (!BC.isX86()) 1010 return Error::success(); 1011 1012 for (auto &It : BC.getBinaryFunctions()) { 1013 BinaryFunction &Function = It.second; 1014 1015 if (!shouldOptimize(Function)) 1016 continue; 1017 1018 if (fixTailCalls(Function)) { 1019 Modified.insert(&Function); 1020 Function.setHasCanonicalCFG(false); 1021 } 1022 } 1023 1024 if (NumTailCallsPatched) 1025 BC.outs() << "BOLT-INFO: SCTC: patched " << NumTailCallsPatched 1026 << " tail calls (" << NumOrigForwardBranches << " forward)" 1027 << " tail calls (" << NumOrigBackwardBranches << " backward)" 1028 << " from a total of " << NumCandidateTailCalls 1029 << " while removing " << NumDoubleJumps << " double jumps" 1030 << " and removing " << DeletedBlocks << " basic blocks" 1031 << " totalling " << DeletedBytes 1032 << " bytes of code. CTCs total execution count is " 1033 << CTCExecCount << " and the number of times CTCs are taken is " 1034 << CTCTakenCount << "\n"; 1035 return Error::success(); 1036 } 1037 1038 uint64_t ShortenInstructions::shortenInstructions(BinaryFunction &Function) { 1039 uint64_t Count = 0; 1040 const BinaryContext &BC = Function.getBinaryContext(); 1041 for (BinaryBasicBlock &BB : Function) { 1042 for (MCInst &Inst : BB) { 1043 // Skip shortening instructions with Size annotation. 1044 if (BC.MIB->getSize(Inst)) 1045 continue; 1046 1047 MCInst OriginalInst; 1048 if (opts::Verbosity > 2) 1049 OriginalInst = Inst; 1050 1051 if (!BC.MIB->shortenInstruction(Inst, *BC.STI)) 1052 continue; 1053 1054 if (opts::Verbosity > 2) { 1055 BC.scopeLock(); 1056 BC.outs() << "BOLT-INFO: shortening:\nBOLT-INFO: "; 1057 BC.printInstruction(BC.outs(), OriginalInst, 0, &Function); 1058 BC.outs() << "BOLT-INFO: to:"; 1059 BC.printInstruction(BC.outs(), Inst, 0, &Function); 1060 } 1061 1062 ++Count; 1063 } 1064 } 1065 1066 return Count; 1067 } 1068 1069 Error ShortenInstructions::runOnFunctions(BinaryContext &BC) { 1070 std::atomic<uint64_t> NumShortened{0}; 1071 if (!BC.isX86()) 1072 return Error::success(); 1073 1074 ParallelUtilities::runOnEachFunction( 1075 BC, ParallelUtilities::SchedulingPolicy::SP_INST_LINEAR, 1076 [&](BinaryFunction &BF) { NumShortened += shortenInstructions(BF); }, 1077 nullptr, "ShortenInstructions"); 1078 1079 if (NumShortened) 1080 BC.outs() << "BOLT-INFO: " << NumShortened 1081 << " instructions were shortened\n"; 1082 return Error::success(); 1083 } 1084 1085 void Peepholes::addTailcallTraps(BinaryFunction &Function) { 1086 MCPlusBuilder *MIB = Function.getBinaryContext().MIB.get(); 1087 for (BinaryBasicBlock &BB : Function) { 1088 MCInst *Inst = BB.getLastNonPseudoInstr(); 1089 if (Inst && MIB->isTailCall(*Inst) && MIB->isIndirectBranch(*Inst)) { 1090 MCInst Trap; 1091 MIB->createTrap(Trap); 1092 BB.addInstruction(Trap); 1093 ++TailCallTraps; 1094 } 1095 } 1096 } 1097 1098 void Peepholes::removeUselessCondBranches(BinaryFunction &Function) { 1099 for (BinaryBasicBlock &BB : Function) { 1100 if (BB.succ_size() != 2) 1101 continue; 1102 1103 BinaryBasicBlock *CondBB = BB.getConditionalSuccessor(true); 1104 BinaryBasicBlock *UncondBB = BB.getConditionalSuccessor(false); 1105 if (CondBB != UncondBB) 1106 continue; 1107 1108 const MCSymbol *TBB = nullptr; 1109 const MCSymbol *FBB = nullptr; 1110 MCInst *CondBranch = nullptr; 1111 MCInst *UncondBranch = nullptr; 1112 bool Result = BB.analyzeBranch(TBB, FBB, CondBranch, UncondBranch); 1113 1114 // analyzeBranch() can fail due to unusual branch instructions, 1115 // e.g. jrcxz, or jump tables (indirect jump). 1116 if (!Result || !CondBranch) 1117 continue; 1118 1119 BB.removeDuplicateConditionalSuccessor(CondBranch); 1120 ++NumUselessCondBranches; 1121 } 1122 } 1123 1124 Error Peepholes::runOnFunctions(BinaryContext &BC) { 1125 const char Opts = 1126 std::accumulate(opts::Peepholes.begin(), opts::Peepholes.end(), 0, 1127 [](const char A, const PeepholeOpts B) { return A | B; }); 1128 if (Opts == PEEP_NONE) 1129 return Error::success(); 1130 1131 for (auto &It : BC.getBinaryFunctions()) { 1132 BinaryFunction &Function = It.second; 1133 if (shouldOptimize(Function)) { 1134 if (Opts & PEEP_DOUBLE_JUMPS) 1135 NumDoubleJumps += fixDoubleJumps(Function, false); 1136 if (Opts & PEEP_TAILCALL_TRAPS) 1137 addTailcallTraps(Function); 1138 if (Opts & PEEP_USELESS_BRANCHES) 1139 removeUselessCondBranches(Function); 1140 assert(Function.validateCFG()); 1141 } 1142 } 1143 BC.outs() << "BOLT-INFO: Peephole: " << NumDoubleJumps 1144 << " double jumps patched.\n" 1145 << "BOLT-INFO: Peephole: " << TailCallTraps 1146 << " tail call traps inserted.\n" 1147 << "BOLT-INFO: Peephole: " << NumUselessCondBranches 1148 << " useless conditional branches removed.\n"; 1149 return Error::success(); 1150 } 1151 1152 bool SimplifyRODataLoads::simplifyRODataLoads(BinaryFunction &BF) { 1153 BinaryContext &BC = BF.getBinaryContext(); 1154 MCPlusBuilder *MIB = BC.MIB.get(); 1155 1156 uint64_t NumLocalLoadsSimplified = 0; 1157 uint64_t NumDynamicLocalLoadsSimplified = 0; 1158 uint64_t NumLocalLoadsFound = 0; 1159 uint64_t NumDynamicLocalLoadsFound = 0; 1160 1161 for (BinaryBasicBlock *BB : BF.getLayout().blocks()) { 1162 for (MCInst &Inst : *BB) { 1163 unsigned Opcode = Inst.getOpcode(); 1164 const MCInstrDesc &Desc = BC.MII->get(Opcode); 1165 1166 // Skip instructions that do not load from memory. 1167 if (!Desc.mayLoad()) 1168 continue; 1169 1170 // Try to statically evaluate the target memory address; 1171 uint64_t TargetAddress; 1172 1173 if (MIB->hasPCRelOperand(Inst)) { 1174 // Try to find the symbol that corresponds to the PC-relative operand. 1175 MCOperand *DispOpI = MIB->getMemOperandDisp(Inst); 1176 assert(DispOpI != Inst.end() && "expected PC-relative displacement"); 1177 assert(DispOpI->isExpr() && 1178 "found PC-relative with non-symbolic displacement"); 1179 1180 // Get displacement symbol. 1181 const MCSymbol *DisplSymbol; 1182 uint64_t DisplOffset; 1183 1184 std::tie(DisplSymbol, DisplOffset) = 1185 MIB->getTargetSymbolInfo(DispOpI->getExpr()); 1186 1187 if (!DisplSymbol) 1188 continue; 1189 1190 // Look up the symbol address in the global symbols map of the binary 1191 // context object. 1192 BinaryData *BD = BC.getBinaryDataByName(DisplSymbol->getName()); 1193 if (!BD) 1194 continue; 1195 TargetAddress = BD->getAddress() + DisplOffset; 1196 } else if (!MIB->evaluateMemOperandTarget(Inst, TargetAddress)) { 1197 continue; 1198 } 1199 1200 // Get the contents of the section containing the target address of the 1201 // memory operand. We are only interested in read-only sections. 1202 ErrorOr<BinarySection &> DataSection = 1203 BC.getSectionForAddress(TargetAddress); 1204 if (!DataSection || DataSection->isWritable()) 1205 continue; 1206 1207 if (BC.getRelocationAt(TargetAddress) || 1208 BC.getDynamicRelocationAt(TargetAddress)) 1209 continue; 1210 1211 uint32_t Offset = TargetAddress - DataSection->getAddress(); 1212 StringRef ConstantData = DataSection->getContents(); 1213 1214 ++NumLocalLoadsFound; 1215 if (BB->hasProfile()) 1216 NumDynamicLocalLoadsFound += BB->getExecutionCount(); 1217 1218 if (MIB->replaceMemOperandWithImm(Inst, ConstantData, Offset)) { 1219 ++NumLocalLoadsSimplified; 1220 if (BB->hasProfile()) 1221 NumDynamicLocalLoadsSimplified += BB->getExecutionCount(); 1222 } 1223 } 1224 } 1225 1226 NumLoadsFound += NumLocalLoadsFound; 1227 NumDynamicLoadsFound += NumDynamicLocalLoadsFound; 1228 NumLoadsSimplified += NumLocalLoadsSimplified; 1229 NumDynamicLoadsSimplified += NumDynamicLocalLoadsSimplified; 1230 1231 return NumLocalLoadsSimplified > 0; 1232 } 1233 1234 Error SimplifyRODataLoads::runOnFunctions(BinaryContext &BC) { 1235 for (auto &It : BC.getBinaryFunctions()) { 1236 BinaryFunction &Function = It.second; 1237 if (shouldOptimize(Function) && simplifyRODataLoads(Function)) 1238 Modified.insert(&Function); 1239 } 1240 1241 BC.outs() << "BOLT-INFO: simplified " << NumLoadsSimplified << " out of " 1242 << NumLoadsFound << " loads from a statically computed address.\n" 1243 << "BOLT-INFO: dynamic loads simplified: " 1244 << NumDynamicLoadsSimplified << "\n" 1245 << "BOLT-INFO: dynamic loads found: " << NumDynamicLoadsFound 1246 << "\n"; 1247 return Error::success(); 1248 } 1249 1250 Error AssignSections::runOnFunctions(BinaryContext &BC) { 1251 for (BinaryFunction *Function : BC.getInjectedBinaryFunctions()) { 1252 Function->setCodeSectionName(BC.getInjectedCodeSectionName()); 1253 Function->setColdCodeSectionName(BC.getInjectedColdCodeSectionName()); 1254 } 1255 1256 // In non-relocation mode functions have pre-assigned section names. 1257 if (!BC.HasRelocations) 1258 return Error::success(); 1259 1260 const bool UseColdSection = 1261 BC.NumProfiledFuncs > 0 || 1262 opts::ReorderFunctions == ReorderFunctions::RT_USER; 1263 for (auto &BFI : BC.getBinaryFunctions()) { 1264 BinaryFunction &Function = BFI.second; 1265 if (opts::isHotTextMover(Function)) { 1266 Function.setCodeSectionName(BC.getHotTextMoverSectionName()); 1267 Function.setColdCodeSectionName(BC.getHotTextMoverSectionName()); 1268 continue; 1269 } 1270 1271 if (!UseColdSection || Function.hasValidIndex()) 1272 Function.setCodeSectionName(BC.getMainCodeSectionName()); 1273 else 1274 Function.setCodeSectionName(BC.getColdCodeSectionName()); 1275 1276 if (Function.isSplit()) 1277 Function.setColdCodeSectionName(BC.getColdCodeSectionName()); 1278 } 1279 return Error::success(); 1280 } 1281 1282 Error PrintProfileStats::runOnFunctions(BinaryContext &BC) { 1283 double FlowImbalanceMean = 0.0; 1284 size_t NumBlocksConsidered = 0; 1285 double WorstBias = 0.0; 1286 const BinaryFunction *WorstBiasFunc = nullptr; 1287 1288 // For each function CFG, we fill an IncomingMap with the sum of the frequency 1289 // of incoming edges for each BB. Likewise for each OutgoingMap and the sum 1290 // of the frequency of outgoing edges. 1291 using FlowMapTy = std::unordered_map<const BinaryBasicBlock *, uint64_t>; 1292 std::unordered_map<const BinaryFunction *, FlowMapTy> TotalIncomingMaps; 1293 std::unordered_map<const BinaryFunction *, FlowMapTy> TotalOutgoingMaps; 1294 1295 // Compute mean 1296 for (const auto &BFI : BC.getBinaryFunctions()) { 1297 const BinaryFunction &Function = BFI.second; 1298 if (Function.empty() || !Function.isSimple()) 1299 continue; 1300 FlowMapTy &IncomingMap = TotalIncomingMaps[&Function]; 1301 FlowMapTy &OutgoingMap = TotalOutgoingMaps[&Function]; 1302 for (const BinaryBasicBlock &BB : Function) { 1303 uint64_t TotalOutgoing = 0ULL; 1304 auto SuccBIIter = BB.branch_info_begin(); 1305 for (BinaryBasicBlock *Succ : BB.successors()) { 1306 uint64_t Count = SuccBIIter->Count; 1307 if (Count == BinaryBasicBlock::COUNT_NO_PROFILE || Count == 0) { 1308 ++SuccBIIter; 1309 continue; 1310 } 1311 TotalOutgoing += Count; 1312 IncomingMap[Succ] += Count; 1313 ++SuccBIIter; 1314 } 1315 OutgoingMap[&BB] = TotalOutgoing; 1316 } 1317 1318 size_t NumBlocks = 0; 1319 double Mean = 0.0; 1320 for (const BinaryBasicBlock &BB : Function) { 1321 // Do not compute score for low frequency blocks, entry or exit blocks 1322 if (IncomingMap[&BB] < 100 || OutgoingMap[&BB] == 0 || BB.isEntryPoint()) 1323 continue; 1324 ++NumBlocks; 1325 const double Difference = (double)OutgoingMap[&BB] - IncomingMap[&BB]; 1326 Mean += fabs(Difference / IncomingMap[&BB]); 1327 } 1328 1329 FlowImbalanceMean += Mean; 1330 NumBlocksConsidered += NumBlocks; 1331 if (!NumBlocks) 1332 continue; 1333 double FuncMean = Mean / NumBlocks; 1334 if (FuncMean > WorstBias) { 1335 WorstBias = FuncMean; 1336 WorstBiasFunc = &Function; 1337 } 1338 } 1339 if (NumBlocksConsidered > 0) 1340 FlowImbalanceMean /= NumBlocksConsidered; 1341 1342 // Compute standard deviation 1343 NumBlocksConsidered = 0; 1344 double FlowImbalanceVar = 0.0; 1345 for (const auto &BFI : BC.getBinaryFunctions()) { 1346 const BinaryFunction &Function = BFI.second; 1347 if (Function.empty() || !Function.isSimple()) 1348 continue; 1349 FlowMapTy &IncomingMap = TotalIncomingMaps[&Function]; 1350 FlowMapTy &OutgoingMap = TotalOutgoingMaps[&Function]; 1351 for (const BinaryBasicBlock &BB : Function) { 1352 if (IncomingMap[&BB] < 100 || OutgoingMap[&BB] == 0) 1353 continue; 1354 ++NumBlocksConsidered; 1355 const double Difference = (double)OutgoingMap[&BB] - IncomingMap[&BB]; 1356 FlowImbalanceVar += 1357 pow(fabs(Difference / IncomingMap[&BB]) - FlowImbalanceMean, 2); 1358 } 1359 } 1360 if (NumBlocksConsidered) { 1361 FlowImbalanceVar /= NumBlocksConsidered; 1362 FlowImbalanceVar = sqrt(FlowImbalanceVar); 1363 } 1364 1365 // Report to user 1366 BC.outs() << format("BOLT-INFO: Profile bias score: %.4lf%% StDev: %.4lf%%\n", 1367 (100.0 * FlowImbalanceMean), (100.0 * FlowImbalanceVar)); 1368 if (WorstBiasFunc && opts::Verbosity >= 1) { 1369 BC.outs() << "Worst average bias observed in " 1370 << WorstBiasFunc->getPrintName() << "\n"; 1371 LLVM_DEBUG(WorstBiasFunc->dump()); 1372 } 1373 return Error::success(); 1374 } 1375 1376 Error PrintProgramStats::runOnFunctions(BinaryContext &BC) { 1377 uint64_t NumRegularFunctions = 0; 1378 uint64_t NumStaleProfileFunctions = 0; 1379 uint64_t NumAllStaleFunctions = 0; 1380 uint64_t NumInferredFunctions = 0; 1381 uint64_t NumNonSimpleProfiledFunctions = 0; 1382 uint64_t NumUnknownControlFlowFunctions = 0; 1383 uint64_t TotalSampleCount = 0; 1384 uint64_t StaleSampleCount = 0; 1385 uint64_t InferredSampleCount = 0; 1386 std::vector<const BinaryFunction *> ProfiledFunctions; 1387 const char *StaleFuncsHeader = "BOLT-INFO: Functions with stale profile:\n"; 1388 for (auto &BFI : BC.getBinaryFunctions()) { 1389 const BinaryFunction &Function = BFI.second; 1390 1391 // Ignore PLT functions for stats. 1392 if (Function.isPLTFunction()) 1393 continue; 1394 1395 // Adjustment for BAT mode: the profile for BOLT split fragments is combined 1396 // so only count the hot fragment. 1397 const uint64_t Address = Function.getAddress(); 1398 bool IsHotParentOfBOLTSplitFunction = !Function.getFragments().empty() && 1399 BAT && BAT->isBATFunction(Address) && 1400 !BAT->fetchParentAddress(Address); 1401 1402 ++NumRegularFunctions; 1403 1404 // In BOLTed binaries split functions are non-simple (due to non-relocation 1405 // mode), but the original function is known to be simple and we have a 1406 // valid profile for it. 1407 if (!Function.isSimple() && !IsHotParentOfBOLTSplitFunction) { 1408 if (Function.hasProfile()) 1409 ++NumNonSimpleProfiledFunctions; 1410 continue; 1411 } 1412 1413 if (Function.hasUnknownControlFlow()) { 1414 if (opts::PrintUnknownCFG) 1415 Function.dump(); 1416 else if (opts::PrintUnknown) 1417 BC.errs() << "function with unknown control flow: " << Function << '\n'; 1418 1419 ++NumUnknownControlFlowFunctions; 1420 } 1421 1422 if (!Function.hasProfile()) 1423 continue; 1424 1425 uint64_t SampleCount = Function.getRawBranchCount(); 1426 TotalSampleCount += SampleCount; 1427 1428 if (Function.hasValidProfile()) { 1429 ProfiledFunctions.push_back(&Function); 1430 if (Function.hasInferredProfile()) { 1431 ++NumInferredFunctions; 1432 InferredSampleCount += SampleCount; 1433 ++NumAllStaleFunctions; 1434 } 1435 } else { 1436 if (opts::ReportStaleFuncs) { 1437 BC.outs() << StaleFuncsHeader; 1438 StaleFuncsHeader = ""; 1439 BC.outs() << " " << Function << '\n'; 1440 } 1441 ++NumStaleProfileFunctions; 1442 StaleSampleCount += SampleCount; 1443 ++NumAllStaleFunctions; 1444 } 1445 } 1446 BC.NumProfiledFuncs = ProfiledFunctions.size(); 1447 BC.NumStaleProfileFuncs = NumStaleProfileFunctions; 1448 1449 const size_t NumAllProfiledFunctions = 1450 ProfiledFunctions.size() + NumStaleProfileFunctions; 1451 BC.outs() << "BOLT-INFO: " << NumAllProfiledFunctions << " out of " 1452 << NumRegularFunctions << " functions in the binary (" 1453 << format("%.1f", NumAllProfiledFunctions / 1454 (float)NumRegularFunctions * 100.0f) 1455 << "%) have non-empty execution profile\n"; 1456 if (NumNonSimpleProfiledFunctions) { 1457 BC.outs() << "BOLT-INFO: " << NumNonSimpleProfiledFunctions << " function" 1458 << (NumNonSimpleProfiledFunctions == 1 ? "" : "s") 1459 << " with profile could not be optimized\n"; 1460 } 1461 if (NumAllStaleFunctions) { 1462 const float PctStale = 1463 NumAllStaleFunctions / (float)NumAllProfiledFunctions * 100.0f; 1464 const float PctStaleFuncsWithEqualBlockCount = 1465 (float)BC.Stats.NumStaleFuncsWithEqualBlockCount / 1466 NumAllStaleFunctions * 100.0f; 1467 const float PctStaleBlocksWithEqualIcount = 1468 (float)BC.Stats.NumStaleBlocksWithEqualIcount / 1469 BC.Stats.NumStaleBlocks * 100.0f; 1470 auto printErrorOrWarning = [&]() { 1471 if (PctStale > opts::StaleThreshold) 1472 BC.errs() << "BOLT-ERROR: "; 1473 else 1474 BC.errs() << "BOLT-WARNING: "; 1475 }; 1476 printErrorOrWarning(); 1477 BC.errs() << NumAllStaleFunctions 1478 << format(" (%.1f%% of all profiled)", PctStale) << " function" 1479 << (NumAllStaleFunctions == 1 ? "" : "s") 1480 << " have invalid (possibly stale) profile." 1481 " Use -report-stale to see the list.\n"; 1482 if (TotalSampleCount > 0) { 1483 printErrorOrWarning(); 1484 BC.errs() << (StaleSampleCount + InferredSampleCount) << " out of " 1485 << TotalSampleCount << " samples in the binary (" 1486 << format("%.1f", 1487 ((100.0f * (StaleSampleCount + InferredSampleCount)) / 1488 TotalSampleCount)) 1489 << "%) belong to functions with invalid" 1490 " (possibly stale) profile.\n"; 1491 } 1492 BC.outs() << "BOLT-INFO: " << BC.Stats.NumStaleFuncsWithEqualBlockCount 1493 << " stale function" 1494 << (BC.Stats.NumStaleFuncsWithEqualBlockCount == 1 ? "" : "s") 1495 << format(" (%.1f%% of all stale)", 1496 PctStaleFuncsWithEqualBlockCount) 1497 << " have matching block count.\n"; 1498 BC.outs() << "BOLT-INFO: " << BC.Stats.NumStaleBlocksWithEqualIcount 1499 << " stale block" 1500 << (BC.Stats.NumStaleBlocksWithEqualIcount == 1 ? "" : "s") 1501 << format(" (%.1f%% of all stale)", PctStaleBlocksWithEqualIcount) 1502 << " have matching icount.\n"; 1503 if (PctStale > opts::StaleThreshold) { 1504 return createFatalBOLTError( 1505 Twine("BOLT-ERROR: stale functions exceed specified threshold of ") + 1506 Twine(opts::StaleThreshold.getValue()) + Twine("%. Exiting.\n")); 1507 } 1508 } 1509 if (NumInferredFunctions) { 1510 BC.outs() << format( 1511 "BOLT-INFO: inferred profile for %d (%.2f%% of profiled, " 1512 "%.2f%% of stale) functions responsible for %.2f%% samples" 1513 " (%zu out of %zu)\n", 1514 NumInferredFunctions, 1515 100.0 * NumInferredFunctions / NumAllProfiledFunctions, 1516 100.0 * NumInferredFunctions / NumAllStaleFunctions, 1517 100.0 * InferredSampleCount / TotalSampleCount, InferredSampleCount, 1518 TotalSampleCount); 1519 BC.outs() << format( 1520 "BOLT-INFO: inference found an exact match for %.2f%% of basic blocks" 1521 " (%zu out of %zu stale) responsible for %.2f%% samples" 1522 " (%zu out of %zu stale)\n", 1523 100.0 * BC.Stats.NumMatchedBlocks / BC.Stats.NumStaleBlocks, 1524 BC.Stats.NumMatchedBlocks, BC.Stats.NumStaleBlocks, 1525 100.0 * BC.Stats.MatchedSampleCount / BC.Stats.StaleSampleCount, 1526 BC.Stats.MatchedSampleCount, BC.Stats.StaleSampleCount); 1527 } 1528 1529 if (const uint64_t NumUnusedObjects = BC.getNumUnusedProfiledObjects()) { 1530 BC.outs() << "BOLT-INFO: profile for " << NumUnusedObjects 1531 << " objects was ignored\n"; 1532 } 1533 1534 if (ProfiledFunctions.size() > 10) { 1535 if (opts::Verbosity >= 1) { 1536 BC.outs() << "BOLT-INFO: top called functions are:\n"; 1537 llvm::sort(ProfiledFunctions, 1538 [](const BinaryFunction *A, const BinaryFunction *B) { 1539 return B->getExecutionCount() < A->getExecutionCount(); 1540 }); 1541 auto SFI = ProfiledFunctions.begin(); 1542 auto SFIend = ProfiledFunctions.end(); 1543 for (unsigned I = 0u; I < opts::TopCalledLimit && SFI != SFIend; 1544 ++SFI, ++I) 1545 BC.outs() << " " << **SFI << " : " << (*SFI)->getExecutionCount() 1546 << '\n'; 1547 } 1548 } 1549 1550 if (!opts::PrintSortedBy.empty()) { 1551 std::vector<BinaryFunction *> Functions; 1552 std::map<const BinaryFunction *, DynoStats> Stats; 1553 1554 for (auto &BFI : BC.getBinaryFunctions()) { 1555 BinaryFunction &BF = BFI.second; 1556 if (shouldOptimize(BF) && BF.hasValidProfile()) { 1557 Functions.push_back(&BF); 1558 Stats.emplace(&BF, getDynoStats(BF)); 1559 } 1560 } 1561 1562 const bool SortAll = 1563 llvm::is_contained(opts::PrintSortedBy, DynoStats::LAST_DYNO_STAT); 1564 1565 const bool Ascending = 1566 opts::DynoStatsSortOrderOpt == opts::DynoStatsSortOrder::Ascending; 1567 1568 std::function<bool(const DynoStats &, const DynoStats &)> 1569 DynoStatsComparator = 1570 SortAll ? [](const DynoStats &StatsA, 1571 const DynoStats &StatsB) { return StatsA < StatsB; } 1572 : [](const DynoStats &StatsA, const DynoStats &StatsB) { 1573 return StatsA.lessThan(StatsB, opts::PrintSortedBy); 1574 }; 1575 1576 llvm::stable_sort(Functions, 1577 [Ascending, &Stats, DynoStatsComparator]( 1578 const BinaryFunction *A, const BinaryFunction *B) { 1579 auto StatsItr = Stats.find(A); 1580 assert(StatsItr != Stats.end()); 1581 const DynoStats &StatsA = StatsItr->second; 1582 1583 StatsItr = Stats.find(B); 1584 assert(StatsItr != Stats.end()); 1585 const DynoStats &StatsB = StatsItr->second; 1586 1587 return Ascending ? DynoStatsComparator(StatsA, StatsB) 1588 : DynoStatsComparator(StatsB, StatsA); 1589 }); 1590 1591 BC.outs() << "BOLT-INFO: top functions sorted by "; 1592 if (SortAll) { 1593 BC.outs() << "dyno stats"; 1594 } else { 1595 BC.outs() << "("; 1596 bool PrintComma = false; 1597 for (const DynoStats::Category Category : opts::PrintSortedBy) { 1598 if (PrintComma) 1599 BC.outs() << ", "; 1600 BC.outs() << DynoStats::Description(Category); 1601 PrintComma = true; 1602 } 1603 BC.outs() << ")"; 1604 } 1605 1606 BC.outs() << " are:\n"; 1607 auto SFI = Functions.begin(); 1608 for (unsigned I = 0; I < 100 && SFI != Functions.end(); ++SFI, ++I) { 1609 const DynoStats Stats = getDynoStats(**SFI); 1610 BC.outs() << " " << **SFI; 1611 if (!SortAll) { 1612 BC.outs() << " ("; 1613 bool PrintComma = false; 1614 for (const DynoStats::Category Category : opts::PrintSortedBy) { 1615 if (PrintComma) 1616 BC.outs() << ", "; 1617 BC.outs() << dynoStatsOptName(Category) << "=" << Stats[Category]; 1618 PrintComma = true; 1619 } 1620 BC.outs() << ")"; 1621 } 1622 BC.outs() << "\n"; 1623 } 1624 } 1625 1626 if (!BC.TrappedFunctions.empty()) { 1627 BC.errs() << "BOLT-WARNING: " << BC.TrappedFunctions.size() << " function" 1628 << (BC.TrappedFunctions.size() > 1 ? "s" : "") 1629 << " will trap on entry. Use -trap-avx512=0 to disable" 1630 " traps."; 1631 if (opts::Verbosity >= 1 || BC.TrappedFunctions.size() <= 5) { 1632 BC.errs() << '\n'; 1633 for (const BinaryFunction *Function : BC.TrappedFunctions) 1634 BC.errs() << " " << *Function << '\n'; 1635 } else { 1636 BC.errs() << " Use -v=1 to see the list.\n"; 1637 } 1638 } 1639 1640 // Print information on missed macro-fusion opportunities seen on input. 1641 if (BC.Stats.MissedMacroFusionPairs) { 1642 BC.outs() << format( 1643 "BOLT-INFO: the input contains %zu (dynamic count : %zu)" 1644 " opportunities for macro-fusion optimization", 1645 BC.Stats.MissedMacroFusionPairs, BC.Stats.MissedMacroFusionExecCount); 1646 switch (opts::AlignMacroOpFusion) { 1647 case MFT_NONE: 1648 BC.outs() << ". Use -align-macro-fusion to fix.\n"; 1649 break; 1650 case MFT_HOT: 1651 BC.outs() << ". Will fix instances on a hot path.\n"; 1652 break; 1653 case MFT_ALL: 1654 BC.outs() << " that are going to be fixed\n"; 1655 break; 1656 } 1657 } 1658 1659 // Collect and print information about suboptimal code layout on input. 1660 if (opts::ReportBadLayout) { 1661 std::vector<BinaryFunction *> SuboptimalFuncs; 1662 for (auto &BFI : BC.getBinaryFunctions()) { 1663 BinaryFunction &BF = BFI.second; 1664 if (!BF.hasValidProfile()) 1665 continue; 1666 1667 const uint64_t HotThreshold = 1668 std::max<uint64_t>(BF.getKnownExecutionCount(), 1); 1669 bool HotSeen = false; 1670 for (const BinaryBasicBlock *BB : BF.getLayout().rblocks()) { 1671 if (!HotSeen && BB->getKnownExecutionCount() > HotThreshold) { 1672 HotSeen = true; 1673 continue; 1674 } 1675 if (HotSeen && BB->getKnownExecutionCount() == 0) { 1676 SuboptimalFuncs.push_back(&BF); 1677 break; 1678 } 1679 } 1680 } 1681 1682 if (!SuboptimalFuncs.empty()) { 1683 llvm::sort(SuboptimalFuncs, 1684 [](const BinaryFunction *A, const BinaryFunction *B) { 1685 return A->getKnownExecutionCount() / A->getSize() > 1686 B->getKnownExecutionCount() / B->getSize(); 1687 }); 1688 1689 BC.outs() << "BOLT-INFO: " << SuboptimalFuncs.size() 1690 << " functions have " 1691 "cold code in the middle of hot code. Top functions are:\n"; 1692 for (unsigned I = 0; 1693 I < std::min(static_cast<size_t>(opts::ReportBadLayout), 1694 SuboptimalFuncs.size()); 1695 ++I) 1696 SuboptimalFuncs[I]->print(BC.outs()); 1697 } 1698 } 1699 1700 if (NumUnknownControlFlowFunctions) { 1701 BC.outs() << "BOLT-INFO: " << NumUnknownControlFlowFunctions 1702 << " functions have instructions with unknown control flow"; 1703 if (!opts::PrintUnknown) 1704 BC.outs() << ". Use -print-unknown to see the list."; 1705 BC.outs() << '\n'; 1706 } 1707 return Error::success(); 1708 } 1709 1710 Error InstructionLowering::runOnFunctions(BinaryContext &BC) { 1711 for (auto &BFI : BC.getBinaryFunctions()) 1712 for (BinaryBasicBlock &BB : BFI.second) 1713 for (MCInst &Instruction : BB) 1714 BC.MIB->lowerTailCall(Instruction); 1715 return Error::success(); 1716 } 1717 1718 Error StripRepRet::runOnFunctions(BinaryContext &BC) { 1719 if (!BC.isX86()) 1720 return Error::success(); 1721 1722 uint64_t NumPrefixesRemoved = 0; 1723 uint64_t NumBytesSaved = 0; 1724 for (auto &BFI : BC.getBinaryFunctions()) { 1725 for (BinaryBasicBlock &BB : BFI.second) { 1726 auto LastInstRIter = BB.getLastNonPseudo(); 1727 if (LastInstRIter == BB.rend() || !BC.MIB->isReturn(*LastInstRIter) || 1728 !BC.MIB->deleteREPPrefix(*LastInstRIter)) 1729 continue; 1730 1731 NumPrefixesRemoved += BB.getKnownExecutionCount(); 1732 ++NumBytesSaved; 1733 } 1734 } 1735 1736 if (NumBytesSaved) 1737 BC.outs() << "BOLT-INFO: removed " << NumBytesSaved 1738 << " 'repz' prefixes" 1739 " with estimated execution count of " 1740 << NumPrefixesRemoved << " times.\n"; 1741 return Error::success(); 1742 } 1743 1744 Error InlineMemcpy::runOnFunctions(BinaryContext &BC) { 1745 if (!BC.isX86()) 1746 return Error::success(); 1747 1748 uint64_t NumInlined = 0; 1749 uint64_t NumInlinedDyno = 0; 1750 for (auto &BFI : BC.getBinaryFunctions()) { 1751 for (BinaryBasicBlock &BB : BFI.second) { 1752 for (auto II = BB.begin(); II != BB.end(); ++II) { 1753 MCInst &Inst = *II; 1754 1755 if (!BC.MIB->isCall(Inst) || MCPlus::getNumPrimeOperands(Inst) != 1 || 1756 !Inst.getOperand(0).isExpr()) 1757 continue; 1758 1759 const MCSymbol *CalleeSymbol = BC.MIB->getTargetSymbol(Inst); 1760 if (CalleeSymbol->getName() != "memcpy" && 1761 CalleeSymbol->getName() != "memcpy@PLT" && 1762 CalleeSymbol->getName() != "_memcpy8") 1763 continue; 1764 1765 const bool IsMemcpy8 = (CalleeSymbol->getName() == "_memcpy8"); 1766 const bool IsTailCall = BC.MIB->isTailCall(Inst); 1767 1768 const InstructionListType NewCode = 1769 BC.MIB->createInlineMemcpy(IsMemcpy8); 1770 II = BB.replaceInstruction(II, NewCode); 1771 std::advance(II, NewCode.size() - 1); 1772 if (IsTailCall) { 1773 MCInst Return; 1774 BC.MIB->createReturn(Return); 1775 II = BB.insertInstruction(std::next(II), std::move(Return)); 1776 } 1777 1778 ++NumInlined; 1779 NumInlinedDyno += BB.getKnownExecutionCount(); 1780 } 1781 } 1782 } 1783 1784 if (NumInlined) { 1785 BC.outs() << "BOLT-INFO: inlined " << NumInlined << " memcpy() calls"; 1786 if (NumInlinedDyno) 1787 BC.outs() << ". The calls were executed " << NumInlinedDyno 1788 << " times based on profile."; 1789 BC.outs() << '\n'; 1790 } 1791 return Error::success(); 1792 } 1793 1794 bool SpecializeMemcpy1::shouldOptimize(const BinaryFunction &Function) const { 1795 if (!BinaryFunctionPass::shouldOptimize(Function)) 1796 return false; 1797 1798 for (const std::string &FunctionSpec : Spec) { 1799 StringRef FunctionName = StringRef(FunctionSpec).split(':').first; 1800 if (Function.hasNameRegex(FunctionName)) 1801 return true; 1802 } 1803 1804 return false; 1805 } 1806 1807 std::set<size_t> SpecializeMemcpy1::getCallSitesToOptimize( 1808 const BinaryFunction &Function) const { 1809 StringRef SitesString; 1810 for (const std::string &FunctionSpec : Spec) { 1811 StringRef FunctionName; 1812 std::tie(FunctionName, SitesString) = StringRef(FunctionSpec).split(':'); 1813 if (Function.hasNameRegex(FunctionName)) 1814 break; 1815 SitesString = ""; 1816 } 1817 1818 std::set<size_t> Sites; 1819 SmallVector<StringRef, 4> SitesVec; 1820 SitesString.split(SitesVec, ':'); 1821 for (StringRef SiteString : SitesVec) { 1822 if (SiteString.empty()) 1823 continue; 1824 size_t Result; 1825 if (!SiteString.getAsInteger(10, Result)) 1826 Sites.emplace(Result); 1827 } 1828 1829 return Sites; 1830 } 1831 1832 Error SpecializeMemcpy1::runOnFunctions(BinaryContext &BC) { 1833 if (!BC.isX86()) 1834 return Error::success(); 1835 1836 uint64_t NumSpecialized = 0; 1837 uint64_t NumSpecializedDyno = 0; 1838 for (auto &BFI : BC.getBinaryFunctions()) { 1839 BinaryFunction &Function = BFI.second; 1840 if (!shouldOptimize(Function)) 1841 continue; 1842 1843 std::set<size_t> CallsToOptimize = getCallSitesToOptimize(Function); 1844 auto shouldOptimize = [&](size_t N) { 1845 return CallsToOptimize.empty() || CallsToOptimize.count(N); 1846 }; 1847 1848 std::vector<BinaryBasicBlock *> Blocks(Function.pbegin(), Function.pend()); 1849 size_t CallSiteID = 0; 1850 for (BinaryBasicBlock *CurBB : Blocks) { 1851 for (auto II = CurBB->begin(); II != CurBB->end(); ++II) { 1852 MCInst &Inst = *II; 1853 1854 if (!BC.MIB->isCall(Inst) || MCPlus::getNumPrimeOperands(Inst) != 1 || 1855 !Inst.getOperand(0).isExpr()) 1856 continue; 1857 1858 const MCSymbol *CalleeSymbol = BC.MIB->getTargetSymbol(Inst); 1859 if (CalleeSymbol->getName() != "memcpy" && 1860 CalleeSymbol->getName() != "memcpy@PLT") 1861 continue; 1862 1863 if (BC.MIB->isTailCall(Inst)) 1864 continue; 1865 1866 ++CallSiteID; 1867 1868 if (!shouldOptimize(CallSiteID)) 1869 continue; 1870 1871 // Create a copy of a call to memcpy(dest, src, size). 1872 MCInst MemcpyInstr = Inst; 1873 1874 BinaryBasicBlock *OneByteMemcpyBB = CurBB->splitAt(II); 1875 1876 BinaryBasicBlock *NextBB = nullptr; 1877 if (OneByteMemcpyBB->getNumNonPseudos() > 1) { 1878 NextBB = OneByteMemcpyBB->splitAt(OneByteMemcpyBB->begin()); 1879 NextBB->eraseInstruction(NextBB->begin()); 1880 } else { 1881 NextBB = OneByteMemcpyBB->getSuccessor(); 1882 OneByteMemcpyBB->eraseInstruction(OneByteMemcpyBB->begin()); 1883 assert(NextBB && "unexpected call to memcpy() with no return"); 1884 } 1885 1886 BinaryBasicBlock *MemcpyBB = Function.addBasicBlock(); 1887 MemcpyBB->setOffset(CurBB->getInputOffset()); 1888 InstructionListType CmpJCC = 1889 BC.MIB->createCmpJE(BC.MIB->getIntArgRegister(2), 1, 1890 OneByteMemcpyBB->getLabel(), BC.Ctx.get()); 1891 CurBB->addInstructions(CmpJCC); 1892 CurBB->addSuccessor(MemcpyBB); 1893 1894 MemcpyBB->addInstruction(std::move(MemcpyInstr)); 1895 MemcpyBB->addSuccessor(NextBB); 1896 MemcpyBB->setCFIState(NextBB->getCFIState()); 1897 MemcpyBB->setExecutionCount(0); 1898 1899 // To prevent the actual call from being moved to cold, we set its 1900 // execution count to 1. 1901 if (CurBB->getKnownExecutionCount() > 0) 1902 MemcpyBB->setExecutionCount(1); 1903 1904 InstructionListType OneByteMemcpy = BC.MIB->createOneByteMemcpy(); 1905 OneByteMemcpyBB->addInstructions(OneByteMemcpy); 1906 1907 ++NumSpecialized; 1908 NumSpecializedDyno += CurBB->getKnownExecutionCount(); 1909 1910 CurBB = NextBB; 1911 1912 // Note: we don't expect the next instruction to be a call to memcpy. 1913 II = CurBB->begin(); 1914 } 1915 } 1916 } 1917 1918 if (NumSpecialized) { 1919 BC.outs() << "BOLT-INFO: specialized " << NumSpecialized 1920 << " memcpy() call sites for size 1"; 1921 if (NumSpecializedDyno) 1922 BC.outs() << ". The calls were executed " << NumSpecializedDyno 1923 << " times based on profile."; 1924 BC.outs() << '\n'; 1925 } 1926 return Error::success(); 1927 } 1928 1929 void RemoveNops::runOnFunction(BinaryFunction &BF) { 1930 const BinaryContext &BC = BF.getBinaryContext(); 1931 for (BinaryBasicBlock &BB : BF) { 1932 for (int64_t I = BB.size() - 1; I >= 0; --I) { 1933 MCInst &Inst = BB.getInstructionAtIndex(I); 1934 if (BC.MIB->isNoop(Inst) && BC.MIB->hasAnnotation(Inst, "NOP")) 1935 BB.eraseInstructionAtIndex(I); 1936 } 1937 } 1938 } 1939 1940 Error RemoveNops::runOnFunctions(BinaryContext &BC) { 1941 ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) { 1942 runOnFunction(BF); 1943 }; 1944 1945 ParallelUtilities::PredicateTy SkipFunc = [&](const BinaryFunction &BF) { 1946 return BF.shouldPreserveNops(); 1947 }; 1948 1949 ParallelUtilities::runOnEachFunction( 1950 BC, ParallelUtilities::SchedulingPolicy::SP_INST_LINEAR, WorkFun, 1951 SkipFunc, "RemoveNops"); 1952 return Error::success(); 1953 } 1954 1955 } // namespace bolt 1956 } // namespace llvm 1957