1 //===- bolt/Passes/BinaryPasses.cpp - Binary-level passes -----------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements multiple passes for binary optimization and analysis. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "bolt/Passes/BinaryPasses.h" 14 #include "bolt/Core/FunctionLayout.h" 15 #include "bolt/Core/ParallelUtilities.h" 16 #include "bolt/Passes/ReorderAlgorithm.h" 17 #include "bolt/Passes/ReorderFunctions.h" 18 #include "llvm/Support/CommandLine.h" 19 #include <atomic> 20 #include <mutex> 21 #include <numeric> 22 #include <vector> 23 24 #define DEBUG_TYPE "bolt-opts" 25 26 using namespace llvm; 27 using namespace bolt; 28 29 static const char *dynoStatsOptName(const bolt::DynoStats::Category C) { 30 assert(C > bolt::DynoStats::FIRST_DYNO_STAT && 31 C < DynoStats::LAST_DYNO_STAT && "Unexpected dyno stat category."); 32 33 static std::string OptNames[bolt::DynoStats::LAST_DYNO_STAT + 1]; 34 35 OptNames[C] = bolt::DynoStats::Description(C); 36 37 std::replace(OptNames[C].begin(), OptNames[C].end(), ' ', '-'); 38 39 return OptNames[C].c_str(); 40 } 41 42 namespace opts { 43 44 extern cl::OptionCategory BoltCategory; 45 extern cl::OptionCategory BoltOptCategory; 46 47 extern cl::opt<unsigned> Verbosity; 48 extern cl::opt<bool> EnableBAT; 49 extern cl::opt<unsigned> ExecutionCountThreshold; 50 extern cl::opt<bool> UpdateDebugSections; 51 extern cl::opt<bolt::ReorderFunctions::ReorderType> ReorderFunctions; 52 53 enum DynoStatsSortOrder : char { 54 Ascending, 55 Descending 56 }; 57 58 static cl::opt<DynoStatsSortOrder> DynoStatsSortOrderOpt( 59 "print-sorted-by-order", 60 cl::desc("use ascending or descending order when printing functions " 61 "ordered by dyno stats"), 62 cl::init(DynoStatsSortOrder::Descending), cl::cat(BoltOptCategory)); 63 64 cl::list<std::string> 65 HotTextMoveSections("hot-text-move-sections", 66 cl::desc("list of sections containing functions used for hugifying hot text. " 67 "BOLT makes sure these functions are not placed on the same page as " 68 "the hot text. (default=\'.stub,.mover\')."), 69 cl::value_desc("sec1,sec2,sec3,..."), 70 cl::CommaSeparated, 71 cl::ZeroOrMore, 72 cl::cat(BoltCategory)); 73 74 bool isHotTextMover(const BinaryFunction &Function) { 75 for (std::string &SectionName : opts::HotTextMoveSections) { 76 if (Function.getOriginSectionName() && 77 *Function.getOriginSectionName() == SectionName) 78 return true; 79 } 80 81 return false; 82 } 83 84 static cl::opt<bool> MinBranchClusters( 85 "min-branch-clusters", 86 cl::desc("use a modified clustering algorithm geared towards minimizing " 87 "branches"), 88 cl::Hidden, cl::cat(BoltOptCategory)); 89 90 static cl::list<Peepholes::PeepholeOpts> Peepholes( 91 "peepholes", cl::CommaSeparated, cl::desc("enable peephole optimizations"), 92 cl::value_desc("opt1,opt2,opt3,..."), 93 cl::values(clEnumValN(Peepholes::PEEP_NONE, "none", "disable peepholes"), 94 clEnumValN(Peepholes::PEEP_DOUBLE_JUMPS, "double-jumps", 95 "remove double jumps when able"), 96 clEnumValN(Peepholes::PEEP_TAILCALL_TRAPS, "tailcall-traps", 97 "insert tail call traps"), 98 clEnumValN(Peepholes::PEEP_USELESS_BRANCHES, "useless-branches", 99 "remove useless conditional branches"), 100 clEnumValN(Peepholes::PEEP_ALL, "all", 101 "enable all peephole optimizations")), 102 cl::ZeroOrMore, cl::cat(BoltOptCategory)); 103 104 static cl::opt<unsigned> 105 PrintFuncStat("print-function-statistics", 106 cl::desc("print statistics about basic block ordering"), 107 cl::init(0), cl::cat(BoltOptCategory)); 108 109 static cl::opt<bool> PrintLargeFunctions( 110 "print-large-functions", 111 cl::desc("print functions that could not be overwritten due to excessive " 112 "size"), 113 cl::init(false), cl::cat(BoltOptCategory)); 114 115 static cl::list<bolt::DynoStats::Category> 116 PrintSortedBy("print-sorted-by", cl::CommaSeparated, 117 cl::desc("print functions sorted by order of dyno stats"), 118 cl::value_desc("key1,key2,key3,..."), 119 cl::values( 120 #define D(name, description, ...) \ 121 clEnumValN(bolt::DynoStats::name, dynoStatsOptName(bolt::DynoStats::name), \ 122 description), 123 REAL_DYNO_STATS 124 #undef D 125 clEnumValN(bolt::DynoStats::LAST_DYNO_STAT, "all", 126 "sorted by all names")), 127 cl::ZeroOrMore, cl::cat(BoltOptCategory)); 128 129 static cl::opt<bool> 130 PrintUnknown("print-unknown", 131 cl::desc("print names of functions with unknown control flow"), 132 cl::cat(BoltCategory), cl::Hidden); 133 134 static cl::opt<bool> 135 PrintUnknownCFG("print-unknown-cfg", 136 cl::desc("dump CFG of functions with unknown control flow"), 137 cl::cat(BoltCategory), cl::ReallyHidden); 138 139 // Please MSVC19 with a forward declaration: otherwise it reports an error about 140 // an undeclared variable inside a callback. 141 extern cl::opt<bolt::ReorderBasicBlocks::LayoutType> ReorderBlocks; 142 cl::opt<bolt::ReorderBasicBlocks::LayoutType> ReorderBlocks( 143 "reorder-blocks", cl::desc("change layout of basic blocks in a function"), 144 cl::init(bolt::ReorderBasicBlocks::LT_NONE), 145 cl::values( 146 clEnumValN(bolt::ReorderBasicBlocks::LT_NONE, "none", 147 "do not reorder basic blocks"), 148 clEnumValN(bolt::ReorderBasicBlocks::LT_REVERSE, "reverse", 149 "layout blocks in reverse order"), 150 clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE, "normal", 151 "perform optimal layout based on profile"), 152 clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_BRANCH, 153 "branch-predictor", 154 "perform optimal layout prioritizing branch " 155 "predictions"), 156 clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_CACHE, "cache", 157 "perform optimal layout prioritizing I-cache " 158 "behavior"), 159 clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_CACHE_PLUS, "cache+", 160 "perform layout optimizing I-cache behavior"), 161 clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_EXT_TSP, "ext-tsp", 162 "perform layout optimizing I-cache behavior"), 163 clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_SHUFFLE, 164 "cluster-shuffle", "perform random layout of clusters")), 165 cl::ZeroOrMore, cl::cat(BoltOptCategory), 166 cl::callback([](const bolt::ReorderBasicBlocks::LayoutType &option) { 167 if (option == bolt::ReorderBasicBlocks::LT_OPTIMIZE_CACHE_PLUS) { 168 errs() << "BOLT-WARNING: '-reorder-blocks=cache+' is deprecated, please" 169 << " use '-reorder-blocks=ext-tsp' instead\n"; 170 ReorderBlocks = bolt::ReorderBasicBlocks::LT_OPTIMIZE_EXT_TSP; 171 } 172 })); 173 174 static cl::opt<unsigned> ReportBadLayout( 175 "report-bad-layout", 176 cl::desc("print top <uint> functions with suboptimal code layout on input"), 177 cl::init(0), cl::Hidden, cl::cat(BoltOptCategory)); 178 179 static cl::opt<bool> 180 ReportStaleFuncs("report-stale", 181 cl::desc("print the list of functions with stale profile"), 182 cl::Hidden, cl::cat(BoltOptCategory)); 183 184 enum SctcModes : char { 185 SctcAlways, 186 SctcPreserveDirection, 187 SctcHeuristic 188 }; 189 190 static cl::opt<SctcModes> 191 SctcMode("sctc-mode", 192 cl::desc("mode for simplify conditional tail calls"), 193 cl::init(SctcAlways), 194 cl::values(clEnumValN(SctcAlways, "always", "always perform sctc"), 195 clEnumValN(SctcPreserveDirection, 196 "preserve", 197 "only perform sctc when branch direction is " 198 "preserved"), 199 clEnumValN(SctcHeuristic, 200 "heuristic", 201 "use branch prediction data to control sctc")), 202 cl::ZeroOrMore, 203 cl::cat(BoltOptCategory)); 204 205 static cl::opt<unsigned> 206 StaleThreshold("stale-threshold", 207 cl::desc( 208 "maximum percentage of stale functions to tolerate (default: 100)"), 209 cl::init(100), 210 cl::Hidden, 211 cl::cat(BoltOptCategory)); 212 213 static cl::opt<unsigned> TSPThreshold( 214 "tsp-threshold", 215 cl::desc( 216 "maximum number of hot basic blocks in a function for which to use " 217 "a precise TSP solution while re-ordering basic blocks"), 218 cl::init(10), cl::Hidden, cl::cat(BoltOptCategory)); 219 220 static cl::opt<unsigned> TopCalledLimit( 221 "top-called-limit", 222 cl::desc("maximum number of functions to print in top called " 223 "functions section"), 224 cl::init(100), cl::Hidden, cl::cat(BoltCategory)); 225 226 } // namespace opts 227 228 namespace llvm { 229 namespace bolt { 230 231 bool BinaryFunctionPass::shouldOptimize(const BinaryFunction &BF) const { 232 return BF.isSimple() && BF.getState() == BinaryFunction::State::CFG && 233 !BF.isIgnored(); 234 } 235 236 bool BinaryFunctionPass::shouldPrint(const BinaryFunction &BF) const { 237 return BF.isSimple() && !BF.isIgnored(); 238 } 239 240 void NormalizeCFG::runOnFunction(BinaryFunction &BF) { 241 uint64_t NumRemoved = 0; 242 uint64_t NumDuplicateEdges = 0; 243 uint64_t NeedsFixBranches = 0; 244 for (BinaryBasicBlock &BB : BF) { 245 if (!BB.empty()) 246 continue; 247 248 if (BB.isEntryPoint() || BB.isLandingPad()) 249 continue; 250 251 // Handle a dangling empty block. 252 if (BB.succ_size() == 0) { 253 // If an empty dangling basic block has a predecessor, it could be a 254 // result of codegen for __builtin_unreachable. In such case, do not 255 // remove the block. 256 if (BB.pred_size() == 0) { 257 BB.markValid(false); 258 ++NumRemoved; 259 } 260 continue; 261 } 262 263 // The block should have just one successor. 264 BinaryBasicBlock *Successor = BB.getSuccessor(); 265 assert(Successor && "invalid CFG encountered"); 266 267 // Redirect all predecessors to the successor block. 268 while (!BB.pred_empty()) { 269 BinaryBasicBlock *Predecessor = *BB.pred_begin(); 270 if (Predecessor->hasJumpTable()) 271 break; 272 273 if (Predecessor == Successor) 274 break; 275 276 BinaryBasicBlock::BinaryBranchInfo &BI = Predecessor->getBranchInfo(BB); 277 Predecessor->replaceSuccessor(&BB, Successor, BI.Count, 278 BI.MispredictedCount); 279 // We need to fix branches even if we failed to replace all successors 280 // and remove the block. 281 NeedsFixBranches = true; 282 } 283 284 if (BB.pred_empty()) { 285 BB.removeAllSuccessors(); 286 BB.markValid(false); 287 ++NumRemoved; 288 } 289 } 290 291 if (NumRemoved) 292 BF.eraseInvalidBBs(); 293 294 // Check for duplicate successors. Do it after the empty block elimination as 295 // we can get more duplicate successors. 296 for (BinaryBasicBlock &BB : BF) 297 if (!BB.hasJumpTable() && BB.succ_size() == 2 && 298 BB.getConditionalSuccessor(false) == BB.getConditionalSuccessor(true)) 299 ++NumDuplicateEdges; 300 301 // fixBranches() will get rid of duplicate edges and update jump instructions. 302 if (NumDuplicateEdges || NeedsFixBranches) 303 BF.fixBranches(); 304 305 NumDuplicateEdgesMerged += NumDuplicateEdges; 306 NumBlocksRemoved += NumRemoved; 307 } 308 309 Error NormalizeCFG::runOnFunctions(BinaryContext &BC) { 310 ParallelUtilities::runOnEachFunction( 311 BC, ParallelUtilities::SchedulingPolicy::SP_BB_LINEAR, 312 [&](BinaryFunction &BF) { runOnFunction(BF); }, 313 [&](const BinaryFunction &BF) { return !shouldOptimize(BF); }, 314 "NormalizeCFG"); 315 if (NumBlocksRemoved) 316 BC.outs() << "BOLT-INFO: removed " << NumBlocksRemoved << " empty block" 317 << (NumBlocksRemoved == 1 ? "" : "s") << '\n'; 318 if (NumDuplicateEdgesMerged) 319 BC.outs() << "BOLT-INFO: merged " << NumDuplicateEdgesMerged 320 << " duplicate CFG edge" 321 << (NumDuplicateEdgesMerged == 1 ? "" : "s") << '\n'; 322 return Error::success(); 323 } 324 325 void EliminateUnreachableBlocks::runOnFunction(BinaryFunction &Function) { 326 BinaryContext &BC = Function.getBinaryContext(); 327 unsigned Count; 328 uint64_t Bytes; 329 Function.markUnreachableBlocks(); 330 LLVM_DEBUG({ 331 for (BinaryBasicBlock &BB : Function) { 332 if (!BB.isValid()) { 333 dbgs() << "BOLT-INFO: UCE found unreachable block " << BB.getName() 334 << " in function " << Function << "\n"; 335 Function.dump(); 336 } 337 } 338 }); 339 BinaryContext::IndependentCodeEmitter Emitter = 340 BC.createIndependentMCCodeEmitter(); 341 std::tie(Count, Bytes) = Function.eraseInvalidBBs(Emitter.MCE.get()); 342 DeletedBlocks += Count; 343 DeletedBytes += Bytes; 344 if (Count) { 345 auto L = BC.scopeLock(); 346 Modified.insert(&Function); 347 if (opts::Verbosity > 0) 348 BC.outs() << "BOLT-INFO: removed " << Count 349 << " dead basic block(s) accounting for " << Bytes 350 << " bytes in function " << Function << '\n'; 351 } 352 } 353 354 Error EliminateUnreachableBlocks::runOnFunctions(BinaryContext &BC) { 355 ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) { 356 runOnFunction(BF); 357 }; 358 359 ParallelUtilities::PredicateTy SkipPredicate = [&](const BinaryFunction &BF) { 360 return !shouldOptimize(BF) || BF.getLayout().block_empty(); 361 }; 362 363 ParallelUtilities::runOnEachFunction( 364 BC, ParallelUtilities::SchedulingPolicy::SP_CONSTANT, WorkFun, 365 SkipPredicate, "elimininate-unreachable"); 366 367 if (DeletedBlocks) 368 BC.outs() << "BOLT-INFO: UCE removed " << DeletedBlocks << " blocks and " 369 << DeletedBytes << " bytes of code\n"; 370 return Error::success(); 371 } 372 373 bool ReorderBasicBlocks::shouldPrint(const BinaryFunction &BF) const { 374 return (BinaryFunctionPass::shouldPrint(BF) && 375 opts::ReorderBlocks != ReorderBasicBlocks::LT_NONE); 376 } 377 378 bool ReorderBasicBlocks::shouldOptimize(const BinaryFunction &BF) const { 379 // Apply execution count threshold 380 if (BF.getKnownExecutionCount() < opts::ExecutionCountThreshold) 381 return false; 382 383 return BinaryFunctionPass::shouldOptimize(BF); 384 } 385 386 Error ReorderBasicBlocks::runOnFunctions(BinaryContext &BC) { 387 if (opts::ReorderBlocks == ReorderBasicBlocks::LT_NONE) 388 return Error::success(); 389 390 std::atomic_uint64_t ModifiedFuncCount(0); 391 std::mutex FunctionEditDistanceMutex; 392 DenseMap<const BinaryFunction *, uint64_t> FunctionEditDistance; 393 394 ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) { 395 SmallVector<const BinaryBasicBlock *, 0> OldBlockOrder; 396 if (opts::PrintFuncStat > 0) 397 llvm::copy(BF.getLayout().blocks(), std::back_inserter(OldBlockOrder)); 398 399 const bool LayoutChanged = 400 modifyFunctionLayout(BF, opts::ReorderBlocks, opts::MinBranchClusters); 401 if (LayoutChanged) { 402 ModifiedFuncCount.fetch_add(1, std::memory_order_relaxed); 403 if (opts::PrintFuncStat > 0) { 404 const uint64_t Distance = BF.getLayout().getEditDistance(OldBlockOrder); 405 std::lock_guard<std::mutex> Lock(FunctionEditDistanceMutex); 406 FunctionEditDistance[&BF] = Distance; 407 } 408 } 409 }; 410 411 ParallelUtilities::PredicateTy SkipFunc = [&](const BinaryFunction &BF) { 412 return !shouldOptimize(BF); 413 }; 414 415 ParallelUtilities::runOnEachFunction( 416 BC, ParallelUtilities::SchedulingPolicy::SP_BB_LINEAR, WorkFun, SkipFunc, 417 "ReorderBasicBlocks"); 418 const size_t NumAllProfiledFunctions = 419 BC.NumProfiledFuncs + BC.NumStaleProfileFuncs; 420 421 BC.outs() << "BOLT-INFO: basic block reordering modified layout of " 422 << format( 423 "%zu functions (%.2lf%% of profiled, %.2lf%% of total)\n", 424 ModifiedFuncCount.load(std::memory_order_relaxed), 425 100.0 * ModifiedFuncCount.load(std::memory_order_relaxed) / 426 NumAllProfiledFunctions, 427 100.0 * ModifiedFuncCount.load(std::memory_order_relaxed) / 428 BC.getBinaryFunctions().size()); 429 430 if (opts::PrintFuncStat > 0) { 431 raw_ostream &OS = BC.outs(); 432 // Copy all the values into vector in order to sort them 433 std::map<uint64_t, BinaryFunction &> ScoreMap; 434 auto &BFs = BC.getBinaryFunctions(); 435 for (auto It = BFs.begin(); It != BFs.end(); ++It) 436 ScoreMap.insert(std::pair<uint64_t, BinaryFunction &>( 437 It->second.getFunctionScore(), It->second)); 438 439 OS << "\nBOLT-INFO: Printing Function Statistics:\n\n"; 440 OS << " There are " << BFs.size() << " functions in total. \n"; 441 OS << " Number of functions being modified: " 442 << ModifiedFuncCount.load(std::memory_order_relaxed) << "\n"; 443 OS << " User asks for detailed information on top " 444 << opts::PrintFuncStat << " functions. (Ranked by function score)" 445 << "\n\n"; 446 uint64_t I = 0; 447 for (std::map<uint64_t, BinaryFunction &>::reverse_iterator Rit = 448 ScoreMap.rbegin(); 449 Rit != ScoreMap.rend() && I < opts::PrintFuncStat; ++Rit, ++I) { 450 BinaryFunction &Function = Rit->second; 451 452 OS << " Information for function of top: " << (I + 1) << ": \n"; 453 OS << " Function Score is: " << Function.getFunctionScore() 454 << "\n"; 455 OS << " There are " << Function.size() 456 << " number of blocks in this function.\n"; 457 OS << " There are " << Function.getInstructionCount() 458 << " number of instructions in this function.\n"; 459 OS << " The edit distance for this function is: " 460 << FunctionEditDistance.lookup(&Function) << "\n\n"; 461 } 462 } 463 return Error::success(); 464 } 465 466 bool ReorderBasicBlocks::modifyFunctionLayout(BinaryFunction &BF, 467 LayoutType Type, 468 bool MinBranchClusters) const { 469 if (BF.size() == 0 || Type == LT_NONE) 470 return false; 471 472 BinaryFunction::BasicBlockOrderType NewLayout; 473 std::unique_ptr<ReorderAlgorithm> Algo; 474 475 // Cannot do optimal layout without profile. 476 if (Type != LT_REVERSE && !BF.hasValidProfile()) 477 return false; 478 479 if (Type == LT_REVERSE) { 480 Algo.reset(new ReverseReorderAlgorithm()); 481 } else if (BF.size() <= opts::TSPThreshold && Type != LT_OPTIMIZE_SHUFFLE) { 482 // Work on optimal solution if problem is small enough 483 LLVM_DEBUG(dbgs() << "finding optimal block layout for " << BF << "\n"); 484 Algo.reset(new TSPReorderAlgorithm()); 485 } else { 486 LLVM_DEBUG(dbgs() << "running block layout heuristics on " << BF << "\n"); 487 488 std::unique_ptr<ClusterAlgorithm> CAlgo; 489 if (MinBranchClusters) 490 CAlgo.reset(new MinBranchGreedyClusterAlgorithm()); 491 else 492 CAlgo.reset(new PHGreedyClusterAlgorithm()); 493 494 switch (Type) { 495 case LT_OPTIMIZE: 496 Algo.reset(new OptimizeReorderAlgorithm(std::move(CAlgo))); 497 break; 498 499 case LT_OPTIMIZE_BRANCH: 500 Algo.reset(new OptimizeBranchReorderAlgorithm(std::move(CAlgo))); 501 break; 502 503 case LT_OPTIMIZE_CACHE: 504 Algo.reset(new OptimizeCacheReorderAlgorithm(std::move(CAlgo))); 505 break; 506 507 case LT_OPTIMIZE_EXT_TSP: 508 Algo.reset(new ExtTSPReorderAlgorithm()); 509 break; 510 511 case LT_OPTIMIZE_SHUFFLE: 512 Algo.reset(new RandomClusterReorderAlgorithm(std::move(CAlgo))); 513 break; 514 515 default: 516 llvm_unreachable("unexpected layout type"); 517 } 518 } 519 520 Algo->reorderBasicBlocks(BF, NewLayout); 521 522 return BF.getLayout().update(NewLayout); 523 } 524 525 Error FixupBranches::runOnFunctions(BinaryContext &BC) { 526 for (auto &It : BC.getBinaryFunctions()) { 527 BinaryFunction &Function = It.second; 528 if (!BC.shouldEmit(Function) || !Function.isSimple()) 529 continue; 530 531 Function.fixBranches(); 532 } 533 return Error::success(); 534 } 535 536 Error FinalizeFunctions::runOnFunctions(BinaryContext &BC) { 537 std::atomic<bool> HasFatal{false}; 538 ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) { 539 if (!BF.finalizeCFIState()) { 540 if (BC.HasRelocations) { 541 BC.errs() << "BOLT-ERROR: unable to fix CFI state for function " << BF 542 << ". Exiting.\n"; 543 HasFatal = true; 544 return; 545 } 546 BF.setSimple(false); 547 return; 548 } 549 550 BF.setFinalized(); 551 552 // Update exception handling information. 553 BF.updateEHRanges(); 554 }; 555 556 ParallelUtilities::PredicateTy SkipPredicate = [&](const BinaryFunction &BF) { 557 return !BC.shouldEmit(BF); 558 }; 559 560 ParallelUtilities::runOnEachFunction( 561 BC, ParallelUtilities::SchedulingPolicy::SP_CONSTANT, WorkFun, 562 SkipPredicate, "FinalizeFunctions"); 563 if (HasFatal) 564 return createFatalBOLTError("finalize CFI state failure"); 565 return Error::success(); 566 } 567 568 Error CheckLargeFunctions::runOnFunctions(BinaryContext &BC) { 569 if (BC.HasRelocations) 570 return Error::success(); 571 572 // If the function wouldn't fit, mark it as non-simple. Otherwise, we may emit 573 // incorrect meta data. 574 ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) { 575 uint64_t HotSize, ColdSize; 576 std::tie(HotSize, ColdSize) = 577 BC.calculateEmittedSize(BF, /*FixBranches=*/false); 578 if (HotSize > BF.getMaxSize()) { 579 if (opts::PrintLargeFunctions) 580 BC.outs() << "BOLT-INFO: " << BF << " size exceeds allocated space by " 581 << (HotSize - BF.getMaxSize()) << " bytes\n"; 582 BF.setSimple(false); 583 } 584 }; 585 586 ParallelUtilities::PredicateTy SkipFunc = [&](const BinaryFunction &BF) { 587 return !shouldOptimize(BF); 588 }; 589 590 ParallelUtilities::runOnEachFunction( 591 BC, ParallelUtilities::SchedulingPolicy::SP_INST_LINEAR, WorkFun, 592 SkipFunc, "CheckLargeFunctions"); 593 594 return Error::success(); 595 } 596 597 bool CheckLargeFunctions::shouldOptimize(const BinaryFunction &BF) const { 598 // Unlike other passes, allow functions in non-CFG state. 599 return BF.isSimple() && !BF.isIgnored(); 600 } 601 602 Error LowerAnnotations::runOnFunctions(BinaryContext &BC) { 603 // Convert GnuArgsSize annotations into CFIs. 604 for (BinaryFunction *BF : BC.getAllBinaryFunctions()) { 605 for (FunctionFragment &FF : BF->getLayout().fragments()) { 606 // Reset at the start of the new fragment. 607 int64_t CurrentGnuArgsSize = 0; 608 609 for (BinaryBasicBlock *const BB : FF) { 610 for (auto II = BB->begin(); II != BB->end(); ++II) { 611 if (!BF->usesGnuArgsSize() || !BC.MIB->isInvoke(*II)) 612 continue; 613 614 const int64_t NewGnuArgsSize = BC.MIB->getGnuArgsSize(*II); 615 assert(NewGnuArgsSize >= 0 && "Expected non-negative GNU_args_size."); 616 if (NewGnuArgsSize == CurrentGnuArgsSize) 617 continue; 618 619 auto InsertII = BF->addCFIInstruction( 620 BB, II, 621 MCCFIInstruction::createGnuArgsSize(nullptr, NewGnuArgsSize)); 622 CurrentGnuArgsSize = NewGnuArgsSize; 623 II = std::next(InsertII); 624 } 625 } 626 } 627 } 628 return Error::success(); 629 } 630 631 // Check for dirty state in MCSymbol objects that might be a consequence 632 // of running calculateEmittedSize() in parallel, during split functions 633 // pass. If an inconsistent state is found (symbol already registered or 634 // already defined), clean it. 635 Error CleanMCState::runOnFunctions(BinaryContext &BC) { 636 MCContext &Ctx = *BC.Ctx; 637 for (const auto &SymMapEntry : Ctx.getSymbols()) { 638 const MCSymbol *S = SymMapEntry.getValue().Symbol; 639 if (!S) 640 continue; 641 if (S->isDefined()) { 642 LLVM_DEBUG(dbgs() << "BOLT-DEBUG: Symbol \"" << S->getName() 643 << "\" is already defined\n"); 644 const_cast<MCSymbol *>(S)->setUndefined(); 645 } 646 if (S->isRegistered()) { 647 LLVM_DEBUG(dbgs() << "BOLT-DEBUG: Symbol \"" << S->getName() 648 << "\" is already registered\n"); 649 const_cast<MCSymbol *>(S)->setIsRegistered(false); 650 } 651 LLVM_DEBUG(if (S->isVariable()) { 652 dbgs() << "BOLT-DEBUG: Symbol \"" << S->getName() << "\" is variable\n"; 653 }); 654 } 655 return Error::success(); 656 } 657 658 // This peephole fixes jump instructions that jump to another basic 659 // block with a single jump instruction, e.g. 660 // 661 // B0: ... 662 // jmp B1 (or jcc B1) 663 // 664 // B1: jmp B2 665 // 666 // -> 667 // 668 // B0: ... 669 // jmp B2 (or jcc B2) 670 // 671 static uint64_t fixDoubleJumps(BinaryFunction &Function, bool MarkInvalid) { 672 uint64_t NumDoubleJumps = 0; 673 674 MCContext *Ctx = Function.getBinaryContext().Ctx.get(); 675 MCPlusBuilder *MIB = Function.getBinaryContext().MIB.get(); 676 for (BinaryBasicBlock &BB : Function) { 677 auto checkAndPatch = [&](BinaryBasicBlock *Pred, BinaryBasicBlock *Succ, 678 const MCSymbol *SuccSym, 679 std::optional<uint32_t> Offset) { 680 // Ignore infinite loop jumps or fallthrough tail jumps. 681 if (Pred == Succ || Succ == &BB) 682 return false; 683 684 if (Succ) { 685 const MCSymbol *TBB = nullptr; 686 const MCSymbol *FBB = nullptr; 687 MCInst *CondBranch = nullptr; 688 MCInst *UncondBranch = nullptr; 689 bool Res = Pred->analyzeBranch(TBB, FBB, CondBranch, UncondBranch); 690 if (!Res) { 691 LLVM_DEBUG(dbgs() << "analyzeBranch failed in peepholes in block:\n"; 692 Pred->dump()); 693 return false; 694 } 695 Pred->replaceSuccessor(&BB, Succ); 696 697 // We must patch up any existing branch instructions to match up 698 // with the new successor. 699 assert((CondBranch || (!CondBranch && Pred->succ_size() == 1)) && 700 "Predecessor block has inconsistent number of successors"); 701 if (CondBranch && MIB->getTargetSymbol(*CondBranch) == BB.getLabel()) { 702 MIB->replaceBranchTarget(*CondBranch, Succ->getLabel(), Ctx); 703 } else if (UncondBranch && 704 MIB->getTargetSymbol(*UncondBranch) == BB.getLabel()) { 705 MIB->replaceBranchTarget(*UncondBranch, Succ->getLabel(), Ctx); 706 } else if (!UncondBranch) { 707 assert(Function.getLayout().getBasicBlockAfter(Pred, false) != Succ && 708 "Don't add an explicit jump to a fallthrough block."); 709 Pred->addBranchInstruction(Succ); 710 } 711 } else { 712 // Succ will be null in the tail call case. In this case we 713 // need to explicitly add a tail call instruction. 714 MCInst *Branch = Pred->getLastNonPseudoInstr(); 715 if (Branch && MIB->isUnconditionalBranch(*Branch)) { 716 assert(MIB->getTargetSymbol(*Branch) == BB.getLabel()); 717 Pred->removeSuccessor(&BB); 718 Pred->eraseInstruction(Pred->findInstruction(Branch)); 719 Pred->addTailCallInstruction(SuccSym); 720 if (Offset) { 721 MCInst *TailCall = Pred->getLastNonPseudoInstr(); 722 assert(TailCall); 723 MIB->setOffset(*TailCall, *Offset); 724 } 725 } else { 726 return false; 727 } 728 } 729 730 ++NumDoubleJumps; 731 LLVM_DEBUG(dbgs() << "Removed double jump in " << Function << " from " 732 << Pred->getName() << " -> " << BB.getName() << " to " 733 << Pred->getName() << " -> " << SuccSym->getName() 734 << (!Succ ? " (tail)\n" : "\n")); 735 736 return true; 737 }; 738 739 if (BB.getNumNonPseudos() != 1 || BB.isLandingPad()) 740 continue; 741 742 MCInst *Inst = BB.getFirstNonPseudoInstr(); 743 const bool IsTailCall = MIB->isTailCall(*Inst); 744 745 if (!MIB->isUnconditionalBranch(*Inst) && !IsTailCall) 746 continue; 747 748 // If we operate after SCTC make sure it's not a conditional tail call. 749 if (IsTailCall && MIB->isConditionalBranch(*Inst)) 750 continue; 751 752 const MCSymbol *SuccSym = MIB->getTargetSymbol(*Inst); 753 BinaryBasicBlock *Succ = BB.getSuccessor(); 754 755 if (((!Succ || &BB == Succ) && !IsTailCall) || (IsTailCall && !SuccSym)) 756 continue; 757 758 std::vector<BinaryBasicBlock *> Preds = {BB.pred_begin(), BB.pred_end()}; 759 760 for (BinaryBasicBlock *Pred : Preds) { 761 if (Pred->isLandingPad()) 762 continue; 763 764 if (Pred->getSuccessor() == &BB || 765 (Pred->getConditionalSuccessor(true) == &BB && !IsTailCall) || 766 Pred->getConditionalSuccessor(false) == &BB) 767 if (checkAndPatch(Pred, Succ, SuccSym, MIB->getOffset(*Inst)) && 768 MarkInvalid) 769 BB.markValid(BB.pred_size() != 0 || BB.isLandingPad() || 770 BB.isEntryPoint()); 771 } 772 } 773 774 return NumDoubleJumps; 775 } 776 777 bool SimplifyConditionalTailCalls::shouldRewriteBranch( 778 const BinaryBasicBlock *PredBB, const MCInst &CondBranch, 779 const BinaryBasicBlock *BB, const bool DirectionFlag) { 780 if (BeenOptimized.count(PredBB)) 781 return false; 782 783 const bool IsForward = BinaryFunction::isForwardBranch(PredBB, BB); 784 785 if (IsForward) 786 ++NumOrigForwardBranches; 787 else 788 ++NumOrigBackwardBranches; 789 790 if (opts::SctcMode == opts::SctcAlways) 791 return true; 792 793 if (opts::SctcMode == opts::SctcPreserveDirection) 794 return IsForward == DirectionFlag; 795 796 const ErrorOr<std::pair<double, double>> Frequency = 797 PredBB->getBranchStats(BB); 798 799 // It's ok to rewrite the conditional branch if the new target will be 800 // a backward branch. 801 802 // If no data available for these branches, then it should be ok to 803 // do the optimization since it will reduce code size. 804 if (Frequency.getError()) 805 return true; 806 807 // TODO: should this use misprediction frequency instead? 808 const bool Result = (IsForward && Frequency.get().first >= 0.5) || 809 (!IsForward && Frequency.get().first <= 0.5); 810 811 return Result == DirectionFlag; 812 } 813 814 uint64_t SimplifyConditionalTailCalls::fixTailCalls(BinaryFunction &BF) { 815 // Need updated indices to correctly detect branch' direction. 816 BF.getLayout().updateLayoutIndices(); 817 BF.markUnreachableBlocks(); 818 819 MCPlusBuilder *MIB = BF.getBinaryContext().MIB.get(); 820 MCContext *Ctx = BF.getBinaryContext().Ctx.get(); 821 uint64_t NumLocalCTCCandidates = 0; 822 uint64_t NumLocalCTCs = 0; 823 uint64_t LocalCTCTakenCount = 0; 824 uint64_t LocalCTCExecCount = 0; 825 std::vector<std::pair<BinaryBasicBlock *, const BinaryBasicBlock *>> 826 NeedsUncondBranch; 827 828 // Will block be deleted by UCE? 829 auto isValid = [](const BinaryBasicBlock *BB) { 830 return (BB->pred_size() != 0 || BB->isLandingPad() || BB->isEntryPoint()); 831 }; 832 833 for (BinaryBasicBlock *BB : BF.getLayout().blocks()) { 834 // Locate BB with a single direct tail-call instruction. 835 if (BB->getNumNonPseudos() != 1) 836 continue; 837 838 MCInst *Instr = BB->getFirstNonPseudoInstr(); 839 if (!MIB->isTailCall(*Instr) || MIB->isConditionalBranch(*Instr)) 840 continue; 841 842 const MCSymbol *CalleeSymbol = MIB->getTargetSymbol(*Instr); 843 if (!CalleeSymbol) 844 continue; 845 846 // Detect direction of the possible conditional tail call. 847 const bool IsForwardCTC = BF.isForwardCall(CalleeSymbol); 848 849 // Iterate through all predecessors. 850 for (BinaryBasicBlock *PredBB : BB->predecessors()) { 851 BinaryBasicBlock *CondSucc = PredBB->getConditionalSuccessor(true); 852 if (!CondSucc) 853 continue; 854 855 ++NumLocalCTCCandidates; 856 857 const MCSymbol *TBB = nullptr; 858 const MCSymbol *FBB = nullptr; 859 MCInst *CondBranch = nullptr; 860 MCInst *UncondBranch = nullptr; 861 bool Result = PredBB->analyzeBranch(TBB, FBB, CondBranch, UncondBranch); 862 863 // analyzeBranch() can fail due to unusual branch instructions, e.g. jrcxz 864 if (!Result) { 865 LLVM_DEBUG(dbgs() << "analyzeBranch failed in SCTC in block:\n"; 866 PredBB->dump()); 867 continue; 868 } 869 870 assert(Result && "internal error analyzing conditional branch"); 871 assert(CondBranch && "conditional branch expected"); 872 873 // Skip dynamic branches for now. 874 if (BF.getBinaryContext().MIB->isDynamicBranch(*CondBranch)) 875 continue; 876 877 // It's possible that PredBB is also a successor to BB that may have 878 // been processed by a previous iteration of the SCTC loop, in which 879 // case it may have been marked invalid. We should skip rewriting in 880 // this case. 881 if (!PredBB->isValid()) { 882 assert(PredBB->isSuccessor(BB) && 883 "PredBB should be valid if it is not a successor to BB"); 884 continue; 885 } 886 887 // We don't want to reverse direction of the branch in new order 888 // without further profile analysis. 889 const bool DirectionFlag = CondSucc == BB ? IsForwardCTC : !IsForwardCTC; 890 if (!shouldRewriteBranch(PredBB, *CondBranch, BB, DirectionFlag)) 891 continue; 892 893 // Record this block so that we don't try to optimize it twice. 894 BeenOptimized.insert(PredBB); 895 896 uint64_t Count = 0; 897 if (CondSucc != BB) { 898 // Patch the new target address into the conditional branch. 899 MIB->reverseBranchCondition(*CondBranch, CalleeSymbol, Ctx); 900 // Since we reversed the condition on the branch we need to change 901 // the target for the unconditional branch or add a unconditional 902 // branch to the old target. This has to be done manually since 903 // fixupBranches is not called after SCTC. 904 NeedsUncondBranch.emplace_back(PredBB, CondSucc); 905 Count = PredBB->getFallthroughBranchInfo().Count; 906 } else { 907 // Change destination of the conditional branch. 908 MIB->replaceBranchTarget(*CondBranch, CalleeSymbol, Ctx); 909 Count = PredBB->getTakenBranchInfo().Count; 910 } 911 const uint64_t CTCTakenFreq = 912 Count == BinaryBasicBlock::COUNT_NO_PROFILE ? 0 : Count; 913 914 // Annotate it, so "isCall" returns true for this jcc 915 MIB->setConditionalTailCall(*CondBranch); 916 // Add info about the conditional tail call frequency, otherwise this 917 // info will be lost when we delete the associated BranchInfo entry 918 auto &CTCAnnotation = 919 MIB->getOrCreateAnnotationAs<uint64_t>(*CondBranch, "CTCTakenCount"); 920 CTCAnnotation = CTCTakenFreq; 921 // Preserve Offset annotation, used in BAT. 922 // Instr is a direct tail call instruction that was created when CTCs are 923 // first expanded, and has the original CTC offset set. 924 if (std::optional<uint32_t> Offset = MIB->getOffset(*Instr)) 925 MIB->setOffset(*CondBranch, *Offset); 926 927 // Remove the unused successor which may be eliminated later 928 // if there are no other users. 929 PredBB->removeSuccessor(BB); 930 // Update BB execution count 931 if (CTCTakenFreq && CTCTakenFreq <= BB->getKnownExecutionCount()) 932 BB->setExecutionCount(BB->getExecutionCount() - CTCTakenFreq); 933 else if (CTCTakenFreq > BB->getKnownExecutionCount()) 934 BB->setExecutionCount(0); 935 936 ++NumLocalCTCs; 937 LocalCTCTakenCount += CTCTakenFreq; 938 LocalCTCExecCount += PredBB->getKnownExecutionCount(); 939 } 940 941 // Remove the block from CFG if all predecessors were removed. 942 BB->markValid(isValid(BB)); 943 } 944 945 // Add unconditional branches at the end of BBs to new successors 946 // as long as the successor is not a fallthrough. 947 for (auto &Entry : NeedsUncondBranch) { 948 BinaryBasicBlock *PredBB = Entry.first; 949 const BinaryBasicBlock *CondSucc = Entry.second; 950 951 const MCSymbol *TBB = nullptr; 952 const MCSymbol *FBB = nullptr; 953 MCInst *CondBranch = nullptr; 954 MCInst *UncondBranch = nullptr; 955 PredBB->analyzeBranch(TBB, FBB, CondBranch, UncondBranch); 956 957 // Find the next valid block. Invalid blocks will be deleted 958 // so they shouldn't be considered fallthrough targets. 959 const BinaryBasicBlock *NextBlock = 960 BF.getLayout().getBasicBlockAfter(PredBB, false); 961 while (NextBlock && !isValid(NextBlock)) 962 NextBlock = BF.getLayout().getBasicBlockAfter(NextBlock, false); 963 964 // Get the unconditional successor to this block. 965 const BinaryBasicBlock *PredSucc = PredBB->getSuccessor(); 966 assert(PredSucc && "The other branch should be a tail call"); 967 968 const bool HasFallthrough = (NextBlock && PredSucc == NextBlock); 969 970 if (UncondBranch) { 971 if (HasFallthrough) 972 PredBB->eraseInstruction(PredBB->findInstruction(UncondBranch)); 973 else 974 MIB->replaceBranchTarget(*UncondBranch, CondSucc->getLabel(), Ctx); 975 } else if (!HasFallthrough) { 976 MCInst Branch; 977 MIB->createUncondBranch(Branch, CondSucc->getLabel(), Ctx); 978 PredBB->addInstruction(Branch); 979 } 980 } 981 982 if (NumLocalCTCs > 0) { 983 NumDoubleJumps += fixDoubleJumps(BF, true); 984 // Clean-up unreachable tail-call blocks. 985 const std::pair<unsigned, uint64_t> Stats = BF.eraseInvalidBBs(); 986 DeletedBlocks += Stats.first; 987 DeletedBytes += Stats.second; 988 989 assert(BF.validateCFG()); 990 } 991 992 LLVM_DEBUG(dbgs() << "BOLT: created " << NumLocalCTCs 993 << " conditional tail calls from a total of " 994 << NumLocalCTCCandidates << " candidates in function " << BF 995 << ". CTCs execution count for this function is " 996 << LocalCTCExecCount << " and CTC taken count is " 997 << LocalCTCTakenCount << "\n";); 998 999 NumTailCallsPatched += NumLocalCTCs; 1000 NumCandidateTailCalls += NumLocalCTCCandidates; 1001 CTCExecCount += LocalCTCExecCount; 1002 CTCTakenCount += LocalCTCTakenCount; 1003 1004 return NumLocalCTCs > 0; 1005 } 1006 1007 Error SimplifyConditionalTailCalls::runOnFunctions(BinaryContext &BC) { 1008 if (!BC.isX86()) 1009 return Error::success(); 1010 1011 for (auto &It : BC.getBinaryFunctions()) { 1012 BinaryFunction &Function = It.second; 1013 1014 if (!shouldOptimize(Function)) 1015 continue; 1016 1017 if (fixTailCalls(Function)) { 1018 Modified.insert(&Function); 1019 Function.setHasCanonicalCFG(false); 1020 } 1021 } 1022 1023 if (NumTailCallsPatched) 1024 BC.outs() << "BOLT-INFO: SCTC: patched " << NumTailCallsPatched 1025 << " tail calls (" << NumOrigForwardBranches << " forward)" 1026 << " tail calls (" << NumOrigBackwardBranches << " backward)" 1027 << " from a total of " << NumCandidateTailCalls 1028 << " while removing " << NumDoubleJumps << " double jumps" 1029 << " and removing " << DeletedBlocks << " basic blocks" 1030 << " totalling " << DeletedBytes 1031 << " bytes of code. CTCs total execution count is " 1032 << CTCExecCount << " and the number of times CTCs are taken is " 1033 << CTCTakenCount << "\n"; 1034 return Error::success(); 1035 } 1036 1037 uint64_t ShortenInstructions::shortenInstructions(BinaryFunction &Function) { 1038 uint64_t Count = 0; 1039 const BinaryContext &BC = Function.getBinaryContext(); 1040 for (BinaryBasicBlock &BB : Function) { 1041 for (MCInst &Inst : BB) { 1042 // Skip shortening instructions with Size annotation. 1043 if (BC.MIB->getSize(Inst)) 1044 continue; 1045 1046 MCInst OriginalInst; 1047 if (opts::Verbosity > 2) 1048 OriginalInst = Inst; 1049 1050 if (!BC.MIB->shortenInstruction(Inst, *BC.STI)) 1051 continue; 1052 1053 if (opts::Verbosity > 2) { 1054 BC.scopeLock(); 1055 BC.outs() << "BOLT-INFO: shortening:\nBOLT-INFO: "; 1056 BC.printInstruction(BC.outs(), OriginalInst, 0, &Function); 1057 BC.outs() << "BOLT-INFO: to:"; 1058 BC.printInstruction(BC.outs(), Inst, 0, &Function); 1059 } 1060 1061 ++Count; 1062 } 1063 } 1064 1065 return Count; 1066 } 1067 1068 Error ShortenInstructions::runOnFunctions(BinaryContext &BC) { 1069 std::atomic<uint64_t> NumShortened{0}; 1070 if (!BC.isX86()) 1071 return Error::success(); 1072 1073 ParallelUtilities::runOnEachFunction( 1074 BC, ParallelUtilities::SchedulingPolicy::SP_INST_LINEAR, 1075 [&](BinaryFunction &BF) { NumShortened += shortenInstructions(BF); }, 1076 nullptr, "ShortenInstructions"); 1077 1078 if (NumShortened) 1079 BC.outs() << "BOLT-INFO: " << NumShortened 1080 << " instructions were shortened\n"; 1081 return Error::success(); 1082 } 1083 1084 void Peepholes::addTailcallTraps(BinaryFunction &Function) { 1085 MCPlusBuilder *MIB = Function.getBinaryContext().MIB.get(); 1086 for (BinaryBasicBlock &BB : Function) { 1087 MCInst *Inst = BB.getLastNonPseudoInstr(); 1088 if (Inst && MIB->isTailCall(*Inst) && MIB->isIndirectBranch(*Inst)) { 1089 MCInst Trap; 1090 MIB->createTrap(Trap); 1091 BB.addInstruction(Trap); 1092 ++TailCallTraps; 1093 } 1094 } 1095 } 1096 1097 void Peepholes::removeUselessCondBranches(BinaryFunction &Function) { 1098 for (BinaryBasicBlock &BB : Function) { 1099 if (BB.succ_size() != 2) 1100 continue; 1101 1102 BinaryBasicBlock *CondBB = BB.getConditionalSuccessor(true); 1103 BinaryBasicBlock *UncondBB = BB.getConditionalSuccessor(false); 1104 if (CondBB != UncondBB) 1105 continue; 1106 1107 const MCSymbol *TBB = nullptr; 1108 const MCSymbol *FBB = nullptr; 1109 MCInst *CondBranch = nullptr; 1110 MCInst *UncondBranch = nullptr; 1111 bool Result = BB.analyzeBranch(TBB, FBB, CondBranch, UncondBranch); 1112 1113 // analyzeBranch() can fail due to unusual branch instructions, 1114 // e.g. jrcxz, or jump tables (indirect jump). 1115 if (!Result || !CondBranch) 1116 continue; 1117 1118 BB.removeDuplicateConditionalSuccessor(CondBranch); 1119 ++NumUselessCondBranches; 1120 } 1121 } 1122 1123 Error Peepholes::runOnFunctions(BinaryContext &BC) { 1124 const char Opts = 1125 std::accumulate(opts::Peepholes.begin(), opts::Peepholes.end(), 0, 1126 [](const char A, const PeepholeOpts B) { return A | B; }); 1127 if (Opts == PEEP_NONE) 1128 return Error::success(); 1129 1130 for (auto &It : BC.getBinaryFunctions()) { 1131 BinaryFunction &Function = It.second; 1132 if (shouldOptimize(Function)) { 1133 if (Opts & PEEP_DOUBLE_JUMPS) 1134 NumDoubleJumps += fixDoubleJumps(Function, false); 1135 if (Opts & PEEP_TAILCALL_TRAPS) 1136 addTailcallTraps(Function); 1137 if (Opts & PEEP_USELESS_BRANCHES) 1138 removeUselessCondBranches(Function); 1139 assert(Function.validateCFG()); 1140 } 1141 } 1142 BC.outs() << "BOLT-INFO: Peephole: " << NumDoubleJumps 1143 << " double jumps patched.\n" 1144 << "BOLT-INFO: Peephole: " << TailCallTraps 1145 << " tail call traps inserted.\n" 1146 << "BOLT-INFO: Peephole: " << NumUselessCondBranches 1147 << " useless conditional branches removed.\n"; 1148 return Error::success(); 1149 } 1150 1151 bool SimplifyRODataLoads::simplifyRODataLoads(BinaryFunction &BF) { 1152 BinaryContext &BC = BF.getBinaryContext(); 1153 MCPlusBuilder *MIB = BC.MIB.get(); 1154 1155 uint64_t NumLocalLoadsSimplified = 0; 1156 uint64_t NumDynamicLocalLoadsSimplified = 0; 1157 uint64_t NumLocalLoadsFound = 0; 1158 uint64_t NumDynamicLocalLoadsFound = 0; 1159 1160 for (BinaryBasicBlock *BB : BF.getLayout().blocks()) { 1161 for (MCInst &Inst : *BB) { 1162 unsigned Opcode = Inst.getOpcode(); 1163 const MCInstrDesc &Desc = BC.MII->get(Opcode); 1164 1165 // Skip instructions that do not load from memory. 1166 if (!Desc.mayLoad()) 1167 continue; 1168 1169 // Try to statically evaluate the target memory address; 1170 uint64_t TargetAddress; 1171 1172 if (MIB->hasPCRelOperand(Inst)) { 1173 // Try to find the symbol that corresponds to the PC-relative operand. 1174 MCOperand *DispOpI = MIB->getMemOperandDisp(Inst); 1175 assert(DispOpI != Inst.end() && "expected PC-relative displacement"); 1176 assert(DispOpI->isExpr() && 1177 "found PC-relative with non-symbolic displacement"); 1178 1179 // Get displacement symbol. 1180 const MCSymbol *DisplSymbol; 1181 uint64_t DisplOffset; 1182 1183 std::tie(DisplSymbol, DisplOffset) = 1184 MIB->getTargetSymbolInfo(DispOpI->getExpr()); 1185 1186 if (!DisplSymbol) 1187 continue; 1188 1189 // Look up the symbol address in the global symbols map of the binary 1190 // context object. 1191 BinaryData *BD = BC.getBinaryDataByName(DisplSymbol->getName()); 1192 if (!BD) 1193 continue; 1194 TargetAddress = BD->getAddress() + DisplOffset; 1195 } else if (!MIB->evaluateMemOperandTarget(Inst, TargetAddress)) { 1196 continue; 1197 } 1198 1199 // Get the contents of the section containing the target address of the 1200 // memory operand. We are only interested in read-only sections. 1201 ErrorOr<BinarySection &> DataSection = 1202 BC.getSectionForAddress(TargetAddress); 1203 if (!DataSection || DataSection->isWritable()) 1204 continue; 1205 1206 if (BC.getRelocationAt(TargetAddress) || 1207 BC.getDynamicRelocationAt(TargetAddress)) 1208 continue; 1209 1210 uint32_t Offset = TargetAddress - DataSection->getAddress(); 1211 StringRef ConstantData = DataSection->getContents(); 1212 1213 ++NumLocalLoadsFound; 1214 if (BB->hasProfile()) 1215 NumDynamicLocalLoadsFound += BB->getExecutionCount(); 1216 1217 if (MIB->replaceMemOperandWithImm(Inst, ConstantData, Offset)) { 1218 ++NumLocalLoadsSimplified; 1219 if (BB->hasProfile()) 1220 NumDynamicLocalLoadsSimplified += BB->getExecutionCount(); 1221 } 1222 } 1223 } 1224 1225 NumLoadsFound += NumLocalLoadsFound; 1226 NumDynamicLoadsFound += NumDynamicLocalLoadsFound; 1227 NumLoadsSimplified += NumLocalLoadsSimplified; 1228 NumDynamicLoadsSimplified += NumDynamicLocalLoadsSimplified; 1229 1230 return NumLocalLoadsSimplified > 0; 1231 } 1232 1233 Error SimplifyRODataLoads::runOnFunctions(BinaryContext &BC) { 1234 for (auto &It : BC.getBinaryFunctions()) { 1235 BinaryFunction &Function = It.second; 1236 if (shouldOptimize(Function) && simplifyRODataLoads(Function)) 1237 Modified.insert(&Function); 1238 } 1239 1240 BC.outs() << "BOLT-INFO: simplified " << NumLoadsSimplified << " out of " 1241 << NumLoadsFound << " loads from a statically computed address.\n" 1242 << "BOLT-INFO: dynamic loads simplified: " 1243 << NumDynamicLoadsSimplified << "\n" 1244 << "BOLT-INFO: dynamic loads found: " << NumDynamicLoadsFound 1245 << "\n"; 1246 return Error::success(); 1247 } 1248 1249 Error AssignSections::runOnFunctions(BinaryContext &BC) { 1250 for (BinaryFunction *Function : BC.getInjectedBinaryFunctions()) { 1251 Function->setCodeSectionName(BC.getInjectedCodeSectionName()); 1252 Function->setColdCodeSectionName(BC.getInjectedColdCodeSectionName()); 1253 } 1254 1255 // In non-relocation mode functions have pre-assigned section names. 1256 if (!BC.HasRelocations) 1257 return Error::success(); 1258 1259 const bool UseColdSection = 1260 BC.NumProfiledFuncs > 0 || 1261 opts::ReorderFunctions == ReorderFunctions::RT_USER; 1262 for (auto &BFI : BC.getBinaryFunctions()) { 1263 BinaryFunction &Function = BFI.second; 1264 if (opts::isHotTextMover(Function)) { 1265 Function.setCodeSectionName(BC.getHotTextMoverSectionName()); 1266 Function.setColdCodeSectionName(BC.getHotTextMoverSectionName()); 1267 continue; 1268 } 1269 1270 if (!UseColdSection || Function.hasValidIndex()) 1271 Function.setCodeSectionName(BC.getMainCodeSectionName()); 1272 else 1273 Function.setCodeSectionName(BC.getColdCodeSectionName()); 1274 1275 if (Function.isSplit()) 1276 Function.setColdCodeSectionName(BC.getColdCodeSectionName()); 1277 } 1278 return Error::success(); 1279 } 1280 1281 Error PrintProfileStats::runOnFunctions(BinaryContext &BC) { 1282 double FlowImbalanceMean = 0.0; 1283 size_t NumBlocksConsidered = 0; 1284 double WorstBias = 0.0; 1285 const BinaryFunction *WorstBiasFunc = nullptr; 1286 1287 // For each function CFG, we fill an IncomingMap with the sum of the frequency 1288 // of incoming edges for each BB. Likewise for each OutgoingMap and the sum 1289 // of the frequency of outgoing edges. 1290 using FlowMapTy = std::unordered_map<const BinaryBasicBlock *, uint64_t>; 1291 std::unordered_map<const BinaryFunction *, FlowMapTy> TotalIncomingMaps; 1292 std::unordered_map<const BinaryFunction *, FlowMapTy> TotalOutgoingMaps; 1293 1294 // Compute mean 1295 for (const auto &BFI : BC.getBinaryFunctions()) { 1296 const BinaryFunction &Function = BFI.second; 1297 if (Function.empty() || !Function.isSimple()) 1298 continue; 1299 FlowMapTy &IncomingMap = TotalIncomingMaps[&Function]; 1300 FlowMapTy &OutgoingMap = TotalOutgoingMaps[&Function]; 1301 for (const BinaryBasicBlock &BB : Function) { 1302 uint64_t TotalOutgoing = 0ULL; 1303 auto SuccBIIter = BB.branch_info_begin(); 1304 for (BinaryBasicBlock *Succ : BB.successors()) { 1305 uint64_t Count = SuccBIIter->Count; 1306 if (Count == BinaryBasicBlock::COUNT_NO_PROFILE || Count == 0) { 1307 ++SuccBIIter; 1308 continue; 1309 } 1310 TotalOutgoing += Count; 1311 IncomingMap[Succ] += Count; 1312 ++SuccBIIter; 1313 } 1314 OutgoingMap[&BB] = TotalOutgoing; 1315 } 1316 1317 size_t NumBlocks = 0; 1318 double Mean = 0.0; 1319 for (const BinaryBasicBlock &BB : Function) { 1320 // Do not compute score for low frequency blocks, entry or exit blocks 1321 if (IncomingMap[&BB] < 100 || OutgoingMap[&BB] == 0 || BB.isEntryPoint()) 1322 continue; 1323 ++NumBlocks; 1324 const double Difference = (double)OutgoingMap[&BB] - IncomingMap[&BB]; 1325 Mean += fabs(Difference / IncomingMap[&BB]); 1326 } 1327 1328 FlowImbalanceMean += Mean; 1329 NumBlocksConsidered += NumBlocks; 1330 if (!NumBlocks) 1331 continue; 1332 double FuncMean = Mean / NumBlocks; 1333 if (FuncMean > WorstBias) { 1334 WorstBias = FuncMean; 1335 WorstBiasFunc = &Function; 1336 } 1337 } 1338 if (NumBlocksConsidered > 0) 1339 FlowImbalanceMean /= NumBlocksConsidered; 1340 1341 // Compute standard deviation 1342 NumBlocksConsidered = 0; 1343 double FlowImbalanceVar = 0.0; 1344 for (const auto &BFI : BC.getBinaryFunctions()) { 1345 const BinaryFunction &Function = BFI.second; 1346 if (Function.empty() || !Function.isSimple()) 1347 continue; 1348 FlowMapTy &IncomingMap = TotalIncomingMaps[&Function]; 1349 FlowMapTy &OutgoingMap = TotalOutgoingMaps[&Function]; 1350 for (const BinaryBasicBlock &BB : Function) { 1351 if (IncomingMap[&BB] < 100 || OutgoingMap[&BB] == 0) 1352 continue; 1353 ++NumBlocksConsidered; 1354 const double Difference = (double)OutgoingMap[&BB] - IncomingMap[&BB]; 1355 FlowImbalanceVar += 1356 pow(fabs(Difference / IncomingMap[&BB]) - FlowImbalanceMean, 2); 1357 } 1358 } 1359 if (NumBlocksConsidered) { 1360 FlowImbalanceVar /= NumBlocksConsidered; 1361 FlowImbalanceVar = sqrt(FlowImbalanceVar); 1362 } 1363 1364 // Report to user 1365 BC.outs() << format("BOLT-INFO: Profile bias score: %.4lf%% StDev: %.4lf%%\n", 1366 (100.0 * FlowImbalanceMean), (100.0 * FlowImbalanceVar)); 1367 if (WorstBiasFunc && opts::Verbosity >= 1) { 1368 BC.outs() << "Worst average bias observed in " 1369 << WorstBiasFunc->getPrintName() << "\n"; 1370 LLVM_DEBUG(WorstBiasFunc->dump()); 1371 } 1372 return Error::success(); 1373 } 1374 1375 Error PrintProgramStats::runOnFunctions(BinaryContext &BC) { 1376 uint64_t NumRegularFunctions = 0; 1377 uint64_t NumStaleProfileFunctions = 0; 1378 uint64_t NumAllStaleFunctions = 0; 1379 uint64_t NumInferredFunctions = 0; 1380 uint64_t NumNonSimpleProfiledFunctions = 0; 1381 uint64_t NumUnknownControlFlowFunctions = 0; 1382 uint64_t TotalSampleCount = 0; 1383 uint64_t StaleSampleCount = 0; 1384 uint64_t InferredSampleCount = 0; 1385 std::vector<const BinaryFunction *> ProfiledFunctions; 1386 const char *StaleFuncsHeader = "BOLT-INFO: Functions with stale profile:\n"; 1387 for (auto &BFI : BC.getBinaryFunctions()) { 1388 const BinaryFunction &Function = BFI.second; 1389 1390 // Ignore PLT functions for stats. 1391 if (Function.isPLTFunction()) 1392 continue; 1393 1394 // Adjustment for BAT mode: the profile for BOLT split fragments is combined 1395 // so only count the hot fragment. 1396 const uint64_t Address = Function.getAddress(); 1397 bool IsHotParentOfBOLTSplitFunction = !Function.getFragments().empty() && 1398 BAT && BAT->isBATFunction(Address) && 1399 !BAT->fetchParentAddress(Address); 1400 1401 ++NumRegularFunctions; 1402 1403 // In BOLTed binaries split functions are non-simple (due to non-relocation 1404 // mode), but the original function is known to be simple and we have a 1405 // valid profile for it. 1406 if (!Function.isSimple() && !IsHotParentOfBOLTSplitFunction) { 1407 if (Function.hasProfile()) 1408 ++NumNonSimpleProfiledFunctions; 1409 continue; 1410 } 1411 1412 if (Function.hasUnknownControlFlow()) { 1413 if (opts::PrintUnknownCFG) 1414 Function.dump(); 1415 else if (opts::PrintUnknown) 1416 BC.errs() << "function with unknown control flow: " << Function << '\n'; 1417 1418 ++NumUnknownControlFlowFunctions; 1419 } 1420 1421 if (!Function.hasProfile()) 1422 continue; 1423 1424 uint64_t SampleCount = Function.getRawBranchCount(); 1425 TotalSampleCount += SampleCount; 1426 1427 if (Function.hasValidProfile()) { 1428 ProfiledFunctions.push_back(&Function); 1429 if (Function.hasInferredProfile()) { 1430 ++NumInferredFunctions; 1431 InferredSampleCount += SampleCount; 1432 ++NumAllStaleFunctions; 1433 } 1434 } else { 1435 if (opts::ReportStaleFuncs) { 1436 BC.outs() << StaleFuncsHeader; 1437 StaleFuncsHeader = ""; 1438 BC.outs() << " " << Function << '\n'; 1439 } 1440 ++NumStaleProfileFunctions; 1441 StaleSampleCount += SampleCount; 1442 ++NumAllStaleFunctions; 1443 } 1444 } 1445 BC.NumProfiledFuncs = ProfiledFunctions.size(); 1446 BC.NumStaleProfileFuncs = NumStaleProfileFunctions; 1447 1448 const size_t NumAllProfiledFunctions = 1449 ProfiledFunctions.size() + NumStaleProfileFunctions; 1450 BC.outs() << "BOLT-INFO: " << NumAllProfiledFunctions << " out of " 1451 << NumRegularFunctions << " functions in the binary (" 1452 << format("%.1f", NumAllProfiledFunctions / 1453 (float)NumRegularFunctions * 100.0f) 1454 << "%) have non-empty execution profile\n"; 1455 if (NumNonSimpleProfiledFunctions) { 1456 BC.outs() << "BOLT-INFO: " << NumNonSimpleProfiledFunctions << " function" 1457 << (NumNonSimpleProfiledFunctions == 1 ? "" : "s") 1458 << " with profile could not be optimized\n"; 1459 } 1460 if (NumAllStaleFunctions) { 1461 const float PctStale = 1462 NumAllStaleFunctions / (float)NumAllProfiledFunctions * 100.0f; 1463 const float PctStaleFuncsWithEqualBlockCount = 1464 (float)BC.Stats.NumStaleFuncsWithEqualBlockCount / 1465 NumAllStaleFunctions * 100.0f; 1466 const float PctStaleBlocksWithEqualIcount = 1467 (float)BC.Stats.NumStaleBlocksWithEqualIcount / 1468 BC.Stats.NumStaleBlocks * 100.0f; 1469 auto printErrorOrWarning = [&]() { 1470 if (PctStale > opts::StaleThreshold) 1471 BC.errs() << "BOLT-ERROR: "; 1472 else 1473 BC.errs() << "BOLT-WARNING: "; 1474 }; 1475 printErrorOrWarning(); 1476 BC.errs() << NumAllStaleFunctions 1477 << format(" (%.1f%% of all profiled)", PctStale) << " function" 1478 << (NumAllStaleFunctions == 1 ? "" : "s") 1479 << " have invalid (possibly stale) profile." 1480 " Use -report-stale to see the list.\n"; 1481 if (TotalSampleCount > 0) { 1482 printErrorOrWarning(); 1483 BC.errs() << (StaleSampleCount + InferredSampleCount) << " out of " 1484 << TotalSampleCount << " samples in the binary (" 1485 << format("%.1f", 1486 ((100.0f * (StaleSampleCount + InferredSampleCount)) / 1487 TotalSampleCount)) 1488 << "%) belong to functions with invalid" 1489 " (possibly stale) profile.\n"; 1490 } 1491 BC.outs() << "BOLT-INFO: " << BC.Stats.NumStaleFuncsWithEqualBlockCount 1492 << " stale function" 1493 << (BC.Stats.NumStaleFuncsWithEqualBlockCount == 1 ? "" : "s") 1494 << format(" (%.1f%% of all stale)", 1495 PctStaleFuncsWithEqualBlockCount) 1496 << " have matching block count.\n"; 1497 BC.outs() << "BOLT-INFO: " << BC.Stats.NumStaleBlocksWithEqualIcount 1498 << " stale block" 1499 << (BC.Stats.NumStaleBlocksWithEqualIcount == 1 ? "" : "s") 1500 << format(" (%.1f%% of all stale)", PctStaleBlocksWithEqualIcount) 1501 << " have matching icount.\n"; 1502 if (PctStale > opts::StaleThreshold) { 1503 return createFatalBOLTError( 1504 Twine("BOLT-ERROR: stale functions exceed specified threshold of ") + 1505 Twine(opts::StaleThreshold.getValue()) + Twine("%. Exiting.\n")); 1506 } 1507 } 1508 if (NumInferredFunctions) { 1509 BC.outs() << format( 1510 "BOLT-INFO: inferred profile for %d (%.2f%% of profiled, " 1511 "%.2f%% of stale) functions responsible for %.2f%% samples" 1512 " (%zu out of %zu)\n", 1513 NumInferredFunctions, 1514 100.0 * NumInferredFunctions / NumAllProfiledFunctions, 1515 100.0 * NumInferredFunctions / NumAllStaleFunctions, 1516 100.0 * InferredSampleCount / TotalSampleCount, InferredSampleCount, 1517 TotalSampleCount); 1518 BC.outs() << format( 1519 "BOLT-INFO: inference found an exact match for %.2f%% of basic blocks" 1520 " (%zu out of %zu stale) responsible for %.2f%% samples" 1521 " (%zu out of %zu stale)\n", 1522 100.0 * BC.Stats.NumMatchedBlocks / BC.Stats.NumStaleBlocks, 1523 BC.Stats.NumMatchedBlocks, BC.Stats.NumStaleBlocks, 1524 100.0 * BC.Stats.MatchedSampleCount / BC.Stats.StaleSampleCount, 1525 BC.Stats.MatchedSampleCount, BC.Stats.StaleSampleCount); 1526 } 1527 1528 if (const uint64_t NumUnusedObjects = BC.getNumUnusedProfiledObjects()) { 1529 BC.outs() << "BOLT-INFO: profile for " << NumUnusedObjects 1530 << " objects was ignored\n"; 1531 } 1532 1533 if (ProfiledFunctions.size() > 10) { 1534 if (opts::Verbosity >= 1) { 1535 BC.outs() << "BOLT-INFO: top called functions are:\n"; 1536 llvm::sort(ProfiledFunctions, 1537 [](const BinaryFunction *A, const BinaryFunction *B) { 1538 return B->getExecutionCount() < A->getExecutionCount(); 1539 }); 1540 auto SFI = ProfiledFunctions.begin(); 1541 auto SFIend = ProfiledFunctions.end(); 1542 for (unsigned I = 0u; I < opts::TopCalledLimit && SFI != SFIend; 1543 ++SFI, ++I) 1544 BC.outs() << " " << **SFI << " : " << (*SFI)->getExecutionCount() 1545 << '\n'; 1546 } 1547 } 1548 1549 if (!opts::PrintSortedBy.empty()) { 1550 std::vector<BinaryFunction *> Functions; 1551 std::map<const BinaryFunction *, DynoStats> Stats; 1552 1553 for (auto &BFI : BC.getBinaryFunctions()) { 1554 BinaryFunction &BF = BFI.second; 1555 if (shouldOptimize(BF) && BF.hasValidProfile()) { 1556 Functions.push_back(&BF); 1557 Stats.emplace(&BF, getDynoStats(BF)); 1558 } 1559 } 1560 1561 const bool SortAll = 1562 llvm::is_contained(opts::PrintSortedBy, DynoStats::LAST_DYNO_STAT); 1563 1564 const bool Ascending = 1565 opts::DynoStatsSortOrderOpt == opts::DynoStatsSortOrder::Ascending; 1566 1567 std::function<bool(const DynoStats &, const DynoStats &)> 1568 DynoStatsComparator = 1569 SortAll ? [](const DynoStats &StatsA, 1570 const DynoStats &StatsB) { return StatsA < StatsB; } 1571 : [](const DynoStats &StatsA, const DynoStats &StatsB) { 1572 return StatsA.lessThan(StatsB, opts::PrintSortedBy); 1573 }; 1574 1575 llvm::stable_sort(Functions, 1576 [Ascending, &Stats, DynoStatsComparator]( 1577 const BinaryFunction *A, const BinaryFunction *B) { 1578 auto StatsItr = Stats.find(A); 1579 assert(StatsItr != Stats.end()); 1580 const DynoStats &StatsA = StatsItr->second; 1581 1582 StatsItr = Stats.find(B); 1583 assert(StatsItr != Stats.end()); 1584 const DynoStats &StatsB = StatsItr->second; 1585 1586 return Ascending ? DynoStatsComparator(StatsA, StatsB) 1587 : DynoStatsComparator(StatsB, StatsA); 1588 }); 1589 1590 BC.outs() << "BOLT-INFO: top functions sorted by "; 1591 if (SortAll) { 1592 BC.outs() << "dyno stats"; 1593 } else { 1594 BC.outs() << "("; 1595 bool PrintComma = false; 1596 for (const DynoStats::Category Category : opts::PrintSortedBy) { 1597 if (PrintComma) 1598 BC.outs() << ", "; 1599 BC.outs() << DynoStats::Description(Category); 1600 PrintComma = true; 1601 } 1602 BC.outs() << ")"; 1603 } 1604 1605 BC.outs() << " are:\n"; 1606 auto SFI = Functions.begin(); 1607 for (unsigned I = 0; I < 100 && SFI != Functions.end(); ++SFI, ++I) { 1608 const DynoStats Stats = getDynoStats(**SFI); 1609 BC.outs() << " " << **SFI; 1610 if (!SortAll) { 1611 BC.outs() << " ("; 1612 bool PrintComma = false; 1613 for (const DynoStats::Category Category : opts::PrintSortedBy) { 1614 if (PrintComma) 1615 BC.outs() << ", "; 1616 BC.outs() << dynoStatsOptName(Category) << "=" << Stats[Category]; 1617 PrintComma = true; 1618 } 1619 BC.outs() << ")"; 1620 } 1621 BC.outs() << "\n"; 1622 } 1623 } 1624 1625 if (!BC.TrappedFunctions.empty()) { 1626 BC.errs() << "BOLT-WARNING: " << BC.TrappedFunctions.size() << " function" 1627 << (BC.TrappedFunctions.size() > 1 ? "s" : "") 1628 << " will trap on entry. Use -trap-avx512=0 to disable" 1629 " traps."; 1630 if (opts::Verbosity >= 1 || BC.TrappedFunctions.size() <= 5) { 1631 BC.errs() << '\n'; 1632 for (const BinaryFunction *Function : BC.TrappedFunctions) 1633 BC.errs() << " " << *Function << '\n'; 1634 } else { 1635 BC.errs() << " Use -v=1 to see the list.\n"; 1636 } 1637 } 1638 1639 // Collect and print information about suboptimal code layout on input. 1640 if (opts::ReportBadLayout) { 1641 std::vector<BinaryFunction *> SuboptimalFuncs; 1642 for (auto &BFI : BC.getBinaryFunctions()) { 1643 BinaryFunction &BF = BFI.second; 1644 if (!BF.hasValidProfile()) 1645 continue; 1646 1647 const uint64_t HotThreshold = 1648 std::max<uint64_t>(BF.getKnownExecutionCount(), 1); 1649 bool HotSeen = false; 1650 for (const BinaryBasicBlock *BB : BF.getLayout().rblocks()) { 1651 if (!HotSeen && BB->getKnownExecutionCount() > HotThreshold) { 1652 HotSeen = true; 1653 continue; 1654 } 1655 if (HotSeen && BB->getKnownExecutionCount() == 0) { 1656 SuboptimalFuncs.push_back(&BF); 1657 break; 1658 } 1659 } 1660 } 1661 1662 if (!SuboptimalFuncs.empty()) { 1663 llvm::sort(SuboptimalFuncs, 1664 [](const BinaryFunction *A, const BinaryFunction *B) { 1665 return A->getKnownExecutionCount() / A->getSize() > 1666 B->getKnownExecutionCount() / B->getSize(); 1667 }); 1668 1669 BC.outs() << "BOLT-INFO: " << SuboptimalFuncs.size() 1670 << " functions have " 1671 "cold code in the middle of hot code. Top functions are:\n"; 1672 for (unsigned I = 0; 1673 I < std::min(static_cast<size_t>(opts::ReportBadLayout), 1674 SuboptimalFuncs.size()); 1675 ++I) 1676 SuboptimalFuncs[I]->print(BC.outs()); 1677 } 1678 } 1679 1680 if (NumUnknownControlFlowFunctions) { 1681 BC.outs() << "BOLT-INFO: " << NumUnknownControlFlowFunctions 1682 << " functions have instructions with unknown control flow"; 1683 if (!opts::PrintUnknown) 1684 BC.outs() << ". Use -print-unknown to see the list."; 1685 BC.outs() << '\n'; 1686 } 1687 return Error::success(); 1688 } 1689 1690 Error InstructionLowering::runOnFunctions(BinaryContext &BC) { 1691 for (auto &BFI : BC.getBinaryFunctions()) 1692 for (BinaryBasicBlock &BB : BFI.second) 1693 for (MCInst &Instruction : BB) 1694 BC.MIB->lowerTailCall(Instruction); 1695 return Error::success(); 1696 } 1697 1698 Error StripRepRet::runOnFunctions(BinaryContext &BC) { 1699 if (!BC.isX86()) 1700 return Error::success(); 1701 1702 uint64_t NumPrefixesRemoved = 0; 1703 uint64_t NumBytesSaved = 0; 1704 for (auto &BFI : BC.getBinaryFunctions()) { 1705 for (BinaryBasicBlock &BB : BFI.second) { 1706 auto LastInstRIter = BB.getLastNonPseudo(); 1707 if (LastInstRIter == BB.rend() || !BC.MIB->isReturn(*LastInstRIter) || 1708 !BC.MIB->deleteREPPrefix(*LastInstRIter)) 1709 continue; 1710 1711 NumPrefixesRemoved += BB.getKnownExecutionCount(); 1712 ++NumBytesSaved; 1713 } 1714 } 1715 1716 if (NumBytesSaved) 1717 BC.outs() << "BOLT-INFO: removed " << NumBytesSaved 1718 << " 'repz' prefixes" 1719 " with estimated execution count of " 1720 << NumPrefixesRemoved << " times.\n"; 1721 return Error::success(); 1722 } 1723 1724 Error InlineMemcpy::runOnFunctions(BinaryContext &BC) { 1725 if (!BC.isX86()) 1726 return Error::success(); 1727 1728 uint64_t NumInlined = 0; 1729 uint64_t NumInlinedDyno = 0; 1730 for (auto &BFI : BC.getBinaryFunctions()) { 1731 for (BinaryBasicBlock &BB : BFI.second) { 1732 for (auto II = BB.begin(); II != BB.end(); ++II) { 1733 MCInst &Inst = *II; 1734 1735 if (!BC.MIB->isCall(Inst) || MCPlus::getNumPrimeOperands(Inst) != 1 || 1736 !Inst.getOperand(0).isExpr()) 1737 continue; 1738 1739 const MCSymbol *CalleeSymbol = BC.MIB->getTargetSymbol(Inst); 1740 if (CalleeSymbol->getName() != "memcpy" && 1741 CalleeSymbol->getName() != "memcpy@PLT" && 1742 CalleeSymbol->getName() != "_memcpy8") 1743 continue; 1744 1745 const bool IsMemcpy8 = (CalleeSymbol->getName() == "_memcpy8"); 1746 const bool IsTailCall = BC.MIB->isTailCall(Inst); 1747 1748 const InstructionListType NewCode = 1749 BC.MIB->createInlineMemcpy(IsMemcpy8); 1750 II = BB.replaceInstruction(II, NewCode); 1751 std::advance(II, NewCode.size() - 1); 1752 if (IsTailCall) { 1753 MCInst Return; 1754 BC.MIB->createReturn(Return); 1755 II = BB.insertInstruction(std::next(II), std::move(Return)); 1756 } 1757 1758 ++NumInlined; 1759 NumInlinedDyno += BB.getKnownExecutionCount(); 1760 } 1761 } 1762 } 1763 1764 if (NumInlined) { 1765 BC.outs() << "BOLT-INFO: inlined " << NumInlined << " memcpy() calls"; 1766 if (NumInlinedDyno) 1767 BC.outs() << ". The calls were executed " << NumInlinedDyno 1768 << " times based on profile."; 1769 BC.outs() << '\n'; 1770 } 1771 return Error::success(); 1772 } 1773 1774 bool SpecializeMemcpy1::shouldOptimize(const BinaryFunction &Function) const { 1775 if (!BinaryFunctionPass::shouldOptimize(Function)) 1776 return false; 1777 1778 for (const std::string &FunctionSpec : Spec) { 1779 StringRef FunctionName = StringRef(FunctionSpec).split(':').first; 1780 if (Function.hasNameRegex(FunctionName)) 1781 return true; 1782 } 1783 1784 return false; 1785 } 1786 1787 std::set<size_t> SpecializeMemcpy1::getCallSitesToOptimize( 1788 const BinaryFunction &Function) const { 1789 StringRef SitesString; 1790 for (const std::string &FunctionSpec : Spec) { 1791 StringRef FunctionName; 1792 std::tie(FunctionName, SitesString) = StringRef(FunctionSpec).split(':'); 1793 if (Function.hasNameRegex(FunctionName)) 1794 break; 1795 SitesString = ""; 1796 } 1797 1798 std::set<size_t> Sites; 1799 SmallVector<StringRef, 4> SitesVec; 1800 SitesString.split(SitesVec, ':'); 1801 for (StringRef SiteString : SitesVec) { 1802 if (SiteString.empty()) 1803 continue; 1804 size_t Result; 1805 if (!SiteString.getAsInteger(10, Result)) 1806 Sites.emplace(Result); 1807 } 1808 1809 return Sites; 1810 } 1811 1812 Error SpecializeMemcpy1::runOnFunctions(BinaryContext &BC) { 1813 if (!BC.isX86()) 1814 return Error::success(); 1815 1816 uint64_t NumSpecialized = 0; 1817 uint64_t NumSpecializedDyno = 0; 1818 for (auto &BFI : BC.getBinaryFunctions()) { 1819 BinaryFunction &Function = BFI.second; 1820 if (!shouldOptimize(Function)) 1821 continue; 1822 1823 std::set<size_t> CallsToOptimize = getCallSitesToOptimize(Function); 1824 auto shouldOptimize = [&](size_t N) { 1825 return CallsToOptimize.empty() || CallsToOptimize.count(N); 1826 }; 1827 1828 std::vector<BinaryBasicBlock *> Blocks(Function.pbegin(), Function.pend()); 1829 size_t CallSiteID = 0; 1830 for (BinaryBasicBlock *CurBB : Blocks) { 1831 for (auto II = CurBB->begin(); II != CurBB->end(); ++II) { 1832 MCInst &Inst = *II; 1833 1834 if (!BC.MIB->isCall(Inst) || MCPlus::getNumPrimeOperands(Inst) != 1 || 1835 !Inst.getOperand(0).isExpr()) 1836 continue; 1837 1838 const MCSymbol *CalleeSymbol = BC.MIB->getTargetSymbol(Inst); 1839 if (CalleeSymbol->getName() != "memcpy" && 1840 CalleeSymbol->getName() != "memcpy@PLT") 1841 continue; 1842 1843 if (BC.MIB->isTailCall(Inst)) 1844 continue; 1845 1846 ++CallSiteID; 1847 1848 if (!shouldOptimize(CallSiteID)) 1849 continue; 1850 1851 // Create a copy of a call to memcpy(dest, src, size). 1852 MCInst MemcpyInstr = Inst; 1853 1854 BinaryBasicBlock *OneByteMemcpyBB = CurBB->splitAt(II); 1855 1856 BinaryBasicBlock *NextBB = nullptr; 1857 if (OneByteMemcpyBB->getNumNonPseudos() > 1) { 1858 NextBB = OneByteMemcpyBB->splitAt(OneByteMemcpyBB->begin()); 1859 NextBB->eraseInstruction(NextBB->begin()); 1860 } else { 1861 NextBB = OneByteMemcpyBB->getSuccessor(); 1862 OneByteMemcpyBB->eraseInstruction(OneByteMemcpyBB->begin()); 1863 assert(NextBB && "unexpected call to memcpy() with no return"); 1864 } 1865 1866 BinaryBasicBlock *MemcpyBB = Function.addBasicBlock(); 1867 MemcpyBB->setOffset(CurBB->getInputOffset()); 1868 InstructionListType CmpJCC = 1869 BC.MIB->createCmpJE(BC.MIB->getIntArgRegister(2), 1, 1870 OneByteMemcpyBB->getLabel(), BC.Ctx.get()); 1871 CurBB->addInstructions(CmpJCC); 1872 CurBB->addSuccessor(MemcpyBB); 1873 1874 MemcpyBB->addInstruction(std::move(MemcpyInstr)); 1875 MemcpyBB->addSuccessor(NextBB); 1876 MemcpyBB->setCFIState(NextBB->getCFIState()); 1877 MemcpyBB->setExecutionCount(0); 1878 1879 // To prevent the actual call from being moved to cold, we set its 1880 // execution count to 1. 1881 if (CurBB->getKnownExecutionCount() > 0) 1882 MemcpyBB->setExecutionCount(1); 1883 1884 InstructionListType OneByteMemcpy = BC.MIB->createOneByteMemcpy(); 1885 OneByteMemcpyBB->addInstructions(OneByteMemcpy); 1886 1887 ++NumSpecialized; 1888 NumSpecializedDyno += CurBB->getKnownExecutionCount(); 1889 1890 CurBB = NextBB; 1891 1892 // Note: we don't expect the next instruction to be a call to memcpy. 1893 II = CurBB->begin(); 1894 } 1895 } 1896 } 1897 1898 if (NumSpecialized) { 1899 BC.outs() << "BOLT-INFO: specialized " << NumSpecialized 1900 << " memcpy() call sites for size 1"; 1901 if (NumSpecializedDyno) 1902 BC.outs() << ". The calls were executed " << NumSpecializedDyno 1903 << " times based on profile."; 1904 BC.outs() << '\n'; 1905 } 1906 return Error::success(); 1907 } 1908 1909 void RemoveNops::runOnFunction(BinaryFunction &BF) { 1910 const BinaryContext &BC = BF.getBinaryContext(); 1911 for (BinaryBasicBlock &BB : BF) { 1912 for (int64_t I = BB.size() - 1; I >= 0; --I) { 1913 MCInst &Inst = BB.getInstructionAtIndex(I); 1914 if (BC.MIB->isNoop(Inst) && BC.MIB->hasAnnotation(Inst, "NOP")) 1915 BB.eraseInstructionAtIndex(I); 1916 } 1917 } 1918 } 1919 1920 Error RemoveNops::runOnFunctions(BinaryContext &BC) { 1921 ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) { 1922 runOnFunction(BF); 1923 }; 1924 1925 ParallelUtilities::PredicateTy SkipFunc = [&](const BinaryFunction &BF) { 1926 return BF.shouldPreserveNops(); 1927 }; 1928 1929 ParallelUtilities::runOnEachFunction( 1930 BC, ParallelUtilities::SchedulingPolicy::SP_INST_LINEAR, WorkFun, 1931 SkipFunc, "RemoveNops"); 1932 return Error::success(); 1933 } 1934 1935 } // namespace bolt 1936 } // namespace llvm 1937