1 //===- bolt/Passes/BinaryPasses.cpp - Binary-level passes -----------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements multiple passes for binary optimization and analysis. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "bolt/Passes/BinaryPasses.h" 14 #include "bolt/Core/FunctionLayout.h" 15 #include "bolt/Core/ParallelUtilities.h" 16 #include "bolt/Passes/ReorderAlgorithm.h" 17 #include "bolt/Passes/ReorderFunctions.h" 18 #include "llvm/Support/CommandLine.h" 19 #include <atomic> 20 #include <mutex> 21 #include <numeric> 22 #include <vector> 23 24 #define DEBUG_TYPE "bolt-opts" 25 26 using namespace llvm; 27 using namespace bolt; 28 29 static const char *dynoStatsOptName(const bolt::DynoStats::Category C) { 30 assert(C > bolt::DynoStats::FIRST_DYNO_STAT && 31 C < DynoStats::LAST_DYNO_STAT && "Unexpected dyno stat category."); 32 33 static std::string OptNames[bolt::DynoStats::LAST_DYNO_STAT + 1]; 34 35 OptNames[C] = bolt::DynoStats::Description(C); 36 37 std::replace(OptNames[C].begin(), OptNames[C].end(), ' ', '-'); 38 39 return OptNames[C].c_str(); 40 } 41 42 namespace opts { 43 44 extern cl::OptionCategory BoltCategory; 45 extern cl::OptionCategory BoltOptCategory; 46 47 extern cl::opt<bolt::MacroFusionType> AlignMacroOpFusion; 48 extern cl::opt<unsigned> Verbosity; 49 extern cl::opt<bool> EnableBAT; 50 extern cl::opt<unsigned> ExecutionCountThreshold; 51 extern cl::opt<bool> UpdateDebugSections; 52 extern cl::opt<bolt::ReorderFunctions::ReorderType> ReorderFunctions; 53 54 enum DynoStatsSortOrder : char { 55 Ascending, 56 Descending 57 }; 58 59 static cl::opt<DynoStatsSortOrder> DynoStatsSortOrderOpt( 60 "print-sorted-by-order", 61 cl::desc("use ascending or descending order when printing functions " 62 "ordered by dyno stats"), 63 cl::init(DynoStatsSortOrder::Descending), cl::cat(BoltOptCategory)); 64 65 cl::list<std::string> 66 HotTextMoveSections("hot-text-move-sections", 67 cl::desc("list of sections containing functions used for hugifying hot text. " 68 "BOLT makes sure these functions are not placed on the same page as " 69 "the hot text. (default=\'.stub,.mover\')."), 70 cl::value_desc("sec1,sec2,sec3,..."), 71 cl::CommaSeparated, 72 cl::ZeroOrMore, 73 cl::cat(BoltCategory)); 74 75 bool isHotTextMover(const BinaryFunction &Function) { 76 for (std::string &SectionName : opts::HotTextMoveSections) { 77 if (Function.getOriginSectionName() && 78 *Function.getOriginSectionName() == SectionName) 79 return true; 80 } 81 82 return false; 83 } 84 85 static cl::opt<bool> MinBranchClusters( 86 "min-branch-clusters", 87 cl::desc("use a modified clustering algorithm geared towards minimizing " 88 "branches"), 89 cl::Hidden, cl::cat(BoltOptCategory)); 90 91 static cl::list<Peepholes::PeepholeOpts> Peepholes( 92 "peepholes", cl::CommaSeparated, cl::desc("enable peephole optimizations"), 93 cl::value_desc("opt1,opt2,opt3,..."), 94 cl::values(clEnumValN(Peepholes::PEEP_NONE, "none", "disable peepholes"), 95 clEnumValN(Peepholes::PEEP_DOUBLE_JUMPS, "double-jumps", 96 "remove double jumps when able"), 97 clEnumValN(Peepholes::PEEP_TAILCALL_TRAPS, "tailcall-traps", 98 "insert tail call traps"), 99 clEnumValN(Peepholes::PEEP_USELESS_BRANCHES, "useless-branches", 100 "remove useless conditional branches"), 101 clEnumValN(Peepholes::PEEP_ALL, "all", 102 "enable all peephole optimizations")), 103 cl::ZeroOrMore, cl::cat(BoltOptCategory)); 104 105 static cl::opt<unsigned> 106 PrintFuncStat("print-function-statistics", 107 cl::desc("print statistics about basic block ordering"), 108 cl::init(0), cl::cat(BoltOptCategory)); 109 110 static cl::list<bolt::DynoStats::Category> 111 PrintSortedBy("print-sorted-by", cl::CommaSeparated, 112 cl::desc("print functions sorted by order of dyno stats"), 113 cl::value_desc("key1,key2,key3,..."), 114 cl::values( 115 #define D(name, description, ...) \ 116 clEnumValN(bolt::DynoStats::name, dynoStatsOptName(bolt::DynoStats::name), \ 117 description), 118 REAL_DYNO_STATS 119 #undef D 120 clEnumValN(bolt::DynoStats::LAST_DYNO_STAT, "all", 121 "sorted by all names")), 122 cl::ZeroOrMore, cl::cat(BoltOptCategory)); 123 124 static cl::opt<bool> 125 PrintUnknown("print-unknown", 126 cl::desc("print names of functions with unknown control flow"), 127 cl::cat(BoltCategory), cl::Hidden); 128 129 static cl::opt<bool> 130 PrintUnknownCFG("print-unknown-cfg", 131 cl::desc("dump CFG of functions with unknown control flow"), 132 cl::cat(BoltCategory), cl::ReallyHidden); 133 134 // Please MSVC19 with a forward declaration: otherwise it reports an error about 135 // an undeclared variable inside a callback. 136 extern cl::opt<bolt::ReorderBasicBlocks::LayoutType> ReorderBlocks; 137 cl::opt<bolt::ReorderBasicBlocks::LayoutType> ReorderBlocks( 138 "reorder-blocks", cl::desc("change layout of basic blocks in a function"), 139 cl::init(bolt::ReorderBasicBlocks::LT_NONE), 140 cl::values( 141 clEnumValN(bolt::ReorderBasicBlocks::LT_NONE, "none", 142 "do not reorder basic blocks"), 143 clEnumValN(bolt::ReorderBasicBlocks::LT_REVERSE, "reverse", 144 "layout blocks in reverse order"), 145 clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE, "normal", 146 "perform optimal layout based on profile"), 147 clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_BRANCH, 148 "branch-predictor", 149 "perform optimal layout prioritizing branch " 150 "predictions"), 151 clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_CACHE, "cache", 152 "perform optimal layout prioritizing I-cache " 153 "behavior"), 154 clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_CACHE_PLUS, "cache+", 155 "perform layout optimizing I-cache behavior"), 156 clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_EXT_TSP, "ext-tsp", 157 "perform layout optimizing I-cache behavior"), 158 clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_SHUFFLE, 159 "cluster-shuffle", "perform random layout of clusters")), 160 cl::ZeroOrMore, cl::cat(BoltOptCategory), 161 cl::callback([](const bolt::ReorderBasicBlocks::LayoutType &option) { 162 if (option == bolt::ReorderBasicBlocks::LT_OPTIMIZE_CACHE_PLUS) { 163 errs() << "BOLT-WARNING: '-reorder-blocks=cache+' is deprecated, please" 164 << " use '-reorder-blocks=ext-tsp' instead\n"; 165 ReorderBlocks = bolt::ReorderBasicBlocks::LT_OPTIMIZE_EXT_TSP; 166 } 167 })); 168 169 static cl::opt<unsigned> ReportBadLayout( 170 "report-bad-layout", 171 cl::desc("print top <uint> functions with suboptimal code layout on input"), 172 cl::init(0), cl::Hidden, cl::cat(BoltOptCategory)); 173 174 static cl::opt<bool> 175 ReportStaleFuncs("report-stale", 176 cl::desc("print the list of functions with stale profile"), 177 cl::Hidden, cl::cat(BoltOptCategory)); 178 179 enum SctcModes : char { 180 SctcAlways, 181 SctcPreserveDirection, 182 SctcHeuristic 183 }; 184 185 static cl::opt<SctcModes> 186 SctcMode("sctc-mode", 187 cl::desc("mode for simplify conditional tail calls"), 188 cl::init(SctcAlways), 189 cl::values(clEnumValN(SctcAlways, "always", "always perform sctc"), 190 clEnumValN(SctcPreserveDirection, 191 "preserve", 192 "only perform sctc when branch direction is " 193 "preserved"), 194 clEnumValN(SctcHeuristic, 195 "heuristic", 196 "use branch prediction data to control sctc")), 197 cl::ZeroOrMore, 198 cl::cat(BoltOptCategory)); 199 200 static cl::opt<unsigned> 201 StaleThreshold("stale-threshold", 202 cl::desc( 203 "maximum percentage of stale functions to tolerate (default: 100)"), 204 cl::init(100), 205 cl::Hidden, 206 cl::cat(BoltOptCategory)); 207 208 static cl::opt<unsigned> TSPThreshold( 209 "tsp-threshold", 210 cl::desc( 211 "maximum number of hot basic blocks in a function for which to use " 212 "a precise TSP solution while re-ordering basic blocks"), 213 cl::init(10), cl::Hidden, cl::cat(BoltOptCategory)); 214 215 static cl::opt<unsigned> TopCalledLimit( 216 "top-called-limit", 217 cl::desc("maximum number of functions to print in top called " 218 "functions section"), 219 cl::init(100), cl::Hidden, cl::cat(BoltCategory)); 220 221 } // namespace opts 222 223 namespace llvm { 224 namespace bolt { 225 226 bool BinaryFunctionPass::shouldOptimize(const BinaryFunction &BF) const { 227 return BF.isSimple() && BF.getState() == BinaryFunction::State::CFG && 228 !BF.isIgnored(); 229 } 230 231 bool BinaryFunctionPass::shouldPrint(const BinaryFunction &BF) const { 232 return BF.isSimple() && !BF.isIgnored(); 233 } 234 235 void NormalizeCFG::runOnFunction(BinaryFunction &BF) { 236 uint64_t NumRemoved = 0; 237 uint64_t NumDuplicateEdges = 0; 238 uint64_t NeedsFixBranches = 0; 239 for (BinaryBasicBlock &BB : BF) { 240 if (!BB.empty()) 241 continue; 242 243 if (BB.isEntryPoint() || BB.isLandingPad()) 244 continue; 245 246 // Handle a dangling empty block. 247 if (BB.succ_size() == 0) { 248 // If an empty dangling basic block has a predecessor, it could be a 249 // result of codegen for __builtin_unreachable. In such case, do not 250 // remove the block. 251 if (BB.pred_size() == 0) { 252 BB.markValid(false); 253 ++NumRemoved; 254 } 255 continue; 256 } 257 258 // The block should have just one successor. 259 BinaryBasicBlock *Successor = BB.getSuccessor(); 260 assert(Successor && "invalid CFG encountered"); 261 262 // Redirect all predecessors to the successor block. 263 while (!BB.pred_empty()) { 264 BinaryBasicBlock *Predecessor = *BB.pred_begin(); 265 if (Predecessor->hasJumpTable()) 266 break; 267 268 if (Predecessor == Successor) 269 break; 270 271 BinaryBasicBlock::BinaryBranchInfo &BI = Predecessor->getBranchInfo(BB); 272 Predecessor->replaceSuccessor(&BB, Successor, BI.Count, 273 BI.MispredictedCount); 274 // We need to fix branches even if we failed to replace all successors 275 // and remove the block. 276 NeedsFixBranches = true; 277 } 278 279 if (BB.pred_empty()) { 280 BB.removeAllSuccessors(); 281 BB.markValid(false); 282 ++NumRemoved; 283 } 284 } 285 286 if (NumRemoved) 287 BF.eraseInvalidBBs(); 288 289 // Check for duplicate successors. Do it after the empty block elimination as 290 // we can get more duplicate successors. 291 for (BinaryBasicBlock &BB : BF) 292 if (!BB.hasJumpTable() && BB.succ_size() == 2 && 293 BB.getConditionalSuccessor(false) == BB.getConditionalSuccessor(true)) 294 ++NumDuplicateEdges; 295 296 // fixBranches() will get rid of duplicate edges and update jump instructions. 297 if (NumDuplicateEdges || NeedsFixBranches) 298 BF.fixBranches(); 299 300 NumDuplicateEdgesMerged += NumDuplicateEdges; 301 NumBlocksRemoved += NumRemoved; 302 } 303 304 void NormalizeCFG::runOnFunctions(BinaryContext &BC) { 305 ParallelUtilities::runOnEachFunction( 306 BC, ParallelUtilities::SchedulingPolicy::SP_BB_LINEAR, 307 [&](BinaryFunction &BF) { runOnFunction(BF); }, 308 [&](const BinaryFunction &BF) { return !shouldOptimize(BF); }, 309 "NormalizeCFG"); 310 if (NumBlocksRemoved) 311 outs() << "BOLT-INFO: removed " << NumBlocksRemoved << " empty block" 312 << (NumBlocksRemoved == 1 ? "" : "s") << '\n'; 313 if (NumDuplicateEdgesMerged) 314 outs() << "BOLT-INFO: merged " << NumDuplicateEdgesMerged 315 << " duplicate CFG edge" << (NumDuplicateEdgesMerged == 1 ? "" : "s") 316 << '\n'; 317 } 318 319 void EliminateUnreachableBlocks::runOnFunction(BinaryFunction &Function) { 320 if (!Function.getLayout().block_empty()) { 321 unsigned Count; 322 uint64_t Bytes; 323 Function.markUnreachableBlocks(); 324 LLVM_DEBUG({ 325 for (BinaryBasicBlock &BB : Function) { 326 if (!BB.isValid()) { 327 dbgs() << "BOLT-INFO: UCE found unreachable block " << BB.getName() 328 << " in function " << Function << "\n"; 329 Function.dump(); 330 } 331 } 332 }); 333 std::tie(Count, Bytes) = Function.eraseInvalidBBs(); 334 DeletedBlocks += Count; 335 DeletedBytes += Bytes; 336 if (Count) { 337 Modified.insert(&Function); 338 if (opts::Verbosity > 0) 339 outs() << "BOLT-INFO: removed " << Count 340 << " dead basic block(s) accounting for " << Bytes 341 << " bytes in function " << Function << '\n'; 342 } 343 } 344 } 345 346 void EliminateUnreachableBlocks::runOnFunctions(BinaryContext &BC) { 347 for (auto &It : BC.getBinaryFunctions()) { 348 BinaryFunction &Function = It.second; 349 if (shouldOptimize(Function)) 350 runOnFunction(Function); 351 } 352 353 if (DeletedBlocks) 354 outs() << "BOLT-INFO: UCE removed " << DeletedBlocks << " blocks and " 355 << DeletedBytes << " bytes of code\n"; 356 } 357 358 bool ReorderBasicBlocks::shouldPrint(const BinaryFunction &BF) const { 359 return (BinaryFunctionPass::shouldPrint(BF) && 360 opts::ReorderBlocks != ReorderBasicBlocks::LT_NONE); 361 } 362 363 bool ReorderBasicBlocks::shouldOptimize(const BinaryFunction &BF) const { 364 // Apply execution count threshold 365 if (BF.getKnownExecutionCount() < opts::ExecutionCountThreshold) 366 return false; 367 368 return BinaryFunctionPass::shouldOptimize(BF); 369 } 370 371 void ReorderBasicBlocks::runOnFunctions(BinaryContext &BC) { 372 if (opts::ReorderBlocks == ReorderBasicBlocks::LT_NONE) 373 return; 374 375 std::atomic_uint64_t ModifiedFuncCount(0); 376 std::mutex FunctionEditDistanceMutex; 377 DenseMap<const BinaryFunction *, uint64_t> FunctionEditDistance; 378 379 ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) { 380 SmallVector<const BinaryBasicBlock *, 0> OldBlockOrder; 381 if (opts::PrintFuncStat > 0) 382 llvm::copy(BF.getLayout().blocks(), std::back_inserter(OldBlockOrder)); 383 384 const bool LayoutChanged = 385 modifyFunctionLayout(BF, opts::ReorderBlocks, opts::MinBranchClusters); 386 if (LayoutChanged) { 387 ModifiedFuncCount.fetch_add(1, std::memory_order_relaxed); 388 if (opts::PrintFuncStat > 0) { 389 const uint64_t Distance = BF.getLayout().getEditDistance(OldBlockOrder); 390 std::lock_guard<std::mutex> Lock(FunctionEditDistanceMutex); 391 FunctionEditDistance[&BF] = Distance; 392 } 393 } 394 }; 395 396 ParallelUtilities::PredicateTy SkipFunc = [&](const BinaryFunction &BF) { 397 return !shouldOptimize(BF); 398 }; 399 400 ParallelUtilities::runOnEachFunction( 401 BC, ParallelUtilities::SchedulingPolicy::SP_BB_LINEAR, WorkFun, SkipFunc, 402 "ReorderBasicBlocks"); 403 const size_t NumAllProfiledFunctions = 404 BC.NumProfiledFuncs + BC.NumStaleProfileFuncs; 405 406 outs() << "BOLT-INFO: basic block reordering modified layout of " 407 << format("%zu functions (%.2lf%% of profiled, %.2lf%% of total)\n", 408 ModifiedFuncCount.load(std::memory_order_relaxed), 409 100.0 * ModifiedFuncCount.load(std::memory_order_relaxed) / 410 NumAllProfiledFunctions, 411 100.0 * ModifiedFuncCount.load(std::memory_order_relaxed) / 412 BC.getBinaryFunctions().size()); 413 414 if (opts::PrintFuncStat > 0) { 415 raw_ostream &OS = outs(); 416 // Copy all the values into vector in order to sort them 417 std::map<uint64_t, BinaryFunction &> ScoreMap; 418 auto &BFs = BC.getBinaryFunctions(); 419 for (auto It = BFs.begin(); It != BFs.end(); ++It) 420 ScoreMap.insert(std::pair<uint64_t, BinaryFunction &>( 421 It->second.getFunctionScore(), It->second)); 422 423 OS << "\nBOLT-INFO: Printing Function Statistics:\n\n"; 424 OS << " There are " << BFs.size() << " functions in total. \n"; 425 OS << " Number of functions being modified: " 426 << ModifiedFuncCount.load(std::memory_order_relaxed) << "\n"; 427 OS << " User asks for detailed information on top " 428 << opts::PrintFuncStat << " functions. (Ranked by function score)" 429 << "\n\n"; 430 uint64_t I = 0; 431 for (std::map<uint64_t, BinaryFunction &>::reverse_iterator Rit = 432 ScoreMap.rbegin(); 433 Rit != ScoreMap.rend() && I < opts::PrintFuncStat; ++Rit, ++I) { 434 BinaryFunction &Function = Rit->second; 435 436 OS << " Information for function of top: " << (I + 1) << ": \n"; 437 OS << " Function Score is: " << Function.getFunctionScore() 438 << "\n"; 439 OS << " There are " << Function.size() 440 << " number of blocks in this function.\n"; 441 OS << " There are " << Function.getInstructionCount() 442 << " number of instructions in this function.\n"; 443 OS << " The edit distance for this function is: " 444 << FunctionEditDistance.lookup(&Function) << "\n\n"; 445 } 446 } 447 } 448 449 bool ReorderBasicBlocks::modifyFunctionLayout(BinaryFunction &BF, 450 LayoutType Type, 451 bool MinBranchClusters) const { 452 if (BF.size() == 0 || Type == LT_NONE) 453 return false; 454 455 BinaryFunction::BasicBlockOrderType NewLayout; 456 std::unique_ptr<ReorderAlgorithm> Algo; 457 458 // Cannot do optimal layout without profile. 459 if (Type != LT_REVERSE && !BF.hasValidProfile()) 460 return false; 461 462 if (Type == LT_REVERSE) { 463 Algo.reset(new ReverseReorderAlgorithm()); 464 } else if (BF.size() <= opts::TSPThreshold && Type != LT_OPTIMIZE_SHUFFLE) { 465 // Work on optimal solution if problem is small enough 466 LLVM_DEBUG(dbgs() << "finding optimal block layout for " << BF << "\n"); 467 Algo.reset(new TSPReorderAlgorithm()); 468 } else { 469 LLVM_DEBUG(dbgs() << "running block layout heuristics on " << BF << "\n"); 470 471 std::unique_ptr<ClusterAlgorithm> CAlgo; 472 if (MinBranchClusters) 473 CAlgo.reset(new MinBranchGreedyClusterAlgorithm()); 474 else 475 CAlgo.reset(new PHGreedyClusterAlgorithm()); 476 477 switch (Type) { 478 case LT_OPTIMIZE: 479 Algo.reset(new OptimizeReorderAlgorithm(std::move(CAlgo))); 480 break; 481 482 case LT_OPTIMIZE_BRANCH: 483 Algo.reset(new OptimizeBranchReorderAlgorithm(std::move(CAlgo))); 484 break; 485 486 case LT_OPTIMIZE_CACHE: 487 Algo.reset(new OptimizeCacheReorderAlgorithm(std::move(CAlgo))); 488 break; 489 490 case LT_OPTIMIZE_EXT_TSP: 491 Algo.reset(new ExtTSPReorderAlgorithm()); 492 break; 493 494 case LT_OPTIMIZE_SHUFFLE: 495 Algo.reset(new RandomClusterReorderAlgorithm(std::move(CAlgo))); 496 break; 497 498 default: 499 llvm_unreachable("unexpected layout type"); 500 } 501 } 502 503 Algo->reorderBasicBlocks(BF, NewLayout); 504 505 return BF.getLayout().update(NewLayout); 506 } 507 508 void FixupBranches::runOnFunctions(BinaryContext &BC) { 509 for (auto &It : BC.getBinaryFunctions()) { 510 BinaryFunction &Function = It.second; 511 if (!BC.shouldEmit(Function) || !Function.isSimple()) 512 continue; 513 514 Function.fixBranches(); 515 } 516 } 517 518 void FinalizeFunctions::runOnFunctions(BinaryContext &BC) { 519 ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) { 520 if (!BF.finalizeCFIState()) { 521 if (BC.HasRelocations) { 522 errs() << "BOLT-ERROR: unable to fix CFI state for function " << BF 523 << ". Exiting.\n"; 524 exit(1); 525 } 526 BF.setSimple(false); 527 return; 528 } 529 530 BF.setFinalized(); 531 532 // Update exception handling information. 533 BF.updateEHRanges(); 534 }; 535 536 ParallelUtilities::PredicateTy SkipPredicate = [&](const BinaryFunction &BF) { 537 return !BC.shouldEmit(BF); 538 }; 539 540 ParallelUtilities::runOnEachFunction( 541 BC, ParallelUtilities::SchedulingPolicy::SP_CONSTANT, WorkFun, 542 SkipPredicate, "FinalizeFunctions"); 543 } 544 545 void CheckLargeFunctions::runOnFunctions(BinaryContext &BC) { 546 if (BC.HasRelocations) 547 return; 548 549 if (!opts::UpdateDebugSections) 550 return; 551 552 // If the function wouldn't fit, mark it as non-simple. Otherwise, we may emit 553 // incorrect debug info. 554 ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) { 555 uint64_t HotSize, ColdSize; 556 std::tie(HotSize, ColdSize) = 557 BC.calculateEmittedSize(BF, /*FixBranches=*/false); 558 if (HotSize > BF.getMaxSize()) 559 BF.setSimple(false); 560 }; 561 562 ParallelUtilities::PredicateTy SkipFunc = [&](const BinaryFunction &BF) { 563 return !shouldOptimize(BF); 564 }; 565 566 ParallelUtilities::runOnEachFunction( 567 BC, ParallelUtilities::SchedulingPolicy::SP_INST_LINEAR, WorkFun, 568 SkipFunc, "CheckLargeFunctions"); 569 } 570 571 bool CheckLargeFunctions::shouldOptimize(const BinaryFunction &BF) const { 572 // Unlike other passes, allow functions in non-CFG state. 573 return BF.isSimple() && !BF.isIgnored(); 574 } 575 576 void LowerAnnotations::runOnFunctions(BinaryContext &BC) { 577 std::vector<std::pair<MCInst *, uint32_t>> PreservedOffsetAnnotations; 578 std::vector<std::pair<MCInst *, MCSymbol *>> PreservedLabelAnnotations; 579 580 for (auto &It : BC.getBinaryFunctions()) { 581 BinaryFunction &BF = It.second; 582 583 for (FunctionFragment &FF : BF.getLayout().fragments()) { 584 int64_t CurrentGnuArgsSize = 0; 585 586 for (BinaryBasicBlock *const BB : FF) { 587 // First convert GnuArgsSize annotations into CFIs. This may change 588 // instr pointers, so do it before recording ptrs for preserved 589 // annotations 590 if (BF.usesGnuArgsSize()) { 591 for (auto II = BB->begin(); II != BB->end(); ++II) { 592 if (!BC.MIB->isInvoke(*II)) 593 continue; 594 const int64_t NewGnuArgsSize = BC.MIB->getGnuArgsSize(*II); 595 assert(NewGnuArgsSize >= 0 && 596 "expected non-negative GNU_args_size"); 597 if (NewGnuArgsSize != CurrentGnuArgsSize) { 598 auto InsertII = BF.addCFIInstruction( 599 BB, II, 600 MCCFIInstruction::createGnuArgsSize(nullptr, NewGnuArgsSize)); 601 CurrentGnuArgsSize = NewGnuArgsSize; 602 II = std::next(InsertII); 603 } 604 } 605 } 606 607 // Now record preserved annotations separately and then strip 608 // annotations. 609 for (auto II = BB->begin(); II != BB->end(); ++II) { 610 if (BF.requiresAddressTranslation() && BC.MIB->getOffset(*II)) 611 PreservedOffsetAnnotations.emplace_back(&(*II), 612 *BC.MIB->getOffset(*II)); 613 if (MCSymbol *Label = BC.MIB->getLabel(*II)) 614 PreservedLabelAnnotations.emplace_back(&*II, Label); 615 BC.MIB->stripAnnotations(*II); 616 } 617 } 618 } 619 } 620 for (BinaryFunction *BF : BC.getInjectedBinaryFunctions()) 621 for (BinaryBasicBlock &BB : *BF) 622 for (MCInst &Instruction : BB) { 623 if (MCSymbol *Label = BC.MIB->getLabel(Instruction)) 624 PreservedLabelAnnotations.emplace_back(&Instruction, Label); 625 BC.MIB->stripAnnotations(Instruction); 626 } 627 628 // Release all memory taken by annotations 629 BC.MIB->freeAnnotations(); 630 631 // Reinsert preserved annotations we need during code emission. 632 for (const std::pair<MCInst *, uint32_t> &Item : PreservedOffsetAnnotations) 633 BC.MIB->setOffset(*Item.first, Item.second); 634 for (auto [Instr, Label] : PreservedLabelAnnotations) 635 BC.MIB->setLabel(*Instr, Label); 636 } 637 638 // Check for dirty state in MCSymbol objects that might be a consequence 639 // of running calculateEmittedSize() in parallel, during split functions 640 // pass. If an inconsistent state is found (symbol already registered or 641 // already defined), clean it. 642 void CleanMCState::runOnFunctions(BinaryContext &BC) { 643 MCContext &Ctx = *BC.Ctx; 644 for (const auto &SymMapEntry : Ctx.getSymbols()) { 645 const MCSymbol *S = SymMapEntry.second; 646 if (S->isDefined()) { 647 LLVM_DEBUG(dbgs() << "BOLT-DEBUG: Symbol \"" << S->getName() 648 << "\" is already defined\n"); 649 const_cast<MCSymbol *>(S)->setUndefined(); 650 } 651 if (S->isRegistered()) { 652 LLVM_DEBUG(dbgs() << "BOLT-DEBUG: Symbol \"" << S->getName() 653 << "\" is already registered\n"); 654 const_cast<MCSymbol *>(S)->setIsRegistered(false); 655 } 656 LLVM_DEBUG(if (S->isVariable()) { 657 dbgs() << "BOLT-DEBUG: Symbol \"" << S->getName() << "\" is variable\n"; 658 }); 659 } 660 } 661 662 // This peephole fixes jump instructions that jump to another basic 663 // block with a single jump instruction, e.g. 664 // 665 // B0: ... 666 // jmp B1 (or jcc B1) 667 // 668 // B1: jmp B2 669 // 670 // -> 671 // 672 // B0: ... 673 // jmp B2 (or jcc B2) 674 // 675 static uint64_t fixDoubleJumps(BinaryFunction &Function, bool MarkInvalid) { 676 uint64_t NumDoubleJumps = 0; 677 678 MCContext *Ctx = Function.getBinaryContext().Ctx.get(); 679 MCPlusBuilder *MIB = Function.getBinaryContext().MIB.get(); 680 for (BinaryBasicBlock &BB : Function) { 681 auto checkAndPatch = [&](BinaryBasicBlock *Pred, BinaryBasicBlock *Succ, 682 const MCSymbol *SuccSym) { 683 // Ignore infinite loop jumps or fallthrough tail jumps. 684 if (Pred == Succ || Succ == &BB) 685 return false; 686 687 if (Succ) { 688 const MCSymbol *TBB = nullptr; 689 const MCSymbol *FBB = nullptr; 690 MCInst *CondBranch = nullptr; 691 MCInst *UncondBranch = nullptr; 692 bool Res = Pred->analyzeBranch(TBB, FBB, CondBranch, UncondBranch); 693 if (!Res) { 694 LLVM_DEBUG(dbgs() << "analyzeBranch failed in peepholes in block:\n"; 695 Pred->dump()); 696 return false; 697 } 698 Pred->replaceSuccessor(&BB, Succ); 699 700 // We must patch up any existing branch instructions to match up 701 // with the new successor. 702 assert((CondBranch || (!CondBranch && Pred->succ_size() == 1)) && 703 "Predecessor block has inconsistent number of successors"); 704 if (CondBranch && MIB->getTargetSymbol(*CondBranch) == BB.getLabel()) { 705 MIB->replaceBranchTarget(*CondBranch, Succ->getLabel(), Ctx); 706 } else if (UncondBranch && 707 MIB->getTargetSymbol(*UncondBranch) == BB.getLabel()) { 708 MIB->replaceBranchTarget(*UncondBranch, Succ->getLabel(), Ctx); 709 } else if (!UncondBranch) { 710 assert(Function.getLayout().getBasicBlockAfter(Pred, false) != Succ && 711 "Don't add an explicit jump to a fallthrough block."); 712 Pred->addBranchInstruction(Succ); 713 } 714 } else { 715 // Succ will be null in the tail call case. In this case we 716 // need to explicitly add a tail call instruction. 717 MCInst *Branch = Pred->getLastNonPseudoInstr(); 718 if (Branch && MIB->isUnconditionalBranch(*Branch)) { 719 assert(MIB->getTargetSymbol(*Branch) == BB.getLabel()); 720 Pred->removeSuccessor(&BB); 721 Pred->eraseInstruction(Pred->findInstruction(Branch)); 722 Pred->addTailCallInstruction(SuccSym); 723 } else { 724 return false; 725 } 726 } 727 728 ++NumDoubleJumps; 729 LLVM_DEBUG(dbgs() << "Removed double jump in " << Function << " from " 730 << Pred->getName() << " -> " << BB.getName() << " to " 731 << Pred->getName() << " -> " << SuccSym->getName() 732 << (!Succ ? " (tail)\n" : "\n")); 733 734 return true; 735 }; 736 737 if (BB.getNumNonPseudos() != 1 || BB.isLandingPad()) 738 continue; 739 740 MCInst *Inst = BB.getFirstNonPseudoInstr(); 741 const bool IsTailCall = MIB->isTailCall(*Inst); 742 743 if (!MIB->isUnconditionalBranch(*Inst) && !IsTailCall) 744 continue; 745 746 // If we operate after SCTC make sure it's not a conditional tail call. 747 if (IsTailCall && MIB->isConditionalBranch(*Inst)) 748 continue; 749 750 const MCSymbol *SuccSym = MIB->getTargetSymbol(*Inst); 751 BinaryBasicBlock *Succ = BB.getSuccessor(); 752 753 if (((!Succ || &BB == Succ) && !IsTailCall) || (IsTailCall && !SuccSym)) 754 continue; 755 756 std::vector<BinaryBasicBlock *> Preds = {BB.pred_begin(), BB.pred_end()}; 757 758 for (BinaryBasicBlock *Pred : Preds) { 759 if (Pred->isLandingPad()) 760 continue; 761 762 if (Pred->getSuccessor() == &BB || 763 (Pred->getConditionalSuccessor(true) == &BB && !IsTailCall) || 764 Pred->getConditionalSuccessor(false) == &BB) 765 if (checkAndPatch(Pred, Succ, SuccSym) && MarkInvalid) 766 BB.markValid(BB.pred_size() != 0 || BB.isLandingPad() || 767 BB.isEntryPoint()); 768 } 769 } 770 771 return NumDoubleJumps; 772 } 773 774 bool SimplifyConditionalTailCalls::shouldRewriteBranch( 775 const BinaryBasicBlock *PredBB, const MCInst &CondBranch, 776 const BinaryBasicBlock *BB, const bool DirectionFlag) { 777 if (BeenOptimized.count(PredBB)) 778 return false; 779 780 const bool IsForward = BinaryFunction::isForwardBranch(PredBB, BB); 781 782 if (IsForward) 783 ++NumOrigForwardBranches; 784 else 785 ++NumOrigBackwardBranches; 786 787 if (opts::SctcMode == opts::SctcAlways) 788 return true; 789 790 if (opts::SctcMode == opts::SctcPreserveDirection) 791 return IsForward == DirectionFlag; 792 793 const ErrorOr<std::pair<double, double>> Frequency = 794 PredBB->getBranchStats(BB); 795 796 // It's ok to rewrite the conditional branch if the new target will be 797 // a backward branch. 798 799 // If no data available for these branches, then it should be ok to 800 // do the optimization since it will reduce code size. 801 if (Frequency.getError()) 802 return true; 803 804 // TODO: should this use misprediction frequency instead? 805 const bool Result = (IsForward && Frequency.get().first >= 0.5) || 806 (!IsForward && Frequency.get().first <= 0.5); 807 808 return Result == DirectionFlag; 809 } 810 811 uint64_t SimplifyConditionalTailCalls::fixTailCalls(BinaryFunction &BF) { 812 // Need updated indices to correctly detect branch' direction. 813 BF.getLayout().updateLayoutIndices(); 814 BF.markUnreachableBlocks(); 815 816 MCPlusBuilder *MIB = BF.getBinaryContext().MIB.get(); 817 MCContext *Ctx = BF.getBinaryContext().Ctx.get(); 818 uint64_t NumLocalCTCCandidates = 0; 819 uint64_t NumLocalCTCs = 0; 820 uint64_t LocalCTCTakenCount = 0; 821 uint64_t LocalCTCExecCount = 0; 822 std::vector<std::pair<BinaryBasicBlock *, const BinaryBasicBlock *>> 823 NeedsUncondBranch; 824 825 // Will block be deleted by UCE? 826 auto isValid = [](const BinaryBasicBlock *BB) { 827 return (BB->pred_size() != 0 || BB->isLandingPad() || BB->isEntryPoint()); 828 }; 829 830 for (BinaryBasicBlock *BB : BF.getLayout().blocks()) { 831 // Locate BB with a single direct tail-call instruction. 832 if (BB->getNumNonPseudos() != 1) 833 continue; 834 835 MCInst *Instr = BB->getFirstNonPseudoInstr(); 836 if (!MIB->isTailCall(*Instr) || MIB->isConditionalBranch(*Instr)) 837 continue; 838 839 const MCSymbol *CalleeSymbol = MIB->getTargetSymbol(*Instr); 840 if (!CalleeSymbol) 841 continue; 842 843 // Detect direction of the possible conditional tail call. 844 const bool IsForwardCTC = BF.isForwardCall(CalleeSymbol); 845 846 // Iterate through all predecessors. 847 for (BinaryBasicBlock *PredBB : BB->predecessors()) { 848 BinaryBasicBlock *CondSucc = PredBB->getConditionalSuccessor(true); 849 if (!CondSucc) 850 continue; 851 852 ++NumLocalCTCCandidates; 853 854 const MCSymbol *TBB = nullptr; 855 const MCSymbol *FBB = nullptr; 856 MCInst *CondBranch = nullptr; 857 MCInst *UncondBranch = nullptr; 858 bool Result = PredBB->analyzeBranch(TBB, FBB, CondBranch, UncondBranch); 859 860 // analyzeBranch() can fail due to unusual branch instructions, e.g. jrcxz 861 if (!Result) { 862 LLVM_DEBUG(dbgs() << "analyzeBranch failed in SCTC in block:\n"; 863 PredBB->dump()); 864 continue; 865 } 866 867 assert(Result && "internal error analyzing conditional branch"); 868 assert(CondBranch && "conditional branch expected"); 869 870 // It's possible that PredBB is also a successor to BB that may have 871 // been processed by a previous iteration of the SCTC loop, in which 872 // case it may have been marked invalid. We should skip rewriting in 873 // this case. 874 if (!PredBB->isValid()) { 875 assert(PredBB->isSuccessor(BB) && 876 "PredBB should be valid if it is not a successor to BB"); 877 continue; 878 } 879 880 // We don't want to reverse direction of the branch in new order 881 // without further profile analysis. 882 const bool DirectionFlag = CondSucc == BB ? IsForwardCTC : !IsForwardCTC; 883 if (!shouldRewriteBranch(PredBB, *CondBranch, BB, DirectionFlag)) 884 continue; 885 886 // Record this block so that we don't try to optimize it twice. 887 BeenOptimized.insert(PredBB); 888 889 uint64_t Count = 0; 890 if (CondSucc != BB) { 891 // Patch the new target address into the conditional branch. 892 MIB->reverseBranchCondition(*CondBranch, CalleeSymbol, Ctx); 893 // Since we reversed the condition on the branch we need to change 894 // the target for the unconditional branch or add a unconditional 895 // branch to the old target. This has to be done manually since 896 // fixupBranches is not called after SCTC. 897 NeedsUncondBranch.emplace_back(PredBB, CondSucc); 898 Count = PredBB->getFallthroughBranchInfo().Count; 899 } else { 900 // Change destination of the conditional branch. 901 MIB->replaceBranchTarget(*CondBranch, CalleeSymbol, Ctx); 902 Count = PredBB->getTakenBranchInfo().Count; 903 } 904 const uint64_t CTCTakenFreq = 905 Count == BinaryBasicBlock::COUNT_NO_PROFILE ? 0 : Count; 906 907 // Annotate it, so "isCall" returns true for this jcc 908 MIB->setConditionalTailCall(*CondBranch); 909 // Add info about the conditional tail call frequency, otherwise this 910 // info will be lost when we delete the associated BranchInfo entry 911 auto &CTCAnnotation = 912 MIB->getOrCreateAnnotationAs<uint64_t>(*CondBranch, "CTCTakenCount"); 913 CTCAnnotation = CTCTakenFreq; 914 915 // Remove the unused successor which may be eliminated later 916 // if there are no other users. 917 PredBB->removeSuccessor(BB); 918 // Update BB execution count 919 if (CTCTakenFreq && CTCTakenFreq <= BB->getKnownExecutionCount()) 920 BB->setExecutionCount(BB->getExecutionCount() - CTCTakenFreq); 921 else if (CTCTakenFreq > BB->getKnownExecutionCount()) 922 BB->setExecutionCount(0); 923 924 ++NumLocalCTCs; 925 LocalCTCTakenCount += CTCTakenFreq; 926 LocalCTCExecCount += PredBB->getKnownExecutionCount(); 927 } 928 929 // Remove the block from CFG if all predecessors were removed. 930 BB->markValid(isValid(BB)); 931 } 932 933 // Add unconditional branches at the end of BBs to new successors 934 // as long as the successor is not a fallthrough. 935 for (auto &Entry : NeedsUncondBranch) { 936 BinaryBasicBlock *PredBB = Entry.first; 937 const BinaryBasicBlock *CondSucc = Entry.second; 938 939 const MCSymbol *TBB = nullptr; 940 const MCSymbol *FBB = nullptr; 941 MCInst *CondBranch = nullptr; 942 MCInst *UncondBranch = nullptr; 943 PredBB->analyzeBranch(TBB, FBB, CondBranch, UncondBranch); 944 945 // Find the next valid block. Invalid blocks will be deleted 946 // so they shouldn't be considered fallthrough targets. 947 const BinaryBasicBlock *NextBlock = 948 BF.getLayout().getBasicBlockAfter(PredBB, false); 949 while (NextBlock && !isValid(NextBlock)) 950 NextBlock = BF.getLayout().getBasicBlockAfter(NextBlock, false); 951 952 // Get the unconditional successor to this block. 953 const BinaryBasicBlock *PredSucc = PredBB->getSuccessor(); 954 assert(PredSucc && "The other branch should be a tail call"); 955 956 const bool HasFallthrough = (NextBlock && PredSucc == NextBlock); 957 958 if (UncondBranch) { 959 if (HasFallthrough) 960 PredBB->eraseInstruction(PredBB->findInstruction(UncondBranch)); 961 else 962 MIB->replaceBranchTarget(*UncondBranch, CondSucc->getLabel(), Ctx); 963 } else if (!HasFallthrough) { 964 MCInst Branch; 965 MIB->createUncondBranch(Branch, CondSucc->getLabel(), Ctx); 966 PredBB->addInstruction(Branch); 967 } 968 } 969 970 if (NumLocalCTCs > 0) { 971 NumDoubleJumps += fixDoubleJumps(BF, true); 972 // Clean-up unreachable tail-call blocks. 973 const std::pair<unsigned, uint64_t> Stats = BF.eraseInvalidBBs(); 974 DeletedBlocks += Stats.first; 975 DeletedBytes += Stats.second; 976 977 assert(BF.validateCFG()); 978 } 979 980 LLVM_DEBUG(dbgs() << "BOLT: created " << NumLocalCTCs 981 << " conditional tail calls from a total of " 982 << NumLocalCTCCandidates << " candidates in function " << BF 983 << ". CTCs execution count for this function is " 984 << LocalCTCExecCount << " and CTC taken count is " 985 << LocalCTCTakenCount << "\n";); 986 987 NumTailCallsPatched += NumLocalCTCs; 988 NumCandidateTailCalls += NumLocalCTCCandidates; 989 CTCExecCount += LocalCTCExecCount; 990 CTCTakenCount += LocalCTCTakenCount; 991 992 return NumLocalCTCs > 0; 993 } 994 995 void SimplifyConditionalTailCalls::runOnFunctions(BinaryContext &BC) { 996 if (!BC.isX86()) 997 return; 998 999 for (auto &It : BC.getBinaryFunctions()) { 1000 BinaryFunction &Function = It.second; 1001 1002 if (!shouldOptimize(Function)) 1003 continue; 1004 1005 if (fixTailCalls(Function)) { 1006 Modified.insert(&Function); 1007 Function.setHasCanonicalCFG(false); 1008 } 1009 } 1010 1011 if (NumTailCallsPatched) 1012 outs() << "BOLT-INFO: SCTC: patched " << NumTailCallsPatched 1013 << " tail calls (" << NumOrigForwardBranches << " forward)" 1014 << " tail calls (" << NumOrigBackwardBranches << " backward)" 1015 << " from a total of " << NumCandidateTailCalls << " while removing " 1016 << NumDoubleJumps << " double jumps" 1017 << " and removing " << DeletedBlocks << " basic blocks" 1018 << " totalling " << DeletedBytes 1019 << " bytes of code. CTCs total execution count is " << CTCExecCount 1020 << " and the number of times CTCs are taken is " << CTCTakenCount 1021 << "\n"; 1022 } 1023 1024 uint64_t ShortenInstructions::shortenInstructions(BinaryFunction &Function) { 1025 uint64_t Count = 0; 1026 const BinaryContext &BC = Function.getBinaryContext(); 1027 for (BinaryBasicBlock &BB : Function) { 1028 for (MCInst &Inst : BB) { 1029 MCInst OriginalInst; 1030 if (opts::Verbosity > 2) 1031 OriginalInst = Inst; 1032 1033 if (!BC.MIB->shortenInstruction(Inst, *BC.STI)) 1034 continue; 1035 1036 if (opts::Verbosity > 2) { 1037 BC.scopeLock(); 1038 outs() << "BOLT-INFO: shortening:\nBOLT-INFO: "; 1039 BC.printInstruction(outs(), OriginalInst, 0, &Function); 1040 outs() << "BOLT-INFO: to:"; 1041 BC.printInstruction(outs(), Inst, 0, &Function); 1042 } 1043 1044 ++Count; 1045 } 1046 } 1047 1048 return Count; 1049 } 1050 1051 void ShortenInstructions::runOnFunctions(BinaryContext &BC) { 1052 std::atomic<uint64_t> NumShortened{0}; 1053 if (!BC.isX86()) 1054 return; 1055 1056 ParallelUtilities::runOnEachFunction( 1057 BC, ParallelUtilities::SchedulingPolicy::SP_INST_LINEAR, 1058 [&](BinaryFunction &BF) { NumShortened += shortenInstructions(BF); }, 1059 nullptr, "ShortenInstructions"); 1060 1061 if (NumShortened) 1062 outs() << "BOLT-INFO: " << NumShortened << " instructions were shortened\n"; 1063 } 1064 1065 void Peepholes::addTailcallTraps(BinaryFunction &Function) { 1066 MCPlusBuilder *MIB = Function.getBinaryContext().MIB.get(); 1067 for (BinaryBasicBlock &BB : Function) { 1068 MCInst *Inst = BB.getLastNonPseudoInstr(); 1069 if (Inst && MIB->isTailCall(*Inst) && MIB->isIndirectBranch(*Inst)) { 1070 MCInst Trap; 1071 if (MIB->createTrap(Trap)) { 1072 BB.addInstruction(Trap); 1073 ++TailCallTraps; 1074 } 1075 } 1076 } 1077 } 1078 1079 void Peepholes::removeUselessCondBranches(BinaryFunction &Function) { 1080 for (BinaryBasicBlock &BB : Function) { 1081 if (BB.succ_size() != 2) 1082 continue; 1083 1084 BinaryBasicBlock *CondBB = BB.getConditionalSuccessor(true); 1085 BinaryBasicBlock *UncondBB = BB.getConditionalSuccessor(false); 1086 if (CondBB != UncondBB) 1087 continue; 1088 1089 const MCSymbol *TBB = nullptr; 1090 const MCSymbol *FBB = nullptr; 1091 MCInst *CondBranch = nullptr; 1092 MCInst *UncondBranch = nullptr; 1093 bool Result = BB.analyzeBranch(TBB, FBB, CondBranch, UncondBranch); 1094 1095 // analyzeBranch() can fail due to unusual branch instructions, 1096 // e.g. jrcxz, or jump tables (indirect jump). 1097 if (!Result || !CondBranch) 1098 continue; 1099 1100 BB.removeDuplicateConditionalSuccessor(CondBranch); 1101 ++NumUselessCondBranches; 1102 } 1103 } 1104 1105 void Peepholes::runOnFunctions(BinaryContext &BC) { 1106 const char Opts = 1107 std::accumulate(opts::Peepholes.begin(), opts::Peepholes.end(), 0, 1108 [](const char A, const PeepholeOpts B) { return A | B; }); 1109 if (Opts == PEEP_NONE) 1110 return; 1111 1112 for (auto &It : BC.getBinaryFunctions()) { 1113 BinaryFunction &Function = It.second; 1114 if (shouldOptimize(Function)) { 1115 if (Opts & PEEP_DOUBLE_JUMPS) 1116 NumDoubleJumps += fixDoubleJumps(Function, false); 1117 if (Opts & PEEP_TAILCALL_TRAPS) 1118 addTailcallTraps(Function); 1119 if (Opts & PEEP_USELESS_BRANCHES) 1120 removeUselessCondBranches(Function); 1121 assert(Function.validateCFG()); 1122 } 1123 } 1124 outs() << "BOLT-INFO: Peephole: " << NumDoubleJumps 1125 << " double jumps patched.\n" 1126 << "BOLT-INFO: Peephole: " << TailCallTraps 1127 << " tail call traps inserted.\n" 1128 << "BOLT-INFO: Peephole: " << NumUselessCondBranches 1129 << " useless conditional branches removed.\n"; 1130 } 1131 1132 bool SimplifyRODataLoads::simplifyRODataLoads(BinaryFunction &BF) { 1133 BinaryContext &BC = BF.getBinaryContext(); 1134 MCPlusBuilder *MIB = BC.MIB.get(); 1135 1136 uint64_t NumLocalLoadsSimplified = 0; 1137 uint64_t NumDynamicLocalLoadsSimplified = 0; 1138 uint64_t NumLocalLoadsFound = 0; 1139 uint64_t NumDynamicLocalLoadsFound = 0; 1140 1141 for (BinaryBasicBlock *BB : BF.getLayout().blocks()) { 1142 for (MCInst &Inst : *BB) { 1143 unsigned Opcode = Inst.getOpcode(); 1144 const MCInstrDesc &Desc = BC.MII->get(Opcode); 1145 1146 // Skip instructions that do not load from memory. 1147 if (!Desc.mayLoad()) 1148 continue; 1149 1150 // Try to statically evaluate the target memory address; 1151 uint64_t TargetAddress; 1152 1153 if (MIB->hasPCRelOperand(Inst)) { 1154 // Try to find the symbol that corresponds to the PC-relative operand. 1155 MCOperand *DispOpI = MIB->getMemOperandDisp(Inst); 1156 assert(DispOpI != Inst.end() && "expected PC-relative displacement"); 1157 assert(DispOpI->isExpr() && 1158 "found PC-relative with non-symbolic displacement"); 1159 1160 // Get displacement symbol. 1161 const MCSymbol *DisplSymbol; 1162 uint64_t DisplOffset; 1163 1164 std::tie(DisplSymbol, DisplOffset) = 1165 MIB->getTargetSymbolInfo(DispOpI->getExpr()); 1166 1167 if (!DisplSymbol) 1168 continue; 1169 1170 // Look up the symbol address in the global symbols map of the binary 1171 // context object. 1172 BinaryData *BD = BC.getBinaryDataByName(DisplSymbol->getName()); 1173 if (!BD) 1174 continue; 1175 TargetAddress = BD->getAddress() + DisplOffset; 1176 } else if (!MIB->evaluateMemOperandTarget(Inst, TargetAddress)) { 1177 continue; 1178 } 1179 1180 // Get the contents of the section containing the target address of the 1181 // memory operand. We are only interested in read-only sections. 1182 ErrorOr<BinarySection &> DataSection = 1183 BC.getSectionForAddress(TargetAddress); 1184 if (!DataSection || DataSection->isWritable()) 1185 continue; 1186 1187 if (BC.getRelocationAt(TargetAddress) || 1188 BC.getDynamicRelocationAt(TargetAddress)) 1189 continue; 1190 1191 uint32_t Offset = TargetAddress - DataSection->getAddress(); 1192 StringRef ConstantData = DataSection->getContents(); 1193 1194 ++NumLocalLoadsFound; 1195 if (BB->hasProfile()) 1196 NumDynamicLocalLoadsFound += BB->getExecutionCount(); 1197 1198 if (MIB->replaceMemOperandWithImm(Inst, ConstantData, Offset)) { 1199 ++NumLocalLoadsSimplified; 1200 if (BB->hasProfile()) 1201 NumDynamicLocalLoadsSimplified += BB->getExecutionCount(); 1202 } 1203 } 1204 } 1205 1206 NumLoadsFound += NumLocalLoadsFound; 1207 NumDynamicLoadsFound += NumDynamicLocalLoadsFound; 1208 NumLoadsSimplified += NumLocalLoadsSimplified; 1209 NumDynamicLoadsSimplified += NumDynamicLocalLoadsSimplified; 1210 1211 return NumLocalLoadsSimplified > 0; 1212 } 1213 1214 void SimplifyRODataLoads::runOnFunctions(BinaryContext &BC) { 1215 for (auto &It : BC.getBinaryFunctions()) { 1216 BinaryFunction &Function = It.second; 1217 if (shouldOptimize(Function) && simplifyRODataLoads(Function)) 1218 Modified.insert(&Function); 1219 } 1220 1221 outs() << "BOLT-INFO: simplified " << NumLoadsSimplified << " out of " 1222 << NumLoadsFound << " loads from a statically computed address.\n" 1223 << "BOLT-INFO: dynamic loads simplified: " << NumDynamicLoadsSimplified 1224 << "\n" 1225 << "BOLT-INFO: dynamic loads found: " << NumDynamicLoadsFound << "\n"; 1226 } 1227 1228 void AssignSections::runOnFunctions(BinaryContext &BC) { 1229 for (BinaryFunction *Function : BC.getInjectedBinaryFunctions()) { 1230 Function->setCodeSectionName(BC.getInjectedCodeSectionName()); 1231 Function->setColdCodeSectionName(BC.getInjectedColdCodeSectionName()); 1232 } 1233 1234 // In non-relocation mode functions have pre-assigned section names. 1235 if (!BC.HasRelocations) 1236 return; 1237 1238 const bool UseColdSection = 1239 BC.NumProfiledFuncs > 0 || 1240 opts::ReorderFunctions == ReorderFunctions::RT_USER; 1241 for (auto &BFI : BC.getBinaryFunctions()) { 1242 BinaryFunction &Function = BFI.second; 1243 if (opts::isHotTextMover(Function)) { 1244 Function.setCodeSectionName(BC.getHotTextMoverSectionName()); 1245 Function.setColdCodeSectionName(BC.getHotTextMoverSectionName()); 1246 continue; 1247 } 1248 1249 if (!UseColdSection || Function.hasValidIndex()) 1250 Function.setCodeSectionName(BC.getMainCodeSectionName()); 1251 else 1252 Function.setCodeSectionName(BC.getColdCodeSectionName()); 1253 1254 if (Function.isSplit()) 1255 Function.setColdCodeSectionName(BC.getColdCodeSectionName()); 1256 } 1257 } 1258 1259 void PrintProfileStats::runOnFunctions(BinaryContext &BC) { 1260 double FlowImbalanceMean = 0.0; 1261 size_t NumBlocksConsidered = 0; 1262 double WorstBias = 0.0; 1263 const BinaryFunction *WorstBiasFunc = nullptr; 1264 1265 // For each function CFG, we fill an IncomingMap with the sum of the frequency 1266 // of incoming edges for each BB. Likewise for each OutgoingMap and the sum 1267 // of the frequency of outgoing edges. 1268 using FlowMapTy = std::unordered_map<const BinaryBasicBlock *, uint64_t>; 1269 std::unordered_map<const BinaryFunction *, FlowMapTy> TotalIncomingMaps; 1270 std::unordered_map<const BinaryFunction *, FlowMapTy> TotalOutgoingMaps; 1271 1272 // Compute mean 1273 for (const auto &BFI : BC.getBinaryFunctions()) { 1274 const BinaryFunction &Function = BFI.second; 1275 if (Function.empty() || !Function.isSimple()) 1276 continue; 1277 FlowMapTy &IncomingMap = TotalIncomingMaps[&Function]; 1278 FlowMapTy &OutgoingMap = TotalOutgoingMaps[&Function]; 1279 for (const BinaryBasicBlock &BB : Function) { 1280 uint64_t TotalOutgoing = 0ULL; 1281 auto SuccBIIter = BB.branch_info_begin(); 1282 for (BinaryBasicBlock *Succ : BB.successors()) { 1283 uint64_t Count = SuccBIIter->Count; 1284 if (Count == BinaryBasicBlock::COUNT_NO_PROFILE || Count == 0) { 1285 ++SuccBIIter; 1286 continue; 1287 } 1288 TotalOutgoing += Count; 1289 IncomingMap[Succ] += Count; 1290 ++SuccBIIter; 1291 } 1292 OutgoingMap[&BB] = TotalOutgoing; 1293 } 1294 1295 size_t NumBlocks = 0; 1296 double Mean = 0.0; 1297 for (const BinaryBasicBlock &BB : Function) { 1298 // Do not compute score for low frequency blocks, entry or exit blocks 1299 if (IncomingMap[&BB] < 100 || OutgoingMap[&BB] == 0 || BB.isEntryPoint()) 1300 continue; 1301 ++NumBlocks; 1302 const double Difference = (double)OutgoingMap[&BB] - IncomingMap[&BB]; 1303 Mean += fabs(Difference / IncomingMap[&BB]); 1304 } 1305 1306 FlowImbalanceMean += Mean; 1307 NumBlocksConsidered += NumBlocks; 1308 if (!NumBlocks) 1309 continue; 1310 double FuncMean = Mean / NumBlocks; 1311 if (FuncMean > WorstBias) { 1312 WorstBias = FuncMean; 1313 WorstBiasFunc = &Function; 1314 } 1315 } 1316 if (NumBlocksConsidered > 0) 1317 FlowImbalanceMean /= NumBlocksConsidered; 1318 1319 // Compute standard deviation 1320 NumBlocksConsidered = 0; 1321 double FlowImbalanceVar = 0.0; 1322 for (const auto &BFI : BC.getBinaryFunctions()) { 1323 const BinaryFunction &Function = BFI.second; 1324 if (Function.empty() || !Function.isSimple()) 1325 continue; 1326 FlowMapTy &IncomingMap = TotalIncomingMaps[&Function]; 1327 FlowMapTy &OutgoingMap = TotalOutgoingMaps[&Function]; 1328 for (const BinaryBasicBlock &BB : Function) { 1329 if (IncomingMap[&BB] < 100 || OutgoingMap[&BB] == 0) 1330 continue; 1331 ++NumBlocksConsidered; 1332 const double Difference = (double)OutgoingMap[&BB] - IncomingMap[&BB]; 1333 FlowImbalanceVar += 1334 pow(fabs(Difference / IncomingMap[&BB]) - FlowImbalanceMean, 2); 1335 } 1336 } 1337 if (NumBlocksConsidered) { 1338 FlowImbalanceVar /= NumBlocksConsidered; 1339 FlowImbalanceVar = sqrt(FlowImbalanceVar); 1340 } 1341 1342 // Report to user 1343 outs() << format("BOLT-INFO: Profile bias score: %.4lf%% StDev: %.4lf%%\n", 1344 (100.0 * FlowImbalanceMean), (100.0 * FlowImbalanceVar)); 1345 if (WorstBiasFunc && opts::Verbosity >= 1) { 1346 outs() << "Worst average bias observed in " << WorstBiasFunc->getPrintName() 1347 << "\n"; 1348 LLVM_DEBUG(WorstBiasFunc->dump()); 1349 } 1350 } 1351 1352 void PrintProgramStats::runOnFunctions(BinaryContext &BC) { 1353 uint64_t NumRegularFunctions = 0; 1354 uint64_t NumStaleProfileFunctions = 0; 1355 uint64_t NumAllStaleFunctions = 0; 1356 uint64_t NumInferredFunctions = 0; 1357 uint64_t NumNonSimpleProfiledFunctions = 0; 1358 uint64_t NumUnknownControlFlowFunctions = 0; 1359 uint64_t TotalSampleCount = 0; 1360 uint64_t StaleSampleCount = 0; 1361 uint64_t InferredSampleCount = 0; 1362 std::vector<const BinaryFunction *> ProfiledFunctions; 1363 const char *StaleFuncsHeader = "BOLT-INFO: Functions with stale profile:\n"; 1364 for (auto &BFI : BC.getBinaryFunctions()) { 1365 const BinaryFunction &Function = BFI.second; 1366 1367 // Ignore PLT functions for stats. 1368 if (Function.isPLTFunction()) 1369 continue; 1370 1371 ++NumRegularFunctions; 1372 1373 if (!Function.isSimple()) { 1374 if (Function.hasProfile()) 1375 ++NumNonSimpleProfiledFunctions; 1376 continue; 1377 } 1378 1379 if (Function.hasUnknownControlFlow()) { 1380 if (opts::PrintUnknownCFG) 1381 Function.dump(); 1382 else if (opts::PrintUnknown) 1383 errs() << "function with unknown control flow: " << Function << '\n'; 1384 1385 ++NumUnknownControlFlowFunctions; 1386 } 1387 1388 if (!Function.hasProfile()) 1389 continue; 1390 1391 uint64_t SampleCount = Function.getRawBranchCount(); 1392 TotalSampleCount += SampleCount; 1393 1394 if (Function.hasValidProfile()) { 1395 ProfiledFunctions.push_back(&Function); 1396 if (Function.hasInferredProfile()) { 1397 ++NumInferredFunctions; 1398 InferredSampleCount += SampleCount; 1399 ++NumAllStaleFunctions; 1400 } 1401 } else { 1402 if (opts::ReportStaleFuncs) { 1403 outs() << StaleFuncsHeader; 1404 StaleFuncsHeader = ""; 1405 outs() << " " << Function << '\n'; 1406 } 1407 ++NumStaleProfileFunctions; 1408 StaleSampleCount += SampleCount; 1409 ++NumAllStaleFunctions; 1410 } 1411 } 1412 BC.NumProfiledFuncs = ProfiledFunctions.size(); 1413 BC.NumStaleProfileFuncs = NumStaleProfileFunctions; 1414 1415 const size_t NumAllProfiledFunctions = 1416 ProfiledFunctions.size() + NumStaleProfileFunctions; 1417 outs() << "BOLT-INFO: " << NumAllProfiledFunctions << " out of " 1418 << NumRegularFunctions << " functions in the binary (" 1419 << format("%.1f", NumAllProfiledFunctions / 1420 (float)NumRegularFunctions * 100.0f) 1421 << "%) have non-empty execution profile\n"; 1422 if (NumNonSimpleProfiledFunctions) { 1423 outs() << "BOLT-INFO: " << NumNonSimpleProfiledFunctions << " function" 1424 << (NumNonSimpleProfiledFunctions == 1 ? "" : "s") 1425 << " with profile could not be optimized\n"; 1426 } 1427 if (NumStaleProfileFunctions) { 1428 const float PctStale = 1429 NumStaleProfileFunctions / (float)NumAllProfiledFunctions * 100.0f; 1430 auto printErrorOrWarning = [&]() { 1431 if (PctStale > opts::StaleThreshold) 1432 errs() << "BOLT-ERROR: "; 1433 else 1434 errs() << "BOLT-WARNING: "; 1435 }; 1436 printErrorOrWarning(); 1437 errs() << NumStaleProfileFunctions 1438 << format(" (%.1f%% of all profiled)", PctStale) << " function" 1439 << (NumStaleProfileFunctions == 1 ? "" : "s") 1440 << " have invalid (possibly stale) profile." 1441 " Use -report-stale to see the list.\n"; 1442 if (TotalSampleCount > 0) { 1443 printErrorOrWarning(); 1444 errs() << StaleSampleCount << " out of " << TotalSampleCount 1445 << " samples in the binary (" 1446 << format("%.1f", ((100.0f * StaleSampleCount) / TotalSampleCount)) 1447 << "%) belong to functions with invalid" 1448 " (possibly stale) profile.\n"; 1449 } 1450 if (PctStale > opts::StaleThreshold) { 1451 errs() << "BOLT-ERROR: stale functions exceed specified threshold of " 1452 << opts::StaleThreshold << "%. Exiting.\n"; 1453 exit(1); 1454 } 1455 } 1456 if (NumInferredFunctions) { 1457 outs() << format("BOLT-INFO: inferred profile for %d (%.2f%% of profiled, " 1458 "%.2f%% of stale) functions responsible for %.2f%% samples" 1459 " (%zu out of %zu)\n", 1460 NumInferredFunctions, 1461 100.0 * NumInferredFunctions / NumAllProfiledFunctions, 1462 100.0 * NumInferredFunctions / NumAllStaleFunctions, 1463 100.0 * InferredSampleCount / TotalSampleCount, 1464 InferredSampleCount, TotalSampleCount); 1465 outs() << format( 1466 "BOLT-INFO: inference found an exact match for %.2f%% of basic blocks" 1467 " (%zu out of %zu stale) responsible for %.2f%% samples" 1468 " (%zu out of %zu stale)\n", 1469 100.0 * BC.Stats.NumMatchedBlocks / BC.Stats.NumStaleBlocks, 1470 BC.Stats.NumMatchedBlocks, BC.Stats.NumStaleBlocks, 1471 100.0 * BC.Stats.MatchedSampleCount / BC.Stats.StaleSampleCount, 1472 BC.Stats.MatchedSampleCount, BC.Stats.StaleSampleCount); 1473 } 1474 1475 if (const uint64_t NumUnusedObjects = BC.getNumUnusedProfiledObjects()) { 1476 outs() << "BOLT-INFO: profile for " << NumUnusedObjects 1477 << " objects was ignored\n"; 1478 } 1479 1480 if (ProfiledFunctions.size() > 10) { 1481 if (opts::Verbosity >= 1) { 1482 outs() << "BOLT-INFO: top called functions are:\n"; 1483 llvm::sort(ProfiledFunctions, 1484 [](const BinaryFunction *A, const BinaryFunction *B) { 1485 return B->getExecutionCount() < A->getExecutionCount(); 1486 }); 1487 auto SFI = ProfiledFunctions.begin(); 1488 auto SFIend = ProfiledFunctions.end(); 1489 for (unsigned I = 0u; I < opts::TopCalledLimit && SFI != SFIend; 1490 ++SFI, ++I) 1491 outs() << " " << **SFI << " : " << (*SFI)->getExecutionCount() << '\n'; 1492 } 1493 } 1494 1495 if (!opts::PrintSortedBy.empty()) { 1496 std::vector<BinaryFunction *> Functions; 1497 std::map<const BinaryFunction *, DynoStats> Stats; 1498 1499 for (auto &BFI : BC.getBinaryFunctions()) { 1500 BinaryFunction &BF = BFI.second; 1501 if (shouldOptimize(BF) && BF.hasValidProfile()) { 1502 Functions.push_back(&BF); 1503 Stats.emplace(&BF, getDynoStats(BF)); 1504 } 1505 } 1506 1507 const bool SortAll = 1508 llvm::is_contained(opts::PrintSortedBy, DynoStats::LAST_DYNO_STAT); 1509 1510 const bool Ascending = 1511 opts::DynoStatsSortOrderOpt == opts::DynoStatsSortOrder::Ascending; 1512 1513 if (SortAll) { 1514 llvm::stable_sort(Functions, 1515 [Ascending, &Stats](const BinaryFunction *A, 1516 const BinaryFunction *B) { 1517 return Ascending ? Stats.at(A) < Stats.at(B) 1518 : Stats.at(B) < Stats.at(A); 1519 }); 1520 } else { 1521 llvm::stable_sort( 1522 Functions, [Ascending, &Stats](const BinaryFunction *A, 1523 const BinaryFunction *B) { 1524 const DynoStats &StatsA = Stats.at(A); 1525 const DynoStats &StatsB = Stats.at(B); 1526 return Ascending ? StatsA.lessThan(StatsB, opts::PrintSortedBy) 1527 : StatsB.lessThan(StatsA, opts::PrintSortedBy); 1528 }); 1529 } 1530 1531 outs() << "BOLT-INFO: top functions sorted by "; 1532 if (SortAll) { 1533 outs() << "dyno stats"; 1534 } else { 1535 outs() << "("; 1536 bool PrintComma = false; 1537 for (const DynoStats::Category Category : opts::PrintSortedBy) { 1538 if (PrintComma) 1539 outs() << ", "; 1540 outs() << DynoStats::Description(Category); 1541 PrintComma = true; 1542 } 1543 outs() << ")"; 1544 } 1545 1546 outs() << " are:\n"; 1547 auto SFI = Functions.begin(); 1548 for (unsigned I = 0; I < 100 && SFI != Functions.end(); ++SFI, ++I) { 1549 const DynoStats Stats = getDynoStats(**SFI); 1550 outs() << " " << **SFI; 1551 if (!SortAll) { 1552 outs() << " ("; 1553 bool PrintComma = false; 1554 for (const DynoStats::Category Category : opts::PrintSortedBy) { 1555 if (PrintComma) 1556 outs() << ", "; 1557 outs() << dynoStatsOptName(Category) << "=" << Stats[Category]; 1558 PrintComma = true; 1559 } 1560 outs() << ")"; 1561 } 1562 outs() << "\n"; 1563 } 1564 } 1565 1566 if (!BC.TrappedFunctions.empty()) { 1567 errs() << "BOLT-WARNING: " << BC.TrappedFunctions.size() << " function" 1568 << (BC.TrappedFunctions.size() > 1 ? "s" : "") 1569 << " will trap on entry. Use -trap-avx512=0 to disable" 1570 " traps."; 1571 if (opts::Verbosity >= 1 || BC.TrappedFunctions.size() <= 5) { 1572 errs() << '\n'; 1573 for (const BinaryFunction *Function : BC.TrappedFunctions) 1574 errs() << " " << *Function << '\n'; 1575 } else { 1576 errs() << " Use -v=1 to see the list.\n"; 1577 } 1578 } 1579 1580 // Print information on missed macro-fusion opportunities seen on input. 1581 if (BC.Stats.MissedMacroFusionPairs) { 1582 outs() << format("BOLT-INFO: the input contains %zu (dynamic count : %zu)" 1583 " opportunities for macro-fusion optimization", 1584 BC.Stats.MissedMacroFusionPairs, 1585 BC.Stats.MissedMacroFusionExecCount); 1586 switch (opts::AlignMacroOpFusion) { 1587 case MFT_NONE: 1588 outs() << ". Use -align-macro-fusion to fix.\n"; 1589 break; 1590 case MFT_HOT: 1591 outs() << ". Will fix instances on a hot path.\n"; 1592 break; 1593 case MFT_ALL: 1594 outs() << " that are going to be fixed\n"; 1595 break; 1596 } 1597 } 1598 1599 // Collect and print information about suboptimal code layout on input. 1600 if (opts::ReportBadLayout) { 1601 std::vector<BinaryFunction *> SuboptimalFuncs; 1602 for (auto &BFI : BC.getBinaryFunctions()) { 1603 BinaryFunction &BF = BFI.second; 1604 if (!BF.hasValidProfile()) 1605 continue; 1606 1607 const uint64_t HotThreshold = 1608 std::max<uint64_t>(BF.getKnownExecutionCount(), 1); 1609 bool HotSeen = false; 1610 for (const BinaryBasicBlock *BB : BF.getLayout().rblocks()) { 1611 if (!HotSeen && BB->getKnownExecutionCount() > HotThreshold) { 1612 HotSeen = true; 1613 continue; 1614 } 1615 if (HotSeen && BB->getKnownExecutionCount() == 0) { 1616 SuboptimalFuncs.push_back(&BF); 1617 break; 1618 } 1619 } 1620 } 1621 1622 if (!SuboptimalFuncs.empty()) { 1623 llvm::sort(SuboptimalFuncs, 1624 [](const BinaryFunction *A, const BinaryFunction *B) { 1625 return A->getKnownExecutionCount() / A->getSize() > 1626 B->getKnownExecutionCount() / B->getSize(); 1627 }); 1628 1629 outs() << "BOLT-INFO: " << SuboptimalFuncs.size() 1630 << " functions have " 1631 "cold code in the middle of hot code. Top functions are:\n"; 1632 for (unsigned I = 0; 1633 I < std::min(static_cast<size_t>(opts::ReportBadLayout), 1634 SuboptimalFuncs.size()); 1635 ++I) 1636 SuboptimalFuncs[I]->print(outs()); 1637 } 1638 } 1639 1640 if (NumUnknownControlFlowFunctions) { 1641 outs() << "BOLT-INFO: " << NumUnknownControlFlowFunctions 1642 << " functions have instructions with unknown control flow"; 1643 if (!opts::PrintUnknown) 1644 outs() << ". Use -print-unknown to see the list."; 1645 outs() << '\n'; 1646 } 1647 } 1648 1649 void InstructionLowering::runOnFunctions(BinaryContext &BC) { 1650 for (auto &BFI : BC.getBinaryFunctions()) 1651 for (BinaryBasicBlock &BB : BFI.second) 1652 for (MCInst &Instruction : BB) 1653 BC.MIB->lowerTailCall(Instruction); 1654 } 1655 1656 void StripRepRet::runOnFunctions(BinaryContext &BC) { 1657 if (!BC.isX86()) 1658 return; 1659 1660 uint64_t NumPrefixesRemoved = 0; 1661 uint64_t NumBytesSaved = 0; 1662 for (auto &BFI : BC.getBinaryFunctions()) { 1663 for (BinaryBasicBlock &BB : BFI.second) { 1664 auto LastInstRIter = BB.getLastNonPseudo(); 1665 if (LastInstRIter == BB.rend() || !BC.MIB->isReturn(*LastInstRIter) || 1666 !BC.MIB->deleteREPPrefix(*LastInstRIter)) 1667 continue; 1668 1669 NumPrefixesRemoved += BB.getKnownExecutionCount(); 1670 ++NumBytesSaved; 1671 } 1672 } 1673 1674 if (NumBytesSaved) 1675 outs() << "BOLT-INFO: removed " << NumBytesSaved 1676 << " 'repz' prefixes" 1677 " with estimated execution count of " 1678 << NumPrefixesRemoved << " times.\n"; 1679 } 1680 1681 void InlineMemcpy::runOnFunctions(BinaryContext &BC) { 1682 if (!BC.isX86()) 1683 return; 1684 1685 uint64_t NumInlined = 0; 1686 uint64_t NumInlinedDyno = 0; 1687 for (auto &BFI : BC.getBinaryFunctions()) { 1688 for (BinaryBasicBlock &BB : BFI.second) { 1689 for (auto II = BB.begin(); II != BB.end(); ++II) { 1690 MCInst &Inst = *II; 1691 1692 if (!BC.MIB->isCall(Inst) || MCPlus::getNumPrimeOperands(Inst) != 1 || 1693 !Inst.getOperand(0).isExpr()) 1694 continue; 1695 1696 const MCSymbol *CalleeSymbol = BC.MIB->getTargetSymbol(Inst); 1697 if (CalleeSymbol->getName() != "memcpy" && 1698 CalleeSymbol->getName() != "memcpy@PLT" && 1699 CalleeSymbol->getName() != "_memcpy8") 1700 continue; 1701 1702 const bool IsMemcpy8 = (CalleeSymbol->getName() == "_memcpy8"); 1703 const bool IsTailCall = BC.MIB->isTailCall(Inst); 1704 1705 const InstructionListType NewCode = 1706 BC.MIB->createInlineMemcpy(IsMemcpy8); 1707 II = BB.replaceInstruction(II, NewCode); 1708 std::advance(II, NewCode.size() - 1); 1709 if (IsTailCall) { 1710 MCInst Return; 1711 BC.MIB->createReturn(Return); 1712 II = BB.insertInstruction(std::next(II), std::move(Return)); 1713 } 1714 1715 ++NumInlined; 1716 NumInlinedDyno += BB.getKnownExecutionCount(); 1717 } 1718 } 1719 } 1720 1721 if (NumInlined) { 1722 outs() << "BOLT-INFO: inlined " << NumInlined << " memcpy() calls"; 1723 if (NumInlinedDyno) 1724 outs() << ". The calls were executed " << NumInlinedDyno 1725 << " times based on profile."; 1726 outs() << '\n'; 1727 } 1728 } 1729 1730 bool SpecializeMemcpy1::shouldOptimize(const BinaryFunction &Function) const { 1731 if (!BinaryFunctionPass::shouldOptimize(Function)) 1732 return false; 1733 1734 for (const std::string &FunctionSpec : Spec) { 1735 StringRef FunctionName = StringRef(FunctionSpec).split(':').first; 1736 if (Function.hasNameRegex(FunctionName)) 1737 return true; 1738 } 1739 1740 return false; 1741 } 1742 1743 std::set<size_t> SpecializeMemcpy1::getCallSitesToOptimize( 1744 const BinaryFunction &Function) const { 1745 StringRef SitesString; 1746 for (const std::string &FunctionSpec : Spec) { 1747 StringRef FunctionName; 1748 std::tie(FunctionName, SitesString) = StringRef(FunctionSpec).split(':'); 1749 if (Function.hasNameRegex(FunctionName)) 1750 break; 1751 SitesString = ""; 1752 } 1753 1754 std::set<size_t> Sites; 1755 SmallVector<StringRef, 4> SitesVec; 1756 SitesString.split(SitesVec, ':'); 1757 for (StringRef SiteString : SitesVec) { 1758 if (SiteString.empty()) 1759 continue; 1760 size_t Result; 1761 if (!SiteString.getAsInteger(10, Result)) 1762 Sites.emplace(Result); 1763 } 1764 1765 return Sites; 1766 } 1767 1768 void SpecializeMemcpy1::runOnFunctions(BinaryContext &BC) { 1769 if (!BC.isX86()) 1770 return; 1771 1772 uint64_t NumSpecialized = 0; 1773 uint64_t NumSpecializedDyno = 0; 1774 for (auto &BFI : BC.getBinaryFunctions()) { 1775 BinaryFunction &Function = BFI.second; 1776 if (!shouldOptimize(Function)) 1777 continue; 1778 1779 std::set<size_t> CallsToOptimize = getCallSitesToOptimize(Function); 1780 auto shouldOptimize = [&](size_t N) { 1781 return CallsToOptimize.empty() || CallsToOptimize.count(N); 1782 }; 1783 1784 std::vector<BinaryBasicBlock *> Blocks(Function.pbegin(), Function.pend()); 1785 size_t CallSiteID = 0; 1786 for (BinaryBasicBlock *CurBB : Blocks) { 1787 for (auto II = CurBB->begin(); II != CurBB->end(); ++II) { 1788 MCInst &Inst = *II; 1789 1790 if (!BC.MIB->isCall(Inst) || MCPlus::getNumPrimeOperands(Inst) != 1 || 1791 !Inst.getOperand(0).isExpr()) 1792 continue; 1793 1794 const MCSymbol *CalleeSymbol = BC.MIB->getTargetSymbol(Inst); 1795 if (CalleeSymbol->getName() != "memcpy" && 1796 CalleeSymbol->getName() != "memcpy@PLT") 1797 continue; 1798 1799 if (BC.MIB->isTailCall(Inst)) 1800 continue; 1801 1802 ++CallSiteID; 1803 1804 if (!shouldOptimize(CallSiteID)) 1805 continue; 1806 1807 // Create a copy of a call to memcpy(dest, src, size). 1808 MCInst MemcpyInstr = Inst; 1809 1810 BinaryBasicBlock *OneByteMemcpyBB = CurBB->splitAt(II); 1811 1812 BinaryBasicBlock *NextBB = nullptr; 1813 if (OneByteMemcpyBB->getNumNonPseudos() > 1) { 1814 NextBB = OneByteMemcpyBB->splitAt(OneByteMemcpyBB->begin()); 1815 NextBB->eraseInstruction(NextBB->begin()); 1816 } else { 1817 NextBB = OneByteMemcpyBB->getSuccessor(); 1818 OneByteMemcpyBB->eraseInstruction(OneByteMemcpyBB->begin()); 1819 assert(NextBB && "unexpected call to memcpy() with no return"); 1820 } 1821 1822 BinaryBasicBlock *MemcpyBB = Function.addBasicBlock(); 1823 MemcpyBB->setOffset(CurBB->getInputOffset()); 1824 InstructionListType CmpJCC = 1825 BC.MIB->createCmpJE(BC.MIB->getIntArgRegister(2), 1, 1826 OneByteMemcpyBB->getLabel(), BC.Ctx.get()); 1827 CurBB->addInstructions(CmpJCC); 1828 CurBB->addSuccessor(MemcpyBB); 1829 1830 MemcpyBB->addInstruction(std::move(MemcpyInstr)); 1831 MemcpyBB->addSuccessor(NextBB); 1832 MemcpyBB->setCFIState(NextBB->getCFIState()); 1833 MemcpyBB->setExecutionCount(0); 1834 1835 // To prevent the actual call from being moved to cold, we set its 1836 // execution count to 1. 1837 if (CurBB->getKnownExecutionCount() > 0) 1838 MemcpyBB->setExecutionCount(1); 1839 1840 InstructionListType OneByteMemcpy = BC.MIB->createOneByteMemcpy(); 1841 OneByteMemcpyBB->addInstructions(OneByteMemcpy); 1842 1843 ++NumSpecialized; 1844 NumSpecializedDyno += CurBB->getKnownExecutionCount(); 1845 1846 CurBB = NextBB; 1847 1848 // Note: we don't expect the next instruction to be a call to memcpy. 1849 II = CurBB->begin(); 1850 } 1851 } 1852 } 1853 1854 if (NumSpecialized) { 1855 outs() << "BOLT-INFO: specialized " << NumSpecialized 1856 << " memcpy() call sites for size 1"; 1857 if (NumSpecializedDyno) 1858 outs() << ". The calls were executed " << NumSpecializedDyno 1859 << " times based on profile."; 1860 outs() << '\n'; 1861 } 1862 } 1863 1864 void RemoveNops::runOnFunction(BinaryFunction &BF) { 1865 const BinaryContext &BC = BF.getBinaryContext(); 1866 for (BinaryBasicBlock &BB : BF) { 1867 for (int64_t I = BB.size() - 1; I >= 0; --I) { 1868 MCInst &Inst = BB.getInstructionAtIndex(I); 1869 if (BC.MIB->isNoop(Inst) && BC.MIB->hasAnnotation(Inst, "NOP")) 1870 BB.eraseInstructionAtIndex(I); 1871 } 1872 } 1873 } 1874 1875 void RemoveNops::runOnFunctions(BinaryContext &BC) { 1876 ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) { 1877 runOnFunction(BF); 1878 }; 1879 1880 ParallelUtilities::PredicateTy SkipFunc = [&](const BinaryFunction &BF) { 1881 return BF.shouldPreserveNops(); 1882 }; 1883 1884 ParallelUtilities::runOnEachFunction( 1885 BC, ParallelUtilities::SchedulingPolicy::SP_INST_LINEAR, WorkFun, 1886 SkipFunc, "RemoveNops"); 1887 } 1888 1889 } // namespace bolt 1890 } // namespace llvm 1891