1 //===- bolt/Passes/BinaryPasses.cpp - Binary-level passes -----------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements multiple passes for binary optimization and analysis. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "bolt/Passes/BinaryPasses.h" 14 #include "bolt/Core/FunctionLayout.h" 15 #include "bolt/Core/ParallelUtilities.h" 16 #include "bolt/Passes/ReorderAlgorithm.h" 17 #include "bolt/Passes/ReorderFunctions.h" 18 #include "llvm/Support/CommandLine.h" 19 #include <atomic> 20 #include <mutex> 21 #include <numeric> 22 #include <vector> 23 24 #define DEBUG_TYPE "bolt-opts" 25 26 using namespace llvm; 27 using namespace bolt; 28 29 static const char *dynoStatsOptName(const bolt::DynoStats::Category C) { 30 assert(C > bolt::DynoStats::FIRST_DYNO_STAT && 31 C < DynoStats::LAST_DYNO_STAT && "Unexpected dyno stat category."); 32 33 static std::string OptNames[bolt::DynoStats::LAST_DYNO_STAT + 1]; 34 35 OptNames[C] = bolt::DynoStats::Description(C); 36 37 std::replace(OptNames[C].begin(), OptNames[C].end(), ' ', '-'); 38 39 return OptNames[C].c_str(); 40 } 41 42 namespace opts { 43 44 extern cl::OptionCategory BoltCategory; 45 extern cl::OptionCategory BoltOptCategory; 46 47 extern cl::opt<bolt::MacroFusionType> AlignMacroOpFusion; 48 extern cl::opt<unsigned> Verbosity; 49 extern cl::opt<bool> EnableBAT; 50 extern cl::opt<unsigned> ExecutionCountThreshold; 51 extern cl::opt<bool> UpdateDebugSections; 52 extern cl::opt<bolt::ReorderFunctions::ReorderType> ReorderFunctions; 53 54 enum DynoStatsSortOrder : char { 55 Ascending, 56 Descending 57 }; 58 59 static cl::opt<DynoStatsSortOrder> DynoStatsSortOrderOpt( 60 "print-sorted-by-order", 61 cl::desc("use ascending or descending order when printing functions " 62 "ordered by dyno stats"), 63 cl::init(DynoStatsSortOrder::Descending), cl::cat(BoltOptCategory)); 64 65 cl::list<std::string> 66 HotTextMoveSections("hot-text-move-sections", 67 cl::desc("list of sections containing functions used for hugifying hot text. " 68 "BOLT makes sure these functions are not placed on the same page as " 69 "the hot text. (default=\'.stub,.mover\')."), 70 cl::value_desc("sec1,sec2,sec3,..."), 71 cl::CommaSeparated, 72 cl::ZeroOrMore, 73 cl::cat(BoltCategory)); 74 75 bool isHotTextMover(const BinaryFunction &Function) { 76 for (std::string &SectionName : opts::HotTextMoveSections) { 77 if (Function.getOriginSectionName() && 78 *Function.getOriginSectionName() == SectionName) 79 return true; 80 } 81 82 return false; 83 } 84 85 static cl::opt<bool> MinBranchClusters( 86 "min-branch-clusters", 87 cl::desc("use a modified clustering algorithm geared towards minimizing " 88 "branches"), 89 cl::Hidden, cl::cat(BoltOptCategory)); 90 91 static cl::list<Peepholes::PeepholeOpts> Peepholes( 92 "peepholes", cl::CommaSeparated, cl::desc("enable peephole optimizations"), 93 cl::value_desc("opt1,opt2,opt3,..."), 94 cl::values(clEnumValN(Peepholes::PEEP_NONE, "none", "disable peepholes"), 95 clEnumValN(Peepholes::PEEP_DOUBLE_JUMPS, "double-jumps", 96 "remove double jumps when able"), 97 clEnumValN(Peepholes::PEEP_TAILCALL_TRAPS, "tailcall-traps", 98 "insert tail call traps"), 99 clEnumValN(Peepholes::PEEP_USELESS_BRANCHES, "useless-branches", 100 "remove useless conditional branches"), 101 clEnumValN(Peepholes::PEEP_ALL, "all", 102 "enable all peephole optimizations")), 103 cl::ZeroOrMore, cl::cat(BoltOptCategory)); 104 105 static cl::opt<unsigned> 106 PrintFuncStat("print-function-statistics", 107 cl::desc("print statistics about basic block ordering"), 108 cl::init(0), cl::cat(BoltOptCategory)); 109 110 static cl::list<bolt::DynoStats::Category> 111 PrintSortedBy("print-sorted-by", cl::CommaSeparated, 112 cl::desc("print functions sorted by order of dyno stats"), 113 cl::value_desc("key1,key2,key3,..."), 114 cl::values( 115 #define D(name, description, ...) \ 116 clEnumValN(bolt::DynoStats::name, dynoStatsOptName(bolt::DynoStats::name), \ 117 description), 118 REAL_DYNO_STATS 119 #undef D 120 clEnumValN(bolt::DynoStats::LAST_DYNO_STAT, "all", 121 "sorted by all names")), 122 cl::ZeroOrMore, cl::cat(BoltOptCategory)); 123 124 static cl::opt<bool> 125 PrintUnknown("print-unknown", 126 cl::desc("print names of functions with unknown control flow"), 127 cl::cat(BoltCategory), cl::Hidden); 128 129 static cl::opt<bool> 130 PrintUnknownCFG("print-unknown-cfg", 131 cl::desc("dump CFG of functions with unknown control flow"), 132 cl::cat(BoltCategory), cl::ReallyHidden); 133 134 // Please MSVC19 with a forward declaration: otherwise it reports an error about 135 // an undeclared variable inside a callback. 136 extern cl::opt<bolt::ReorderBasicBlocks::LayoutType> ReorderBlocks; 137 cl::opt<bolt::ReorderBasicBlocks::LayoutType> ReorderBlocks( 138 "reorder-blocks", cl::desc("change layout of basic blocks in a function"), 139 cl::init(bolt::ReorderBasicBlocks::LT_NONE), 140 cl::values( 141 clEnumValN(bolt::ReorderBasicBlocks::LT_NONE, "none", 142 "do not reorder basic blocks"), 143 clEnumValN(bolt::ReorderBasicBlocks::LT_REVERSE, "reverse", 144 "layout blocks in reverse order"), 145 clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE, "normal", 146 "perform optimal layout based on profile"), 147 clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_BRANCH, 148 "branch-predictor", 149 "perform optimal layout prioritizing branch " 150 "predictions"), 151 clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_CACHE, "cache", 152 "perform optimal layout prioritizing I-cache " 153 "behavior"), 154 clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_CACHE_PLUS, "cache+", 155 "perform layout optimizing I-cache behavior"), 156 clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_EXT_TSP, "ext-tsp", 157 "perform layout optimizing I-cache behavior"), 158 clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_SHUFFLE, 159 "cluster-shuffle", "perform random layout of clusters")), 160 cl::ZeroOrMore, cl::cat(BoltOptCategory), 161 cl::callback([](const bolt::ReorderBasicBlocks::LayoutType &option) { 162 if (option == bolt::ReorderBasicBlocks::LT_OPTIMIZE_CACHE_PLUS) { 163 errs() << "BOLT-WARNING: '-reorder-blocks=cache+' is deprecated, please" 164 << " use '-reorder-blocks=ext-tsp' instead\n"; 165 ReorderBlocks = bolt::ReorderBasicBlocks::LT_OPTIMIZE_EXT_TSP; 166 } 167 })); 168 169 static cl::opt<unsigned> ReportBadLayout( 170 "report-bad-layout", 171 cl::desc("print top <uint> functions with suboptimal code layout on input"), 172 cl::init(0), cl::Hidden, cl::cat(BoltOptCategory)); 173 174 static cl::opt<bool> 175 ReportStaleFuncs("report-stale", 176 cl::desc("print the list of functions with stale profile"), 177 cl::Hidden, cl::cat(BoltOptCategory)); 178 179 enum SctcModes : char { 180 SctcAlways, 181 SctcPreserveDirection, 182 SctcHeuristic 183 }; 184 185 static cl::opt<SctcModes> 186 SctcMode("sctc-mode", 187 cl::desc("mode for simplify conditional tail calls"), 188 cl::init(SctcAlways), 189 cl::values(clEnumValN(SctcAlways, "always", "always perform sctc"), 190 clEnumValN(SctcPreserveDirection, 191 "preserve", 192 "only perform sctc when branch direction is " 193 "preserved"), 194 clEnumValN(SctcHeuristic, 195 "heuristic", 196 "use branch prediction data to control sctc")), 197 cl::ZeroOrMore, 198 cl::cat(BoltOptCategory)); 199 200 static cl::opt<unsigned> 201 StaleThreshold("stale-threshold", 202 cl::desc( 203 "maximum percentage of stale functions to tolerate (default: 100)"), 204 cl::init(100), 205 cl::Hidden, 206 cl::cat(BoltOptCategory)); 207 208 static cl::opt<unsigned> TSPThreshold( 209 "tsp-threshold", 210 cl::desc( 211 "maximum number of hot basic blocks in a function for which to use " 212 "a precise TSP solution while re-ordering basic blocks"), 213 cl::init(10), cl::Hidden, cl::cat(BoltOptCategory)); 214 215 static cl::opt<unsigned> TopCalledLimit( 216 "top-called-limit", 217 cl::desc("maximum number of functions to print in top called " 218 "functions section"), 219 cl::init(100), cl::Hidden, cl::cat(BoltCategory)); 220 221 } // namespace opts 222 223 namespace llvm { 224 namespace bolt { 225 226 bool BinaryFunctionPass::shouldOptimize(const BinaryFunction &BF) const { 227 return BF.isSimple() && BF.getState() == BinaryFunction::State::CFG && 228 !BF.isIgnored(); 229 } 230 231 bool BinaryFunctionPass::shouldPrint(const BinaryFunction &BF) const { 232 return BF.isSimple() && !BF.isIgnored(); 233 } 234 235 void NormalizeCFG::runOnFunction(BinaryFunction &BF) { 236 uint64_t NumRemoved = 0; 237 uint64_t NumDuplicateEdges = 0; 238 uint64_t NeedsFixBranches = 0; 239 for (BinaryBasicBlock &BB : BF) { 240 if (!BB.empty()) 241 continue; 242 243 if (BB.isEntryPoint() || BB.isLandingPad()) 244 continue; 245 246 // Handle a dangling empty block. 247 if (BB.succ_size() == 0) { 248 // If an empty dangling basic block has a predecessor, it could be a 249 // result of codegen for __builtin_unreachable. In such case, do not 250 // remove the block. 251 if (BB.pred_size() == 0) { 252 BB.markValid(false); 253 ++NumRemoved; 254 } 255 continue; 256 } 257 258 // The block should have just one successor. 259 BinaryBasicBlock *Successor = BB.getSuccessor(); 260 assert(Successor && "invalid CFG encountered"); 261 262 // Redirect all predecessors to the successor block. 263 while (!BB.pred_empty()) { 264 BinaryBasicBlock *Predecessor = *BB.pred_begin(); 265 if (Predecessor->hasJumpTable()) 266 break; 267 268 if (Predecessor == Successor) 269 break; 270 271 BinaryBasicBlock::BinaryBranchInfo &BI = Predecessor->getBranchInfo(BB); 272 Predecessor->replaceSuccessor(&BB, Successor, BI.Count, 273 BI.MispredictedCount); 274 // We need to fix branches even if we failed to replace all successors 275 // and remove the block. 276 NeedsFixBranches = true; 277 } 278 279 if (BB.pred_empty()) { 280 BB.removeAllSuccessors(); 281 BB.markValid(false); 282 ++NumRemoved; 283 } 284 } 285 286 if (NumRemoved) 287 BF.eraseInvalidBBs(); 288 289 // Check for duplicate successors. Do it after the empty block elimination as 290 // we can get more duplicate successors. 291 for (BinaryBasicBlock &BB : BF) 292 if (!BB.hasJumpTable() && BB.succ_size() == 2 && 293 BB.getConditionalSuccessor(false) == BB.getConditionalSuccessor(true)) 294 ++NumDuplicateEdges; 295 296 // fixBranches() will get rid of duplicate edges and update jump instructions. 297 if (NumDuplicateEdges || NeedsFixBranches) 298 BF.fixBranches(); 299 300 NumDuplicateEdgesMerged += NumDuplicateEdges; 301 NumBlocksRemoved += NumRemoved; 302 } 303 304 void NormalizeCFG::runOnFunctions(BinaryContext &BC) { 305 ParallelUtilities::runOnEachFunction( 306 BC, ParallelUtilities::SchedulingPolicy::SP_BB_LINEAR, 307 [&](BinaryFunction &BF) { runOnFunction(BF); }, 308 [&](const BinaryFunction &BF) { return !shouldOptimize(BF); }, 309 "NormalizeCFG"); 310 if (NumBlocksRemoved) 311 outs() << "BOLT-INFO: removed " << NumBlocksRemoved << " empty block" 312 << (NumBlocksRemoved == 1 ? "" : "s") << '\n'; 313 if (NumDuplicateEdgesMerged) 314 outs() << "BOLT-INFO: merged " << NumDuplicateEdgesMerged 315 << " duplicate CFG edge" << (NumDuplicateEdgesMerged == 1 ? "" : "s") 316 << '\n'; 317 } 318 319 void EliminateUnreachableBlocks::runOnFunction(BinaryFunction &Function) { 320 BinaryContext &BC = Function.getBinaryContext(); 321 unsigned Count; 322 uint64_t Bytes; 323 Function.markUnreachableBlocks(); 324 LLVM_DEBUG({ 325 for (BinaryBasicBlock &BB : Function) { 326 if (!BB.isValid()) { 327 dbgs() << "BOLT-INFO: UCE found unreachable block " << BB.getName() 328 << " in function " << Function << "\n"; 329 Function.dump(); 330 } 331 } 332 }); 333 BinaryContext::IndependentCodeEmitter Emitter = 334 BC.createIndependentMCCodeEmitter(); 335 std::tie(Count, Bytes) = Function.eraseInvalidBBs(Emitter.MCE.get()); 336 DeletedBlocks += Count; 337 DeletedBytes += Bytes; 338 if (Count) { 339 auto L = BC.scopeLock(); 340 Modified.insert(&Function); 341 if (opts::Verbosity > 0) 342 outs() << "BOLT-INFO: removed " << Count 343 << " dead basic block(s) accounting for " << Bytes 344 << " bytes in function " << Function << '\n'; 345 } 346 } 347 348 void EliminateUnreachableBlocks::runOnFunctions(BinaryContext &BC) { 349 ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) { 350 runOnFunction(BF); 351 }; 352 353 ParallelUtilities::PredicateTy SkipPredicate = [&](const BinaryFunction &BF) { 354 return !shouldOptimize(BF) || BF.getLayout().block_empty(); 355 }; 356 357 ParallelUtilities::runOnEachFunction( 358 BC, ParallelUtilities::SchedulingPolicy::SP_CONSTANT, WorkFun, 359 SkipPredicate, "elimininate-unreachable"); 360 361 if (DeletedBlocks) 362 outs() << "BOLT-INFO: UCE removed " << DeletedBlocks << " blocks and " 363 << DeletedBytes << " bytes of code\n"; 364 } 365 366 bool ReorderBasicBlocks::shouldPrint(const BinaryFunction &BF) const { 367 return (BinaryFunctionPass::shouldPrint(BF) && 368 opts::ReorderBlocks != ReorderBasicBlocks::LT_NONE); 369 } 370 371 bool ReorderBasicBlocks::shouldOptimize(const BinaryFunction &BF) const { 372 // Apply execution count threshold 373 if (BF.getKnownExecutionCount() < opts::ExecutionCountThreshold) 374 return false; 375 376 return BinaryFunctionPass::shouldOptimize(BF); 377 } 378 379 void ReorderBasicBlocks::runOnFunctions(BinaryContext &BC) { 380 if (opts::ReorderBlocks == ReorderBasicBlocks::LT_NONE) 381 return; 382 383 std::atomic_uint64_t ModifiedFuncCount(0); 384 std::mutex FunctionEditDistanceMutex; 385 DenseMap<const BinaryFunction *, uint64_t> FunctionEditDistance; 386 387 ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) { 388 SmallVector<const BinaryBasicBlock *, 0> OldBlockOrder; 389 if (opts::PrintFuncStat > 0) 390 llvm::copy(BF.getLayout().blocks(), std::back_inserter(OldBlockOrder)); 391 392 const bool LayoutChanged = 393 modifyFunctionLayout(BF, opts::ReorderBlocks, opts::MinBranchClusters); 394 if (LayoutChanged) { 395 ModifiedFuncCount.fetch_add(1, std::memory_order_relaxed); 396 if (opts::PrintFuncStat > 0) { 397 const uint64_t Distance = BF.getLayout().getEditDistance(OldBlockOrder); 398 std::lock_guard<std::mutex> Lock(FunctionEditDistanceMutex); 399 FunctionEditDistance[&BF] = Distance; 400 } 401 } 402 }; 403 404 ParallelUtilities::PredicateTy SkipFunc = [&](const BinaryFunction &BF) { 405 return !shouldOptimize(BF); 406 }; 407 408 ParallelUtilities::runOnEachFunction( 409 BC, ParallelUtilities::SchedulingPolicy::SP_BB_LINEAR, WorkFun, SkipFunc, 410 "ReorderBasicBlocks"); 411 const size_t NumAllProfiledFunctions = 412 BC.NumProfiledFuncs + BC.NumStaleProfileFuncs; 413 414 outs() << "BOLT-INFO: basic block reordering modified layout of " 415 << format("%zu functions (%.2lf%% of profiled, %.2lf%% of total)\n", 416 ModifiedFuncCount.load(std::memory_order_relaxed), 417 100.0 * ModifiedFuncCount.load(std::memory_order_relaxed) / 418 NumAllProfiledFunctions, 419 100.0 * ModifiedFuncCount.load(std::memory_order_relaxed) / 420 BC.getBinaryFunctions().size()); 421 422 if (opts::PrintFuncStat > 0) { 423 raw_ostream &OS = outs(); 424 // Copy all the values into vector in order to sort them 425 std::map<uint64_t, BinaryFunction &> ScoreMap; 426 auto &BFs = BC.getBinaryFunctions(); 427 for (auto It = BFs.begin(); It != BFs.end(); ++It) 428 ScoreMap.insert(std::pair<uint64_t, BinaryFunction &>( 429 It->second.getFunctionScore(), It->second)); 430 431 OS << "\nBOLT-INFO: Printing Function Statistics:\n\n"; 432 OS << " There are " << BFs.size() << " functions in total. \n"; 433 OS << " Number of functions being modified: " 434 << ModifiedFuncCount.load(std::memory_order_relaxed) << "\n"; 435 OS << " User asks for detailed information on top " 436 << opts::PrintFuncStat << " functions. (Ranked by function score)" 437 << "\n\n"; 438 uint64_t I = 0; 439 for (std::map<uint64_t, BinaryFunction &>::reverse_iterator Rit = 440 ScoreMap.rbegin(); 441 Rit != ScoreMap.rend() && I < opts::PrintFuncStat; ++Rit, ++I) { 442 BinaryFunction &Function = Rit->second; 443 444 OS << " Information for function of top: " << (I + 1) << ": \n"; 445 OS << " Function Score is: " << Function.getFunctionScore() 446 << "\n"; 447 OS << " There are " << Function.size() 448 << " number of blocks in this function.\n"; 449 OS << " There are " << Function.getInstructionCount() 450 << " number of instructions in this function.\n"; 451 OS << " The edit distance for this function is: " 452 << FunctionEditDistance.lookup(&Function) << "\n\n"; 453 } 454 } 455 } 456 457 bool ReorderBasicBlocks::modifyFunctionLayout(BinaryFunction &BF, 458 LayoutType Type, 459 bool MinBranchClusters) const { 460 if (BF.size() == 0 || Type == LT_NONE) 461 return false; 462 463 BinaryFunction::BasicBlockOrderType NewLayout; 464 std::unique_ptr<ReorderAlgorithm> Algo; 465 466 // Cannot do optimal layout without profile. 467 if (Type != LT_REVERSE && !BF.hasValidProfile()) 468 return false; 469 470 if (Type == LT_REVERSE) { 471 Algo.reset(new ReverseReorderAlgorithm()); 472 } else if (BF.size() <= opts::TSPThreshold && Type != LT_OPTIMIZE_SHUFFLE) { 473 // Work on optimal solution if problem is small enough 474 LLVM_DEBUG(dbgs() << "finding optimal block layout for " << BF << "\n"); 475 Algo.reset(new TSPReorderAlgorithm()); 476 } else { 477 LLVM_DEBUG(dbgs() << "running block layout heuristics on " << BF << "\n"); 478 479 std::unique_ptr<ClusterAlgorithm> CAlgo; 480 if (MinBranchClusters) 481 CAlgo.reset(new MinBranchGreedyClusterAlgorithm()); 482 else 483 CAlgo.reset(new PHGreedyClusterAlgorithm()); 484 485 switch (Type) { 486 case LT_OPTIMIZE: 487 Algo.reset(new OptimizeReorderAlgorithm(std::move(CAlgo))); 488 break; 489 490 case LT_OPTIMIZE_BRANCH: 491 Algo.reset(new OptimizeBranchReorderAlgorithm(std::move(CAlgo))); 492 break; 493 494 case LT_OPTIMIZE_CACHE: 495 Algo.reset(new OptimizeCacheReorderAlgorithm(std::move(CAlgo))); 496 break; 497 498 case LT_OPTIMIZE_EXT_TSP: 499 Algo.reset(new ExtTSPReorderAlgorithm()); 500 break; 501 502 case LT_OPTIMIZE_SHUFFLE: 503 Algo.reset(new RandomClusterReorderAlgorithm(std::move(CAlgo))); 504 break; 505 506 default: 507 llvm_unreachable("unexpected layout type"); 508 } 509 } 510 511 Algo->reorderBasicBlocks(BF, NewLayout); 512 513 return BF.getLayout().update(NewLayout); 514 } 515 516 void FixupBranches::runOnFunctions(BinaryContext &BC) { 517 for (auto &It : BC.getBinaryFunctions()) { 518 BinaryFunction &Function = It.second; 519 if (!BC.shouldEmit(Function) || !Function.isSimple()) 520 continue; 521 522 Function.fixBranches(); 523 } 524 } 525 526 void FinalizeFunctions::runOnFunctions(BinaryContext &BC) { 527 ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) { 528 if (!BF.finalizeCFIState()) { 529 if (BC.HasRelocations) { 530 errs() << "BOLT-ERROR: unable to fix CFI state for function " << BF 531 << ". Exiting.\n"; 532 exit(1); 533 } 534 BF.setSimple(false); 535 return; 536 } 537 538 BF.setFinalized(); 539 540 // Update exception handling information. 541 BF.updateEHRanges(); 542 }; 543 544 ParallelUtilities::PredicateTy SkipPredicate = [&](const BinaryFunction &BF) { 545 return !BC.shouldEmit(BF); 546 }; 547 548 ParallelUtilities::runOnEachFunction( 549 BC, ParallelUtilities::SchedulingPolicy::SP_CONSTANT, WorkFun, 550 SkipPredicate, "FinalizeFunctions"); 551 } 552 553 void CheckLargeFunctions::runOnFunctions(BinaryContext &BC) { 554 if (BC.HasRelocations) 555 return; 556 557 if (!opts::UpdateDebugSections) 558 return; 559 560 // If the function wouldn't fit, mark it as non-simple. Otherwise, we may emit 561 // incorrect debug info. 562 ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) { 563 uint64_t HotSize, ColdSize; 564 std::tie(HotSize, ColdSize) = 565 BC.calculateEmittedSize(BF, /*FixBranches=*/false); 566 if (HotSize > BF.getMaxSize()) 567 BF.setSimple(false); 568 }; 569 570 ParallelUtilities::PredicateTy SkipFunc = [&](const BinaryFunction &BF) { 571 return !shouldOptimize(BF); 572 }; 573 574 ParallelUtilities::runOnEachFunction( 575 BC, ParallelUtilities::SchedulingPolicy::SP_INST_LINEAR, WorkFun, 576 SkipFunc, "CheckLargeFunctions"); 577 } 578 579 bool CheckLargeFunctions::shouldOptimize(const BinaryFunction &BF) const { 580 // Unlike other passes, allow functions in non-CFG state. 581 return BF.isSimple() && !BF.isIgnored(); 582 } 583 584 void LowerAnnotations::runOnFunctions(BinaryContext &BC) { 585 for (BinaryFunction *BF : BC.getAllBinaryFunctions()) { 586 for (FunctionFragment &FF : BF->getLayout().fragments()) { 587 // Reset at the start of the new fragment. 588 int64_t CurrentGnuArgsSize = 0; 589 590 for (BinaryBasicBlock *const BB : FF) { 591 for (auto II = BB->begin(); II != BB->end(); ++II) { 592 593 // Convert GnuArgsSize annotations into CFIs. 594 if (BF->usesGnuArgsSize() && BC.MIB->isInvoke(*II)) { 595 const int64_t NewGnuArgsSize = BC.MIB->getGnuArgsSize(*II); 596 assert(NewGnuArgsSize >= 0 && 597 "Expected non-negative GNU_args_size."); 598 if (NewGnuArgsSize != CurrentGnuArgsSize) { 599 auto InsertII = BF->addCFIInstruction( 600 BB, II, 601 MCCFIInstruction::createGnuArgsSize(nullptr, NewGnuArgsSize)); 602 CurrentGnuArgsSize = NewGnuArgsSize; 603 II = std::next(InsertII); 604 } 605 } 606 607 // Preserve selected annotations and strip the rest. 608 std::optional<uint32_t> Offset = BF->requiresAddressTranslation() 609 ? BC.MIB->getOffset(*II) 610 : std::nullopt; 611 MCSymbol *Label = BC.MIB->getLabel(*II); 612 613 BC.MIB->stripAnnotations(*II); 614 615 if (Offset) 616 BC.MIB->setOffset(*II, *Offset); 617 if (Label) 618 BC.MIB->setLabel(*II, Label); 619 } 620 } 621 } 622 } 623 624 // Release all memory taken by annotations 625 BC.MIB->freeAnnotations(); 626 } 627 628 // Check for dirty state in MCSymbol objects that might be a consequence 629 // of running calculateEmittedSize() in parallel, during split functions 630 // pass. If an inconsistent state is found (symbol already registered or 631 // already defined), clean it. 632 void CleanMCState::runOnFunctions(BinaryContext &BC) { 633 MCContext &Ctx = *BC.Ctx; 634 for (const auto &SymMapEntry : Ctx.getSymbols()) { 635 const MCSymbol *S = SymMapEntry.second; 636 if (S->isDefined()) { 637 LLVM_DEBUG(dbgs() << "BOLT-DEBUG: Symbol \"" << S->getName() 638 << "\" is already defined\n"); 639 const_cast<MCSymbol *>(S)->setUndefined(); 640 } 641 if (S->isRegistered()) { 642 LLVM_DEBUG(dbgs() << "BOLT-DEBUG: Symbol \"" << S->getName() 643 << "\" is already registered\n"); 644 const_cast<MCSymbol *>(S)->setIsRegistered(false); 645 } 646 LLVM_DEBUG(if (S->isVariable()) { 647 dbgs() << "BOLT-DEBUG: Symbol \"" << S->getName() << "\" is variable\n"; 648 }); 649 } 650 } 651 652 // This peephole fixes jump instructions that jump to another basic 653 // block with a single jump instruction, e.g. 654 // 655 // B0: ... 656 // jmp B1 (or jcc B1) 657 // 658 // B1: jmp B2 659 // 660 // -> 661 // 662 // B0: ... 663 // jmp B2 (or jcc B2) 664 // 665 static uint64_t fixDoubleJumps(BinaryFunction &Function, bool MarkInvalid) { 666 uint64_t NumDoubleJumps = 0; 667 668 MCContext *Ctx = Function.getBinaryContext().Ctx.get(); 669 MCPlusBuilder *MIB = Function.getBinaryContext().MIB.get(); 670 for (BinaryBasicBlock &BB : Function) { 671 auto checkAndPatch = [&](BinaryBasicBlock *Pred, BinaryBasicBlock *Succ, 672 const MCSymbol *SuccSym) { 673 // Ignore infinite loop jumps or fallthrough tail jumps. 674 if (Pred == Succ || Succ == &BB) 675 return false; 676 677 if (Succ) { 678 const MCSymbol *TBB = nullptr; 679 const MCSymbol *FBB = nullptr; 680 MCInst *CondBranch = nullptr; 681 MCInst *UncondBranch = nullptr; 682 bool Res = Pred->analyzeBranch(TBB, FBB, CondBranch, UncondBranch); 683 if (!Res) { 684 LLVM_DEBUG(dbgs() << "analyzeBranch failed in peepholes in block:\n"; 685 Pred->dump()); 686 return false; 687 } 688 Pred->replaceSuccessor(&BB, Succ); 689 690 // We must patch up any existing branch instructions to match up 691 // with the new successor. 692 assert((CondBranch || (!CondBranch && Pred->succ_size() == 1)) && 693 "Predecessor block has inconsistent number of successors"); 694 if (CondBranch && MIB->getTargetSymbol(*CondBranch) == BB.getLabel()) { 695 MIB->replaceBranchTarget(*CondBranch, Succ->getLabel(), Ctx); 696 } else if (UncondBranch && 697 MIB->getTargetSymbol(*UncondBranch) == BB.getLabel()) { 698 MIB->replaceBranchTarget(*UncondBranch, Succ->getLabel(), Ctx); 699 } else if (!UncondBranch) { 700 assert(Function.getLayout().getBasicBlockAfter(Pred, false) != Succ && 701 "Don't add an explicit jump to a fallthrough block."); 702 Pred->addBranchInstruction(Succ); 703 } 704 } else { 705 // Succ will be null in the tail call case. In this case we 706 // need to explicitly add a tail call instruction. 707 MCInst *Branch = Pred->getLastNonPseudoInstr(); 708 if (Branch && MIB->isUnconditionalBranch(*Branch)) { 709 assert(MIB->getTargetSymbol(*Branch) == BB.getLabel()); 710 Pred->removeSuccessor(&BB); 711 Pred->eraseInstruction(Pred->findInstruction(Branch)); 712 Pred->addTailCallInstruction(SuccSym); 713 } else { 714 return false; 715 } 716 } 717 718 ++NumDoubleJumps; 719 LLVM_DEBUG(dbgs() << "Removed double jump in " << Function << " from " 720 << Pred->getName() << " -> " << BB.getName() << " to " 721 << Pred->getName() << " -> " << SuccSym->getName() 722 << (!Succ ? " (tail)\n" : "\n")); 723 724 return true; 725 }; 726 727 if (BB.getNumNonPseudos() != 1 || BB.isLandingPad()) 728 continue; 729 730 MCInst *Inst = BB.getFirstNonPseudoInstr(); 731 const bool IsTailCall = MIB->isTailCall(*Inst); 732 733 if (!MIB->isUnconditionalBranch(*Inst) && !IsTailCall) 734 continue; 735 736 // If we operate after SCTC make sure it's not a conditional tail call. 737 if (IsTailCall && MIB->isConditionalBranch(*Inst)) 738 continue; 739 740 const MCSymbol *SuccSym = MIB->getTargetSymbol(*Inst); 741 BinaryBasicBlock *Succ = BB.getSuccessor(); 742 743 if (((!Succ || &BB == Succ) && !IsTailCall) || (IsTailCall && !SuccSym)) 744 continue; 745 746 std::vector<BinaryBasicBlock *> Preds = {BB.pred_begin(), BB.pred_end()}; 747 748 for (BinaryBasicBlock *Pred : Preds) { 749 if (Pred->isLandingPad()) 750 continue; 751 752 if (Pred->getSuccessor() == &BB || 753 (Pred->getConditionalSuccessor(true) == &BB && !IsTailCall) || 754 Pred->getConditionalSuccessor(false) == &BB) 755 if (checkAndPatch(Pred, Succ, SuccSym) && MarkInvalid) 756 BB.markValid(BB.pred_size() != 0 || BB.isLandingPad() || 757 BB.isEntryPoint()); 758 } 759 } 760 761 return NumDoubleJumps; 762 } 763 764 bool SimplifyConditionalTailCalls::shouldRewriteBranch( 765 const BinaryBasicBlock *PredBB, const MCInst &CondBranch, 766 const BinaryBasicBlock *BB, const bool DirectionFlag) { 767 if (BeenOptimized.count(PredBB)) 768 return false; 769 770 const bool IsForward = BinaryFunction::isForwardBranch(PredBB, BB); 771 772 if (IsForward) 773 ++NumOrigForwardBranches; 774 else 775 ++NumOrigBackwardBranches; 776 777 if (opts::SctcMode == opts::SctcAlways) 778 return true; 779 780 if (opts::SctcMode == opts::SctcPreserveDirection) 781 return IsForward == DirectionFlag; 782 783 const ErrorOr<std::pair<double, double>> Frequency = 784 PredBB->getBranchStats(BB); 785 786 // It's ok to rewrite the conditional branch if the new target will be 787 // a backward branch. 788 789 // If no data available for these branches, then it should be ok to 790 // do the optimization since it will reduce code size. 791 if (Frequency.getError()) 792 return true; 793 794 // TODO: should this use misprediction frequency instead? 795 const bool Result = (IsForward && Frequency.get().first >= 0.5) || 796 (!IsForward && Frequency.get().first <= 0.5); 797 798 return Result == DirectionFlag; 799 } 800 801 uint64_t SimplifyConditionalTailCalls::fixTailCalls(BinaryFunction &BF) { 802 // Need updated indices to correctly detect branch' direction. 803 BF.getLayout().updateLayoutIndices(); 804 BF.markUnreachableBlocks(); 805 806 MCPlusBuilder *MIB = BF.getBinaryContext().MIB.get(); 807 MCContext *Ctx = BF.getBinaryContext().Ctx.get(); 808 uint64_t NumLocalCTCCandidates = 0; 809 uint64_t NumLocalCTCs = 0; 810 uint64_t LocalCTCTakenCount = 0; 811 uint64_t LocalCTCExecCount = 0; 812 std::vector<std::pair<BinaryBasicBlock *, const BinaryBasicBlock *>> 813 NeedsUncondBranch; 814 815 // Will block be deleted by UCE? 816 auto isValid = [](const BinaryBasicBlock *BB) { 817 return (BB->pred_size() != 0 || BB->isLandingPad() || BB->isEntryPoint()); 818 }; 819 820 for (BinaryBasicBlock *BB : BF.getLayout().blocks()) { 821 // Locate BB with a single direct tail-call instruction. 822 if (BB->getNumNonPseudos() != 1) 823 continue; 824 825 MCInst *Instr = BB->getFirstNonPseudoInstr(); 826 if (!MIB->isTailCall(*Instr) || MIB->isConditionalBranch(*Instr)) 827 continue; 828 829 const MCSymbol *CalleeSymbol = MIB->getTargetSymbol(*Instr); 830 if (!CalleeSymbol) 831 continue; 832 833 // Detect direction of the possible conditional tail call. 834 const bool IsForwardCTC = BF.isForwardCall(CalleeSymbol); 835 836 // Iterate through all predecessors. 837 for (BinaryBasicBlock *PredBB : BB->predecessors()) { 838 BinaryBasicBlock *CondSucc = PredBB->getConditionalSuccessor(true); 839 if (!CondSucc) 840 continue; 841 842 ++NumLocalCTCCandidates; 843 844 const MCSymbol *TBB = nullptr; 845 const MCSymbol *FBB = nullptr; 846 MCInst *CondBranch = nullptr; 847 MCInst *UncondBranch = nullptr; 848 bool Result = PredBB->analyzeBranch(TBB, FBB, CondBranch, UncondBranch); 849 850 // analyzeBranch() can fail due to unusual branch instructions, e.g. jrcxz 851 if (!Result) { 852 LLVM_DEBUG(dbgs() << "analyzeBranch failed in SCTC in block:\n"; 853 PredBB->dump()); 854 continue; 855 } 856 857 assert(Result && "internal error analyzing conditional branch"); 858 assert(CondBranch && "conditional branch expected"); 859 860 // It's possible that PredBB is also a successor to BB that may have 861 // been processed by a previous iteration of the SCTC loop, in which 862 // case it may have been marked invalid. We should skip rewriting in 863 // this case. 864 if (!PredBB->isValid()) { 865 assert(PredBB->isSuccessor(BB) && 866 "PredBB should be valid if it is not a successor to BB"); 867 continue; 868 } 869 870 // We don't want to reverse direction of the branch in new order 871 // without further profile analysis. 872 const bool DirectionFlag = CondSucc == BB ? IsForwardCTC : !IsForwardCTC; 873 if (!shouldRewriteBranch(PredBB, *CondBranch, BB, DirectionFlag)) 874 continue; 875 876 // Record this block so that we don't try to optimize it twice. 877 BeenOptimized.insert(PredBB); 878 879 uint64_t Count = 0; 880 if (CondSucc != BB) { 881 // Patch the new target address into the conditional branch. 882 MIB->reverseBranchCondition(*CondBranch, CalleeSymbol, Ctx); 883 // Since we reversed the condition on the branch we need to change 884 // the target for the unconditional branch or add a unconditional 885 // branch to the old target. This has to be done manually since 886 // fixupBranches is not called after SCTC. 887 NeedsUncondBranch.emplace_back(PredBB, CondSucc); 888 Count = PredBB->getFallthroughBranchInfo().Count; 889 } else { 890 // Change destination of the conditional branch. 891 MIB->replaceBranchTarget(*CondBranch, CalleeSymbol, Ctx); 892 Count = PredBB->getTakenBranchInfo().Count; 893 } 894 const uint64_t CTCTakenFreq = 895 Count == BinaryBasicBlock::COUNT_NO_PROFILE ? 0 : Count; 896 897 // Annotate it, so "isCall" returns true for this jcc 898 MIB->setConditionalTailCall(*CondBranch); 899 // Add info about the conditional tail call frequency, otherwise this 900 // info will be lost when we delete the associated BranchInfo entry 901 auto &CTCAnnotation = 902 MIB->getOrCreateAnnotationAs<uint64_t>(*CondBranch, "CTCTakenCount"); 903 CTCAnnotation = CTCTakenFreq; 904 905 // Remove the unused successor which may be eliminated later 906 // if there are no other users. 907 PredBB->removeSuccessor(BB); 908 // Update BB execution count 909 if (CTCTakenFreq && CTCTakenFreq <= BB->getKnownExecutionCount()) 910 BB->setExecutionCount(BB->getExecutionCount() - CTCTakenFreq); 911 else if (CTCTakenFreq > BB->getKnownExecutionCount()) 912 BB->setExecutionCount(0); 913 914 ++NumLocalCTCs; 915 LocalCTCTakenCount += CTCTakenFreq; 916 LocalCTCExecCount += PredBB->getKnownExecutionCount(); 917 } 918 919 // Remove the block from CFG if all predecessors were removed. 920 BB->markValid(isValid(BB)); 921 } 922 923 // Add unconditional branches at the end of BBs to new successors 924 // as long as the successor is not a fallthrough. 925 for (auto &Entry : NeedsUncondBranch) { 926 BinaryBasicBlock *PredBB = Entry.first; 927 const BinaryBasicBlock *CondSucc = Entry.second; 928 929 const MCSymbol *TBB = nullptr; 930 const MCSymbol *FBB = nullptr; 931 MCInst *CondBranch = nullptr; 932 MCInst *UncondBranch = nullptr; 933 PredBB->analyzeBranch(TBB, FBB, CondBranch, UncondBranch); 934 935 // Find the next valid block. Invalid blocks will be deleted 936 // so they shouldn't be considered fallthrough targets. 937 const BinaryBasicBlock *NextBlock = 938 BF.getLayout().getBasicBlockAfter(PredBB, false); 939 while (NextBlock && !isValid(NextBlock)) 940 NextBlock = BF.getLayout().getBasicBlockAfter(NextBlock, false); 941 942 // Get the unconditional successor to this block. 943 const BinaryBasicBlock *PredSucc = PredBB->getSuccessor(); 944 assert(PredSucc && "The other branch should be a tail call"); 945 946 const bool HasFallthrough = (NextBlock && PredSucc == NextBlock); 947 948 if (UncondBranch) { 949 if (HasFallthrough) 950 PredBB->eraseInstruction(PredBB->findInstruction(UncondBranch)); 951 else 952 MIB->replaceBranchTarget(*UncondBranch, CondSucc->getLabel(), Ctx); 953 } else if (!HasFallthrough) { 954 MCInst Branch; 955 MIB->createUncondBranch(Branch, CondSucc->getLabel(), Ctx); 956 PredBB->addInstruction(Branch); 957 } 958 } 959 960 if (NumLocalCTCs > 0) { 961 NumDoubleJumps += fixDoubleJumps(BF, true); 962 // Clean-up unreachable tail-call blocks. 963 const std::pair<unsigned, uint64_t> Stats = BF.eraseInvalidBBs(); 964 DeletedBlocks += Stats.first; 965 DeletedBytes += Stats.second; 966 967 assert(BF.validateCFG()); 968 } 969 970 LLVM_DEBUG(dbgs() << "BOLT: created " << NumLocalCTCs 971 << " conditional tail calls from a total of " 972 << NumLocalCTCCandidates << " candidates in function " << BF 973 << ". CTCs execution count for this function is " 974 << LocalCTCExecCount << " and CTC taken count is " 975 << LocalCTCTakenCount << "\n";); 976 977 NumTailCallsPatched += NumLocalCTCs; 978 NumCandidateTailCalls += NumLocalCTCCandidates; 979 CTCExecCount += LocalCTCExecCount; 980 CTCTakenCount += LocalCTCTakenCount; 981 982 return NumLocalCTCs > 0; 983 } 984 985 void SimplifyConditionalTailCalls::runOnFunctions(BinaryContext &BC) { 986 if (!BC.isX86()) 987 return; 988 989 for (auto &It : BC.getBinaryFunctions()) { 990 BinaryFunction &Function = It.second; 991 992 if (!shouldOptimize(Function)) 993 continue; 994 995 if (fixTailCalls(Function)) { 996 Modified.insert(&Function); 997 Function.setHasCanonicalCFG(false); 998 } 999 } 1000 1001 if (NumTailCallsPatched) 1002 outs() << "BOLT-INFO: SCTC: patched " << NumTailCallsPatched 1003 << " tail calls (" << NumOrigForwardBranches << " forward)" 1004 << " tail calls (" << NumOrigBackwardBranches << " backward)" 1005 << " from a total of " << NumCandidateTailCalls << " while removing " 1006 << NumDoubleJumps << " double jumps" 1007 << " and removing " << DeletedBlocks << " basic blocks" 1008 << " totalling " << DeletedBytes 1009 << " bytes of code. CTCs total execution count is " << CTCExecCount 1010 << " and the number of times CTCs are taken is " << CTCTakenCount 1011 << "\n"; 1012 } 1013 1014 uint64_t ShortenInstructions::shortenInstructions(BinaryFunction &Function) { 1015 uint64_t Count = 0; 1016 const BinaryContext &BC = Function.getBinaryContext(); 1017 for (BinaryBasicBlock &BB : Function) { 1018 for (MCInst &Inst : BB) { 1019 MCInst OriginalInst; 1020 if (opts::Verbosity > 2) 1021 OriginalInst = Inst; 1022 1023 if (!BC.MIB->shortenInstruction(Inst, *BC.STI)) 1024 continue; 1025 1026 if (opts::Verbosity > 2) { 1027 BC.scopeLock(); 1028 outs() << "BOLT-INFO: shortening:\nBOLT-INFO: "; 1029 BC.printInstruction(outs(), OriginalInst, 0, &Function); 1030 outs() << "BOLT-INFO: to:"; 1031 BC.printInstruction(outs(), Inst, 0, &Function); 1032 } 1033 1034 ++Count; 1035 } 1036 } 1037 1038 return Count; 1039 } 1040 1041 void ShortenInstructions::runOnFunctions(BinaryContext &BC) { 1042 std::atomic<uint64_t> NumShortened{0}; 1043 if (!BC.isX86()) 1044 return; 1045 1046 ParallelUtilities::runOnEachFunction( 1047 BC, ParallelUtilities::SchedulingPolicy::SP_INST_LINEAR, 1048 [&](BinaryFunction &BF) { NumShortened += shortenInstructions(BF); }, 1049 nullptr, "ShortenInstructions"); 1050 1051 if (NumShortened) 1052 outs() << "BOLT-INFO: " << NumShortened << " instructions were shortened\n"; 1053 } 1054 1055 void Peepholes::addTailcallTraps(BinaryFunction &Function) { 1056 MCPlusBuilder *MIB = Function.getBinaryContext().MIB.get(); 1057 for (BinaryBasicBlock &BB : Function) { 1058 MCInst *Inst = BB.getLastNonPseudoInstr(); 1059 if (Inst && MIB->isTailCall(*Inst) && MIB->isIndirectBranch(*Inst)) { 1060 MCInst Trap; 1061 if (MIB->createTrap(Trap)) { 1062 BB.addInstruction(Trap); 1063 ++TailCallTraps; 1064 } 1065 } 1066 } 1067 } 1068 1069 void Peepholes::removeUselessCondBranches(BinaryFunction &Function) { 1070 for (BinaryBasicBlock &BB : Function) { 1071 if (BB.succ_size() != 2) 1072 continue; 1073 1074 BinaryBasicBlock *CondBB = BB.getConditionalSuccessor(true); 1075 BinaryBasicBlock *UncondBB = BB.getConditionalSuccessor(false); 1076 if (CondBB != UncondBB) 1077 continue; 1078 1079 const MCSymbol *TBB = nullptr; 1080 const MCSymbol *FBB = nullptr; 1081 MCInst *CondBranch = nullptr; 1082 MCInst *UncondBranch = nullptr; 1083 bool Result = BB.analyzeBranch(TBB, FBB, CondBranch, UncondBranch); 1084 1085 // analyzeBranch() can fail due to unusual branch instructions, 1086 // e.g. jrcxz, or jump tables (indirect jump). 1087 if (!Result || !CondBranch) 1088 continue; 1089 1090 BB.removeDuplicateConditionalSuccessor(CondBranch); 1091 ++NumUselessCondBranches; 1092 } 1093 } 1094 1095 void Peepholes::runOnFunctions(BinaryContext &BC) { 1096 const char Opts = 1097 std::accumulate(opts::Peepholes.begin(), opts::Peepholes.end(), 0, 1098 [](const char A, const PeepholeOpts B) { return A | B; }); 1099 if (Opts == PEEP_NONE) 1100 return; 1101 1102 for (auto &It : BC.getBinaryFunctions()) { 1103 BinaryFunction &Function = It.second; 1104 if (shouldOptimize(Function)) { 1105 if (Opts & PEEP_DOUBLE_JUMPS) 1106 NumDoubleJumps += fixDoubleJumps(Function, false); 1107 if (Opts & PEEP_TAILCALL_TRAPS) 1108 addTailcallTraps(Function); 1109 if (Opts & PEEP_USELESS_BRANCHES) 1110 removeUselessCondBranches(Function); 1111 assert(Function.validateCFG()); 1112 } 1113 } 1114 outs() << "BOLT-INFO: Peephole: " << NumDoubleJumps 1115 << " double jumps patched.\n" 1116 << "BOLT-INFO: Peephole: " << TailCallTraps 1117 << " tail call traps inserted.\n" 1118 << "BOLT-INFO: Peephole: " << NumUselessCondBranches 1119 << " useless conditional branches removed.\n"; 1120 } 1121 1122 bool SimplifyRODataLoads::simplifyRODataLoads(BinaryFunction &BF) { 1123 BinaryContext &BC = BF.getBinaryContext(); 1124 MCPlusBuilder *MIB = BC.MIB.get(); 1125 1126 uint64_t NumLocalLoadsSimplified = 0; 1127 uint64_t NumDynamicLocalLoadsSimplified = 0; 1128 uint64_t NumLocalLoadsFound = 0; 1129 uint64_t NumDynamicLocalLoadsFound = 0; 1130 1131 for (BinaryBasicBlock *BB : BF.getLayout().blocks()) { 1132 for (MCInst &Inst : *BB) { 1133 unsigned Opcode = Inst.getOpcode(); 1134 const MCInstrDesc &Desc = BC.MII->get(Opcode); 1135 1136 // Skip instructions that do not load from memory. 1137 if (!Desc.mayLoad()) 1138 continue; 1139 1140 // Try to statically evaluate the target memory address; 1141 uint64_t TargetAddress; 1142 1143 if (MIB->hasPCRelOperand(Inst)) { 1144 // Try to find the symbol that corresponds to the PC-relative operand. 1145 MCOperand *DispOpI = MIB->getMemOperandDisp(Inst); 1146 assert(DispOpI != Inst.end() && "expected PC-relative displacement"); 1147 assert(DispOpI->isExpr() && 1148 "found PC-relative with non-symbolic displacement"); 1149 1150 // Get displacement symbol. 1151 const MCSymbol *DisplSymbol; 1152 uint64_t DisplOffset; 1153 1154 std::tie(DisplSymbol, DisplOffset) = 1155 MIB->getTargetSymbolInfo(DispOpI->getExpr()); 1156 1157 if (!DisplSymbol) 1158 continue; 1159 1160 // Look up the symbol address in the global symbols map of the binary 1161 // context object. 1162 BinaryData *BD = BC.getBinaryDataByName(DisplSymbol->getName()); 1163 if (!BD) 1164 continue; 1165 TargetAddress = BD->getAddress() + DisplOffset; 1166 } else if (!MIB->evaluateMemOperandTarget(Inst, TargetAddress)) { 1167 continue; 1168 } 1169 1170 // Get the contents of the section containing the target address of the 1171 // memory operand. We are only interested in read-only sections. 1172 ErrorOr<BinarySection &> DataSection = 1173 BC.getSectionForAddress(TargetAddress); 1174 if (!DataSection || DataSection->isWritable()) 1175 continue; 1176 1177 if (BC.getRelocationAt(TargetAddress) || 1178 BC.getDynamicRelocationAt(TargetAddress)) 1179 continue; 1180 1181 uint32_t Offset = TargetAddress - DataSection->getAddress(); 1182 StringRef ConstantData = DataSection->getContents(); 1183 1184 ++NumLocalLoadsFound; 1185 if (BB->hasProfile()) 1186 NumDynamicLocalLoadsFound += BB->getExecutionCount(); 1187 1188 if (MIB->replaceMemOperandWithImm(Inst, ConstantData, Offset)) { 1189 ++NumLocalLoadsSimplified; 1190 if (BB->hasProfile()) 1191 NumDynamicLocalLoadsSimplified += BB->getExecutionCount(); 1192 } 1193 } 1194 } 1195 1196 NumLoadsFound += NumLocalLoadsFound; 1197 NumDynamicLoadsFound += NumDynamicLocalLoadsFound; 1198 NumLoadsSimplified += NumLocalLoadsSimplified; 1199 NumDynamicLoadsSimplified += NumDynamicLocalLoadsSimplified; 1200 1201 return NumLocalLoadsSimplified > 0; 1202 } 1203 1204 void SimplifyRODataLoads::runOnFunctions(BinaryContext &BC) { 1205 for (auto &It : BC.getBinaryFunctions()) { 1206 BinaryFunction &Function = It.second; 1207 if (shouldOptimize(Function) && simplifyRODataLoads(Function)) 1208 Modified.insert(&Function); 1209 } 1210 1211 outs() << "BOLT-INFO: simplified " << NumLoadsSimplified << " out of " 1212 << NumLoadsFound << " loads from a statically computed address.\n" 1213 << "BOLT-INFO: dynamic loads simplified: " << NumDynamicLoadsSimplified 1214 << "\n" 1215 << "BOLT-INFO: dynamic loads found: " << NumDynamicLoadsFound << "\n"; 1216 } 1217 1218 void AssignSections::runOnFunctions(BinaryContext &BC) { 1219 for (BinaryFunction *Function : BC.getInjectedBinaryFunctions()) { 1220 Function->setCodeSectionName(BC.getInjectedCodeSectionName()); 1221 Function->setColdCodeSectionName(BC.getInjectedColdCodeSectionName()); 1222 } 1223 1224 // In non-relocation mode functions have pre-assigned section names. 1225 if (!BC.HasRelocations) 1226 return; 1227 1228 const bool UseColdSection = 1229 BC.NumProfiledFuncs > 0 || 1230 opts::ReorderFunctions == ReorderFunctions::RT_USER; 1231 for (auto &BFI : BC.getBinaryFunctions()) { 1232 BinaryFunction &Function = BFI.second; 1233 if (opts::isHotTextMover(Function)) { 1234 Function.setCodeSectionName(BC.getHotTextMoverSectionName()); 1235 Function.setColdCodeSectionName(BC.getHotTextMoverSectionName()); 1236 continue; 1237 } 1238 1239 if (!UseColdSection || Function.hasValidIndex()) 1240 Function.setCodeSectionName(BC.getMainCodeSectionName()); 1241 else 1242 Function.setCodeSectionName(BC.getColdCodeSectionName()); 1243 1244 if (Function.isSplit()) 1245 Function.setColdCodeSectionName(BC.getColdCodeSectionName()); 1246 } 1247 } 1248 1249 void PrintProfileStats::runOnFunctions(BinaryContext &BC) { 1250 double FlowImbalanceMean = 0.0; 1251 size_t NumBlocksConsidered = 0; 1252 double WorstBias = 0.0; 1253 const BinaryFunction *WorstBiasFunc = nullptr; 1254 1255 // For each function CFG, we fill an IncomingMap with the sum of the frequency 1256 // of incoming edges for each BB. Likewise for each OutgoingMap and the sum 1257 // of the frequency of outgoing edges. 1258 using FlowMapTy = std::unordered_map<const BinaryBasicBlock *, uint64_t>; 1259 std::unordered_map<const BinaryFunction *, FlowMapTy> TotalIncomingMaps; 1260 std::unordered_map<const BinaryFunction *, FlowMapTy> TotalOutgoingMaps; 1261 1262 // Compute mean 1263 for (const auto &BFI : BC.getBinaryFunctions()) { 1264 const BinaryFunction &Function = BFI.second; 1265 if (Function.empty() || !Function.isSimple()) 1266 continue; 1267 FlowMapTy &IncomingMap = TotalIncomingMaps[&Function]; 1268 FlowMapTy &OutgoingMap = TotalOutgoingMaps[&Function]; 1269 for (const BinaryBasicBlock &BB : Function) { 1270 uint64_t TotalOutgoing = 0ULL; 1271 auto SuccBIIter = BB.branch_info_begin(); 1272 for (BinaryBasicBlock *Succ : BB.successors()) { 1273 uint64_t Count = SuccBIIter->Count; 1274 if (Count == BinaryBasicBlock::COUNT_NO_PROFILE || Count == 0) { 1275 ++SuccBIIter; 1276 continue; 1277 } 1278 TotalOutgoing += Count; 1279 IncomingMap[Succ] += Count; 1280 ++SuccBIIter; 1281 } 1282 OutgoingMap[&BB] = TotalOutgoing; 1283 } 1284 1285 size_t NumBlocks = 0; 1286 double Mean = 0.0; 1287 for (const BinaryBasicBlock &BB : Function) { 1288 // Do not compute score for low frequency blocks, entry or exit blocks 1289 if (IncomingMap[&BB] < 100 || OutgoingMap[&BB] == 0 || BB.isEntryPoint()) 1290 continue; 1291 ++NumBlocks; 1292 const double Difference = (double)OutgoingMap[&BB] - IncomingMap[&BB]; 1293 Mean += fabs(Difference / IncomingMap[&BB]); 1294 } 1295 1296 FlowImbalanceMean += Mean; 1297 NumBlocksConsidered += NumBlocks; 1298 if (!NumBlocks) 1299 continue; 1300 double FuncMean = Mean / NumBlocks; 1301 if (FuncMean > WorstBias) { 1302 WorstBias = FuncMean; 1303 WorstBiasFunc = &Function; 1304 } 1305 } 1306 if (NumBlocksConsidered > 0) 1307 FlowImbalanceMean /= NumBlocksConsidered; 1308 1309 // Compute standard deviation 1310 NumBlocksConsidered = 0; 1311 double FlowImbalanceVar = 0.0; 1312 for (const auto &BFI : BC.getBinaryFunctions()) { 1313 const BinaryFunction &Function = BFI.second; 1314 if (Function.empty() || !Function.isSimple()) 1315 continue; 1316 FlowMapTy &IncomingMap = TotalIncomingMaps[&Function]; 1317 FlowMapTy &OutgoingMap = TotalOutgoingMaps[&Function]; 1318 for (const BinaryBasicBlock &BB : Function) { 1319 if (IncomingMap[&BB] < 100 || OutgoingMap[&BB] == 0) 1320 continue; 1321 ++NumBlocksConsidered; 1322 const double Difference = (double)OutgoingMap[&BB] - IncomingMap[&BB]; 1323 FlowImbalanceVar += 1324 pow(fabs(Difference / IncomingMap[&BB]) - FlowImbalanceMean, 2); 1325 } 1326 } 1327 if (NumBlocksConsidered) { 1328 FlowImbalanceVar /= NumBlocksConsidered; 1329 FlowImbalanceVar = sqrt(FlowImbalanceVar); 1330 } 1331 1332 // Report to user 1333 outs() << format("BOLT-INFO: Profile bias score: %.4lf%% StDev: %.4lf%%\n", 1334 (100.0 * FlowImbalanceMean), (100.0 * FlowImbalanceVar)); 1335 if (WorstBiasFunc && opts::Verbosity >= 1) { 1336 outs() << "Worst average bias observed in " << WorstBiasFunc->getPrintName() 1337 << "\n"; 1338 LLVM_DEBUG(WorstBiasFunc->dump()); 1339 } 1340 } 1341 1342 void PrintProgramStats::runOnFunctions(BinaryContext &BC) { 1343 uint64_t NumRegularFunctions = 0; 1344 uint64_t NumStaleProfileFunctions = 0; 1345 uint64_t NumAllStaleFunctions = 0; 1346 uint64_t NumInferredFunctions = 0; 1347 uint64_t NumNonSimpleProfiledFunctions = 0; 1348 uint64_t NumUnknownControlFlowFunctions = 0; 1349 uint64_t TotalSampleCount = 0; 1350 uint64_t StaleSampleCount = 0; 1351 uint64_t InferredSampleCount = 0; 1352 std::vector<const BinaryFunction *> ProfiledFunctions; 1353 const char *StaleFuncsHeader = "BOLT-INFO: Functions with stale profile:\n"; 1354 for (auto &BFI : BC.getBinaryFunctions()) { 1355 const BinaryFunction &Function = BFI.second; 1356 1357 // Ignore PLT functions for stats. 1358 if (Function.isPLTFunction()) 1359 continue; 1360 1361 ++NumRegularFunctions; 1362 1363 if (!Function.isSimple()) { 1364 if (Function.hasProfile()) 1365 ++NumNonSimpleProfiledFunctions; 1366 continue; 1367 } 1368 1369 if (Function.hasUnknownControlFlow()) { 1370 if (opts::PrintUnknownCFG) 1371 Function.dump(); 1372 else if (opts::PrintUnknown) 1373 errs() << "function with unknown control flow: " << Function << '\n'; 1374 1375 ++NumUnknownControlFlowFunctions; 1376 } 1377 1378 if (!Function.hasProfile()) 1379 continue; 1380 1381 uint64_t SampleCount = Function.getRawBranchCount(); 1382 TotalSampleCount += SampleCount; 1383 1384 if (Function.hasValidProfile()) { 1385 ProfiledFunctions.push_back(&Function); 1386 if (Function.hasInferredProfile()) { 1387 ++NumInferredFunctions; 1388 InferredSampleCount += SampleCount; 1389 ++NumAllStaleFunctions; 1390 } 1391 } else { 1392 if (opts::ReportStaleFuncs) { 1393 outs() << StaleFuncsHeader; 1394 StaleFuncsHeader = ""; 1395 outs() << " " << Function << '\n'; 1396 } 1397 ++NumStaleProfileFunctions; 1398 StaleSampleCount += SampleCount; 1399 ++NumAllStaleFunctions; 1400 } 1401 } 1402 BC.NumProfiledFuncs = ProfiledFunctions.size(); 1403 BC.NumStaleProfileFuncs = NumStaleProfileFunctions; 1404 1405 const size_t NumAllProfiledFunctions = 1406 ProfiledFunctions.size() + NumStaleProfileFunctions; 1407 outs() << "BOLT-INFO: " << NumAllProfiledFunctions << " out of " 1408 << NumRegularFunctions << " functions in the binary (" 1409 << format("%.1f", NumAllProfiledFunctions / 1410 (float)NumRegularFunctions * 100.0f) 1411 << "%) have non-empty execution profile\n"; 1412 if (NumNonSimpleProfiledFunctions) { 1413 outs() << "BOLT-INFO: " << NumNonSimpleProfiledFunctions << " function" 1414 << (NumNonSimpleProfiledFunctions == 1 ? "" : "s") 1415 << " with profile could not be optimized\n"; 1416 } 1417 if (NumStaleProfileFunctions) { 1418 const float PctStale = 1419 NumStaleProfileFunctions / (float)NumAllProfiledFunctions * 100.0f; 1420 auto printErrorOrWarning = [&]() { 1421 if (PctStale > opts::StaleThreshold) 1422 errs() << "BOLT-ERROR: "; 1423 else 1424 errs() << "BOLT-WARNING: "; 1425 }; 1426 printErrorOrWarning(); 1427 errs() << NumStaleProfileFunctions 1428 << format(" (%.1f%% of all profiled)", PctStale) << " function" 1429 << (NumStaleProfileFunctions == 1 ? "" : "s") 1430 << " have invalid (possibly stale) profile." 1431 " Use -report-stale to see the list.\n"; 1432 if (TotalSampleCount > 0) { 1433 printErrorOrWarning(); 1434 errs() << StaleSampleCount << " out of " << TotalSampleCount 1435 << " samples in the binary (" 1436 << format("%.1f", ((100.0f * StaleSampleCount) / TotalSampleCount)) 1437 << "%) belong to functions with invalid" 1438 " (possibly stale) profile.\n"; 1439 } 1440 if (PctStale > opts::StaleThreshold) { 1441 errs() << "BOLT-ERROR: stale functions exceed specified threshold of " 1442 << opts::StaleThreshold << "%. Exiting.\n"; 1443 exit(1); 1444 } 1445 } 1446 if (NumInferredFunctions) { 1447 outs() << format("BOLT-INFO: inferred profile for %d (%.2f%% of profiled, " 1448 "%.2f%% of stale) functions responsible for %.2f%% samples" 1449 " (%zu out of %zu)\n", 1450 NumInferredFunctions, 1451 100.0 * NumInferredFunctions / NumAllProfiledFunctions, 1452 100.0 * NumInferredFunctions / NumAllStaleFunctions, 1453 100.0 * InferredSampleCount / TotalSampleCount, 1454 InferredSampleCount, TotalSampleCount); 1455 outs() << format( 1456 "BOLT-INFO: inference found an exact match for %.2f%% of basic blocks" 1457 " (%zu out of %zu stale) responsible for %.2f%% samples" 1458 " (%zu out of %zu stale)\n", 1459 100.0 * BC.Stats.NumMatchedBlocks / BC.Stats.NumStaleBlocks, 1460 BC.Stats.NumMatchedBlocks, BC.Stats.NumStaleBlocks, 1461 100.0 * BC.Stats.MatchedSampleCount / BC.Stats.StaleSampleCount, 1462 BC.Stats.MatchedSampleCount, BC.Stats.StaleSampleCount); 1463 } 1464 1465 if (const uint64_t NumUnusedObjects = BC.getNumUnusedProfiledObjects()) { 1466 outs() << "BOLT-INFO: profile for " << NumUnusedObjects 1467 << " objects was ignored\n"; 1468 } 1469 1470 if (ProfiledFunctions.size() > 10) { 1471 if (opts::Verbosity >= 1) { 1472 outs() << "BOLT-INFO: top called functions are:\n"; 1473 llvm::sort(ProfiledFunctions, 1474 [](const BinaryFunction *A, const BinaryFunction *B) { 1475 return B->getExecutionCount() < A->getExecutionCount(); 1476 }); 1477 auto SFI = ProfiledFunctions.begin(); 1478 auto SFIend = ProfiledFunctions.end(); 1479 for (unsigned I = 0u; I < opts::TopCalledLimit && SFI != SFIend; 1480 ++SFI, ++I) 1481 outs() << " " << **SFI << " : " << (*SFI)->getExecutionCount() << '\n'; 1482 } 1483 } 1484 1485 if (!opts::PrintSortedBy.empty()) { 1486 std::vector<BinaryFunction *> Functions; 1487 std::map<const BinaryFunction *, DynoStats> Stats; 1488 1489 for (auto &BFI : BC.getBinaryFunctions()) { 1490 BinaryFunction &BF = BFI.second; 1491 if (shouldOptimize(BF) && BF.hasValidProfile()) { 1492 Functions.push_back(&BF); 1493 Stats.emplace(&BF, getDynoStats(BF)); 1494 } 1495 } 1496 1497 const bool SortAll = 1498 llvm::is_contained(opts::PrintSortedBy, DynoStats::LAST_DYNO_STAT); 1499 1500 const bool Ascending = 1501 opts::DynoStatsSortOrderOpt == opts::DynoStatsSortOrder::Ascending; 1502 1503 if (SortAll) { 1504 llvm::stable_sort(Functions, 1505 [Ascending, &Stats](const BinaryFunction *A, 1506 const BinaryFunction *B) { 1507 return Ascending ? Stats.at(A) < Stats.at(B) 1508 : Stats.at(B) < Stats.at(A); 1509 }); 1510 } else { 1511 llvm::stable_sort( 1512 Functions, [Ascending, &Stats](const BinaryFunction *A, 1513 const BinaryFunction *B) { 1514 const DynoStats &StatsA = Stats.at(A); 1515 const DynoStats &StatsB = Stats.at(B); 1516 return Ascending ? StatsA.lessThan(StatsB, opts::PrintSortedBy) 1517 : StatsB.lessThan(StatsA, opts::PrintSortedBy); 1518 }); 1519 } 1520 1521 outs() << "BOLT-INFO: top functions sorted by "; 1522 if (SortAll) { 1523 outs() << "dyno stats"; 1524 } else { 1525 outs() << "("; 1526 bool PrintComma = false; 1527 for (const DynoStats::Category Category : opts::PrintSortedBy) { 1528 if (PrintComma) 1529 outs() << ", "; 1530 outs() << DynoStats::Description(Category); 1531 PrintComma = true; 1532 } 1533 outs() << ")"; 1534 } 1535 1536 outs() << " are:\n"; 1537 auto SFI = Functions.begin(); 1538 for (unsigned I = 0; I < 100 && SFI != Functions.end(); ++SFI, ++I) { 1539 const DynoStats Stats = getDynoStats(**SFI); 1540 outs() << " " << **SFI; 1541 if (!SortAll) { 1542 outs() << " ("; 1543 bool PrintComma = false; 1544 for (const DynoStats::Category Category : opts::PrintSortedBy) { 1545 if (PrintComma) 1546 outs() << ", "; 1547 outs() << dynoStatsOptName(Category) << "=" << Stats[Category]; 1548 PrintComma = true; 1549 } 1550 outs() << ")"; 1551 } 1552 outs() << "\n"; 1553 } 1554 } 1555 1556 if (!BC.TrappedFunctions.empty()) { 1557 errs() << "BOLT-WARNING: " << BC.TrappedFunctions.size() << " function" 1558 << (BC.TrappedFunctions.size() > 1 ? "s" : "") 1559 << " will trap on entry. Use -trap-avx512=0 to disable" 1560 " traps."; 1561 if (opts::Verbosity >= 1 || BC.TrappedFunctions.size() <= 5) { 1562 errs() << '\n'; 1563 for (const BinaryFunction *Function : BC.TrappedFunctions) 1564 errs() << " " << *Function << '\n'; 1565 } else { 1566 errs() << " Use -v=1 to see the list.\n"; 1567 } 1568 } 1569 1570 // Print information on missed macro-fusion opportunities seen on input. 1571 if (BC.Stats.MissedMacroFusionPairs) { 1572 outs() << format("BOLT-INFO: the input contains %zu (dynamic count : %zu)" 1573 " opportunities for macro-fusion optimization", 1574 BC.Stats.MissedMacroFusionPairs, 1575 BC.Stats.MissedMacroFusionExecCount); 1576 switch (opts::AlignMacroOpFusion) { 1577 case MFT_NONE: 1578 outs() << ". Use -align-macro-fusion to fix.\n"; 1579 break; 1580 case MFT_HOT: 1581 outs() << ". Will fix instances on a hot path.\n"; 1582 break; 1583 case MFT_ALL: 1584 outs() << " that are going to be fixed\n"; 1585 break; 1586 } 1587 } 1588 1589 // Collect and print information about suboptimal code layout on input. 1590 if (opts::ReportBadLayout) { 1591 std::vector<BinaryFunction *> SuboptimalFuncs; 1592 for (auto &BFI : BC.getBinaryFunctions()) { 1593 BinaryFunction &BF = BFI.second; 1594 if (!BF.hasValidProfile()) 1595 continue; 1596 1597 const uint64_t HotThreshold = 1598 std::max<uint64_t>(BF.getKnownExecutionCount(), 1); 1599 bool HotSeen = false; 1600 for (const BinaryBasicBlock *BB : BF.getLayout().rblocks()) { 1601 if (!HotSeen && BB->getKnownExecutionCount() > HotThreshold) { 1602 HotSeen = true; 1603 continue; 1604 } 1605 if (HotSeen && BB->getKnownExecutionCount() == 0) { 1606 SuboptimalFuncs.push_back(&BF); 1607 break; 1608 } 1609 } 1610 } 1611 1612 if (!SuboptimalFuncs.empty()) { 1613 llvm::sort(SuboptimalFuncs, 1614 [](const BinaryFunction *A, const BinaryFunction *B) { 1615 return A->getKnownExecutionCount() / A->getSize() > 1616 B->getKnownExecutionCount() / B->getSize(); 1617 }); 1618 1619 outs() << "BOLT-INFO: " << SuboptimalFuncs.size() 1620 << " functions have " 1621 "cold code in the middle of hot code. Top functions are:\n"; 1622 for (unsigned I = 0; 1623 I < std::min(static_cast<size_t>(opts::ReportBadLayout), 1624 SuboptimalFuncs.size()); 1625 ++I) 1626 SuboptimalFuncs[I]->print(outs()); 1627 } 1628 } 1629 1630 if (NumUnknownControlFlowFunctions) { 1631 outs() << "BOLT-INFO: " << NumUnknownControlFlowFunctions 1632 << " functions have instructions with unknown control flow"; 1633 if (!opts::PrintUnknown) 1634 outs() << ". Use -print-unknown to see the list."; 1635 outs() << '\n'; 1636 } 1637 } 1638 1639 void InstructionLowering::runOnFunctions(BinaryContext &BC) { 1640 for (auto &BFI : BC.getBinaryFunctions()) 1641 for (BinaryBasicBlock &BB : BFI.second) 1642 for (MCInst &Instruction : BB) 1643 BC.MIB->lowerTailCall(Instruction); 1644 } 1645 1646 void StripRepRet::runOnFunctions(BinaryContext &BC) { 1647 if (!BC.isX86()) 1648 return; 1649 1650 uint64_t NumPrefixesRemoved = 0; 1651 uint64_t NumBytesSaved = 0; 1652 for (auto &BFI : BC.getBinaryFunctions()) { 1653 for (BinaryBasicBlock &BB : BFI.second) { 1654 auto LastInstRIter = BB.getLastNonPseudo(); 1655 if (LastInstRIter == BB.rend() || !BC.MIB->isReturn(*LastInstRIter) || 1656 !BC.MIB->deleteREPPrefix(*LastInstRIter)) 1657 continue; 1658 1659 NumPrefixesRemoved += BB.getKnownExecutionCount(); 1660 ++NumBytesSaved; 1661 } 1662 } 1663 1664 if (NumBytesSaved) 1665 outs() << "BOLT-INFO: removed " << NumBytesSaved 1666 << " 'repz' prefixes" 1667 " with estimated execution count of " 1668 << NumPrefixesRemoved << " times.\n"; 1669 } 1670 1671 void InlineMemcpy::runOnFunctions(BinaryContext &BC) { 1672 if (!BC.isX86()) 1673 return; 1674 1675 uint64_t NumInlined = 0; 1676 uint64_t NumInlinedDyno = 0; 1677 for (auto &BFI : BC.getBinaryFunctions()) { 1678 for (BinaryBasicBlock &BB : BFI.second) { 1679 for (auto II = BB.begin(); II != BB.end(); ++II) { 1680 MCInst &Inst = *II; 1681 1682 if (!BC.MIB->isCall(Inst) || MCPlus::getNumPrimeOperands(Inst) != 1 || 1683 !Inst.getOperand(0).isExpr()) 1684 continue; 1685 1686 const MCSymbol *CalleeSymbol = BC.MIB->getTargetSymbol(Inst); 1687 if (CalleeSymbol->getName() != "memcpy" && 1688 CalleeSymbol->getName() != "memcpy@PLT" && 1689 CalleeSymbol->getName() != "_memcpy8") 1690 continue; 1691 1692 const bool IsMemcpy8 = (CalleeSymbol->getName() == "_memcpy8"); 1693 const bool IsTailCall = BC.MIB->isTailCall(Inst); 1694 1695 const InstructionListType NewCode = 1696 BC.MIB->createInlineMemcpy(IsMemcpy8); 1697 II = BB.replaceInstruction(II, NewCode); 1698 std::advance(II, NewCode.size() - 1); 1699 if (IsTailCall) { 1700 MCInst Return; 1701 BC.MIB->createReturn(Return); 1702 II = BB.insertInstruction(std::next(II), std::move(Return)); 1703 } 1704 1705 ++NumInlined; 1706 NumInlinedDyno += BB.getKnownExecutionCount(); 1707 } 1708 } 1709 } 1710 1711 if (NumInlined) { 1712 outs() << "BOLT-INFO: inlined " << NumInlined << " memcpy() calls"; 1713 if (NumInlinedDyno) 1714 outs() << ". The calls were executed " << NumInlinedDyno 1715 << " times based on profile."; 1716 outs() << '\n'; 1717 } 1718 } 1719 1720 bool SpecializeMemcpy1::shouldOptimize(const BinaryFunction &Function) const { 1721 if (!BinaryFunctionPass::shouldOptimize(Function)) 1722 return false; 1723 1724 for (const std::string &FunctionSpec : Spec) { 1725 StringRef FunctionName = StringRef(FunctionSpec).split(':').first; 1726 if (Function.hasNameRegex(FunctionName)) 1727 return true; 1728 } 1729 1730 return false; 1731 } 1732 1733 std::set<size_t> SpecializeMemcpy1::getCallSitesToOptimize( 1734 const BinaryFunction &Function) const { 1735 StringRef SitesString; 1736 for (const std::string &FunctionSpec : Spec) { 1737 StringRef FunctionName; 1738 std::tie(FunctionName, SitesString) = StringRef(FunctionSpec).split(':'); 1739 if (Function.hasNameRegex(FunctionName)) 1740 break; 1741 SitesString = ""; 1742 } 1743 1744 std::set<size_t> Sites; 1745 SmallVector<StringRef, 4> SitesVec; 1746 SitesString.split(SitesVec, ':'); 1747 for (StringRef SiteString : SitesVec) { 1748 if (SiteString.empty()) 1749 continue; 1750 size_t Result; 1751 if (!SiteString.getAsInteger(10, Result)) 1752 Sites.emplace(Result); 1753 } 1754 1755 return Sites; 1756 } 1757 1758 void SpecializeMemcpy1::runOnFunctions(BinaryContext &BC) { 1759 if (!BC.isX86()) 1760 return; 1761 1762 uint64_t NumSpecialized = 0; 1763 uint64_t NumSpecializedDyno = 0; 1764 for (auto &BFI : BC.getBinaryFunctions()) { 1765 BinaryFunction &Function = BFI.second; 1766 if (!shouldOptimize(Function)) 1767 continue; 1768 1769 std::set<size_t> CallsToOptimize = getCallSitesToOptimize(Function); 1770 auto shouldOptimize = [&](size_t N) { 1771 return CallsToOptimize.empty() || CallsToOptimize.count(N); 1772 }; 1773 1774 std::vector<BinaryBasicBlock *> Blocks(Function.pbegin(), Function.pend()); 1775 size_t CallSiteID = 0; 1776 for (BinaryBasicBlock *CurBB : Blocks) { 1777 for (auto II = CurBB->begin(); II != CurBB->end(); ++II) { 1778 MCInst &Inst = *II; 1779 1780 if (!BC.MIB->isCall(Inst) || MCPlus::getNumPrimeOperands(Inst) != 1 || 1781 !Inst.getOperand(0).isExpr()) 1782 continue; 1783 1784 const MCSymbol *CalleeSymbol = BC.MIB->getTargetSymbol(Inst); 1785 if (CalleeSymbol->getName() != "memcpy" && 1786 CalleeSymbol->getName() != "memcpy@PLT") 1787 continue; 1788 1789 if (BC.MIB->isTailCall(Inst)) 1790 continue; 1791 1792 ++CallSiteID; 1793 1794 if (!shouldOptimize(CallSiteID)) 1795 continue; 1796 1797 // Create a copy of a call to memcpy(dest, src, size). 1798 MCInst MemcpyInstr = Inst; 1799 1800 BinaryBasicBlock *OneByteMemcpyBB = CurBB->splitAt(II); 1801 1802 BinaryBasicBlock *NextBB = nullptr; 1803 if (OneByteMemcpyBB->getNumNonPseudos() > 1) { 1804 NextBB = OneByteMemcpyBB->splitAt(OneByteMemcpyBB->begin()); 1805 NextBB->eraseInstruction(NextBB->begin()); 1806 } else { 1807 NextBB = OneByteMemcpyBB->getSuccessor(); 1808 OneByteMemcpyBB->eraseInstruction(OneByteMemcpyBB->begin()); 1809 assert(NextBB && "unexpected call to memcpy() with no return"); 1810 } 1811 1812 BinaryBasicBlock *MemcpyBB = Function.addBasicBlock(); 1813 MemcpyBB->setOffset(CurBB->getInputOffset()); 1814 InstructionListType CmpJCC = 1815 BC.MIB->createCmpJE(BC.MIB->getIntArgRegister(2), 1, 1816 OneByteMemcpyBB->getLabel(), BC.Ctx.get()); 1817 CurBB->addInstructions(CmpJCC); 1818 CurBB->addSuccessor(MemcpyBB); 1819 1820 MemcpyBB->addInstruction(std::move(MemcpyInstr)); 1821 MemcpyBB->addSuccessor(NextBB); 1822 MemcpyBB->setCFIState(NextBB->getCFIState()); 1823 MemcpyBB->setExecutionCount(0); 1824 1825 // To prevent the actual call from being moved to cold, we set its 1826 // execution count to 1. 1827 if (CurBB->getKnownExecutionCount() > 0) 1828 MemcpyBB->setExecutionCount(1); 1829 1830 InstructionListType OneByteMemcpy = BC.MIB->createOneByteMemcpy(); 1831 OneByteMemcpyBB->addInstructions(OneByteMemcpy); 1832 1833 ++NumSpecialized; 1834 NumSpecializedDyno += CurBB->getKnownExecutionCount(); 1835 1836 CurBB = NextBB; 1837 1838 // Note: we don't expect the next instruction to be a call to memcpy. 1839 II = CurBB->begin(); 1840 } 1841 } 1842 } 1843 1844 if (NumSpecialized) { 1845 outs() << "BOLT-INFO: specialized " << NumSpecialized 1846 << " memcpy() call sites for size 1"; 1847 if (NumSpecializedDyno) 1848 outs() << ". The calls were executed " << NumSpecializedDyno 1849 << " times based on profile."; 1850 outs() << '\n'; 1851 } 1852 } 1853 1854 void RemoveNops::runOnFunction(BinaryFunction &BF) { 1855 const BinaryContext &BC = BF.getBinaryContext(); 1856 for (BinaryBasicBlock &BB : BF) { 1857 for (int64_t I = BB.size() - 1; I >= 0; --I) { 1858 MCInst &Inst = BB.getInstructionAtIndex(I); 1859 if (BC.MIB->isNoop(Inst) && BC.MIB->hasAnnotation(Inst, "NOP")) 1860 BB.eraseInstructionAtIndex(I); 1861 } 1862 } 1863 } 1864 1865 void RemoveNops::runOnFunctions(BinaryContext &BC) { 1866 ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) { 1867 runOnFunction(BF); 1868 }; 1869 1870 ParallelUtilities::PredicateTy SkipFunc = [&](const BinaryFunction &BF) { 1871 return BF.shouldPreserveNops(); 1872 }; 1873 1874 ParallelUtilities::runOnEachFunction( 1875 BC, ParallelUtilities::SchedulingPolicy::SP_INST_LINEAR, WorkFun, 1876 SkipFunc, "RemoveNops"); 1877 } 1878 1879 } // namespace bolt 1880 } // namespace llvm 1881