1 //===- bolt/Passes/BinaryPasses.cpp - Binary-level passes -----------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements multiple passes for binary optimization and analysis. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "bolt/Passes/BinaryPasses.h" 14 #include "bolt/Core/FunctionLayout.h" 15 #include "bolt/Core/ParallelUtilities.h" 16 #include "bolt/Passes/ReorderAlgorithm.h" 17 #include "bolt/Passes/ReorderFunctions.h" 18 #include "llvm/Support/CommandLine.h" 19 #include <atomic> 20 #include <mutex> 21 #include <numeric> 22 #include <vector> 23 24 #define DEBUG_TYPE "bolt-opts" 25 26 using namespace llvm; 27 using namespace bolt; 28 29 static const char *dynoStatsOptName(const bolt::DynoStats::Category C) { 30 assert(C > bolt::DynoStats::FIRST_DYNO_STAT && 31 C < DynoStats::LAST_DYNO_STAT && "Unexpected dyno stat category."); 32 33 static std::string OptNames[bolt::DynoStats::LAST_DYNO_STAT + 1]; 34 35 OptNames[C] = bolt::DynoStats::Description(C); 36 37 std::replace(OptNames[C].begin(), OptNames[C].end(), ' ', '-'); 38 39 return OptNames[C].c_str(); 40 } 41 42 namespace opts { 43 44 extern cl::OptionCategory BoltCategory; 45 extern cl::OptionCategory BoltOptCategory; 46 47 extern cl::opt<bolt::MacroFusionType> AlignMacroOpFusion; 48 extern cl::opt<unsigned> Verbosity; 49 extern cl::opt<bool> EnableBAT; 50 extern cl::opt<unsigned> ExecutionCountThreshold; 51 extern cl::opt<bool> UpdateDebugSections; 52 extern cl::opt<bolt::ReorderFunctions::ReorderType> ReorderFunctions; 53 54 enum DynoStatsSortOrder : char { 55 Ascending, 56 Descending 57 }; 58 59 static cl::opt<DynoStatsSortOrder> DynoStatsSortOrderOpt( 60 "print-sorted-by-order", 61 cl::desc("use ascending or descending order when printing functions " 62 "ordered by dyno stats"), 63 cl::init(DynoStatsSortOrder::Descending), cl::cat(BoltOptCategory)); 64 65 cl::list<std::string> 66 HotTextMoveSections("hot-text-move-sections", 67 cl::desc("list of sections containing functions used for hugifying hot text. " 68 "BOLT makes sure these functions are not placed on the same page as " 69 "the hot text. (default=\'.stub,.mover\')."), 70 cl::value_desc("sec1,sec2,sec3,..."), 71 cl::CommaSeparated, 72 cl::ZeroOrMore, 73 cl::cat(BoltCategory)); 74 75 bool isHotTextMover(const BinaryFunction &Function) { 76 for (std::string &SectionName : opts::HotTextMoveSections) { 77 if (Function.getOriginSectionName() && 78 *Function.getOriginSectionName() == SectionName) 79 return true; 80 } 81 82 return false; 83 } 84 85 static cl::opt<bool> MinBranchClusters( 86 "min-branch-clusters", 87 cl::desc("use a modified clustering algorithm geared towards minimizing " 88 "branches"), 89 cl::Hidden, cl::cat(BoltOptCategory)); 90 91 static cl::list<Peepholes::PeepholeOpts> Peepholes( 92 "peepholes", cl::CommaSeparated, cl::desc("enable peephole optimizations"), 93 cl::value_desc("opt1,opt2,opt3,..."), 94 cl::values(clEnumValN(Peepholes::PEEP_NONE, "none", "disable peepholes"), 95 clEnumValN(Peepholes::PEEP_DOUBLE_JUMPS, "double-jumps", 96 "remove double jumps when able"), 97 clEnumValN(Peepholes::PEEP_TAILCALL_TRAPS, "tailcall-traps", 98 "insert tail call traps"), 99 clEnumValN(Peepholes::PEEP_USELESS_BRANCHES, "useless-branches", 100 "remove useless conditional branches"), 101 clEnumValN(Peepholes::PEEP_ALL, "all", 102 "enable all peephole optimizations")), 103 cl::ZeroOrMore, cl::cat(BoltOptCategory)); 104 105 static cl::opt<unsigned> 106 PrintFuncStat("print-function-statistics", 107 cl::desc("print statistics about basic block ordering"), 108 cl::init(0), cl::cat(BoltOptCategory)); 109 110 static cl::list<bolt::DynoStats::Category> 111 PrintSortedBy("print-sorted-by", cl::CommaSeparated, 112 cl::desc("print functions sorted by order of dyno stats"), 113 cl::value_desc("key1,key2,key3,..."), 114 cl::values( 115 #define D(name, description, ...) \ 116 clEnumValN(bolt::DynoStats::name, dynoStatsOptName(bolt::DynoStats::name), \ 117 description), 118 REAL_DYNO_STATS 119 #undef D 120 clEnumValN(bolt::DynoStats::LAST_DYNO_STAT, "all", 121 "sorted by all names")), 122 cl::ZeroOrMore, cl::cat(BoltOptCategory)); 123 124 static cl::opt<bool> 125 PrintUnknown("print-unknown", 126 cl::desc("print names of functions with unknown control flow"), 127 cl::cat(BoltCategory), cl::Hidden); 128 129 static cl::opt<bool> 130 PrintUnknownCFG("print-unknown-cfg", 131 cl::desc("dump CFG of functions with unknown control flow"), 132 cl::cat(BoltCategory), cl::ReallyHidden); 133 134 // Please MSVC19 with a forward declaration: otherwise it reports an error about 135 // an undeclared variable inside a callback. 136 extern cl::opt<bolt::ReorderBasicBlocks::LayoutType> ReorderBlocks; 137 cl::opt<bolt::ReorderBasicBlocks::LayoutType> ReorderBlocks( 138 "reorder-blocks", cl::desc("change layout of basic blocks in a function"), 139 cl::init(bolt::ReorderBasicBlocks::LT_NONE), 140 cl::values( 141 clEnumValN(bolt::ReorderBasicBlocks::LT_NONE, "none", 142 "do not reorder basic blocks"), 143 clEnumValN(bolt::ReorderBasicBlocks::LT_REVERSE, "reverse", 144 "layout blocks in reverse order"), 145 clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE, "normal", 146 "perform optimal layout based on profile"), 147 clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_BRANCH, 148 "branch-predictor", 149 "perform optimal layout prioritizing branch " 150 "predictions"), 151 clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_CACHE, "cache", 152 "perform optimal layout prioritizing I-cache " 153 "behavior"), 154 clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_CACHE_PLUS, "cache+", 155 "perform layout optimizing I-cache behavior"), 156 clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_EXT_TSP, "ext-tsp", 157 "perform layout optimizing I-cache behavior"), 158 clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_SHUFFLE, 159 "cluster-shuffle", "perform random layout of clusters")), 160 cl::ZeroOrMore, cl::cat(BoltOptCategory), 161 cl::callback([](const bolt::ReorderBasicBlocks::LayoutType &option) { 162 if (option == bolt::ReorderBasicBlocks::LT_OPTIMIZE_CACHE_PLUS) { 163 errs() << "BOLT-WARNING: '-reorder-blocks=cache+' is deprecated, please" 164 << " use '-reorder-blocks=ext-tsp' instead\n"; 165 ReorderBlocks = bolt::ReorderBasicBlocks::LT_OPTIMIZE_EXT_TSP; 166 } 167 })); 168 169 static cl::opt<unsigned> ReportBadLayout( 170 "report-bad-layout", 171 cl::desc("print top <uint> functions with suboptimal code layout on input"), 172 cl::init(0), cl::Hidden, cl::cat(BoltOptCategory)); 173 174 static cl::opt<bool> 175 ReportStaleFuncs("report-stale", 176 cl::desc("print the list of functions with stale profile"), 177 cl::Hidden, cl::cat(BoltOptCategory)); 178 179 enum SctcModes : char { 180 SctcAlways, 181 SctcPreserveDirection, 182 SctcHeuristic 183 }; 184 185 static cl::opt<SctcModes> 186 SctcMode("sctc-mode", 187 cl::desc("mode for simplify conditional tail calls"), 188 cl::init(SctcAlways), 189 cl::values(clEnumValN(SctcAlways, "always", "always perform sctc"), 190 clEnumValN(SctcPreserveDirection, 191 "preserve", 192 "only perform sctc when branch direction is " 193 "preserved"), 194 clEnumValN(SctcHeuristic, 195 "heuristic", 196 "use branch prediction data to control sctc")), 197 cl::ZeroOrMore, 198 cl::cat(BoltOptCategory)); 199 200 static cl::opt<unsigned> 201 StaleThreshold("stale-threshold", 202 cl::desc( 203 "maximum percentage of stale functions to tolerate (default: 100)"), 204 cl::init(100), 205 cl::Hidden, 206 cl::cat(BoltOptCategory)); 207 208 static cl::opt<unsigned> TSPThreshold( 209 "tsp-threshold", 210 cl::desc( 211 "maximum number of hot basic blocks in a function for which to use " 212 "a precise TSP solution while re-ordering basic blocks"), 213 cl::init(10), cl::Hidden, cl::cat(BoltOptCategory)); 214 215 static cl::opt<unsigned> TopCalledLimit( 216 "top-called-limit", 217 cl::desc("maximum number of functions to print in top called " 218 "functions section"), 219 cl::init(100), cl::Hidden, cl::cat(BoltCategory)); 220 221 } // namespace opts 222 223 namespace llvm { 224 namespace bolt { 225 226 bool BinaryFunctionPass::shouldOptimize(const BinaryFunction &BF) const { 227 return BF.isSimple() && BF.getState() == BinaryFunction::State::CFG && 228 !BF.isIgnored(); 229 } 230 231 bool BinaryFunctionPass::shouldPrint(const BinaryFunction &BF) const { 232 return BF.isSimple() && !BF.isIgnored(); 233 } 234 235 void NormalizeCFG::runOnFunction(BinaryFunction &BF) { 236 uint64_t NumRemoved = 0; 237 uint64_t NumDuplicateEdges = 0; 238 uint64_t NeedsFixBranches = 0; 239 for (BinaryBasicBlock &BB : BF) { 240 if (!BB.empty()) 241 continue; 242 243 if (BB.isEntryPoint() || BB.isLandingPad()) 244 continue; 245 246 // Handle a dangling empty block. 247 if (BB.succ_size() == 0) { 248 // If an empty dangling basic block has a predecessor, it could be a 249 // result of codegen for __builtin_unreachable. In such case, do not 250 // remove the block. 251 if (BB.pred_size() == 0) { 252 BB.markValid(false); 253 ++NumRemoved; 254 } 255 continue; 256 } 257 258 // The block should have just one successor. 259 BinaryBasicBlock *Successor = BB.getSuccessor(); 260 assert(Successor && "invalid CFG encountered"); 261 262 // Redirect all predecessors to the successor block. 263 while (!BB.pred_empty()) { 264 BinaryBasicBlock *Predecessor = *BB.pred_begin(); 265 if (Predecessor->hasJumpTable()) 266 break; 267 268 if (Predecessor == Successor) 269 break; 270 271 BinaryBasicBlock::BinaryBranchInfo &BI = Predecessor->getBranchInfo(BB); 272 Predecessor->replaceSuccessor(&BB, Successor, BI.Count, 273 BI.MispredictedCount); 274 // We need to fix branches even if we failed to replace all successors 275 // and remove the block. 276 NeedsFixBranches = true; 277 } 278 279 if (BB.pred_empty()) { 280 BB.removeAllSuccessors(); 281 BB.markValid(false); 282 ++NumRemoved; 283 } 284 } 285 286 if (NumRemoved) 287 BF.eraseInvalidBBs(); 288 289 // Check for duplicate successors. Do it after the empty block elimination as 290 // we can get more duplicate successors. 291 for (BinaryBasicBlock &BB : BF) 292 if (!BB.hasJumpTable() && BB.succ_size() == 2 && 293 BB.getConditionalSuccessor(false) == BB.getConditionalSuccessor(true)) 294 ++NumDuplicateEdges; 295 296 // fixBranches() will get rid of duplicate edges and update jump instructions. 297 if (NumDuplicateEdges || NeedsFixBranches) 298 BF.fixBranches(); 299 300 NumDuplicateEdgesMerged += NumDuplicateEdges; 301 NumBlocksRemoved += NumRemoved; 302 } 303 304 Error NormalizeCFG::runOnFunctions(BinaryContext &BC) { 305 ParallelUtilities::runOnEachFunction( 306 BC, ParallelUtilities::SchedulingPolicy::SP_BB_LINEAR, 307 [&](BinaryFunction &BF) { runOnFunction(BF); }, 308 [&](const BinaryFunction &BF) { return !shouldOptimize(BF); }, 309 "NormalizeCFG"); 310 if (NumBlocksRemoved) 311 outs() << "BOLT-INFO: removed " << NumBlocksRemoved << " empty block" 312 << (NumBlocksRemoved == 1 ? "" : "s") << '\n'; 313 if (NumDuplicateEdgesMerged) 314 outs() << "BOLT-INFO: merged " << NumDuplicateEdgesMerged 315 << " duplicate CFG edge" << (NumDuplicateEdgesMerged == 1 ? "" : "s") 316 << '\n'; 317 return Error::success(); 318 } 319 320 void EliminateUnreachableBlocks::runOnFunction(BinaryFunction &Function) { 321 BinaryContext &BC = Function.getBinaryContext(); 322 unsigned Count; 323 uint64_t Bytes; 324 Function.markUnreachableBlocks(); 325 LLVM_DEBUG({ 326 for (BinaryBasicBlock &BB : Function) { 327 if (!BB.isValid()) { 328 dbgs() << "BOLT-INFO: UCE found unreachable block " << BB.getName() 329 << " in function " << Function << "\n"; 330 Function.dump(); 331 } 332 } 333 }); 334 BinaryContext::IndependentCodeEmitter Emitter = 335 BC.createIndependentMCCodeEmitter(); 336 std::tie(Count, Bytes) = Function.eraseInvalidBBs(Emitter.MCE.get()); 337 DeletedBlocks += Count; 338 DeletedBytes += Bytes; 339 if (Count) { 340 auto L = BC.scopeLock(); 341 Modified.insert(&Function); 342 if (opts::Verbosity > 0) 343 outs() << "BOLT-INFO: removed " << Count 344 << " dead basic block(s) accounting for " << Bytes 345 << " bytes in function " << Function << '\n'; 346 } 347 } 348 349 Error EliminateUnreachableBlocks::runOnFunctions(BinaryContext &BC) { 350 ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) { 351 runOnFunction(BF); 352 }; 353 354 ParallelUtilities::PredicateTy SkipPredicate = [&](const BinaryFunction &BF) { 355 return !shouldOptimize(BF) || BF.getLayout().block_empty(); 356 }; 357 358 ParallelUtilities::runOnEachFunction( 359 BC, ParallelUtilities::SchedulingPolicy::SP_CONSTANT, WorkFun, 360 SkipPredicate, "elimininate-unreachable"); 361 362 if (DeletedBlocks) 363 outs() << "BOLT-INFO: UCE removed " << DeletedBlocks << " blocks and " 364 << DeletedBytes << " bytes of code\n"; 365 return Error::success(); 366 } 367 368 bool ReorderBasicBlocks::shouldPrint(const BinaryFunction &BF) const { 369 return (BinaryFunctionPass::shouldPrint(BF) && 370 opts::ReorderBlocks != ReorderBasicBlocks::LT_NONE); 371 } 372 373 bool ReorderBasicBlocks::shouldOptimize(const BinaryFunction &BF) const { 374 // Apply execution count threshold 375 if (BF.getKnownExecutionCount() < opts::ExecutionCountThreshold) 376 return false; 377 378 return BinaryFunctionPass::shouldOptimize(BF); 379 } 380 381 Error ReorderBasicBlocks::runOnFunctions(BinaryContext &BC) { 382 if (opts::ReorderBlocks == ReorderBasicBlocks::LT_NONE) 383 return Error::success(); 384 385 std::atomic_uint64_t ModifiedFuncCount(0); 386 std::mutex FunctionEditDistanceMutex; 387 DenseMap<const BinaryFunction *, uint64_t> FunctionEditDistance; 388 389 ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) { 390 SmallVector<const BinaryBasicBlock *, 0> OldBlockOrder; 391 if (opts::PrintFuncStat > 0) 392 llvm::copy(BF.getLayout().blocks(), std::back_inserter(OldBlockOrder)); 393 394 const bool LayoutChanged = 395 modifyFunctionLayout(BF, opts::ReorderBlocks, opts::MinBranchClusters); 396 if (LayoutChanged) { 397 ModifiedFuncCount.fetch_add(1, std::memory_order_relaxed); 398 if (opts::PrintFuncStat > 0) { 399 const uint64_t Distance = BF.getLayout().getEditDistance(OldBlockOrder); 400 std::lock_guard<std::mutex> Lock(FunctionEditDistanceMutex); 401 FunctionEditDistance[&BF] = Distance; 402 } 403 } 404 }; 405 406 ParallelUtilities::PredicateTy SkipFunc = [&](const BinaryFunction &BF) { 407 return !shouldOptimize(BF); 408 }; 409 410 ParallelUtilities::runOnEachFunction( 411 BC, ParallelUtilities::SchedulingPolicy::SP_BB_LINEAR, WorkFun, SkipFunc, 412 "ReorderBasicBlocks"); 413 const size_t NumAllProfiledFunctions = 414 BC.NumProfiledFuncs + BC.NumStaleProfileFuncs; 415 416 outs() << "BOLT-INFO: basic block reordering modified layout of " 417 << format("%zu functions (%.2lf%% of profiled, %.2lf%% of total)\n", 418 ModifiedFuncCount.load(std::memory_order_relaxed), 419 100.0 * ModifiedFuncCount.load(std::memory_order_relaxed) / 420 NumAllProfiledFunctions, 421 100.0 * ModifiedFuncCount.load(std::memory_order_relaxed) / 422 BC.getBinaryFunctions().size()); 423 424 if (opts::PrintFuncStat > 0) { 425 raw_ostream &OS = outs(); 426 // Copy all the values into vector in order to sort them 427 std::map<uint64_t, BinaryFunction &> ScoreMap; 428 auto &BFs = BC.getBinaryFunctions(); 429 for (auto It = BFs.begin(); It != BFs.end(); ++It) 430 ScoreMap.insert(std::pair<uint64_t, BinaryFunction &>( 431 It->second.getFunctionScore(), It->second)); 432 433 OS << "\nBOLT-INFO: Printing Function Statistics:\n\n"; 434 OS << " There are " << BFs.size() << " functions in total. \n"; 435 OS << " Number of functions being modified: " 436 << ModifiedFuncCount.load(std::memory_order_relaxed) << "\n"; 437 OS << " User asks for detailed information on top " 438 << opts::PrintFuncStat << " functions. (Ranked by function score)" 439 << "\n\n"; 440 uint64_t I = 0; 441 for (std::map<uint64_t, BinaryFunction &>::reverse_iterator Rit = 442 ScoreMap.rbegin(); 443 Rit != ScoreMap.rend() && I < opts::PrintFuncStat; ++Rit, ++I) { 444 BinaryFunction &Function = Rit->second; 445 446 OS << " Information for function of top: " << (I + 1) << ": \n"; 447 OS << " Function Score is: " << Function.getFunctionScore() 448 << "\n"; 449 OS << " There are " << Function.size() 450 << " number of blocks in this function.\n"; 451 OS << " There are " << Function.getInstructionCount() 452 << " number of instructions in this function.\n"; 453 OS << " The edit distance for this function is: " 454 << FunctionEditDistance.lookup(&Function) << "\n\n"; 455 } 456 } 457 return Error::success(); 458 } 459 460 bool ReorderBasicBlocks::modifyFunctionLayout(BinaryFunction &BF, 461 LayoutType Type, 462 bool MinBranchClusters) const { 463 if (BF.size() == 0 || Type == LT_NONE) 464 return false; 465 466 BinaryFunction::BasicBlockOrderType NewLayout; 467 std::unique_ptr<ReorderAlgorithm> Algo; 468 469 // Cannot do optimal layout without profile. 470 if (Type != LT_REVERSE && !BF.hasValidProfile()) 471 return false; 472 473 if (Type == LT_REVERSE) { 474 Algo.reset(new ReverseReorderAlgorithm()); 475 } else if (BF.size() <= opts::TSPThreshold && Type != LT_OPTIMIZE_SHUFFLE) { 476 // Work on optimal solution if problem is small enough 477 LLVM_DEBUG(dbgs() << "finding optimal block layout for " << BF << "\n"); 478 Algo.reset(new TSPReorderAlgorithm()); 479 } else { 480 LLVM_DEBUG(dbgs() << "running block layout heuristics on " << BF << "\n"); 481 482 std::unique_ptr<ClusterAlgorithm> CAlgo; 483 if (MinBranchClusters) 484 CAlgo.reset(new MinBranchGreedyClusterAlgorithm()); 485 else 486 CAlgo.reset(new PHGreedyClusterAlgorithm()); 487 488 switch (Type) { 489 case LT_OPTIMIZE: 490 Algo.reset(new OptimizeReorderAlgorithm(std::move(CAlgo))); 491 break; 492 493 case LT_OPTIMIZE_BRANCH: 494 Algo.reset(new OptimizeBranchReorderAlgorithm(std::move(CAlgo))); 495 break; 496 497 case LT_OPTIMIZE_CACHE: 498 Algo.reset(new OptimizeCacheReorderAlgorithm(std::move(CAlgo))); 499 break; 500 501 case LT_OPTIMIZE_EXT_TSP: 502 Algo.reset(new ExtTSPReorderAlgorithm()); 503 break; 504 505 case LT_OPTIMIZE_SHUFFLE: 506 Algo.reset(new RandomClusterReorderAlgorithm(std::move(CAlgo))); 507 break; 508 509 default: 510 llvm_unreachable("unexpected layout type"); 511 } 512 } 513 514 Algo->reorderBasicBlocks(BF, NewLayout); 515 516 return BF.getLayout().update(NewLayout); 517 } 518 519 Error FixupBranches::runOnFunctions(BinaryContext &BC) { 520 for (auto &It : BC.getBinaryFunctions()) { 521 BinaryFunction &Function = It.second; 522 if (!BC.shouldEmit(Function) || !Function.isSimple()) 523 continue; 524 525 Function.fixBranches(); 526 } 527 return Error::success(); 528 } 529 530 Error FinalizeFunctions::runOnFunctions(BinaryContext &BC) { 531 std::atomic<bool> HasFatal{false}; 532 ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) { 533 if (!BF.finalizeCFIState()) { 534 if (BC.HasRelocations) { 535 errs() << "BOLT-ERROR: unable to fix CFI state for function " << BF 536 << ". Exiting.\n"; 537 HasFatal = true; 538 return; 539 } 540 BF.setSimple(false); 541 return; 542 } 543 544 BF.setFinalized(); 545 546 // Update exception handling information. 547 BF.updateEHRanges(); 548 }; 549 550 ParallelUtilities::PredicateTy SkipPredicate = [&](const BinaryFunction &BF) { 551 return !BC.shouldEmit(BF); 552 }; 553 554 ParallelUtilities::runOnEachFunction( 555 BC, ParallelUtilities::SchedulingPolicy::SP_CONSTANT, WorkFun, 556 SkipPredicate, "FinalizeFunctions"); 557 if (HasFatal) 558 return createFatalBOLTError("finalize CFI state failure"); 559 return Error::success(); 560 } 561 562 Error CheckLargeFunctions::runOnFunctions(BinaryContext &BC) { 563 if (BC.HasRelocations) 564 return Error::success(); 565 566 // If the function wouldn't fit, mark it as non-simple. Otherwise, we may emit 567 // incorrect meta data. 568 ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) { 569 uint64_t HotSize, ColdSize; 570 std::tie(HotSize, ColdSize) = 571 BC.calculateEmittedSize(BF, /*FixBranches=*/false); 572 if (HotSize > BF.getMaxSize()) 573 BF.setSimple(false); 574 }; 575 576 ParallelUtilities::PredicateTy SkipFunc = [&](const BinaryFunction &BF) { 577 return !shouldOptimize(BF); 578 }; 579 580 ParallelUtilities::runOnEachFunction( 581 BC, ParallelUtilities::SchedulingPolicy::SP_INST_LINEAR, WorkFun, 582 SkipFunc, "CheckLargeFunctions"); 583 584 return Error::success(); 585 } 586 587 bool CheckLargeFunctions::shouldOptimize(const BinaryFunction &BF) const { 588 // Unlike other passes, allow functions in non-CFG state. 589 return BF.isSimple() && !BF.isIgnored(); 590 } 591 592 Error LowerAnnotations::runOnFunctions(BinaryContext &BC) { 593 // Convert GnuArgsSize annotations into CFIs. 594 for (BinaryFunction *BF : BC.getAllBinaryFunctions()) { 595 for (FunctionFragment &FF : BF->getLayout().fragments()) { 596 // Reset at the start of the new fragment. 597 int64_t CurrentGnuArgsSize = 0; 598 599 for (BinaryBasicBlock *const BB : FF) { 600 for (auto II = BB->begin(); II != BB->end(); ++II) { 601 if (!BF->usesGnuArgsSize() || !BC.MIB->isInvoke(*II)) 602 continue; 603 604 const int64_t NewGnuArgsSize = BC.MIB->getGnuArgsSize(*II); 605 assert(NewGnuArgsSize >= 0 && "Expected non-negative GNU_args_size."); 606 if (NewGnuArgsSize == CurrentGnuArgsSize) 607 continue; 608 609 auto InsertII = BF->addCFIInstruction( 610 BB, II, 611 MCCFIInstruction::createGnuArgsSize(nullptr, NewGnuArgsSize)); 612 CurrentGnuArgsSize = NewGnuArgsSize; 613 II = std::next(InsertII); 614 } 615 } 616 } 617 } 618 return Error::success(); 619 } 620 621 // Check for dirty state in MCSymbol objects that might be a consequence 622 // of running calculateEmittedSize() in parallel, during split functions 623 // pass. If an inconsistent state is found (symbol already registered or 624 // already defined), clean it. 625 Error CleanMCState::runOnFunctions(BinaryContext &BC) { 626 MCContext &Ctx = *BC.Ctx; 627 for (const auto &SymMapEntry : Ctx.getSymbols()) { 628 const MCSymbol *S = SymMapEntry.second; 629 if (S->isDefined()) { 630 LLVM_DEBUG(dbgs() << "BOLT-DEBUG: Symbol \"" << S->getName() 631 << "\" is already defined\n"); 632 const_cast<MCSymbol *>(S)->setUndefined(); 633 } 634 if (S->isRegistered()) { 635 LLVM_DEBUG(dbgs() << "BOLT-DEBUG: Symbol \"" << S->getName() 636 << "\" is already registered\n"); 637 const_cast<MCSymbol *>(S)->setIsRegistered(false); 638 } 639 LLVM_DEBUG(if (S->isVariable()) { 640 dbgs() << "BOLT-DEBUG: Symbol \"" << S->getName() << "\" is variable\n"; 641 }); 642 } 643 return Error::success(); 644 } 645 646 // This peephole fixes jump instructions that jump to another basic 647 // block with a single jump instruction, e.g. 648 // 649 // B0: ... 650 // jmp B1 (or jcc B1) 651 // 652 // B1: jmp B2 653 // 654 // -> 655 // 656 // B0: ... 657 // jmp B2 (or jcc B2) 658 // 659 static uint64_t fixDoubleJumps(BinaryFunction &Function, bool MarkInvalid) { 660 uint64_t NumDoubleJumps = 0; 661 662 MCContext *Ctx = Function.getBinaryContext().Ctx.get(); 663 MCPlusBuilder *MIB = Function.getBinaryContext().MIB.get(); 664 for (BinaryBasicBlock &BB : Function) { 665 auto checkAndPatch = [&](BinaryBasicBlock *Pred, BinaryBasicBlock *Succ, 666 const MCSymbol *SuccSym) { 667 // Ignore infinite loop jumps or fallthrough tail jumps. 668 if (Pred == Succ || Succ == &BB) 669 return false; 670 671 if (Succ) { 672 const MCSymbol *TBB = nullptr; 673 const MCSymbol *FBB = nullptr; 674 MCInst *CondBranch = nullptr; 675 MCInst *UncondBranch = nullptr; 676 bool Res = Pred->analyzeBranch(TBB, FBB, CondBranch, UncondBranch); 677 if (!Res) { 678 LLVM_DEBUG(dbgs() << "analyzeBranch failed in peepholes in block:\n"; 679 Pred->dump()); 680 return false; 681 } 682 Pred->replaceSuccessor(&BB, Succ); 683 684 // We must patch up any existing branch instructions to match up 685 // with the new successor. 686 assert((CondBranch || (!CondBranch && Pred->succ_size() == 1)) && 687 "Predecessor block has inconsistent number of successors"); 688 if (CondBranch && MIB->getTargetSymbol(*CondBranch) == BB.getLabel()) { 689 MIB->replaceBranchTarget(*CondBranch, Succ->getLabel(), Ctx); 690 } else if (UncondBranch && 691 MIB->getTargetSymbol(*UncondBranch) == BB.getLabel()) { 692 MIB->replaceBranchTarget(*UncondBranch, Succ->getLabel(), Ctx); 693 } else if (!UncondBranch) { 694 assert(Function.getLayout().getBasicBlockAfter(Pred, false) != Succ && 695 "Don't add an explicit jump to a fallthrough block."); 696 Pred->addBranchInstruction(Succ); 697 } 698 } else { 699 // Succ will be null in the tail call case. In this case we 700 // need to explicitly add a tail call instruction. 701 MCInst *Branch = Pred->getLastNonPseudoInstr(); 702 if (Branch && MIB->isUnconditionalBranch(*Branch)) { 703 assert(MIB->getTargetSymbol(*Branch) == BB.getLabel()); 704 Pred->removeSuccessor(&BB); 705 Pred->eraseInstruction(Pred->findInstruction(Branch)); 706 Pred->addTailCallInstruction(SuccSym); 707 } else { 708 return false; 709 } 710 } 711 712 ++NumDoubleJumps; 713 LLVM_DEBUG(dbgs() << "Removed double jump in " << Function << " from " 714 << Pred->getName() << " -> " << BB.getName() << " to " 715 << Pred->getName() << " -> " << SuccSym->getName() 716 << (!Succ ? " (tail)\n" : "\n")); 717 718 return true; 719 }; 720 721 if (BB.getNumNonPseudos() != 1 || BB.isLandingPad()) 722 continue; 723 724 MCInst *Inst = BB.getFirstNonPseudoInstr(); 725 const bool IsTailCall = MIB->isTailCall(*Inst); 726 727 if (!MIB->isUnconditionalBranch(*Inst) && !IsTailCall) 728 continue; 729 730 // If we operate after SCTC make sure it's not a conditional tail call. 731 if (IsTailCall && MIB->isConditionalBranch(*Inst)) 732 continue; 733 734 const MCSymbol *SuccSym = MIB->getTargetSymbol(*Inst); 735 BinaryBasicBlock *Succ = BB.getSuccessor(); 736 737 if (((!Succ || &BB == Succ) && !IsTailCall) || (IsTailCall && !SuccSym)) 738 continue; 739 740 std::vector<BinaryBasicBlock *> Preds = {BB.pred_begin(), BB.pred_end()}; 741 742 for (BinaryBasicBlock *Pred : Preds) { 743 if (Pred->isLandingPad()) 744 continue; 745 746 if (Pred->getSuccessor() == &BB || 747 (Pred->getConditionalSuccessor(true) == &BB && !IsTailCall) || 748 Pred->getConditionalSuccessor(false) == &BB) 749 if (checkAndPatch(Pred, Succ, SuccSym) && MarkInvalid) 750 BB.markValid(BB.pred_size() != 0 || BB.isLandingPad() || 751 BB.isEntryPoint()); 752 } 753 } 754 755 return NumDoubleJumps; 756 } 757 758 bool SimplifyConditionalTailCalls::shouldRewriteBranch( 759 const BinaryBasicBlock *PredBB, const MCInst &CondBranch, 760 const BinaryBasicBlock *BB, const bool DirectionFlag) { 761 if (BeenOptimized.count(PredBB)) 762 return false; 763 764 const bool IsForward = BinaryFunction::isForwardBranch(PredBB, BB); 765 766 if (IsForward) 767 ++NumOrigForwardBranches; 768 else 769 ++NumOrigBackwardBranches; 770 771 if (opts::SctcMode == opts::SctcAlways) 772 return true; 773 774 if (opts::SctcMode == opts::SctcPreserveDirection) 775 return IsForward == DirectionFlag; 776 777 const ErrorOr<std::pair<double, double>> Frequency = 778 PredBB->getBranchStats(BB); 779 780 // It's ok to rewrite the conditional branch if the new target will be 781 // a backward branch. 782 783 // If no data available for these branches, then it should be ok to 784 // do the optimization since it will reduce code size. 785 if (Frequency.getError()) 786 return true; 787 788 // TODO: should this use misprediction frequency instead? 789 const bool Result = (IsForward && Frequency.get().first >= 0.5) || 790 (!IsForward && Frequency.get().first <= 0.5); 791 792 return Result == DirectionFlag; 793 } 794 795 uint64_t SimplifyConditionalTailCalls::fixTailCalls(BinaryFunction &BF) { 796 // Need updated indices to correctly detect branch' direction. 797 BF.getLayout().updateLayoutIndices(); 798 BF.markUnreachableBlocks(); 799 800 MCPlusBuilder *MIB = BF.getBinaryContext().MIB.get(); 801 MCContext *Ctx = BF.getBinaryContext().Ctx.get(); 802 uint64_t NumLocalCTCCandidates = 0; 803 uint64_t NumLocalCTCs = 0; 804 uint64_t LocalCTCTakenCount = 0; 805 uint64_t LocalCTCExecCount = 0; 806 std::vector<std::pair<BinaryBasicBlock *, const BinaryBasicBlock *>> 807 NeedsUncondBranch; 808 809 // Will block be deleted by UCE? 810 auto isValid = [](const BinaryBasicBlock *BB) { 811 return (BB->pred_size() != 0 || BB->isLandingPad() || BB->isEntryPoint()); 812 }; 813 814 for (BinaryBasicBlock *BB : BF.getLayout().blocks()) { 815 // Locate BB with a single direct tail-call instruction. 816 if (BB->getNumNonPseudos() != 1) 817 continue; 818 819 MCInst *Instr = BB->getFirstNonPseudoInstr(); 820 if (!MIB->isTailCall(*Instr) || MIB->isConditionalBranch(*Instr)) 821 continue; 822 823 const MCSymbol *CalleeSymbol = MIB->getTargetSymbol(*Instr); 824 if (!CalleeSymbol) 825 continue; 826 827 // Detect direction of the possible conditional tail call. 828 const bool IsForwardCTC = BF.isForwardCall(CalleeSymbol); 829 830 // Iterate through all predecessors. 831 for (BinaryBasicBlock *PredBB : BB->predecessors()) { 832 BinaryBasicBlock *CondSucc = PredBB->getConditionalSuccessor(true); 833 if (!CondSucc) 834 continue; 835 836 ++NumLocalCTCCandidates; 837 838 const MCSymbol *TBB = nullptr; 839 const MCSymbol *FBB = nullptr; 840 MCInst *CondBranch = nullptr; 841 MCInst *UncondBranch = nullptr; 842 bool Result = PredBB->analyzeBranch(TBB, FBB, CondBranch, UncondBranch); 843 844 // analyzeBranch() can fail due to unusual branch instructions, e.g. jrcxz 845 if (!Result) { 846 LLVM_DEBUG(dbgs() << "analyzeBranch failed in SCTC in block:\n"; 847 PredBB->dump()); 848 continue; 849 } 850 851 assert(Result && "internal error analyzing conditional branch"); 852 assert(CondBranch && "conditional branch expected"); 853 854 // It's possible that PredBB is also a successor to BB that may have 855 // been processed by a previous iteration of the SCTC loop, in which 856 // case it may have been marked invalid. We should skip rewriting in 857 // this case. 858 if (!PredBB->isValid()) { 859 assert(PredBB->isSuccessor(BB) && 860 "PredBB should be valid if it is not a successor to BB"); 861 continue; 862 } 863 864 // We don't want to reverse direction of the branch in new order 865 // without further profile analysis. 866 const bool DirectionFlag = CondSucc == BB ? IsForwardCTC : !IsForwardCTC; 867 if (!shouldRewriteBranch(PredBB, *CondBranch, BB, DirectionFlag)) 868 continue; 869 870 // Record this block so that we don't try to optimize it twice. 871 BeenOptimized.insert(PredBB); 872 873 uint64_t Count = 0; 874 if (CondSucc != BB) { 875 // Patch the new target address into the conditional branch. 876 MIB->reverseBranchCondition(*CondBranch, CalleeSymbol, Ctx); 877 // Since we reversed the condition on the branch we need to change 878 // the target for the unconditional branch or add a unconditional 879 // branch to the old target. This has to be done manually since 880 // fixupBranches is not called after SCTC. 881 NeedsUncondBranch.emplace_back(PredBB, CondSucc); 882 Count = PredBB->getFallthroughBranchInfo().Count; 883 } else { 884 // Change destination of the conditional branch. 885 MIB->replaceBranchTarget(*CondBranch, CalleeSymbol, Ctx); 886 Count = PredBB->getTakenBranchInfo().Count; 887 } 888 const uint64_t CTCTakenFreq = 889 Count == BinaryBasicBlock::COUNT_NO_PROFILE ? 0 : Count; 890 891 // Annotate it, so "isCall" returns true for this jcc 892 MIB->setConditionalTailCall(*CondBranch); 893 // Add info about the conditional tail call frequency, otherwise this 894 // info will be lost when we delete the associated BranchInfo entry 895 auto &CTCAnnotation = 896 MIB->getOrCreateAnnotationAs<uint64_t>(*CondBranch, "CTCTakenCount"); 897 CTCAnnotation = CTCTakenFreq; 898 899 // Remove the unused successor which may be eliminated later 900 // if there are no other users. 901 PredBB->removeSuccessor(BB); 902 // Update BB execution count 903 if (CTCTakenFreq && CTCTakenFreq <= BB->getKnownExecutionCount()) 904 BB->setExecutionCount(BB->getExecutionCount() - CTCTakenFreq); 905 else if (CTCTakenFreq > BB->getKnownExecutionCount()) 906 BB->setExecutionCount(0); 907 908 ++NumLocalCTCs; 909 LocalCTCTakenCount += CTCTakenFreq; 910 LocalCTCExecCount += PredBB->getKnownExecutionCount(); 911 } 912 913 // Remove the block from CFG if all predecessors were removed. 914 BB->markValid(isValid(BB)); 915 } 916 917 // Add unconditional branches at the end of BBs to new successors 918 // as long as the successor is not a fallthrough. 919 for (auto &Entry : NeedsUncondBranch) { 920 BinaryBasicBlock *PredBB = Entry.first; 921 const BinaryBasicBlock *CondSucc = Entry.second; 922 923 const MCSymbol *TBB = nullptr; 924 const MCSymbol *FBB = nullptr; 925 MCInst *CondBranch = nullptr; 926 MCInst *UncondBranch = nullptr; 927 PredBB->analyzeBranch(TBB, FBB, CondBranch, UncondBranch); 928 929 // Find the next valid block. Invalid blocks will be deleted 930 // so they shouldn't be considered fallthrough targets. 931 const BinaryBasicBlock *NextBlock = 932 BF.getLayout().getBasicBlockAfter(PredBB, false); 933 while (NextBlock && !isValid(NextBlock)) 934 NextBlock = BF.getLayout().getBasicBlockAfter(NextBlock, false); 935 936 // Get the unconditional successor to this block. 937 const BinaryBasicBlock *PredSucc = PredBB->getSuccessor(); 938 assert(PredSucc && "The other branch should be a tail call"); 939 940 const bool HasFallthrough = (NextBlock && PredSucc == NextBlock); 941 942 if (UncondBranch) { 943 if (HasFallthrough) 944 PredBB->eraseInstruction(PredBB->findInstruction(UncondBranch)); 945 else 946 MIB->replaceBranchTarget(*UncondBranch, CondSucc->getLabel(), Ctx); 947 } else if (!HasFallthrough) { 948 MCInst Branch; 949 MIB->createUncondBranch(Branch, CondSucc->getLabel(), Ctx); 950 PredBB->addInstruction(Branch); 951 } 952 } 953 954 if (NumLocalCTCs > 0) { 955 NumDoubleJumps += fixDoubleJumps(BF, true); 956 // Clean-up unreachable tail-call blocks. 957 const std::pair<unsigned, uint64_t> Stats = BF.eraseInvalidBBs(); 958 DeletedBlocks += Stats.first; 959 DeletedBytes += Stats.second; 960 961 assert(BF.validateCFG()); 962 } 963 964 LLVM_DEBUG(dbgs() << "BOLT: created " << NumLocalCTCs 965 << " conditional tail calls from a total of " 966 << NumLocalCTCCandidates << " candidates in function " << BF 967 << ". CTCs execution count for this function is " 968 << LocalCTCExecCount << " and CTC taken count is " 969 << LocalCTCTakenCount << "\n";); 970 971 NumTailCallsPatched += NumLocalCTCs; 972 NumCandidateTailCalls += NumLocalCTCCandidates; 973 CTCExecCount += LocalCTCExecCount; 974 CTCTakenCount += LocalCTCTakenCount; 975 976 return NumLocalCTCs > 0; 977 } 978 979 Error SimplifyConditionalTailCalls::runOnFunctions(BinaryContext &BC) { 980 if (!BC.isX86()) 981 return Error::success(); 982 983 for (auto &It : BC.getBinaryFunctions()) { 984 BinaryFunction &Function = It.second; 985 986 if (!shouldOptimize(Function)) 987 continue; 988 989 if (fixTailCalls(Function)) { 990 Modified.insert(&Function); 991 Function.setHasCanonicalCFG(false); 992 } 993 } 994 995 if (NumTailCallsPatched) 996 outs() << "BOLT-INFO: SCTC: patched " << NumTailCallsPatched 997 << " tail calls (" << NumOrigForwardBranches << " forward)" 998 << " tail calls (" << NumOrigBackwardBranches << " backward)" 999 << " from a total of " << NumCandidateTailCalls << " while removing " 1000 << NumDoubleJumps << " double jumps" 1001 << " and removing " << DeletedBlocks << " basic blocks" 1002 << " totalling " << DeletedBytes 1003 << " bytes of code. CTCs total execution count is " << CTCExecCount 1004 << " and the number of times CTCs are taken is " << CTCTakenCount 1005 << "\n"; 1006 return Error::success(); 1007 } 1008 1009 uint64_t ShortenInstructions::shortenInstructions(BinaryFunction &Function) { 1010 uint64_t Count = 0; 1011 const BinaryContext &BC = Function.getBinaryContext(); 1012 for (BinaryBasicBlock &BB : Function) { 1013 for (MCInst &Inst : BB) { 1014 MCInst OriginalInst; 1015 if (opts::Verbosity > 2) 1016 OriginalInst = Inst; 1017 1018 if (!BC.MIB->shortenInstruction(Inst, *BC.STI)) 1019 continue; 1020 1021 if (opts::Verbosity > 2) { 1022 BC.scopeLock(); 1023 outs() << "BOLT-INFO: shortening:\nBOLT-INFO: "; 1024 BC.printInstruction(outs(), OriginalInst, 0, &Function); 1025 outs() << "BOLT-INFO: to:"; 1026 BC.printInstruction(outs(), Inst, 0, &Function); 1027 } 1028 1029 ++Count; 1030 } 1031 } 1032 1033 return Count; 1034 } 1035 1036 Error ShortenInstructions::runOnFunctions(BinaryContext &BC) { 1037 std::atomic<uint64_t> NumShortened{0}; 1038 if (!BC.isX86()) 1039 return Error::success(); 1040 1041 ParallelUtilities::runOnEachFunction( 1042 BC, ParallelUtilities::SchedulingPolicy::SP_INST_LINEAR, 1043 [&](BinaryFunction &BF) { NumShortened += shortenInstructions(BF); }, 1044 nullptr, "ShortenInstructions"); 1045 1046 if (NumShortened) 1047 outs() << "BOLT-INFO: " << NumShortened << " instructions were shortened\n"; 1048 return Error::success(); 1049 } 1050 1051 void Peepholes::addTailcallTraps(BinaryFunction &Function) { 1052 MCPlusBuilder *MIB = Function.getBinaryContext().MIB.get(); 1053 for (BinaryBasicBlock &BB : Function) { 1054 MCInst *Inst = BB.getLastNonPseudoInstr(); 1055 if (Inst && MIB->isTailCall(*Inst) && MIB->isIndirectBranch(*Inst)) { 1056 MCInst Trap; 1057 if (MIB->createTrap(Trap)) { 1058 BB.addInstruction(Trap); 1059 ++TailCallTraps; 1060 } 1061 } 1062 } 1063 } 1064 1065 void Peepholes::removeUselessCondBranches(BinaryFunction &Function) { 1066 for (BinaryBasicBlock &BB : Function) { 1067 if (BB.succ_size() != 2) 1068 continue; 1069 1070 BinaryBasicBlock *CondBB = BB.getConditionalSuccessor(true); 1071 BinaryBasicBlock *UncondBB = BB.getConditionalSuccessor(false); 1072 if (CondBB != UncondBB) 1073 continue; 1074 1075 const MCSymbol *TBB = nullptr; 1076 const MCSymbol *FBB = nullptr; 1077 MCInst *CondBranch = nullptr; 1078 MCInst *UncondBranch = nullptr; 1079 bool Result = BB.analyzeBranch(TBB, FBB, CondBranch, UncondBranch); 1080 1081 // analyzeBranch() can fail due to unusual branch instructions, 1082 // e.g. jrcxz, or jump tables (indirect jump). 1083 if (!Result || !CondBranch) 1084 continue; 1085 1086 BB.removeDuplicateConditionalSuccessor(CondBranch); 1087 ++NumUselessCondBranches; 1088 } 1089 } 1090 1091 Error Peepholes::runOnFunctions(BinaryContext &BC) { 1092 const char Opts = 1093 std::accumulate(opts::Peepholes.begin(), opts::Peepholes.end(), 0, 1094 [](const char A, const PeepholeOpts B) { return A | B; }); 1095 if (Opts == PEEP_NONE) 1096 return Error::success(); 1097 1098 for (auto &It : BC.getBinaryFunctions()) { 1099 BinaryFunction &Function = It.second; 1100 if (shouldOptimize(Function)) { 1101 if (Opts & PEEP_DOUBLE_JUMPS) 1102 NumDoubleJumps += fixDoubleJumps(Function, false); 1103 if (Opts & PEEP_TAILCALL_TRAPS) 1104 addTailcallTraps(Function); 1105 if (Opts & PEEP_USELESS_BRANCHES) 1106 removeUselessCondBranches(Function); 1107 assert(Function.validateCFG()); 1108 } 1109 } 1110 outs() << "BOLT-INFO: Peephole: " << NumDoubleJumps 1111 << " double jumps patched.\n" 1112 << "BOLT-INFO: Peephole: " << TailCallTraps 1113 << " tail call traps inserted.\n" 1114 << "BOLT-INFO: Peephole: " << NumUselessCondBranches 1115 << " useless conditional branches removed.\n"; 1116 return Error::success(); 1117 } 1118 1119 bool SimplifyRODataLoads::simplifyRODataLoads(BinaryFunction &BF) { 1120 BinaryContext &BC = BF.getBinaryContext(); 1121 MCPlusBuilder *MIB = BC.MIB.get(); 1122 1123 uint64_t NumLocalLoadsSimplified = 0; 1124 uint64_t NumDynamicLocalLoadsSimplified = 0; 1125 uint64_t NumLocalLoadsFound = 0; 1126 uint64_t NumDynamicLocalLoadsFound = 0; 1127 1128 for (BinaryBasicBlock *BB : BF.getLayout().blocks()) { 1129 for (MCInst &Inst : *BB) { 1130 unsigned Opcode = Inst.getOpcode(); 1131 const MCInstrDesc &Desc = BC.MII->get(Opcode); 1132 1133 // Skip instructions that do not load from memory. 1134 if (!Desc.mayLoad()) 1135 continue; 1136 1137 // Try to statically evaluate the target memory address; 1138 uint64_t TargetAddress; 1139 1140 if (MIB->hasPCRelOperand(Inst)) { 1141 // Try to find the symbol that corresponds to the PC-relative operand. 1142 MCOperand *DispOpI = MIB->getMemOperandDisp(Inst); 1143 assert(DispOpI != Inst.end() && "expected PC-relative displacement"); 1144 assert(DispOpI->isExpr() && 1145 "found PC-relative with non-symbolic displacement"); 1146 1147 // Get displacement symbol. 1148 const MCSymbol *DisplSymbol; 1149 uint64_t DisplOffset; 1150 1151 std::tie(DisplSymbol, DisplOffset) = 1152 MIB->getTargetSymbolInfo(DispOpI->getExpr()); 1153 1154 if (!DisplSymbol) 1155 continue; 1156 1157 // Look up the symbol address in the global symbols map of the binary 1158 // context object. 1159 BinaryData *BD = BC.getBinaryDataByName(DisplSymbol->getName()); 1160 if (!BD) 1161 continue; 1162 TargetAddress = BD->getAddress() + DisplOffset; 1163 } else if (!MIB->evaluateMemOperandTarget(Inst, TargetAddress)) { 1164 continue; 1165 } 1166 1167 // Get the contents of the section containing the target address of the 1168 // memory operand. We are only interested in read-only sections. 1169 ErrorOr<BinarySection &> DataSection = 1170 BC.getSectionForAddress(TargetAddress); 1171 if (!DataSection || DataSection->isWritable()) 1172 continue; 1173 1174 if (BC.getRelocationAt(TargetAddress) || 1175 BC.getDynamicRelocationAt(TargetAddress)) 1176 continue; 1177 1178 uint32_t Offset = TargetAddress - DataSection->getAddress(); 1179 StringRef ConstantData = DataSection->getContents(); 1180 1181 ++NumLocalLoadsFound; 1182 if (BB->hasProfile()) 1183 NumDynamicLocalLoadsFound += BB->getExecutionCount(); 1184 1185 if (MIB->replaceMemOperandWithImm(Inst, ConstantData, Offset)) { 1186 ++NumLocalLoadsSimplified; 1187 if (BB->hasProfile()) 1188 NumDynamicLocalLoadsSimplified += BB->getExecutionCount(); 1189 } 1190 } 1191 } 1192 1193 NumLoadsFound += NumLocalLoadsFound; 1194 NumDynamicLoadsFound += NumDynamicLocalLoadsFound; 1195 NumLoadsSimplified += NumLocalLoadsSimplified; 1196 NumDynamicLoadsSimplified += NumDynamicLocalLoadsSimplified; 1197 1198 return NumLocalLoadsSimplified > 0; 1199 } 1200 1201 Error SimplifyRODataLoads::runOnFunctions(BinaryContext &BC) { 1202 for (auto &It : BC.getBinaryFunctions()) { 1203 BinaryFunction &Function = It.second; 1204 if (shouldOptimize(Function) && simplifyRODataLoads(Function)) 1205 Modified.insert(&Function); 1206 } 1207 1208 outs() << "BOLT-INFO: simplified " << NumLoadsSimplified << " out of " 1209 << NumLoadsFound << " loads from a statically computed address.\n" 1210 << "BOLT-INFO: dynamic loads simplified: " << NumDynamicLoadsSimplified 1211 << "\n" 1212 << "BOLT-INFO: dynamic loads found: " << NumDynamicLoadsFound << "\n"; 1213 return Error::success(); 1214 } 1215 1216 Error AssignSections::runOnFunctions(BinaryContext &BC) { 1217 for (BinaryFunction *Function : BC.getInjectedBinaryFunctions()) { 1218 Function->setCodeSectionName(BC.getInjectedCodeSectionName()); 1219 Function->setColdCodeSectionName(BC.getInjectedColdCodeSectionName()); 1220 } 1221 1222 // In non-relocation mode functions have pre-assigned section names. 1223 if (!BC.HasRelocations) 1224 return Error::success(); 1225 1226 const bool UseColdSection = 1227 BC.NumProfiledFuncs > 0 || 1228 opts::ReorderFunctions == ReorderFunctions::RT_USER; 1229 for (auto &BFI : BC.getBinaryFunctions()) { 1230 BinaryFunction &Function = BFI.second; 1231 if (opts::isHotTextMover(Function)) { 1232 Function.setCodeSectionName(BC.getHotTextMoverSectionName()); 1233 Function.setColdCodeSectionName(BC.getHotTextMoverSectionName()); 1234 continue; 1235 } 1236 1237 if (!UseColdSection || Function.hasValidIndex()) 1238 Function.setCodeSectionName(BC.getMainCodeSectionName()); 1239 else 1240 Function.setCodeSectionName(BC.getColdCodeSectionName()); 1241 1242 if (Function.isSplit()) 1243 Function.setColdCodeSectionName(BC.getColdCodeSectionName()); 1244 } 1245 return Error::success(); 1246 } 1247 1248 Error PrintProfileStats::runOnFunctions(BinaryContext &BC) { 1249 double FlowImbalanceMean = 0.0; 1250 size_t NumBlocksConsidered = 0; 1251 double WorstBias = 0.0; 1252 const BinaryFunction *WorstBiasFunc = nullptr; 1253 1254 // For each function CFG, we fill an IncomingMap with the sum of the frequency 1255 // of incoming edges for each BB. Likewise for each OutgoingMap and the sum 1256 // of the frequency of outgoing edges. 1257 using FlowMapTy = std::unordered_map<const BinaryBasicBlock *, uint64_t>; 1258 std::unordered_map<const BinaryFunction *, FlowMapTy> TotalIncomingMaps; 1259 std::unordered_map<const BinaryFunction *, FlowMapTy> TotalOutgoingMaps; 1260 1261 // Compute mean 1262 for (const auto &BFI : BC.getBinaryFunctions()) { 1263 const BinaryFunction &Function = BFI.second; 1264 if (Function.empty() || !Function.isSimple()) 1265 continue; 1266 FlowMapTy &IncomingMap = TotalIncomingMaps[&Function]; 1267 FlowMapTy &OutgoingMap = TotalOutgoingMaps[&Function]; 1268 for (const BinaryBasicBlock &BB : Function) { 1269 uint64_t TotalOutgoing = 0ULL; 1270 auto SuccBIIter = BB.branch_info_begin(); 1271 for (BinaryBasicBlock *Succ : BB.successors()) { 1272 uint64_t Count = SuccBIIter->Count; 1273 if (Count == BinaryBasicBlock::COUNT_NO_PROFILE || Count == 0) { 1274 ++SuccBIIter; 1275 continue; 1276 } 1277 TotalOutgoing += Count; 1278 IncomingMap[Succ] += Count; 1279 ++SuccBIIter; 1280 } 1281 OutgoingMap[&BB] = TotalOutgoing; 1282 } 1283 1284 size_t NumBlocks = 0; 1285 double Mean = 0.0; 1286 for (const BinaryBasicBlock &BB : Function) { 1287 // Do not compute score for low frequency blocks, entry or exit blocks 1288 if (IncomingMap[&BB] < 100 || OutgoingMap[&BB] == 0 || BB.isEntryPoint()) 1289 continue; 1290 ++NumBlocks; 1291 const double Difference = (double)OutgoingMap[&BB] - IncomingMap[&BB]; 1292 Mean += fabs(Difference / IncomingMap[&BB]); 1293 } 1294 1295 FlowImbalanceMean += Mean; 1296 NumBlocksConsidered += NumBlocks; 1297 if (!NumBlocks) 1298 continue; 1299 double FuncMean = Mean / NumBlocks; 1300 if (FuncMean > WorstBias) { 1301 WorstBias = FuncMean; 1302 WorstBiasFunc = &Function; 1303 } 1304 } 1305 if (NumBlocksConsidered > 0) 1306 FlowImbalanceMean /= NumBlocksConsidered; 1307 1308 // Compute standard deviation 1309 NumBlocksConsidered = 0; 1310 double FlowImbalanceVar = 0.0; 1311 for (const auto &BFI : BC.getBinaryFunctions()) { 1312 const BinaryFunction &Function = BFI.second; 1313 if (Function.empty() || !Function.isSimple()) 1314 continue; 1315 FlowMapTy &IncomingMap = TotalIncomingMaps[&Function]; 1316 FlowMapTy &OutgoingMap = TotalOutgoingMaps[&Function]; 1317 for (const BinaryBasicBlock &BB : Function) { 1318 if (IncomingMap[&BB] < 100 || OutgoingMap[&BB] == 0) 1319 continue; 1320 ++NumBlocksConsidered; 1321 const double Difference = (double)OutgoingMap[&BB] - IncomingMap[&BB]; 1322 FlowImbalanceVar += 1323 pow(fabs(Difference / IncomingMap[&BB]) - FlowImbalanceMean, 2); 1324 } 1325 } 1326 if (NumBlocksConsidered) { 1327 FlowImbalanceVar /= NumBlocksConsidered; 1328 FlowImbalanceVar = sqrt(FlowImbalanceVar); 1329 } 1330 1331 // Report to user 1332 outs() << format("BOLT-INFO: Profile bias score: %.4lf%% StDev: %.4lf%%\n", 1333 (100.0 * FlowImbalanceMean), (100.0 * FlowImbalanceVar)); 1334 if (WorstBiasFunc && opts::Verbosity >= 1) { 1335 outs() << "Worst average bias observed in " << WorstBiasFunc->getPrintName() 1336 << "\n"; 1337 LLVM_DEBUG(WorstBiasFunc->dump()); 1338 } 1339 return Error::success(); 1340 } 1341 1342 Error PrintProgramStats::runOnFunctions(BinaryContext &BC) { 1343 uint64_t NumRegularFunctions = 0; 1344 uint64_t NumStaleProfileFunctions = 0; 1345 uint64_t NumAllStaleFunctions = 0; 1346 uint64_t NumInferredFunctions = 0; 1347 uint64_t NumNonSimpleProfiledFunctions = 0; 1348 uint64_t NumUnknownControlFlowFunctions = 0; 1349 uint64_t TotalSampleCount = 0; 1350 uint64_t StaleSampleCount = 0; 1351 uint64_t InferredSampleCount = 0; 1352 std::vector<const BinaryFunction *> ProfiledFunctions; 1353 const char *StaleFuncsHeader = "BOLT-INFO: Functions with stale profile:\n"; 1354 for (auto &BFI : BC.getBinaryFunctions()) { 1355 const BinaryFunction &Function = BFI.second; 1356 1357 // Ignore PLT functions for stats. 1358 if (Function.isPLTFunction()) 1359 continue; 1360 1361 ++NumRegularFunctions; 1362 1363 if (!Function.isSimple()) { 1364 if (Function.hasProfile()) 1365 ++NumNonSimpleProfiledFunctions; 1366 continue; 1367 } 1368 1369 if (Function.hasUnknownControlFlow()) { 1370 if (opts::PrintUnknownCFG) 1371 Function.dump(); 1372 else if (opts::PrintUnknown) 1373 errs() << "function with unknown control flow: " << Function << '\n'; 1374 1375 ++NumUnknownControlFlowFunctions; 1376 } 1377 1378 if (!Function.hasProfile()) 1379 continue; 1380 1381 uint64_t SampleCount = Function.getRawBranchCount(); 1382 TotalSampleCount += SampleCount; 1383 1384 if (Function.hasValidProfile()) { 1385 ProfiledFunctions.push_back(&Function); 1386 if (Function.hasInferredProfile()) { 1387 ++NumInferredFunctions; 1388 InferredSampleCount += SampleCount; 1389 ++NumAllStaleFunctions; 1390 } 1391 } else { 1392 if (opts::ReportStaleFuncs) { 1393 outs() << StaleFuncsHeader; 1394 StaleFuncsHeader = ""; 1395 outs() << " " << Function << '\n'; 1396 } 1397 ++NumStaleProfileFunctions; 1398 StaleSampleCount += SampleCount; 1399 ++NumAllStaleFunctions; 1400 } 1401 } 1402 BC.NumProfiledFuncs = ProfiledFunctions.size(); 1403 BC.NumStaleProfileFuncs = NumStaleProfileFunctions; 1404 1405 const size_t NumAllProfiledFunctions = 1406 ProfiledFunctions.size() + NumStaleProfileFunctions; 1407 outs() << "BOLT-INFO: " << NumAllProfiledFunctions << " out of " 1408 << NumRegularFunctions << " functions in the binary (" 1409 << format("%.1f", NumAllProfiledFunctions / 1410 (float)NumRegularFunctions * 100.0f) 1411 << "%) have non-empty execution profile\n"; 1412 if (NumNonSimpleProfiledFunctions) { 1413 outs() << "BOLT-INFO: " << NumNonSimpleProfiledFunctions << " function" 1414 << (NumNonSimpleProfiledFunctions == 1 ? "" : "s") 1415 << " with profile could not be optimized\n"; 1416 } 1417 if (NumAllStaleFunctions) { 1418 const float PctStale = 1419 NumAllStaleFunctions / (float)NumAllProfiledFunctions * 100.0f; 1420 const float PctStaleFuncsWithEqualBlockCount = 1421 (float)BC.Stats.NumStaleFuncsWithEqualBlockCount / 1422 NumAllStaleFunctions * 100.0f; 1423 const float PctStaleBlocksWithEqualIcount = 1424 (float)BC.Stats.NumStaleBlocksWithEqualIcount / 1425 BC.Stats.NumStaleBlocks * 100.0f; 1426 auto printErrorOrWarning = [&]() { 1427 if (PctStale > opts::StaleThreshold) 1428 errs() << "BOLT-ERROR: "; 1429 else 1430 errs() << "BOLT-WARNING: "; 1431 }; 1432 printErrorOrWarning(); 1433 errs() << NumAllStaleFunctions 1434 << format(" (%.1f%% of all profiled)", PctStale) << " function" 1435 << (NumAllStaleFunctions == 1 ? "" : "s") 1436 << " have invalid (possibly stale) profile." 1437 " Use -report-stale to see the list.\n"; 1438 if (TotalSampleCount > 0) { 1439 printErrorOrWarning(); 1440 errs() << (StaleSampleCount + InferredSampleCount) << " out of " 1441 << TotalSampleCount << " samples in the binary (" 1442 << format("%.1f", 1443 ((100.0f * (StaleSampleCount + InferredSampleCount)) / 1444 TotalSampleCount)) 1445 << "%) belong to functions with invalid" 1446 " (possibly stale) profile.\n"; 1447 } 1448 outs() << "BOLT-INFO: " << BC.Stats.NumStaleFuncsWithEqualBlockCount 1449 << " stale function" 1450 << (BC.Stats.NumStaleFuncsWithEqualBlockCount == 1 ? "" : "s") 1451 << format(" (%.1f%% of all stale)", PctStaleFuncsWithEqualBlockCount) 1452 << " have matching block count.\n"; 1453 outs() << "BOLT-INFO: " << BC.Stats.NumStaleBlocksWithEqualIcount 1454 << " stale block" 1455 << (BC.Stats.NumStaleBlocksWithEqualIcount == 1 ? "" : "s") 1456 << format(" (%.1f%% of all stale)", PctStaleBlocksWithEqualIcount) 1457 << " have matching icount.\n"; 1458 if (PctStale > opts::StaleThreshold) { 1459 return createFatalBOLTError( 1460 Twine("BOLT-ERROR: stale functions exceed specified threshold of ") + 1461 Twine(opts::StaleThreshold.getValue()) + Twine("%. Exiting.\n")); 1462 } 1463 } 1464 if (NumInferredFunctions) { 1465 outs() << format("BOLT-INFO: inferred profile for %d (%.2f%% of profiled, " 1466 "%.2f%% of stale) functions responsible for %.2f%% samples" 1467 " (%zu out of %zu)\n", 1468 NumInferredFunctions, 1469 100.0 * NumInferredFunctions / NumAllProfiledFunctions, 1470 100.0 * NumInferredFunctions / NumAllStaleFunctions, 1471 100.0 * InferredSampleCount / TotalSampleCount, 1472 InferredSampleCount, TotalSampleCount); 1473 outs() << format( 1474 "BOLT-INFO: inference found an exact match for %.2f%% of basic blocks" 1475 " (%zu out of %zu stale) responsible for %.2f%% samples" 1476 " (%zu out of %zu stale)\n", 1477 100.0 * BC.Stats.NumMatchedBlocks / BC.Stats.NumStaleBlocks, 1478 BC.Stats.NumMatchedBlocks, BC.Stats.NumStaleBlocks, 1479 100.0 * BC.Stats.MatchedSampleCount / BC.Stats.StaleSampleCount, 1480 BC.Stats.MatchedSampleCount, BC.Stats.StaleSampleCount); 1481 } 1482 1483 if (const uint64_t NumUnusedObjects = BC.getNumUnusedProfiledObjects()) { 1484 outs() << "BOLT-INFO: profile for " << NumUnusedObjects 1485 << " objects was ignored\n"; 1486 } 1487 1488 if (ProfiledFunctions.size() > 10) { 1489 if (opts::Verbosity >= 1) { 1490 outs() << "BOLT-INFO: top called functions are:\n"; 1491 llvm::sort(ProfiledFunctions, 1492 [](const BinaryFunction *A, const BinaryFunction *B) { 1493 return B->getExecutionCount() < A->getExecutionCount(); 1494 }); 1495 auto SFI = ProfiledFunctions.begin(); 1496 auto SFIend = ProfiledFunctions.end(); 1497 for (unsigned I = 0u; I < opts::TopCalledLimit && SFI != SFIend; 1498 ++SFI, ++I) 1499 outs() << " " << **SFI << " : " << (*SFI)->getExecutionCount() << '\n'; 1500 } 1501 } 1502 1503 if (!opts::PrintSortedBy.empty()) { 1504 std::vector<BinaryFunction *> Functions; 1505 std::map<const BinaryFunction *, DynoStats> Stats; 1506 1507 for (auto &BFI : BC.getBinaryFunctions()) { 1508 BinaryFunction &BF = BFI.second; 1509 if (shouldOptimize(BF) && BF.hasValidProfile()) { 1510 Functions.push_back(&BF); 1511 Stats.emplace(&BF, getDynoStats(BF)); 1512 } 1513 } 1514 1515 const bool SortAll = 1516 llvm::is_contained(opts::PrintSortedBy, DynoStats::LAST_DYNO_STAT); 1517 1518 const bool Ascending = 1519 opts::DynoStatsSortOrderOpt == opts::DynoStatsSortOrder::Ascending; 1520 1521 if (SortAll) { 1522 llvm::stable_sort(Functions, 1523 [Ascending, &Stats](const BinaryFunction *A, 1524 const BinaryFunction *B) { 1525 return Ascending ? Stats.at(A) < Stats.at(B) 1526 : Stats.at(B) < Stats.at(A); 1527 }); 1528 } else { 1529 llvm::stable_sort( 1530 Functions, [Ascending, &Stats](const BinaryFunction *A, 1531 const BinaryFunction *B) { 1532 const DynoStats &StatsA = Stats.at(A); 1533 const DynoStats &StatsB = Stats.at(B); 1534 return Ascending ? StatsA.lessThan(StatsB, opts::PrintSortedBy) 1535 : StatsB.lessThan(StatsA, opts::PrintSortedBy); 1536 }); 1537 } 1538 1539 outs() << "BOLT-INFO: top functions sorted by "; 1540 if (SortAll) { 1541 outs() << "dyno stats"; 1542 } else { 1543 outs() << "("; 1544 bool PrintComma = false; 1545 for (const DynoStats::Category Category : opts::PrintSortedBy) { 1546 if (PrintComma) 1547 outs() << ", "; 1548 outs() << DynoStats::Description(Category); 1549 PrintComma = true; 1550 } 1551 outs() << ")"; 1552 } 1553 1554 outs() << " are:\n"; 1555 auto SFI = Functions.begin(); 1556 for (unsigned I = 0; I < 100 && SFI != Functions.end(); ++SFI, ++I) { 1557 const DynoStats Stats = getDynoStats(**SFI); 1558 outs() << " " << **SFI; 1559 if (!SortAll) { 1560 outs() << " ("; 1561 bool PrintComma = false; 1562 for (const DynoStats::Category Category : opts::PrintSortedBy) { 1563 if (PrintComma) 1564 outs() << ", "; 1565 outs() << dynoStatsOptName(Category) << "=" << Stats[Category]; 1566 PrintComma = true; 1567 } 1568 outs() << ")"; 1569 } 1570 outs() << "\n"; 1571 } 1572 } 1573 1574 if (!BC.TrappedFunctions.empty()) { 1575 errs() << "BOLT-WARNING: " << BC.TrappedFunctions.size() << " function" 1576 << (BC.TrappedFunctions.size() > 1 ? "s" : "") 1577 << " will trap on entry. Use -trap-avx512=0 to disable" 1578 " traps."; 1579 if (opts::Verbosity >= 1 || BC.TrappedFunctions.size() <= 5) { 1580 errs() << '\n'; 1581 for (const BinaryFunction *Function : BC.TrappedFunctions) 1582 errs() << " " << *Function << '\n'; 1583 } else { 1584 errs() << " Use -v=1 to see the list.\n"; 1585 } 1586 } 1587 1588 // Print information on missed macro-fusion opportunities seen on input. 1589 if (BC.Stats.MissedMacroFusionPairs) { 1590 outs() << format("BOLT-INFO: the input contains %zu (dynamic count : %zu)" 1591 " opportunities for macro-fusion optimization", 1592 BC.Stats.MissedMacroFusionPairs, 1593 BC.Stats.MissedMacroFusionExecCount); 1594 switch (opts::AlignMacroOpFusion) { 1595 case MFT_NONE: 1596 outs() << ". Use -align-macro-fusion to fix.\n"; 1597 break; 1598 case MFT_HOT: 1599 outs() << ". Will fix instances on a hot path.\n"; 1600 break; 1601 case MFT_ALL: 1602 outs() << " that are going to be fixed\n"; 1603 break; 1604 } 1605 } 1606 1607 // Collect and print information about suboptimal code layout on input. 1608 if (opts::ReportBadLayout) { 1609 std::vector<BinaryFunction *> SuboptimalFuncs; 1610 for (auto &BFI : BC.getBinaryFunctions()) { 1611 BinaryFunction &BF = BFI.second; 1612 if (!BF.hasValidProfile()) 1613 continue; 1614 1615 const uint64_t HotThreshold = 1616 std::max<uint64_t>(BF.getKnownExecutionCount(), 1); 1617 bool HotSeen = false; 1618 for (const BinaryBasicBlock *BB : BF.getLayout().rblocks()) { 1619 if (!HotSeen && BB->getKnownExecutionCount() > HotThreshold) { 1620 HotSeen = true; 1621 continue; 1622 } 1623 if (HotSeen && BB->getKnownExecutionCount() == 0) { 1624 SuboptimalFuncs.push_back(&BF); 1625 break; 1626 } 1627 } 1628 } 1629 1630 if (!SuboptimalFuncs.empty()) { 1631 llvm::sort(SuboptimalFuncs, 1632 [](const BinaryFunction *A, const BinaryFunction *B) { 1633 return A->getKnownExecutionCount() / A->getSize() > 1634 B->getKnownExecutionCount() / B->getSize(); 1635 }); 1636 1637 outs() << "BOLT-INFO: " << SuboptimalFuncs.size() 1638 << " functions have " 1639 "cold code in the middle of hot code. Top functions are:\n"; 1640 for (unsigned I = 0; 1641 I < std::min(static_cast<size_t>(opts::ReportBadLayout), 1642 SuboptimalFuncs.size()); 1643 ++I) 1644 SuboptimalFuncs[I]->print(outs()); 1645 } 1646 } 1647 1648 if (NumUnknownControlFlowFunctions) { 1649 outs() << "BOLT-INFO: " << NumUnknownControlFlowFunctions 1650 << " functions have instructions with unknown control flow"; 1651 if (!opts::PrintUnknown) 1652 outs() << ". Use -print-unknown to see the list."; 1653 outs() << '\n'; 1654 } 1655 return Error::success(); 1656 } 1657 1658 Error InstructionLowering::runOnFunctions(BinaryContext &BC) { 1659 for (auto &BFI : BC.getBinaryFunctions()) 1660 for (BinaryBasicBlock &BB : BFI.second) 1661 for (MCInst &Instruction : BB) 1662 BC.MIB->lowerTailCall(Instruction); 1663 return Error::success(); 1664 } 1665 1666 Error StripRepRet::runOnFunctions(BinaryContext &BC) { 1667 if (!BC.isX86()) 1668 return Error::success(); 1669 1670 uint64_t NumPrefixesRemoved = 0; 1671 uint64_t NumBytesSaved = 0; 1672 for (auto &BFI : BC.getBinaryFunctions()) { 1673 for (BinaryBasicBlock &BB : BFI.second) { 1674 auto LastInstRIter = BB.getLastNonPseudo(); 1675 if (LastInstRIter == BB.rend() || !BC.MIB->isReturn(*LastInstRIter) || 1676 !BC.MIB->deleteREPPrefix(*LastInstRIter)) 1677 continue; 1678 1679 NumPrefixesRemoved += BB.getKnownExecutionCount(); 1680 ++NumBytesSaved; 1681 } 1682 } 1683 1684 if (NumBytesSaved) 1685 outs() << "BOLT-INFO: removed " << NumBytesSaved 1686 << " 'repz' prefixes" 1687 " with estimated execution count of " 1688 << NumPrefixesRemoved << " times.\n"; 1689 return Error::success(); 1690 } 1691 1692 Error InlineMemcpy::runOnFunctions(BinaryContext &BC) { 1693 if (!BC.isX86()) 1694 return Error::success(); 1695 1696 uint64_t NumInlined = 0; 1697 uint64_t NumInlinedDyno = 0; 1698 for (auto &BFI : BC.getBinaryFunctions()) { 1699 for (BinaryBasicBlock &BB : BFI.second) { 1700 for (auto II = BB.begin(); II != BB.end(); ++II) { 1701 MCInst &Inst = *II; 1702 1703 if (!BC.MIB->isCall(Inst) || MCPlus::getNumPrimeOperands(Inst) != 1 || 1704 !Inst.getOperand(0).isExpr()) 1705 continue; 1706 1707 const MCSymbol *CalleeSymbol = BC.MIB->getTargetSymbol(Inst); 1708 if (CalleeSymbol->getName() != "memcpy" && 1709 CalleeSymbol->getName() != "memcpy@PLT" && 1710 CalleeSymbol->getName() != "_memcpy8") 1711 continue; 1712 1713 const bool IsMemcpy8 = (CalleeSymbol->getName() == "_memcpy8"); 1714 const bool IsTailCall = BC.MIB->isTailCall(Inst); 1715 1716 const InstructionListType NewCode = 1717 BC.MIB->createInlineMemcpy(IsMemcpy8); 1718 II = BB.replaceInstruction(II, NewCode); 1719 std::advance(II, NewCode.size() - 1); 1720 if (IsTailCall) { 1721 MCInst Return; 1722 BC.MIB->createReturn(Return); 1723 II = BB.insertInstruction(std::next(II), std::move(Return)); 1724 } 1725 1726 ++NumInlined; 1727 NumInlinedDyno += BB.getKnownExecutionCount(); 1728 } 1729 } 1730 } 1731 1732 if (NumInlined) { 1733 outs() << "BOLT-INFO: inlined " << NumInlined << " memcpy() calls"; 1734 if (NumInlinedDyno) 1735 outs() << ". The calls were executed " << NumInlinedDyno 1736 << " times based on profile."; 1737 outs() << '\n'; 1738 } 1739 return Error::success(); 1740 } 1741 1742 bool SpecializeMemcpy1::shouldOptimize(const BinaryFunction &Function) const { 1743 if (!BinaryFunctionPass::shouldOptimize(Function)) 1744 return false; 1745 1746 for (const std::string &FunctionSpec : Spec) { 1747 StringRef FunctionName = StringRef(FunctionSpec).split(':').first; 1748 if (Function.hasNameRegex(FunctionName)) 1749 return true; 1750 } 1751 1752 return false; 1753 } 1754 1755 std::set<size_t> SpecializeMemcpy1::getCallSitesToOptimize( 1756 const BinaryFunction &Function) const { 1757 StringRef SitesString; 1758 for (const std::string &FunctionSpec : Spec) { 1759 StringRef FunctionName; 1760 std::tie(FunctionName, SitesString) = StringRef(FunctionSpec).split(':'); 1761 if (Function.hasNameRegex(FunctionName)) 1762 break; 1763 SitesString = ""; 1764 } 1765 1766 std::set<size_t> Sites; 1767 SmallVector<StringRef, 4> SitesVec; 1768 SitesString.split(SitesVec, ':'); 1769 for (StringRef SiteString : SitesVec) { 1770 if (SiteString.empty()) 1771 continue; 1772 size_t Result; 1773 if (!SiteString.getAsInteger(10, Result)) 1774 Sites.emplace(Result); 1775 } 1776 1777 return Sites; 1778 } 1779 1780 Error SpecializeMemcpy1::runOnFunctions(BinaryContext &BC) { 1781 if (!BC.isX86()) 1782 return Error::success(); 1783 1784 uint64_t NumSpecialized = 0; 1785 uint64_t NumSpecializedDyno = 0; 1786 for (auto &BFI : BC.getBinaryFunctions()) { 1787 BinaryFunction &Function = BFI.second; 1788 if (!shouldOptimize(Function)) 1789 continue; 1790 1791 std::set<size_t> CallsToOptimize = getCallSitesToOptimize(Function); 1792 auto shouldOptimize = [&](size_t N) { 1793 return CallsToOptimize.empty() || CallsToOptimize.count(N); 1794 }; 1795 1796 std::vector<BinaryBasicBlock *> Blocks(Function.pbegin(), Function.pend()); 1797 size_t CallSiteID = 0; 1798 for (BinaryBasicBlock *CurBB : Blocks) { 1799 for (auto II = CurBB->begin(); II != CurBB->end(); ++II) { 1800 MCInst &Inst = *II; 1801 1802 if (!BC.MIB->isCall(Inst) || MCPlus::getNumPrimeOperands(Inst) != 1 || 1803 !Inst.getOperand(0).isExpr()) 1804 continue; 1805 1806 const MCSymbol *CalleeSymbol = BC.MIB->getTargetSymbol(Inst); 1807 if (CalleeSymbol->getName() != "memcpy" && 1808 CalleeSymbol->getName() != "memcpy@PLT") 1809 continue; 1810 1811 if (BC.MIB->isTailCall(Inst)) 1812 continue; 1813 1814 ++CallSiteID; 1815 1816 if (!shouldOptimize(CallSiteID)) 1817 continue; 1818 1819 // Create a copy of a call to memcpy(dest, src, size). 1820 MCInst MemcpyInstr = Inst; 1821 1822 BinaryBasicBlock *OneByteMemcpyBB = CurBB->splitAt(II); 1823 1824 BinaryBasicBlock *NextBB = nullptr; 1825 if (OneByteMemcpyBB->getNumNonPseudos() > 1) { 1826 NextBB = OneByteMemcpyBB->splitAt(OneByteMemcpyBB->begin()); 1827 NextBB->eraseInstruction(NextBB->begin()); 1828 } else { 1829 NextBB = OneByteMemcpyBB->getSuccessor(); 1830 OneByteMemcpyBB->eraseInstruction(OneByteMemcpyBB->begin()); 1831 assert(NextBB && "unexpected call to memcpy() with no return"); 1832 } 1833 1834 BinaryBasicBlock *MemcpyBB = Function.addBasicBlock(); 1835 MemcpyBB->setOffset(CurBB->getInputOffset()); 1836 InstructionListType CmpJCC = 1837 BC.MIB->createCmpJE(BC.MIB->getIntArgRegister(2), 1, 1838 OneByteMemcpyBB->getLabel(), BC.Ctx.get()); 1839 CurBB->addInstructions(CmpJCC); 1840 CurBB->addSuccessor(MemcpyBB); 1841 1842 MemcpyBB->addInstruction(std::move(MemcpyInstr)); 1843 MemcpyBB->addSuccessor(NextBB); 1844 MemcpyBB->setCFIState(NextBB->getCFIState()); 1845 MemcpyBB->setExecutionCount(0); 1846 1847 // To prevent the actual call from being moved to cold, we set its 1848 // execution count to 1. 1849 if (CurBB->getKnownExecutionCount() > 0) 1850 MemcpyBB->setExecutionCount(1); 1851 1852 InstructionListType OneByteMemcpy = BC.MIB->createOneByteMemcpy(); 1853 OneByteMemcpyBB->addInstructions(OneByteMemcpy); 1854 1855 ++NumSpecialized; 1856 NumSpecializedDyno += CurBB->getKnownExecutionCount(); 1857 1858 CurBB = NextBB; 1859 1860 // Note: we don't expect the next instruction to be a call to memcpy. 1861 II = CurBB->begin(); 1862 } 1863 } 1864 } 1865 1866 if (NumSpecialized) { 1867 outs() << "BOLT-INFO: specialized " << NumSpecialized 1868 << " memcpy() call sites for size 1"; 1869 if (NumSpecializedDyno) 1870 outs() << ". The calls were executed " << NumSpecializedDyno 1871 << " times based on profile."; 1872 outs() << '\n'; 1873 } 1874 return Error::success(); 1875 } 1876 1877 void RemoveNops::runOnFunction(BinaryFunction &BF) { 1878 const BinaryContext &BC = BF.getBinaryContext(); 1879 for (BinaryBasicBlock &BB : BF) { 1880 for (int64_t I = BB.size() - 1; I >= 0; --I) { 1881 MCInst &Inst = BB.getInstructionAtIndex(I); 1882 if (BC.MIB->isNoop(Inst) && BC.MIB->hasAnnotation(Inst, "NOP")) 1883 BB.eraseInstructionAtIndex(I); 1884 } 1885 } 1886 } 1887 1888 Error RemoveNops::runOnFunctions(BinaryContext &BC) { 1889 ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) { 1890 runOnFunction(BF); 1891 }; 1892 1893 ParallelUtilities::PredicateTy SkipFunc = [&](const BinaryFunction &BF) { 1894 return BF.shouldPreserveNops(); 1895 }; 1896 1897 ParallelUtilities::runOnEachFunction( 1898 BC, ParallelUtilities::SchedulingPolicy::SP_INST_LINEAR, WorkFun, 1899 SkipFunc, "RemoveNops"); 1900 return Error::success(); 1901 } 1902 1903 } // namespace bolt 1904 } // namespace llvm 1905