1 //===-- ProfileGenerator.cpp - Profile Generator ---------------*- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 #include "ProfileGenerator.h" 9 #include "ErrorHandling.h" 10 #include "MissingFrameInferrer.h" 11 #include "PerfReader.h" 12 #include "ProfiledBinary.h" 13 #include "llvm/DebugInfo/Symbolize/SymbolizableModule.h" 14 #include "llvm/ProfileData/ProfileCommon.h" 15 #include <algorithm> 16 #include <float.h> 17 #include <unordered_set> 18 #include <utility> 19 20 cl::opt<std::string> OutputFilename("output", cl::value_desc("output"), 21 cl::Required, 22 cl::desc("Output profile file")); 23 static cl::alias OutputA("o", cl::desc("Alias for --output"), 24 cl::aliasopt(OutputFilename)); 25 26 static cl::opt<SampleProfileFormat> OutputFormat( 27 "format", cl::desc("Format of output profile"), cl::init(SPF_Ext_Binary), 28 cl::values( 29 clEnumValN(SPF_Binary, "binary", "Binary encoding (default)"), 30 clEnumValN(SPF_Ext_Binary, "extbinary", "Extensible binary encoding"), 31 clEnumValN(SPF_Text, "text", "Text encoding"), 32 clEnumValN(SPF_GCC, "gcc", 33 "GCC encoding (only meaningful for -sample)"))); 34 35 static cl::opt<bool> UseMD5( 36 "use-md5", cl::Hidden, 37 cl::desc("Use md5 to represent function names in the output profile (only " 38 "meaningful for -extbinary)")); 39 40 static cl::opt<bool> PopulateProfileSymbolList( 41 "populate-profile-symbol-list", cl::init(false), cl::Hidden, 42 cl::desc("Populate profile symbol list (only meaningful for -extbinary)")); 43 44 static cl::opt<bool> FillZeroForAllFuncs( 45 "fill-zero-for-all-funcs", cl::init(false), cl::Hidden, 46 cl::desc("Attribute all functions' range with zero count " 47 "even it's not hit by any samples.")); 48 49 static cl::opt<int32_t, true> RecursionCompression( 50 "compress-recursion", 51 cl::desc("Compressing recursion by deduplicating adjacent frame " 52 "sequences up to the specified size. -1 means no size limit."), 53 cl::Hidden, 54 cl::location(llvm::sampleprof::CSProfileGenerator::MaxCompressionSize)); 55 56 static cl::opt<bool> 57 TrimColdProfile("trim-cold-profile", 58 cl::desc("If the total count of the profile is smaller " 59 "than threshold, it will be trimmed.")); 60 61 static cl::opt<bool> CSProfMergeColdContext( 62 "csprof-merge-cold-context", cl::init(true), 63 cl::desc("If the total count of context profile is smaller than " 64 "the threshold, it will be merged into context-less base " 65 "profile.")); 66 67 static cl::opt<uint32_t> CSProfMaxColdContextDepth( 68 "csprof-max-cold-context-depth", cl::init(1), 69 cl::desc("Keep the last K contexts while merging cold profile. 1 means the " 70 "context-less base profile")); 71 72 static cl::opt<int, true> CSProfMaxContextDepth( 73 "csprof-max-context-depth", 74 cl::desc("Keep the last K contexts while merging profile. -1 means no " 75 "depth limit."), 76 cl::location(llvm::sampleprof::CSProfileGenerator::MaxContextDepth)); 77 78 static cl::opt<double> ProfileDensityThreshold( 79 "profile-density-threshold", llvm::cl::init(50), 80 llvm::cl::desc("If the profile density is below the given threshold, it " 81 "will be suggested to increase the sampling rate."), 82 llvm::cl::Optional); 83 static cl::opt<bool> ShowDensity("show-density", llvm::cl::init(false), 84 llvm::cl::desc("show profile density details"), 85 llvm::cl::Optional); 86 static cl::opt<int> ProfileDensityCutOffHot( 87 "profile-density-cutoff-hot", llvm::cl::init(990000), 88 llvm::cl::desc("Total samples cutoff for functions used to calculate " 89 "profile density.")); 90 91 static cl::opt<bool> UpdateTotalSamples( 92 "update-total-samples", llvm::cl::init(false), 93 llvm::cl::desc( 94 "Update total samples by accumulating all its body samples."), 95 llvm::cl::Optional); 96 97 static cl::opt<bool> GenCSNestedProfile( 98 "gen-cs-nested-profile", cl::Hidden, cl::init(true), 99 cl::desc("Generate nested function profiles for CSSPGO")); 100 101 cl::opt<bool> InferMissingFrames( 102 "infer-missing-frames", llvm::cl::init(true), 103 llvm::cl::desc( 104 "Infer missing call frames due to compiler tail call elimination."), 105 llvm::cl::Optional); 106 107 using namespace llvm; 108 using namespace sampleprof; 109 110 namespace llvm { 111 extern cl::opt<int> ProfileSummaryCutoffHot; 112 extern cl::opt<bool> UseContextLessSummary; 113 114 namespace sampleprof { 115 116 // Initialize the MaxCompressionSize to -1 which means no size limit 117 int32_t CSProfileGenerator::MaxCompressionSize = -1; 118 119 int CSProfileGenerator::MaxContextDepth = -1; 120 121 bool ProfileGeneratorBase::UseFSDiscriminator = false; 122 123 std::unique_ptr<ProfileGeneratorBase> 124 ProfileGeneratorBase::create(ProfiledBinary *Binary, 125 const ContextSampleCounterMap *SampleCounters, 126 bool ProfileIsCS) { 127 std::unique_ptr<ProfileGeneratorBase> Generator; 128 if (ProfileIsCS) { 129 Generator.reset(new CSProfileGenerator(Binary, SampleCounters)); 130 } else { 131 Generator.reset(new ProfileGenerator(Binary, SampleCounters)); 132 } 133 ProfileGeneratorBase::UseFSDiscriminator = Binary->useFSDiscriminator(); 134 FunctionSamples::ProfileIsFS = Binary->useFSDiscriminator(); 135 136 return Generator; 137 } 138 139 std::unique_ptr<ProfileGeneratorBase> 140 ProfileGeneratorBase::create(ProfiledBinary *Binary, SampleProfileMap &Profiles, 141 bool ProfileIsCS) { 142 std::unique_ptr<ProfileGeneratorBase> Generator; 143 if (ProfileIsCS) { 144 Generator.reset(new CSProfileGenerator(Binary, Profiles)); 145 } else { 146 Generator.reset(new ProfileGenerator(Binary, std::move(Profiles))); 147 } 148 ProfileGeneratorBase::UseFSDiscriminator = Binary->useFSDiscriminator(); 149 FunctionSamples::ProfileIsFS = Binary->useFSDiscriminator(); 150 151 return Generator; 152 } 153 154 void ProfileGeneratorBase::write(std::unique_ptr<SampleProfileWriter> Writer, 155 SampleProfileMap &ProfileMap) { 156 // Populate profile symbol list if extended binary format is used. 157 ProfileSymbolList SymbolList; 158 159 if (PopulateProfileSymbolList && OutputFormat == SPF_Ext_Binary) { 160 Binary->populateSymbolListFromDWARF(SymbolList); 161 Writer->setProfileSymbolList(&SymbolList); 162 } 163 164 if (std::error_code EC = Writer->write(ProfileMap)) 165 exitWithError(std::move(EC)); 166 } 167 168 void ProfileGeneratorBase::write() { 169 auto WriterOrErr = SampleProfileWriter::create(OutputFilename, OutputFormat); 170 if (std::error_code EC = WriterOrErr.getError()) 171 exitWithError(EC, OutputFilename); 172 173 if (UseMD5) { 174 if (OutputFormat != SPF_Ext_Binary) 175 WithColor::warning() << "-use-md5 is ignored. Specify " 176 "--format=extbinary to enable it\n"; 177 else 178 WriterOrErr.get()->setUseMD5(); 179 } 180 181 write(std::move(WriterOrErr.get()), ProfileMap); 182 } 183 184 void ProfileGeneratorBase::showDensitySuggestion(double Density) { 185 if (Density == 0.0) 186 WithColor::warning() << "The output profile is empty or the " 187 "--profile-density-cutoff-hot option is " 188 "set too low. Please check your command.\n"; 189 else if (Density < ProfileDensityThreshold) 190 WithColor::warning() 191 << "Sample PGO is estimated to optimize better with " 192 << format("%.1f", ProfileDensityThreshold / Density) 193 << "x more samples. Please consider increasing sampling rate or " 194 "profiling for longer duration to get more samples.\n"; 195 196 if (ShowDensity) 197 outs() << "Functions with density >= " << format("%.1f", Density) 198 << " account for " 199 << format("%.2f", 200 static_cast<double>(ProfileDensityCutOffHot) / 10000) 201 << "% total sample counts.\n"; 202 } 203 204 bool ProfileGeneratorBase::filterAmbiguousProfile(FunctionSamples &FS) { 205 for (const auto &Prefix : FuncPrefixsToFilter) { 206 if (FS.getFuncName().starts_with(Prefix)) 207 return true; 208 } 209 210 // Filter the function profiles for the inlinees. It's useful for fuzzy 211 // profile matching which flattens the profile and inlinees' samples are 212 // merged into top-level function. 213 for (auto &Callees : 214 const_cast<CallsiteSampleMap &>(FS.getCallsiteSamples())) { 215 auto &CalleesMap = Callees.second; 216 for (auto I = CalleesMap.begin(); I != CalleesMap.end();) { 217 auto FS = I++; 218 if (filterAmbiguousProfile(FS->second)) 219 CalleesMap.erase(FS); 220 } 221 } 222 return false; 223 } 224 225 // For built-in local initialization function such as __cxx_global_var_init, 226 // __tls_init prefix function, there could be multiple versions of the functions 227 // in the final binary. However, in the profile generation, we call 228 // getCanonicalFnName to canonicalize the names which strips the suffixes. 229 // Therefore, samples from different functions queries the same profile and the 230 // samples are merged. As the functions are essentially different, entries of 231 // the merged profile are ambiguous. In sample loader, the IR from one version 232 // would be attributed towards a merged entries, which is inaccurate. Especially 233 // for fuzzy profile matching, it gets multiple callsites(from different 234 // function) but used to match one callsite, which misleads the matching and 235 // causes a lot of false positives report. Hence, we want to filter them out 236 // from the profile map during the profile generation time. The profiles are all 237 // cold functions, it won't have perf impact. 238 void ProfileGeneratorBase::filterAmbiguousProfile(SampleProfileMap &Profiles) { 239 for (auto I = ProfileMap.begin(); I != ProfileMap.end();) { 240 auto FS = I++; 241 if (filterAmbiguousProfile(FS->second)) 242 ProfileMap.erase(FS); 243 } 244 } 245 246 void ProfileGeneratorBase::findDisjointRanges(RangeSample &DisjointRanges, 247 const RangeSample &Ranges) { 248 249 /* 250 Regions may overlap with each other. Using the boundary info, find all 251 disjoint ranges and their sample count. BoundaryPoint contains the count 252 multiple samples begin/end at this points. 253 254 |<--100-->| Sample1 255 |<------200------>| Sample2 256 A B C 257 258 In the example above, 259 Sample1 begins at A, ends at B, its value is 100. 260 Sample2 beings at A, ends at C, its value is 200. 261 For A, BeginCount is the sum of sample begins at A, which is 300 and no 262 samples ends at A, so EndCount is 0. 263 Then boundary points A, B, and C with begin/end counts are: 264 A: (300, 0) 265 B: (0, 100) 266 C: (0, 200) 267 */ 268 struct BoundaryPoint { 269 // Sum of sample counts beginning at this point 270 uint64_t BeginCount = UINT64_MAX; 271 // Sum of sample counts ending at this point 272 uint64_t EndCount = UINT64_MAX; 273 // Is the begin point of a zero range. 274 bool IsZeroRangeBegin = false; 275 // Is the end point of a zero range. 276 bool IsZeroRangeEnd = false; 277 278 void addBeginCount(uint64_t Count) { 279 if (BeginCount == UINT64_MAX) 280 BeginCount = 0; 281 BeginCount += Count; 282 } 283 284 void addEndCount(uint64_t Count) { 285 if (EndCount == UINT64_MAX) 286 EndCount = 0; 287 EndCount += Count; 288 } 289 }; 290 291 /* 292 For the above example. With boundary points, follwing logic finds two 293 disjoint region of 294 295 [A,B]: 300 296 [B+1,C]: 200 297 298 If there is a boundary point that both begin and end, the point itself 299 becomes a separate disjoint region. For example, if we have original 300 ranges of 301 302 |<--- 100 --->| 303 |<--- 200 --->| 304 A B C 305 306 there are three boundary points with their begin/end counts of 307 308 A: (100, 0) 309 B: (200, 100) 310 C: (0, 200) 311 312 the disjoint ranges would be 313 314 [A, B-1]: 100 315 [B, B]: 300 316 [B+1, C]: 200. 317 318 Example for zero value range: 319 320 |<--- 100 --->| 321 |<--- 200 --->| 322 |<--------------- 0 ----------------->| 323 A B C D E F 324 325 [A, B-1] : 0 326 [B, C] : 100 327 [C+1, D-1]: 0 328 [D, E] : 200 329 [E+1, F] : 0 330 */ 331 std::map<uint64_t, BoundaryPoint> Boundaries; 332 333 for (const auto &Item : Ranges) { 334 assert(Item.first.first <= Item.first.second && 335 "Invalid instruction range"); 336 auto &BeginPoint = Boundaries[Item.first.first]; 337 auto &EndPoint = Boundaries[Item.first.second]; 338 uint64_t Count = Item.second; 339 340 BeginPoint.addBeginCount(Count); 341 EndPoint.addEndCount(Count); 342 if (Count == 0) { 343 BeginPoint.IsZeroRangeBegin = true; 344 EndPoint.IsZeroRangeEnd = true; 345 } 346 } 347 348 // Use UINT64_MAX to indicate there is no existing range between BeginAddress 349 // and the next valid address 350 uint64_t BeginAddress = UINT64_MAX; 351 int ZeroRangeDepth = 0; 352 uint64_t Count = 0; 353 for (const auto &Item : Boundaries) { 354 uint64_t Address = Item.first; 355 const BoundaryPoint &Point = Item.second; 356 if (Point.BeginCount != UINT64_MAX) { 357 if (BeginAddress != UINT64_MAX) 358 DisjointRanges[{BeginAddress, Address - 1}] = Count; 359 Count += Point.BeginCount; 360 BeginAddress = Address; 361 ZeroRangeDepth += Point.IsZeroRangeBegin; 362 } 363 if (Point.EndCount != UINT64_MAX) { 364 assert((BeginAddress != UINT64_MAX) && 365 "First boundary point cannot be 'end' point"); 366 DisjointRanges[{BeginAddress, Address}] = Count; 367 assert(Count >= Point.EndCount && "Mismatched live ranges"); 368 Count -= Point.EndCount; 369 BeginAddress = Address + 1; 370 ZeroRangeDepth -= Point.IsZeroRangeEnd; 371 // If the remaining count is zero and it's no longer in a zero range, this 372 // means we consume all the ranges before, thus mark BeginAddress as 373 // UINT64_MAX. e.g. supposing we have two non-overlapping ranges: 374 // [<---- 10 ---->] 375 // [<---- 20 ---->] 376 // A B C D 377 // The BeginAddress(B+1) will reset to invalid(UINT64_MAX), so we won't 378 // have the [B+1, C-1] zero range. 379 if (Count == 0 && ZeroRangeDepth == 0) 380 BeginAddress = UINT64_MAX; 381 } 382 } 383 } 384 385 void ProfileGeneratorBase::updateBodySamplesforFunctionProfile( 386 FunctionSamples &FunctionProfile, const SampleContextFrame &LeafLoc, 387 uint64_t Count) { 388 // Use the maximum count of samples with same line location 389 uint32_t Discriminator = getBaseDiscriminator(LeafLoc.Location.Discriminator); 390 391 // Use duplication factor to compensated for loop unroll/vectorization. 392 // Note that this is only needed when we're taking MAX of the counts at 393 // the location instead of SUM. 394 Count *= getDuplicationFactor(LeafLoc.Location.Discriminator); 395 396 ErrorOr<uint64_t> R = 397 FunctionProfile.findSamplesAt(LeafLoc.Location.LineOffset, Discriminator); 398 399 uint64_t PreviousCount = R ? R.get() : 0; 400 if (PreviousCount <= Count) { 401 FunctionProfile.addBodySamples(LeafLoc.Location.LineOffset, Discriminator, 402 Count - PreviousCount); 403 } 404 } 405 406 void ProfileGeneratorBase::updateTotalSamples() { 407 for (auto &Item : ProfileMap) { 408 FunctionSamples &FunctionProfile = Item.second; 409 FunctionProfile.updateTotalSamples(); 410 } 411 } 412 413 void ProfileGeneratorBase::updateCallsiteSamples() { 414 for (auto &Item : ProfileMap) { 415 FunctionSamples &FunctionProfile = Item.second; 416 FunctionProfile.updateCallsiteSamples(); 417 } 418 } 419 420 void ProfileGeneratorBase::updateFunctionSamples() { 421 updateCallsiteSamples(); 422 423 if (UpdateTotalSamples) 424 updateTotalSamples(); 425 } 426 427 void ProfileGeneratorBase::collectProfiledFunctions() { 428 std::unordered_set<const BinaryFunction *> ProfiledFunctions; 429 if (collectFunctionsFromRawProfile(ProfiledFunctions)) 430 Binary->setProfiledFunctions(ProfiledFunctions); 431 else if (collectFunctionsFromLLVMProfile(ProfiledFunctions)) 432 Binary->setProfiledFunctions(ProfiledFunctions); 433 else 434 llvm_unreachable("Unsupported input profile"); 435 } 436 437 bool ProfileGeneratorBase::collectFunctionsFromRawProfile( 438 std::unordered_set<const BinaryFunction *> &ProfiledFunctions) { 439 if (!SampleCounters) 440 return false; 441 // Go through all the stacks, ranges and branches in sample counters, use 442 // the start of the range to look up the function it belongs and record the 443 // function. 444 for (const auto &CI : *SampleCounters) { 445 if (const auto *CtxKey = dyn_cast<AddrBasedCtxKey>(CI.first.getPtr())) { 446 for (auto StackAddr : CtxKey->Context) { 447 if (FuncRange *FRange = Binary->findFuncRange(StackAddr)) 448 ProfiledFunctions.insert(FRange->Func); 449 } 450 } 451 452 for (auto Item : CI.second.RangeCounter) { 453 uint64_t StartAddress = Item.first.first; 454 if (FuncRange *FRange = Binary->findFuncRange(StartAddress)) 455 ProfiledFunctions.insert(FRange->Func); 456 } 457 458 for (auto Item : CI.second.BranchCounter) { 459 uint64_t SourceAddress = Item.first.first; 460 uint64_t TargetAddress = Item.first.second; 461 if (FuncRange *FRange = Binary->findFuncRange(SourceAddress)) 462 ProfiledFunctions.insert(FRange->Func); 463 if (FuncRange *FRange = Binary->findFuncRange(TargetAddress)) 464 ProfiledFunctions.insert(FRange->Func); 465 } 466 } 467 return true; 468 } 469 470 bool ProfileGenerator::collectFunctionsFromLLVMProfile( 471 std::unordered_set<const BinaryFunction *> &ProfiledFunctions) { 472 for (const auto &FS : ProfileMap) { 473 if (auto *Func = Binary->getBinaryFunction(FS.second.getFunction())) 474 ProfiledFunctions.insert(Func); 475 } 476 return true; 477 } 478 479 bool CSProfileGenerator::collectFunctionsFromLLVMProfile( 480 std::unordered_set<const BinaryFunction *> &ProfiledFunctions) { 481 for (auto *Node : ContextTracker) { 482 if (!Node->getFuncName().empty()) 483 if (auto *Func = Binary->getBinaryFunction(Node->getFuncName())) 484 ProfiledFunctions.insert(Func); 485 } 486 return true; 487 } 488 489 FunctionSamples & 490 ProfileGenerator::getTopLevelFunctionProfile(FunctionId FuncName) { 491 SampleContext Context(FuncName); 492 return ProfileMap.create(Context); 493 } 494 495 void ProfileGenerator::generateProfile() { 496 collectProfiledFunctions(); 497 498 if (Binary->usePseudoProbes()) 499 Binary->decodePseudoProbe(); 500 501 if (SampleCounters) { 502 if (Binary->usePseudoProbes()) { 503 generateProbeBasedProfile(); 504 } else { 505 generateLineNumBasedProfile(); 506 } 507 } 508 509 postProcessProfiles(); 510 } 511 512 void ProfileGenerator::postProcessProfiles() { 513 computeSummaryAndThreshold(ProfileMap); 514 trimColdProfiles(ProfileMap, ColdCountThreshold); 515 filterAmbiguousProfile(ProfileMap); 516 calculateAndShowDensity(ProfileMap); 517 } 518 519 void ProfileGenerator::trimColdProfiles(const SampleProfileMap &Profiles, 520 uint64_t ColdCntThreshold) { 521 if (!TrimColdProfile) 522 return; 523 524 // Move cold profiles into a tmp container. 525 std::vector<hash_code> ColdProfileHashes; 526 for (const auto &I : ProfileMap) { 527 if (I.second.getTotalSamples() < ColdCntThreshold) 528 ColdProfileHashes.emplace_back(I.first); 529 } 530 531 // Remove the cold profile from ProfileMap. 532 for (const auto &I : ColdProfileHashes) 533 ProfileMap.erase(I); 534 } 535 536 void ProfileGenerator::generateLineNumBasedProfile() { 537 assert(SampleCounters->size() == 1 && 538 "Must have one entry for profile generation."); 539 const SampleCounter &SC = SampleCounters->begin()->second; 540 // Fill in function body samples 541 populateBodySamplesForAllFunctions(SC.RangeCounter); 542 // Fill in boundary sample counts as well as call site samples for calls 543 populateBoundarySamplesForAllFunctions(SC.BranchCounter); 544 545 updateFunctionSamples(); 546 } 547 548 void ProfileGenerator::generateProbeBasedProfile() { 549 assert(SampleCounters->size() == 1 && 550 "Must have one entry for profile generation."); 551 // Enable pseudo probe functionalities in SampleProf 552 FunctionSamples::ProfileIsProbeBased = true; 553 const SampleCounter &SC = SampleCounters->begin()->second; 554 // Fill in function body samples 555 populateBodySamplesWithProbesForAllFunctions(SC.RangeCounter); 556 // Fill in boundary sample counts as well as call site samples for calls 557 populateBoundarySamplesWithProbesForAllFunctions(SC.BranchCounter); 558 559 updateFunctionSamples(); 560 } 561 562 void ProfileGenerator::populateBodySamplesWithProbesForAllFunctions( 563 const RangeSample &RangeCounter) { 564 ProbeCounterMap ProbeCounter; 565 // preprocessRangeCounter returns disjoint ranges, so no longer to redo it 566 // inside extractProbesFromRange. 567 extractProbesFromRange(preprocessRangeCounter(RangeCounter), ProbeCounter, 568 false); 569 570 for (const auto &PI : ProbeCounter) { 571 const MCDecodedPseudoProbe *Probe = PI.first; 572 uint64_t Count = PI.second; 573 SampleContextFrameVector FrameVec; 574 Binary->getInlineContextForProbe(Probe, FrameVec, true); 575 FunctionSamples &FunctionProfile = 576 getLeafProfileAndAddTotalSamples(FrameVec, Count); 577 FunctionProfile.addBodySamples(Probe->getIndex(), Probe->getDiscriminator(), 578 Count); 579 if (Probe->isEntry()) 580 FunctionProfile.addHeadSamples(Count); 581 } 582 } 583 584 void ProfileGenerator::populateBoundarySamplesWithProbesForAllFunctions( 585 const BranchSample &BranchCounters) { 586 for (const auto &Entry : BranchCounters) { 587 uint64_t SourceAddress = Entry.first.first; 588 uint64_t TargetAddress = Entry.first.second; 589 uint64_t Count = Entry.second; 590 assert(Count != 0 && "Unexpected zero weight branch"); 591 592 StringRef CalleeName = getCalleeNameForAddress(TargetAddress); 593 if (CalleeName.size() == 0) 594 continue; 595 596 const MCDecodedPseudoProbe *CallProbe = 597 Binary->getCallProbeForAddr(SourceAddress); 598 if (CallProbe == nullptr) 599 continue; 600 601 // Record called target sample and its count. 602 SampleContextFrameVector FrameVec; 603 Binary->getInlineContextForProbe(CallProbe, FrameVec, true); 604 605 if (!FrameVec.empty()) { 606 FunctionSamples &FunctionProfile = 607 getLeafProfileAndAddTotalSamples(FrameVec, 0); 608 FunctionProfile.addCalledTargetSamples( 609 FrameVec.back().Location.LineOffset, 610 FrameVec.back().Location.Discriminator, 611 FunctionId(CalleeName), Count); 612 } 613 } 614 } 615 616 FunctionSamples &ProfileGenerator::getLeafProfileAndAddTotalSamples( 617 const SampleContextFrameVector &FrameVec, uint64_t Count) { 618 // Get top level profile 619 FunctionSamples *FunctionProfile = 620 &getTopLevelFunctionProfile(FrameVec[0].Func); 621 FunctionProfile->addTotalSamples(Count); 622 if (Binary->usePseudoProbes()) { 623 const auto *FuncDesc = Binary->getFuncDescForGUID( 624 FunctionProfile->getFunction().getHashCode()); 625 FunctionProfile->setFunctionHash(FuncDesc->FuncHash); 626 } 627 628 for (size_t I = 1; I < FrameVec.size(); I++) { 629 LineLocation Callsite( 630 FrameVec[I - 1].Location.LineOffset, 631 getBaseDiscriminator(FrameVec[I - 1].Location.Discriminator)); 632 FunctionSamplesMap &SamplesMap = 633 FunctionProfile->functionSamplesAt(Callsite); 634 auto Ret = 635 SamplesMap.emplace(FrameVec[I].Func, FunctionSamples()); 636 if (Ret.second) { 637 SampleContext Context(FrameVec[I].Func); 638 Ret.first->second.setContext(Context); 639 } 640 FunctionProfile = &Ret.first->second; 641 FunctionProfile->addTotalSamples(Count); 642 if (Binary->usePseudoProbes()) { 643 const auto *FuncDesc = Binary->getFuncDescForGUID( 644 FunctionProfile->getFunction().getHashCode()); 645 FunctionProfile->setFunctionHash(FuncDesc->FuncHash); 646 } 647 } 648 649 return *FunctionProfile; 650 } 651 652 RangeSample 653 ProfileGenerator::preprocessRangeCounter(const RangeSample &RangeCounter) { 654 RangeSample Ranges(RangeCounter.begin(), RangeCounter.end()); 655 if (FillZeroForAllFuncs) { 656 for (auto &FuncI : Binary->getAllBinaryFunctions()) { 657 for (auto &R : FuncI.second.Ranges) { 658 Ranges[{R.first, R.second - 1}] += 0; 659 } 660 } 661 } else { 662 // For each range, we search for all ranges of the function it belongs to 663 // and initialize it with zero count, so it remains zero if doesn't hit any 664 // samples. This is to be consistent with compiler that interpret zero count 665 // as unexecuted(cold). 666 for (const auto &I : RangeCounter) { 667 uint64_t StartAddress = I.first.first; 668 for (const auto &Range : Binary->getRanges(StartAddress)) 669 Ranges[{Range.first, Range.second - 1}] += 0; 670 } 671 } 672 RangeSample DisjointRanges; 673 findDisjointRanges(DisjointRanges, Ranges); 674 return DisjointRanges; 675 } 676 677 void ProfileGenerator::populateBodySamplesForAllFunctions( 678 const RangeSample &RangeCounter) { 679 for (const auto &Range : preprocessRangeCounter(RangeCounter)) { 680 uint64_t RangeBegin = Range.first.first; 681 uint64_t RangeEnd = Range.first.second; 682 uint64_t Count = Range.second; 683 684 InstructionPointer IP(Binary, RangeBegin, true); 685 // Disjoint ranges may have range in the middle of two instr, 686 // e.g. If Instr1 at Addr1, and Instr2 at Addr2, disjoint range 687 // can be Addr1+1 to Addr2-1. We should ignore such range. 688 if (IP.Address > RangeEnd) 689 continue; 690 691 do { 692 const SampleContextFrameVector FrameVec = 693 Binary->getFrameLocationStack(IP.Address); 694 if (!FrameVec.empty()) { 695 // FIXME: As accumulating total count per instruction caused some 696 // regression, we changed to accumulate total count per byte as a 697 // workaround. Tuning hotness threshold on the compiler side might be 698 // necessary in the future. 699 FunctionSamples &FunctionProfile = getLeafProfileAndAddTotalSamples( 700 FrameVec, Count * Binary->getInstSize(IP.Address)); 701 updateBodySamplesforFunctionProfile(FunctionProfile, FrameVec.back(), 702 Count); 703 } 704 } while (IP.advance() && IP.Address <= RangeEnd); 705 } 706 } 707 708 StringRef 709 ProfileGeneratorBase::getCalleeNameForAddress(uint64_t TargetAddress) { 710 // Get the function range by branch target if it's a call branch. 711 auto *FRange = Binary->findFuncRangeForStartAddr(TargetAddress); 712 713 // We won't accumulate sample count for a range whose start is not the real 714 // function entry such as outlined function or inner labels. 715 if (!FRange || !FRange->IsFuncEntry) 716 return StringRef(); 717 718 return FunctionSamples::getCanonicalFnName(FRange->getFuncName()); 719 } 720 721 void ProfileGenerator::populateBoundarySamplesForAllFunctions( 722 const BranchSample &BranchCounters) { 723 for (const auto &Entry : BranchCounters) { 724 uint64_t SourceAddress = Entry.first.first; 725 uint64_t TargetAddress = Entry.first.second; 726 uint64_t Count = Entry.second; 727 assert(Count != 0 && "Unexpected zero weight branch"); 728 729 StringRef CalleeName = getCalleeNameForAddress(TargetAddress); 730 if (CalleeName.size() == 0) 731 continue; 732 // Record called target sample and its count. 733 const SampleContextFrameVector &FrameVec = 734 Binary->getCachedFrameLocationStack(SourceAddress); 735 if (!FrameVec.empty()) { 736 FunctionSamples &FunctionProfile = 737 getLeafProfileAndAddTotalSamples(FrameVec, 0); 738 FunctionProfile.addCalledTargetSamples( 739 FrameVec.back().Location.LineOffset, 740 getBaseDiscriminator(FrameVec.back().Location.Discriminator), 741 FunctionId(CalleeName), Count); 742 } 743 // Add head samples for callee. 744 FunctionSamples &CalleeProfile = 745 getTopLevelFunctionProfile(FunctionId(CalleeName)); 746 CalleeProfile.addHeadSamples(Count); 747 } 748 } 749 750 void ProfileGeneratorBase::calculateBodySamplesAndSize( 751 const FunctionSamples &FSamples, uint64_t &TotalBodySamples, 752 uint64_t &FuncBodySize) { 753 // Note that ideally the size should be the number of function instruction. 754 // However, for probe-based profile, we don't have the accurate instruction 755 // count for each probe, instead, the probe sample is the samples count for 756 // the block, which is equivelant to 757 // total_instruction_samples/num_of_instruction in one block. Hence, we use 758 // the number of probe as a proxy for the function's size. 759 FuncBodySize += FSamples.getBodySamples().size(); 760 761 // The accumulated body samples re-calculated here could be different from the 762 // TotalSamples(getTotalSamples) field of FunctionSamples for line-number 763 // based profile. The reason is that TotalSamples is the sum of all the 764 // samples of the machine instruction in one source-code line, however, the 765 // entry of Bodysamples is the only max number of them, so the TotalSamples is 766 // usually much bigger than the accumulated body samples as one souce-code 767 // line can emit many machine instructions. We observed a regression when we 768 // switched to use the accumulated body samples(by using 769 // -update-total-samples). Hence, it's safer to re-calculate here to avoid 770 // such discrepancy. There is no problem for probe-based profile, as the 771 // TotalSamples is exactly the same as the accumulated body samples. 772 for (const auto &I : FSamples.getBodySamples()) 773 TotalBodySamples += I.second.getSamples(); 774 775 for (const auto &CallsiteSamples : FSamples.getCallsiteSamples()) 776 for (const auto &Callee : CallsiteSamples.second) { 777 // For binary-level density, the inlinees' samples and size should be 778 // included in the calculation. 779 calculateBodySamplesAndSize(Callee.second, TotalBodySamples, 780 FuncBodySize); 781 } 782 } 783 784 // Calculate Profile-density: 785 // Calculate the density for each function and sort them in descending order, 786 // keep accumulating their total samples unitl it exceeds the 787 // percentage_threshold(cut-off) of total profile samples, the profile-density 788 // is the last(minimum) function-density of the processed functions, which means 789 // all the functions hot to perf are on good density if the profile-density is 790 // good. The percentage_threshold(--profile-density-cutoff-hot) is configurable 791 // depending on how much regression the system want to tolerate. 792 double 793 ProfileGeneratorBase::calculateDensity(const SampleProfileMap &Profiles) { 794 double ProfileDensity = 0.0; 795 796 uint64_t TotalProfileSamples = 0; 797 // A list of the function profile density and its total samples. 798 std::vector<std::pair<double, uint64_t>> FuncDensityList; 799 for (const auto &I : Profiles) { 800 uint64_t TotalBodySamples = 0; 801 uint64_t FuncBodySize = 0; 802 calculateBodySamplesAndSize(I.second, TotalBodySamples, FuncBodySize); 803 804 if (FuncBodySize == 0) 805 continue; 806 807 double FuncDensity = static_cast<double>(TotalBodySamples) / FuncBodySize; 808 TotalProfileSamples += TotalBodySamples; 809 FuncDensityList.emplace_back(FuncDensity, TotalBodySamples); 810 } 811 812 // Sorted by the density in descending order. 813 llvm::stable_sort(FuncDensityList, [&](const std::pair<double, uint64_t> &A, 814 const std::pair<double, uint64_t> &B) { 815 if (A.first != B.first) 816 return A.first > B.first; 817 return A.second < B.second; 818 }); 819 820 uint64_t AccumulatedSamples = 0; 821 uint32_t I = 0; 822 assert(ProfileDensityCutOffHot <= 1000000 && 823 "The cutoff value is greater than 1000000(100%)"); 824 while (AccumulatedSamples < TotalProfileSamples * 825 static_cast<float>(ProfileDensityCutOffHot) / 826 1000000 && 827 I < FuncDensityList.size()) { 828 AccumulatedSamples += FuncDensityList[I].second; 829 ProfileDensity = FuncDensityList[I].first; 830 I++; 831 } 832 833 return ProfileDensity; 834 } 835 836 void ProfileGeneratorBase::calculateAndShowDensity( 837 const SampleProfileMap &Profiles) { 838 double Density = calculateDensity(Profiles); 839 showDensitySuggestion(Density); 840 } 841 842 FunctionSamples * 843 CSProfileGenerator::getOrCreateFunctionSamples(ContextTrieNode *ContextNode, 844 bool WasLeafInlined) { 845 FunctionSamples *FProfile = ContextNode->getFunctionSamples(); 846 if (!FProfile) { 847 FSamplesList.emplace_back(); 848 FProfile = &FSamplesList.back(); 849 FProfile->setFunction(ContextNode->getFuncName()); 850 ContextNode->setFunctionSamples(FProfile); 851 } 852 // Update ContextWasInlined attribute for existing contexts. 853 // The current function can be called in two ways: 854 // - when processing a probe of the current frame 855 // - when processing the entry probe of an inlinee's frame, which 856 // is then used to update the callsite count of the current frame. 857 // The two can happen in any order, hence here we are making sure 858 // `ContextWasInlined` is always set as expected. 859 // TODO: Note that the former does not always happen if no probes of the 860 // current frame has samples, and if the latter happens, we could lose the 861 // attribute. This should be fixed. 862 if (WasLeafInlined) 863 FProfile->getContext().setAttribute(ContextWasInlined); 864 return FProfile; 865 } 866 867 ContextTrieNode * 868 CSProfileGenerator::getOrCreateContextNode(const SampleContextFrames Context, 869 bool WasLeafInlined) { 870 ContextTrieNode *ContextNode = 871 ContextTracker.getOrCreateContextPath(Context, true); 872 getOrCreateFunctionSamples(ContextNode, WasLeafInlined); 873 return ContextNode; 874 } 875 876 void CSProfileGenerator::generateProfile() { 877 FunctionSamples::ProfileIsCS = true; 878 879 collectProfiledFunctions(); 880 881 if (Binary->usePseudoProbes()) { 882 Binary->decodePseudoProbe(); 883 if (InferMissingFrames) 884 initializeMissingFrameInferrer(); 885 } 886 887 if (SampleCounters) { 888 if (Binary->usePseudoProbes()) { 889 generateProbeBasedProfile(); 890 } else { 891 generateLineNumBasedProfile(); 892 } 893 } 894 895 if (Binary->getTrackFuncContextSize()) 896 computeSizeForProfiledFunctions(); 897 898 postProcessProfiles(); 899 } 900 901 void CSProfileGenerator::initializeMissingFrameInferrer() { 902 Binary->getMissingContextInferrer()->initialize(SampleCounters); 903 } 904 905 void CSProfileGenerator::inferMissingFrames( 906 const SmallVectorImpl<uint64_t> &Context, 907 SmallVectorImpl<uint64_t> &NewContext) { 908 Binary->inferMissingFrames(Context, NewContext); 909 } 910 911 void CSProfileGenerator::computeSizeForProfiledFunctions() { 912 for (auto *Func : Binary->getProfiledFunctions()) 913 Binary->computeInlinedContextSizeForFunc(Func); 914 915 // Flush the symbolizer to save memory. 916 Binary->flushSymbolizer(); 917 } 918 919 void CSProfileGenerator::updateFunctionSamples() { 920 for (auto *Node : ContextTracker) { 921 FunctionSamples *FSamples = Node->getFunctionSamples(); 922 if (FSamples) { 923 if (UpdateTotalSamples) 924 FSamples->updateTotalSamples(); 925 FSamples->updateCallsiteSamples(); 926 } 927 } 928 } 929 930 void CSProfileGenerator::generateLineNumBasedProfile() { 931 for (const auto &CI : *SampleCounters) { 932 const auto *CtxKey = cast<StringBasedCtxKey>(CI.first.getPtr()); 933 934 ContextTrieNode *ContextNode = &getRootContext(); 935 // Sample context will be empty if the jump is an external-to-internal call 936 // pattern, the head samples should be added for the internal function. 937 if (!CtxKey->Context.empty()) { 938 // Get or create function profile for the range 939 ContextNode = 940 getOrCreateContextNode(CtxKey->Context, CtxKey->WasLeafInlined); 941 // Fill in function body samples 942 populateBodySamplesForFunction(*ContextNode->getFunctionSamples(), 943 CI.second.RangeCounter); 944 } 945 // Fill in boundary sample counts as well as call site samples for calls 946 populateBoundarySamplesForFunction(ContextNode, CI.second.BranchCounter); 947 } 948 // Fill in call site value sample for inlined calls and also use context to 949 // infer missing samples. Since we don't have call count for inlined 950 // functions, we estimate it from inlinee's profile using the entry of the 951 // body sample. 952 populateInferredFunctionSamples(getRootContext()); 953 954 updateFunctionSamples(); 955 } 956 957 void CSProfileGenerator::populateBodySamplesForFunction( 958 FunctionSamples &FunctionProfile, const RangeSample &RangeCounter) { 959 // Compute disjoint ranges first, so we can use MAX 960 // for calculating count for each location. 961 RangeSample Ranges; 962 findDisjointRanges(Ranges, RangeCounter); 963 for (const auto &Range : Ranges) { 964 uint64_t RangeBegin = Range.first.first; 965 uint64_t RangeEnd = Range.first.second; 966 uint64_t Count = Range.second; 967 // Disjoint ranges have introduce zero-filled gap that 968 // doesn't belong to current context, filter them out. 969 if (Count == 0) 970 continue; 971 972 InstructionPointer IP(Binary, RangeBegin, true); 973 // Disjoint ranges may have range in the middle of two instr, 974 // e.g. If Instr1 at Addr1, and Instr2 at Addr2, disjoint range 975 // can be Addr1+1 to Addr2-1. We should ignore such range. 976 if (IP.Address > RangeEnd) 977 continue; 978 979 do { 980 auto LeafLoc = Binary->getInlineLeafFrameLoc(IP.Address); 981 if (LeafLoc) { 982 // Recording body sample for this specific context 983 updateBodySamplesforFunctionProfile(FunctionProfile, *LeafLoc, Count); 984 FunctionProfile.addTotalSamples(Count); 985 } 986 } while (IP.advance() && IP.Address <= RangeEnd); 987 } 988 } 989 990 void CSProfileGenerator::populateBoundarySamplesForFunction( 991 ContextTrieNode *Node, const BranchSample &BranchCounters) { 992 993 for (const auto &Entry : BranchCounters) { 994 uint64_t SourceAddress = Entry.first.first; 995 uint64_t TargetAddress = Entry.first.second; 996 uint64_t Count = Entry.second; 997 assert(Count != 0 && "Unexpected zero weight branch"); 998 999 StringRef CalleeName = getCalleeNameForAddress(TargetAddress); 1000 if (CalleeName.size() == 0) 1001 continue; 1002 1003 ContextTrieNode *CallerNode = Node; 1004 LineLocation CalleeCallSite(0, 0); 1005 if (CallerNode != &getRootContext()) { 1006 // Record called target sample and its count 1007 auto LeafLoc = Binary->getInlineLeafFrameLoc(SourceAddress); 1008 if (LeafLoc) { 1009 CallerNode->getFunctionSamples()->addCalledTargetSamples( 1010 LeafLoc->Location.LineOffset, 1011 getBaseDiscriminator(LeafLoc->Location.Discriminator), 1012 FunctionId(CalleeName), 1013 Count); 1014 // Record head sample for called target(callee) 1015 CalleeCallSite = LeafLoc->Location; 1016 } 1017 } 1018 1019 ContextTrieNode *CalleeNode = 1020 CallerNode->getOrCreateChildContext(CalleeCallSite, 1021 FunctionId(CalleeName)); 1022 FunctionSamples *CalleeProfile = getOrCreateFunctionSamples(CalleeNode); 1023 CalleeProfile->addHeadSamples(Count); 1024 } 1025 } 1026 1027 void CSProfileGenerator::populateInferredFunctionSamples( 1028 ContextTrieNode &Node) { 1029 // There is no call jmp sample between the inliner and inlinee, we need to use 1030 // the inlinee's context to infer inliner's context, i.e. parent(inliner)'s 1031 // sample depends on child(inlinee)'s sample, so traverse the tree in 1032 // post-order. 1033 for (auto &It : Node.getAllChildContext()) 1034 populateInferredFunctionSamples(It.second); 1035 1036 FunctionSamples *CalleeProfile = Node.getFunctionSamples(); 1037 if (!CalleeProfile) 1038 return; 1039 // If we already have head sample counts, we must have value profile 1040 // for call sites added already. Skip to avoid double counting. 1041 if (CalleeProfile->getHeadSamples()) 1042 return; 1043 ContextTrieNode *CallerNode = Node.getParentContext(); 1044 // If we don't have context, nothing to do for caller's call site. 1045 // This could happen for entry point function. 1046 if (CallerNode == &getRootContext()) 1047 return; 1048 1049 LineLocation CallerLeafFrameLoc = Node.getCallSiteLoc(); 1050 FunctionSamples &CallerProfile = *getOrCreateFunctionSamples(CallerNode); 1051 // Since we don't have call count for inlined functions, we 1052 // estimate it from inlinee's profile using entry body sample. 1053 uint64_t EstimatedCallCount = CalleeProfile->getHeadSamplesEstimate(); 1054 // If we don't have samples with location, use 1 to indicate live. 1055 if (!EstimatedCallCount && !CalleeProfile->getBodySamples().size()) 1056 EstimatedCallCount = 1; 1057 CallerProfile.addCalledTargetSamples(CallerLeafFrameLoc.LineOffset, 1058 CallerLeafFrameLoc.Discriminator, 1059 Node.getFuncName(), EstimatedCallCount); 1060 CallerProfile.addBodySamples(CallerLeafFrameLoc.LineOffset, 1061 CallerLeafFrameLoc.Discriminator, 1062 EstimatedCallCount); 1063 CallerProfile.addTotalSamples(EstimatedCallCount); 1064 } 1065 1066 void CSProfileGenerator::convertToProfileMap( 1067 ContextTrieNode &Node, SampleContextFrameVector &Context) { 1068 FunctionSamples *FProfile = Node.getFunctionSamples(); 1069 if (FProfile) { 1070 Context.emplace_back(Node.getFuncName(), LineLocation(0, 0)); 1071 // Save the new context for future references. 1072 SampleContextFrames NewContext = *Contexts.insert(Context).first; 1073 auto Ret = ProfileMap.emplace(NewContext, std::move(*FProfile)); 1074 FunctionSamples &NewProfile = Ret.first->second; 1075 NewProfile.getContext().setContext(NewContext); 1076 Context.pop_back(); 1077 } 1078 1079 for (auto &It : Node.getAllChildContext()) { 1080 ContextTrieNode &ChildNode = It.second; 1081 Context.emplace_back(Node.getFuncName(), ChildNode.getCallSiteLoc()); 1082 convertToProfileMap(ChildNode, Context); 1083 Context.pop_back(); 1084 } 1085 } 1086 1087 void CSProfileGenerator::convertToProfileMap() { 1088 assert(ProfileMap.empty() && 1089 "ProfileMap should be empty before converting from the trie"); 1090 assert(IsProfileValidOnTrie && 1091 "Do not convert the trie twice, it's already destroyed"); 1092 1093 SampleContextFrameVector Context; 1094 for (auto &It : getRootContext().getAllChildContext()) 1095 convertToProfileMap(It.second, Context); 1096 1097 IsProfileValidOnTrie = false; 1098 } 1099 1100 void CSProfileGenerator::postProcessProfiles() { 1101 // Compute hot/cold threshold based on profile. This will be used for cold 1102 // context profile merging/trimming. 1103 computeSummaryAndThreshold(); 1104 1105 // Run global pre-inliner to adjust/merge context profile based on estimated 1106 // inline decisions. 1107 if (EnableCSPreInliner) { 1108 ContextTracker.populateFuncToCtxtMap(); 1109 CSPreInliner(ContextTracker, *Binary, Summary.get()).run(); 1110 // Turn off the profile merger by default unless it is explicitly enabled. 1111 if (!CSProfMergeColdContext.getNumOccurrences()) 1112 CSProfMergeColdContext = false; 1113 } 1114 1115 convertToProfileMap(); 1116 1117 // Trim and merge cold context profile using cold threshold above. 1118 if (TrimColdProfile || CSProfMergeColdContext) { 1119 SampleContextTrimmer(ProfileMap) 1120 .trimAndMergeColdContextProfiles( 1121 HotCountThreshold, TrimColdProfile, CSProfMergeColdContext, 1122 CSProfMaxColdContextDepth, EnableCSPreInliner); 1123 } 1124 1125 if (GenCSNestedProfile) { 1126 ProfileConverter CSConverter(ProfileMap); 1127 CSConverter.convertCSProfiles(); 1128 FunctionSamples::ProfileIsCS = false; 1129 } 1130 filterAmbiguousProfile(ProfileMap); 1131 ProfileGeneratorBase::calculateAndShowDensity(ProfileMap); 1132 } 1133 1134 void ProfileGeneratorBase::computeSummaryAndThreshold( 1135 SampleProfileMap &Profiles) { 1136 SampleProfileSummaryBuilder Builder(ProfileSummaryBuilder::DefaultCutoffs); 1137 Summary = Builder.computeSummaryForProfiles(Profiles); 1138 HotCountThreshold = ProfileSummaryBuilder::getHotCountThreshold( 1139 (Summary->getDetailedSummary())); 1140 ColdCountThreshold = ProfileSummaryBuilder::getColdCountThreshold( 1141 (Summary->getDetailedSummary())); 1142 } 1143 1144 void CSProfileGenerator::computeSummaryAndThreshold() { 1145 // Always merge and use context-less profile map to compute summary. 1146 SampleProfileMap ContextLessProfiles; 1147 ContextTracker.createContextLessProfileMap(ContextLessProfiles); 1148 1149 // Set the flag below to avoid merging the profile again in 1150 // computeSummaryAndThreshold 1151 FunctionSamples::ProfileIsCS = false; 1152 assert( 1153 (!UseContextLessSummary.getNumOccurrences() || UseContextLessSummary) && 1154 "Don't set --profile-summary-contextless to false for profile " 1155 "generation"); 1156 ProfileGeneratorBase::computeSummaryAndThreshold(ContextLessProfiles); 1157 // Recover the old value. 1158 FunctionSamples::ProfileIsCS = true; 1159 } 1160 1161 void ProfileGeneratorBase::extractProbesFromRange( 1162 const RangeSample &RangeCounter, ProbeCounterMap &ProbeCounter, 1163 bool FindDisjointRanges) { 1164 const RangeSample *PRanges = &RangeCounter; 1165 RangeSample Ranges; 1166 if (FindDisjointRanges) { 1167 findDisjointRanges(Ranges, RangeCounter); 1168 PRanges = &Ranges; 1169 } 1170 1171 for (const auto &Range : *PRanges) { 1172 uint64_t RangeBegin = Range.first.first; 1173 uint64_t RangeEnd = Range.first.second; 1174 uint64_t Count = Range.second; 1175 1176 InstructionPointer IP(Binary, RangeBegin, true); 1177 // Disjoint ranges may have range in the middle of two instr, 1178 // e.g. If Instr1 at Addr1, and Instr2 at Addr2, disjoint range 1179 // can be Addr1+1 to Addr2-1. We should ignore such range. 1180 if (IP.Address > RangeEnd) 1181 continue; 1182 1183 do { 1184 const AddressProbesMap &Address2ProbesMap = 1185 Binary->getAddress2ProbesMap(); 1186 for (const MCDecodedPseudoProbe &Probe : 1187 Address2ProbesMap.find(IP.Address)) { 1188 ProbeCounter[&Probe] += Count; 1189 } 1190 } while (IP.advance() && IP.Address <= RangeEnd); 1191 } 1192 } 1193 1194 static void extractPrefixContextStack(SampleContextFrameVector &ContextStack, 1195 const SmallVectorImpl<uint64_t> &AddrVec, 1196 ProfiledBinary *Binary) { 1197 SmallVector<const MCDecodedPseudoProbe *, 16> Probes; 1198 for (auto Address : reverse(AddrVec)) { 1199 const MCDecodedPseudoProbe *CallProbe = 1200 Binary->getCallProbeForAddr(Address); 1201 // These could be the cases when a probe is not found at a calliste. Cutting 1202 // off the context from here since the inliner will not know how to consume 1203 // a context with unknown callsites. 1204 // 1. for functions that are not sampled when 1205 // --decode-probe-for-profiled-functions-only is on. 1206 // 2. for a merged callsite. Callsite merging may cause the loss of original 1207 // probe IDs. 1208 // 3. for an external callsite. 1209 if (!CallProbe) 1210 break; 1211 Probes.push_back(CallProbe); 1212 } 1213 1214 std::reverse(Probes.begin(), Probes.end()); 1215 1216 // Extract context stack for reusing, leaf context stack will be added 1217 // compressed while looking up function profile. 1218 for (const auto *P : Probes) { 1219 Binary->getInlineContextForProbe(P, ContextStack, true); 1220 } 1221 } 1222 1223 void CSProfileGenerator::generateProbeBasedProfile() { 1224 // Enable pseudo probe functionalities in SampleProf 1225 FunctionSamples::ProfileIsProbeBased = true; 1226 for (const auto &CI : *SampleCounters) { 1227 const AddrBasedCtxKey *CtxKey = 1228 dyn_cast<AddrBasedCtxKey>(CI.first.getPtr()); 1229 // Fill in function body samples from probes, also infer caller's samples 1230 // from callee's probe 1231 populateBodySamplesWithProbes(CI.second.RangeCounter, CtxKey); 1232 // Fill in boundary samples for a call probe 1233 populateBoundarySamplesWithProbes(CI.second.BranchCounter, CtxKey); 1234 } 1235 } 1236 1237 void CSProfileGenerator::populateBodySamplesWithProbes( 1238 const RangeSample &RangeCounter, const AddrBasedCtxKey *CtxKey) { 1239 ProbeCounterMap ProbeCounter; 1240 // Extract the top frame probes by looking up each address among the range in 1241 // the Address2ProbeMap 1242 extractProbesFromRange(RangeCounter, ProbeCounter); 1243 std::unordered_map<MCDecodedPseudoProbeInlineTree *, 1244 std::unordered_set<FunctionSamples *>> 1245 FrameSamples; 1246 for (const auto &PI : ProbeCounter) { 1247 const MCDecodedPseudoProbe *Probe = PI.first; 1248 uint64_t Count = PI.second; 1249 // Disjoint ranges have introduce zero-filled gap that 1250 // doesn't belong to current context, filter them out. 1251 if (!Probe->isBlock() || Count == 0) 1252 continue; 1253 1254 ContextTrieNode *ContextNode = getContextNodeForLeafProbe(CtxKey, Probe); 1255 FunctionSamples &FunctionProfile = *ContextNode->getFunctionSamples(); 1256 // Record the current frame and FunctionProfile whenever samples are 1257 // collected for non-danglie probes. This is for reporting all of the 1258 // zero count probes of the frame later. 1259 FrameSamples[Probe->getInlineTreeNode()].insert(&FunctionProfile); 1260 FunctionProfile.addBodySamples(Probe->getIndex(), Probe->getDiscriminator(), 1261 Count); 1262 FunctionProfile.addTotalSamples(Count); 1263 if (Probe->isEntry()) { 1264 FunctionProfile.addHeadSamples(Count); 1265 // Look up for the caller's function profile 1266 const auto *InlinerDesc = Binary->getInlinerDescForProbe(Probe); 1267 ContextTrieNode *CallerNode = ContextNode->getParentContext(); 1268 if (InlinerDesc != nullptr && CallerNode != &getRootContext()) { 1269 // Since the context id will be compressed, we have to use callee's 1270 // context id to infer caller's context id to ensure they share the 1271 // same context prefix. 1272 uint64_t CallerIndex = ContextNode->getCallSiteLoc().LineOffset; 1273 uint64_t CallerDiscriminator = ContextNode->getCallSiteLoc().Discriminator; 1274 assert(CallerIndex && 1275 "Inferred caller's location index shouldn't be zero!"); 1276 assert(!CallerDiscriminator && 1277 "Callsite probe should not have a discriminator!"); 1278 FunctionSamples &CallerProfile = 1279 *getOrCreateFunctionSamples(CallerNode); 1280 CallerProfile.setFunctionHash(InlinerDesc->FuncHash); 1281 CallerProfile.addBodySamples(CallerIndex, CallerDiscriminator, Count); 1282 CallerProfile.addTotalSamples(Count); 1283 CallerProfile.addCalledTargetSamples(CallerIndex, CallerDiscriminator, 1284 ContextNode->getFuncName(), Count); 1285 } 1286 } 1287 } 1288 1289 // Assign zero count for remaining probes without sample hits to 1290 // differentiate from probes optimized away, of which the counts are unknown 1291 // and will be inferred by the compiler. 1292 for (auto &I : FrameSamples) { 1293 for (auto *FunctionProfile : I.second) { 1294 for (const MCDecodedPseudoProbe &Probe : I.first->getProbes()) { 1295 FunctionProfile->addBodySamples(Probe.getIndex(), 1296 Probe.getDiscriminator(), 0); 1297 } 1298 } 1299 } 1300 } 1301 1302 void CSProfileGenerator::populateBoundarySamplesWithProbes( 1303 const BranchSample &BranchCounter, const AddrBasedCtxKey *CtxKey) { 1304 for (const auto &BI : BranchCounter) { 1305 uint64_t SourceAddress = BI.first.first; 1306 uint64_t TargetAddress = BI.first.second; 1307 uint64_t Count = BI.second; 1308 const MCDecodedPseudoProbe *CallProbe = 1309 Binary->getCallProbeForAddr(SourceAddress); 1310 if (CallProbe == nullptr) 1311 continue; 1312 FunctionSamples &FunctionProfile = 1313 getFunctionProfileForLeafProbe(CtxKey, CallProbe); 1314 FunctionProfile.addBodySamples(CallProbe->getIndex(), 0, Count); 1315 FunctionProfile.addTotalSamples(Count); 1316 StringRef CalleeName = getCalleeNameForAddress(TargetAddress); 1317 if (CalleeName.size() == 0) 1318 continue; 1319 FunctionProfile.addCalledTargetSamples(CallProbe->getIndex(), 1320 CallProbe->getDiscriminator(), 1321 FunctionId(CalleeName), Count); 1322 } 1323 } 1324 1325 ContextTrieNode *CSProfileGenerator::getContextNodeForLeafProbe( 1326 const AddrBasedCtxKey *CtxKey, const MCDecodedPseudoProbe *LeafProbe) { 1327 1328 const SmallVectorImpl<uint64_t> *PContext = &CtxKey->Context; 1329 SmallVector<uint64_t, 16> NewContext; 1330 1331 if (InferMissingFrames) { 1332 SmallVector<uint64_t, 16> Context = CtxKey->Context; 1333 // Append leaf frame for a complete inference. 1334 Context.push_back(LeafProbe->getAddress()); 1335 inferMissingFrames(Context, NewContext); 1336 // Pop out the leaf probe that was pushed in above. 1337 NewContext.pop_back(); 1338 PContext = &NewContext; 1339 } 1340 1341 SampleContextFrameVector ContextStack; 1342 extractPrefixContextStack(ContextStack, *PContext, Binary); 1343 1344 // Explicitly copy the context for appending the leaf context 1345 SampleContextFrameVector NewContextStack(ContextStack.begin(), 1346 ContextStack.end()); 1347 Binary->getInlineContextForProbe(LeafProbe, NewContextStack, true); 1348 // For leaf inlined context with the top frame, we should strip off the top 1349 // frame's probe id, like: 1350 // Inlined stack: [foo:1, bar:2], the ContextId will be "foo:1 @ bar" 1351 auto LeafFrame = NewContextStack.back(); 1352 LeafFrame.Location = LineLocation(0, 0); 1353 NewContextStack.pop_back(); 1354 // Compress the context string except for the leaf frame 1355 CSProfileGenerator::compressRecursionContext(NewContextStack); 1356 CSProfileGenerator::trimContext(NewContextStack); 1357 NewContextStack.push_back(LeafFrame); 1358 1359 const auto *FuncDesc = Binary->getFuncDescForGUID(LeafProbe->getGuid()); 1360 bool WasLeafInlined = LeafProbe->getInlineTreeNode()->hasInlineSite(); 1361 ContextTrieNode *ContextNode = 1362 getOrCreateContextNode(NewContextStack, WasLeafInlined); 1363 ContextNode->getFunctionSamples()->setFunctionHash(FuncDesc->FuncHash); 1364 return ContextNode; 1365 } 1366 1367 FunctionSamples &CSProfileGenerator::getFunctionProfileForLeafProbe( 1368 const AddrBasedCtxKey *CtxKey, const MCDecodedPseudoProbe *LeafProbe) { 1369 return *getContextNodeForLeafProbe(CtxKey, LeafProbe)->getFunctionSamples(); 1370 } 1371 1372 } // end namespace sampleprof 1373 } // end namespace llvm 1374