1 //===-- ProfileGenerator.cpp - Profile Generator ---------------*- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 #include "ProfileGenerator.h" 9 #include "ErrorHandling.h" 10 #include "MissingFrameInferrer.h" 11 #include "PerfReader.h" 12 #include "ProfiledBinary.h" 13 #include "llvm/DebugInfo/Symbolize/SymbolizableModule.h" 14 #include "llvm/ProfileData/ProfileCommon.h" 15 #include <algorithm> 16 #include <float.h> 17 #include <unordered_set> 18 #include <utility> 19 20 cl::opt<std::string> OutputFilename("output", cl::value_desc("output"), 21 cl::Required, 22 cl::desc("Output profile file")); 23 static cl::alias OutputA("o", cl::desc("Alias for --output"), 24 cl::aliasopt(OutputFilename)); 25 26 static cl::opt<SampleProfileFormat> OutputFormat( 27 "format", cl::desc("Format of output profile"), cl::init(SPF_Ext_Binary), 28 cl::values( 29 clEnumValN(SPF_Binary, "binary", "Binary encoding (default)"), 30 clEnumValN(SPF_Ext_Binary, "extbinary", "Extensible binary encoding"), 31 clEnumValN(SPF_Text, "text", "Text encoding"), 32 clEnumValN(SPF_GCC, "gcc", 33 "GCC encoding (only meaningful for -sample)"))); 34 35 static cl::opt<bool> UseMD5( 36 "use-md5", cl::Hidden, 37 cl::desc("Use md5 to represent function names in the output profile (only " 38 "meaningful for -extbinary)")); 39 40 static cl::opt<bool> PopulateProfileSymbolList( 41 "populate-profile-symbol-list", cl::init(false), cl::Hidden, 42 cl::desc("Populate profile symbol list (only meaningful for -extbinary)")); 43 44 static cl::opt<bool> FillZeroForAllFuncs( 45 "fill-zero-for-all-funcs", cl::init(false), cl::Hidden, 46 cl::desc("Attribute all functions' range with zero count " 47 "even it's not hit by any samples.")); 48 49 static cl::opt<int32_t, true> RecursionCompression( 50 "compress-recursion", 51 cl::desc("Compressing recursion by deduplicating adjacent frame " 52 "sequences up to the specified size. -1 means no size limit."), 53 cl::Hidden, 54 cl::location(llvm::sampleprof::CSProfileGenerator::MaxCompressionSize)); 55 56 static cl::opt<bool> 57 TrimColdProfile("trim-cold-profile", 58 cl::desc("If the total count of the profile is smaller " 59 "than threshold, it will be trimmed.")); 60 61 static cl::opt<bool> CSProfMergeColdContext( 62 "csprof-merge-cold-context", cl::init(true), 63 cl::desc("If the total count of context profile is smaller than " 64 "the threshold, it will be merged into context-less base " 65 "profile.")); 66 67 static cl::opt<uint32_t> CSProfMaxColdContextDepth( 68 "csprof-max-cold-context-depth", cl::init(1), 69 cl::desc("Keep the last K contexts while merging cold profile. 1 means the " 70 "context-less base profile")); 71 72 static cl::opt<int, true> CSProfMaxContextDepth( 73 "csprof-max-context-depth", 74 cl::desc("Keep the last K contexts while merging profile. -1 means no " 75 "depth limit."), 76 cl::location(llvm::sampleprof::CSProfileGenerator::MaxContextDepth)); 77 78 static cl::opt<double> ProfileDensityThreshold( 79 "profile-density-threshold", llvm::cl::init(50), 80 llvm::cl::desc("If the profile density is below the given threshold, it " 81 "will be suggested to increase the sampling rate."), 82 llvm::cl::Optional); 83 static cl::opt<bool> ShowDensity("show-density", llvm::cl::init(false), 84 llvm::cl::desc("show profile density details"), 85 llvm::cl::Optional); 86 static cl::opt<int> ProfileDensityCutOffHot( 87 "profile-density-cutoff-hot", llvm::cl::init(990000), 88 llvm::cl::desc("Total samples cutoff for functions used to calculate " 89 "profile density.")); 90 91 static cl::opt<bool> UpdateTotalSamples( 92 "update-total-samples", llvm::cl::init(false), 93 llvm::cl::desc( 94 "Update total samples by accumulating all its body samples."), 95 llvm::cl::Optional); 96 97 static cl::opt<bool> GenCSNestedProfile( 98 "gen-cs-nested-profile", cl::Hidden, cl::init(true), 99 cl::desc("Generate nested function profiles for CSSPGO")); 100 101 cl::opt<bool> InferMissingFrames( 102 "infer-missing-frames", llvm::cl::init(true), 103 llvm::cl::desc( 104 "Infer missing call frames due to compiler tail call elimination."), 105 llvm::cl::Optional); 106 107 using namespace llvm; 108 using namespace sampleprof; 109 110 namespace llvm { 111 extern cl::opt<int> ProfileSummaryCutoffHot; 112 extern cl::opt<bool> UseContextLessSummary; 113 114 namespace sampleprof { 115 116 // Initialize the MaxCompressionSize to -1 which means no size limit 117 int32_t CSProfileGenerator::MaxCompressionSize = -1; 118 119 int CSProfileGenerator::MaxContextDepth = -1; 120 121 bool ProfileGeneratorBase::UseFSDiscriminator = false; 122 123 std::unique_ptr<ProfileGeneratorBase> 124 ProfileGeneratorBase::create(ProfiledBinary *Binary, 125 const ContextSampleCounterMap *SampleCounters, 126 bool ProfileIsCS) { 127 std::unique_ptr<ProfileGeneratorBase> Generator; 128 if (ProfileIsCS) { 129 Generator.reset(new CSProfileGenerator(Binary, SampleCounters)); 130 } else { 131 Generator.reset(new ProfileGenerator(Binary, SampleCounters)); 132 } 133 ProfileGeneratorBase::UseFSDiscriminator = Binary->useFSDiscriminator(); 134 FunctionSamples::ProfileIsFS = Binary->useFSDiscriminator(); 135 136 return Generator; 137 } 138 139 std::unique_ptr<ProfileGeneratorBase> 140 ProfileGeneratorBase::create(ProfiledBinary *Binary, SampleProfileMap &Profiles, 141 bool ProfileIsCS) { 142 std::unique_ptr<ProfileGeneratorBase> Generator; 143 if (ProfileIsCS) { 144 Generator.reset(new CSProfileGenerator(Binary, Profiles)); 145 } else { 146 Generator.reset(new ProfileGenerator(Binary, std::move(Profiles))); 147 } 148 ProfileGeneratorBase::UseFSDiscriminator = Binary->useFSDiscriminator(); 149 FunctionSamples::ProfileIsFS = Binary->useFSDiscriminator(); 150 151 return Generator; 152 } 153 154 void ProfileGeneratorBase::write(std::unique_ptr<SampleProfileWriter> Writer, 155 SampleProfileMap &ProfileMap) { 156 // Populate profile symbol list if extended binary format is used. 157 ProfileSymbolList SymbolList; 158 159 if (PopulateProfileSymbolList && OutputFormat == SPF_Ext_Binary) { 160 Binary->populateSymbolListFromDWARF(SymbolList); 161 Writer->setProfileSymbolList(&SymbolList); 162 } 163 164 if (std::error_code EC = Writer->write(ProfileMap)) 165 exitWithError(std::move(EC)); 166 } 167 168 void ProfileGeneratorBase::write() { 169 auto WriterOrErr = SampleProfileWriter::create(OutputFilename, OutputFormat); 170 if (std::error_code EC = WriterOrErr.getError()) 171 exitWithError(EC, OutputFilename); 172 173 if (UseMD5) { 174 if (OutputFormat != SPF_Ext_Binary) 175 WithColor::warning() << "-use-md5 is ignored. Specify " 176 "--format=extbinary to enable it\n"; 177 else 178 WriterOrErr.get()->setUseMD5(); 179 } 180 181 write(std::move(WriterOrErr.get()), ProfileMap); 182 } 183 184 void ProfileGeneratorBase::showDensitySuggestion(double Density) { 185 if (Density == 0.0) 186 WithColor::warning() << "The output profile is empty or the " 187 "--profile-density-cutoff-hot option is " 188 "set too low. Please check your command.\n"; 189 else if (Density < ProfileDensityThreshold) 190 WithColor::warning() 191 << "Sample PGO is estimated to optimize better with " 192 << format("%.1f", ProfileDensityThreshold / Density) 193 << "x more samples. Please consider increasing sampling rate or " 194 "profiling for longer duration to get more samples.\n"; 195 196 if (ShowDensity) 197 outs() << "Functions with density >= " << format("%.1f", Density) 198 << " account for " 199 << format("%.2f", 200 static_cast<double>(ProfileDensityCutOffHot) / 10000) 201 << "% total sample counts.\n"; 202 } 203 204 bool ProfileGeneratorBase::filterAmbiguousProfile(FunctionSamples &FS) { 205 for (const auto &Prefix : FuncPrefixsToFilter) { 206 if (FS.getFuncName().starts_with(Prefix)) 207 return true; 208 } 209 210 // Filter the function profiles for the inlinees. It's useful for fuzzy 211 // profile matching which flattens the profile and inlinees' samples are 212 // merged into top-level function. 213 for (auto &Callees : 214 const_cast<CallsiteSampleMap &>(FS.getCallsiteSamples())) { 215 auto &CalleesMap = Callees.second; 216 for (auto I = CalleesMap.begin(); I != CalleesMap.end();) { 217 auto FS = I++; 218 if (filterAmbiguousProfile(FS->second)) 219 CalleesMap.erase(FS); 220 } 221 } 222 return false; 223 } 224 225 // For built-in local initialization function such as __cxx_global_var_init, 226 // __tls_init prefix function, there could be multiple versions of the functions 227 // in the final binary. However, in the profile generation, we call 228 // getCanonicalFnName to canonicalize the names which strips the suffixes. 229 // Therefore, samples from different functions queries the same profile and the 230 // samples are merged. As the functions are essentially different, entries of 231 // the merged profile are ambiguous. In sample loader, the IR from one version 232 // would be attributed towards a merged entries, which is inaccurate. Especially 233 // for fuzzy profile matching, it gets multiple callsites(from different 234 // function) but used to match one callsite, which misleads the matching and 235 // causes a lot of false positives report. Hence, we want to filter them out 236 // from the profile map during the profile generation time. The profiles are all 237 // cold functions, it won't have perf impact. 238 void ProfileGeneratorBase::filterAmbiguousProfile(SampleProfileMap &Profiles) { 239 for (auto I = ProfileMap.begin(); I != ProfileMap.end();) { 240 auto FS = I++; 241 if (filterAmbiguousProfile(FS->second)) 242 ProfileMap.erase(FS); 243 } 244 } 245 246 void ProfileGeneratorBase::findDisjointRanges(RangeSample &DisjointRanges, 247 const RangeSample &Ranges) { 248 249 /* 250 Regions may overlap with each other. Using the boundary info, find all 251 disjoint ranges and their sample count. BoundaryPoint contains the count 252 multiple samples begin/end at this points. 253 254 |<--100-->| Sample1 255 |<------200------>| Sample2 256 A B C 257 258 In the example above, 259 Sample1 begins at A, ends at B, its value is 100. 260 Sample2 beings at A, ends at C, its value is 200. 261 For A, BeginCount is the sum of sample begins at A, which is 300 and no 262 samples ends at A, so EndCount is 0. 263 Then boundary points A, B, and C with begin/end counts are: 264 A: (300, 0) 265 B: (0, 100) 266 C: (0, 200) 267 */ 268 struct BoundaryPoint { 269 // Sum of sample counts beginning at this point 270 uint64_t BeginCount = UINT64_MAX; 271 // Sum of sample counts ending at this point 272 uint64_t EndCount = UINT64_MAX; 273 // Is the begin point of a zero range. 274 bool IsZeroRangeBegin = false; 275 // Is the end point of a zero range. 276 bool IsZeroRangeEnd = false; 277 278 void addBeginCount(uint64_t Count) { 279 if (BeginCount == UINT64_MAX) 280 BeginCount = 0; 281 BeginCount += Count; 282 } 283 284 void addEndCount(uint64_t Count) { 285 if (EndCount == UINT64_MAX) 286 EndCount = 0; 287 EndCount += Count; 288 } 289 }; 290 291 /* 292 For the above example. With boundary points, follwing logic finds two 293 disjoint region of 294 295 [A,B]: 300 296 [B+1,C]: 200 297 298 If there is a boundary point that both begin and end, the point itself 299 becomes a separate disjoint region. For example, if we have original 300 ranges of 301 302 |<--- 100 --->| 303 |<--- 200 --->| 304 A B C 305 306 there are three boundary points with their begin/end counts of 307 308 A: (100, 0) 309 B: (200, 100) 310 C: (0, 200) 311 312 the disjoint ranges would be 313 314 [A, B-1]: 100 315 [B, B]: 300 316 [B+1, C]: 200. 317 318 Example for zero value range: 319 320 |<--- 100 --->| 321 |<--- 200 --->| 322 |<--------------- 0 ----------------->| 323 A B C D E F 324 325 [A, B-1] : 0 326 [B, C] : 100 327 [C+1, D-1]: 0 328 [D, E] : 200 329 [E+1, F] : 0 330 */ 331 std::map<uint64_t, BoundaryPoint> Boundaries; 332 333 for (const auto &Item : Ranges) { 334 assert(Item.first.first <= Item.first.second && 335 "Invalid instruction range"); 336 auto &BeginPoint = Boundaries[Item.first.first]; 337 auto &EndPoint = Boundaries[Item.first.second]; 338 uint64_t Count = Item.second; 339 340 BeginPoint.addBeginCount(Count); 341 EndPoint.addEndCount(Count); 342 if (Count == 0) { 343 BeginPoint.IsZeroRangeBegin = true; 344 EndPoint.IsZeroRangeEnd = true; 345 } 346 } 347 348 // Use UINT64_MAX to indicate there is no existing range between BeginAddress 349 // and the next valid address 350 uint64_t BeginAddress = UINT64_MAX; 351 int ZeroRangeDepth = 0; 352 uint64_t Count = 0; 353 for (const auto &Item : Boundaries) { 354 uint64_t Address = Item.first; 355 const BoundaryPoint &Point = Item.second; 356 if (Point.BeginCount != UINT64_MAX) { 357 if (BeginAddress != UINT64_MAX) 358 DisjointRanges[{BeginAddress, Address - 1}] = Count; 359 Count += Point.BeginCount; 360 BeginAddress = Address; 361 ZeroRangeDepth += Point.IsZeroRangeBegin; 362 } 363 if (Point.EndCount != UINT64_MAX) { 364 assert((BeginAddress != UINT64_MAX) && 365 "First boundary point cannot be 'end' point"); 366 DisjointRanges[{BeginAddress, Address}] = Count; 367 assert(Count >= Point.EndCount && "Mismatched live ranges"); 368 Count -= Point.EndCount; 369 BeginAddress = Address + 1; 370 ZeroRangeDepth -= Point.IsZeroRangeEnd; 371 // If the remaining count is zero and it's no longer in a zero range, this 372 // means we consume all the ranges before, thus mark BeginAddress as 373 // UINT64_MAX. e.g. supposing we have two non-overlapping ranges: 374 // [<---- 10 ---->] 375 // [<---- 20 ---->] 376 // A B C D 377 // The BeginAddress(B+1) will reset to invalid(UINT64_MAX), so we won't 378 // have the [B+1, C-1] zero range. 379 if (Count == 0 && ZeroRangeDepth == 0) 380 BeginAddress = UINT64_MAX; 381 } 382 } 383 } 384 385 void ProfileGeneratorBase::updateBodySamplesforFunctionProfile( 386 FunctionSamples &FunctionProfile, const SampleContextFrame &LeafLoc, 387 uint64_t Count) { 388 // Use the maximum count of samples with same line location 389 uint32_t Discriminator = getBaseDiscriminator(LeafLoc.Location.Discriminator); 390 391 // Use duplication factor to compensated for loop unroll/vectorization. 392 // Note that this is only needed when we're taking MAX of the counts at 393 // the location instead of SUM. 394 Count *= getDuplicationFactor(LeafLoc.Location.Discriminator); 395 396 ErrorOr<uint64_t> R = 397 FunctionProfile.findSamplesAt(LeafLoc.Location.LineOffset, Discriminator); 398 399 uint64_t PreviousCount = R ? R.get() : 0; 400 if (PreviousCount <= Count) { 401 FunctionProfile.addBodySamples(LeafLoc.Location.LineOffset, Discriminator, 402 Count - PreviousCount); 403 } 404 } 405 406 void ProfileGeneratorBase::updateTotalSamples() { 407 for (auto &Item : ProfileMap) { 408 FunctionSamples &FunctionProfile = Item.second; 409 FunctionProfile.updateTotalSamples(); 410 } 411 } 412 413 void ProfileGeneratorBase::updateCallsiteSamples() { 414 for (auto &Item : ProfileMap) { 415 FunctionSamples &FunctionProfile = Item.second; 416 FunctionProfile.updateCallsiteSamples(); 417 } 418 } 419 420 void ProfileGeneratorBase::updateFunctionSamples() { 421 updateCallsiteSamples(); 422 423 if (UpdateTotalSamples) 424 updateTotalSamples(); 425 } 426 427 void ProfileGeneratorBase::collectProfiledFunctions() { 428 std::unordered_set<const BinaryFunction *> ProfiledFunctions; 429 if (collectFunctionsFromRawProfile(ProfiledFunctions)) 430 Binary->setProfiledFunctions(ProfiledFunctions); 431 else if (collectFunctionsFromLLVMProfile(ProfiledFunctions)) 432 Binary->setProfiledFunctions(ProfiledFunctions); 433 else 434 llvm_unreachable("Unsupported input profile"); 435 } 436 437 bool ProfileGeneratorBase::collectFunctionsFromRawProfile( 438 std::unordered_set<const BinaryFunction *> &ProfiledFunctions) { 439 if (!SampleCounters) 440 return false; 441 // Go through all the stacks, ranges and branches in sample counters, use 442 // the start of the range to look up the function it belongs and record the 443 // function. 444 for (const auto &CI : *SampleCounters) { 445 if (const auto *CtxKey = dyn_cast<AddrBasedCtxKey>(CI.first.getPtr())) { 446 for (auto StackAddr : CtxKey->Context) { 447 if (FuncRange *FRange = Binary->findFuncRange(StackAddr)) 448 ProfiledFunctions.insert(FRange->Func); 449 } 450 } 451 452 for (auto Item : CI.second.RangeCounter) { 453 uint64_t StartAddress = Item.first.first; 454 if (FuncRange *FRange = Binary->findFuncRange(StartAddress)) 455 ProfiledFunctions.insert(FRange->Func); 456 } 457 458 for (auto Item : CI.second.BranchCounter) { 459 uint64_t SourceAddress = Item.first.first; 460 uint64_t TargetAddress = Item.first.second; 461 if (FuncRange *FRange = Binary->findFuncRange(SourceAddress)) 462 ProfiledFunctions.insert(FRange->Func); 463 if (FuncRange *FRange = Binary->findFuncRange(TargetAddress)) 464 ProfiledFunctions.insert(FRange->Func); 465 } 466 } 467 return true; 468 } 469 470 bool ProfileGenerator::collectFunctionsFromLLVMProfile( 471 std::unordered_set<const BinaryFunction *> &ProfiledFunctions) { 472 for (const auto &FS : ProfileMap) { 473 if (auto *Func = Binary->getBinaryFunction(FS.second.getFunction())) 474 ProfiledFunctions.insert(Func); 475 } 476 return true; 477 } 478 479 bool CSProfileGenerator::collectFunctionsFromLLVMProfile( 480 std::unordered_set<const BinaryFunction *> &ProfiledFunctions) { 481 for (auto *Node : ContextTracker) { 482 if (!Node->getFuncName().empty()) 483 if (auto *Func = Binary->getBinaryFunction(Node->getFuncName())) 484 ProfiledFunctions.insert(Func); 485 } 486 return true; 487 } 488 489 FunctionSamples & 490 ProfileGenerator::getTopLevelFunctionProfile(FunctionId FuncName) { 491 SampleContext Context(FuncName); 492 return ProfileMap.create(Context); 493 } 494 495 void ProfileGenerator::generateProfile() { 496 collectProfiledFunctions(); 497 498 if (Binary->usePseudoProbes()) 499 Binary->decodePseudoProbe(); 500 501 if (SampleCounters) { 502 if (Binary->usePseudoProbes()) { 503 generateProbeBasedProfile(); 504 } else { 505 generateLineNumBasedProfile(); 506 } 507 } 508 509 postProcessProfiles(); 510 } 511 512 void ProfileGenerator::postProcessProfiles() { 513 computeSummaryAndThreshold(ProfileMap); 514 trimColdProfiles(ProfileMap, ColdCountThreshold); 515 filterAmbiguousProfile(ProfileMap); 516 calculateAndShowDensity(ProfileMap); 517 } 518 519 void ProfileGenerator::trimColdProfiles(const SampleProfileMap &Profiles, 520 uint64_t ColdCntThreshold) { 521 if (!TrimColdProfile) 522 return; 523 524 // Move cold profiles into a tmp container. 525 std::vector<hash_code> ColdProfileHashes; 526 for (const auto &I : ProfileMap) { 527 if (I.second.getTotalSamples() < ColdCntThreshold) 528 ColdProfileHashes.emplace_back(I.first); 529 } 530 531 // Remove the cold profile from ProfileMap. 532 for (const auto &I : ColdProfileHashes) 533 ProfileMap.erase(I); 534 } 535 536 void ProfileGenerator::generateLineNumBasedProfile() { 537 assert(SampleCounters->size() == 1 && 538 "Must have one entry for profile generation."); 539 const SampleCounter &SC = SampleCounters->begin()->second; 540 // Fill in function body samples 541 populateBodySamplesForAllFunctions(SC.RangeCounter); 542 // Fill in boundary sample counts as well as call site samples for calls 543 populateBoundarySamplesForAllFunctions(SC.BranchCounter); 544 545 updateFunctionSamples(); 546 } 547 548 void ProfileGenerator::generateProbeBasedProfile() { 549 assert(SampleCounters->size() == 1 && 550 "Must have one entry for profile generation."); 551 // Enable pseudo probe functionalities in SampleProf 552 FunctionSamples::ProfileIsProbeBased = true; 553 const SampleCounter &SC = SampleCounters->begin()->second; 554 // Fill in function body samples 555 populateBodySamplesWithProbesForAllFunctions(SC.RangeCounter); 556 // Fill in boundary sample counts as well as call site samples for calls 557 populateBoundarySamplesWithProbesForAllFunctions(SC.BranchCounter); 558 559 updateFunctionSamples(); 560 } 561 562 void ProfileGenerator::populateBodySamplesWithProbesForAllFunctions( 563 const RangeSample &RangeCounter) { 564 ProbeCounterMap ProbeCounter; 565 // preprocessRangeCounter returns disjoint ranges, so no longer to redo it 566 // inside extractProbesFromRange. 567 extractProbesFromRange(preprocessRangeCounter(RangeCounter), ProbeCounter, 568 false); 569 570 for (const auto &PI : ProbeCounter) { 571 const MCDecodedPseudoProbe *Probe = PI.first; 572 uint64_t Count = PI.second; 573 SampleContextFrameVector FrameVec; 574 Binary->getInlineContextForProbe(Probe, FrameVec, true); 575 FunctionSamples &FunctionProfile = 576 getLeafProfileAndAddTotalSamples(FrameVec, Count); 577 FunctionProfile.addBodySamples(Probe->getIndex(), Probe->getDiscriminator(), 578 Count); 579 if (Probe->isEntry()) 580 FunctionProfile.addHeadSamples(Count); 581 } 582 } 583 584 void ProfileGenerator::populateBoundarySamplesWithProbesForAllFunctions( 585 const BranchSample &BranchCounters) { 586 for (const auto &Entry : BranchCounters) { 587 uint64_t SourceAddress = Entry.first.first; 588 uint64_t TargetAddress = Entry.first.second; 589 uint64_t Count = Entry.second; 590 assert(Count != 0 && "Unexpected zero weight branch"); 591 592 StringRef CalleeName = getCalleeNameForAddress(TargetAddress); 593 if (CalleeName.size() == 0) 594 continue; 595 596 const MCDecodedPseudoProbe *CallProbe = 597 Binary->getCallProbeForAddr(SourceAddress); 598 if (CallProbe == nullptr) 599 continue; 600 601 // Record called target sample and its count. 602 SampleContextFrameVector FrameVec; 603 Binary->getInlineContextForProbe(CallProbe, FrameVec, true); 604 605 if (!FrameVec.empty()) { 606 FunctionSamples &FunctionProfile = 607 getLeafProfileAndAddTotalSamples(FrameVec, 0); 608 FunctionProfile.addCalledTargetSamples( 609 FrameVec.back().Location.LineOffset, 610 FrameVec.back().Location.Discriminator, 611 FunctionId(CalleeName), Count); 612 } 613 } 614 } 615 616 FunctionSamples &ProfileGenerator::getLeafProfileAndAddTotalSamples( 617 const SampleContextFrameVector &FrameVec, uint64_t Count) { 618 // Get top level profile 619 FunctionSamples *FunctionProfile = 620 &getTopLevelFunctionProfile(FrameVec[0].Func); 621 FunctionProfile->addTotalSamples(Count); 622 if (Binary->usePseudoProbes()) { 623 const auto *FuncDesc = Binary->getFuncDescForGUID( 624 FunctionProfile->getFunction().getHashCode()); 625 FunctionProfile->setFunctionHash(FuncDesc->FuncHash); 626 } 627 628 for (size_t I = 1; I < FrameVec.size(); I++) { 629 LineLocation Callsite( 630 FrameVec[I - 1].Location.LineOffset, 631 getBaseDiscriminator(FrameVec[I - 1].Location.Discriminator)); 632 FunctionSamplesMap &SamplesMap = 633 FunctionProfile->functionSamplesAt(Callsite); 634 auto Ret = 635 SamplesMap.emplace(FrameVec[I].Func, FunctionSamples()); 636 if (Ret.second) { 637 SampleContext Context(FrameVec[I].Func); 638 Ret.first->second.setContext(Context); 639 } 640 FunctionProfile = &Ret.first->second; 641 FunctionProfile->addTotalSamples(Count); 642 if (Binary->usePseudoProbes()) { 643 const auto *FuncDesc = Binary->getFuncDescForGUID( 644 FunctionProfile->getFunction().getHashCode()); 645 FunctionProfile->setFunctionHash(FuncDesc->FuncHash); 646 } 647 } 648 649 return *FunctionProfile; 650 } 651 652 RangeSample 653 ProfileGenerator::preprocessRangeCounter(const RangeSample &RangeCounter) { 654 RangeSample Ranges(RangeCounter.begin(), RangeCounter.end()); 655 if (FillZeroForAllFuncs) { 656 for (auto &FuncI : Binary->getAllBinaryFunctions()) { 657 for (auto &R : FuncI.second.Ranges) { 658 Ranges[{R.first, R.second - 1}] += 0; 659 } 660 } 661 } else { 662 // For each range, we search for all ranges of the function it belongs to 663 // and initialize it with zero count, so it remains zero if doesn't hit any 664 // samples. This is to be consistent with compiler that interpret zero count 665 // as unexecuted(cold). 666 for (const auto &I : RangeCounter) { 667 uint64_t StartAddress = I.first.first; 668 for (const auto &Range : Binary->getRanges(StartAddress)) 669 Ranges[{Range.first, Range.second - 1}] += 0; 670 } 671 } 672 RangeSample DisjointRanges; 673 findDisjointRanges(DisjointRanges, Ranges); 674 return DisjointRanges; 675 } 676 677 void ProfileGenerator::populateBodySamplesForAllFunctions( 678 const RangeSample &RangeCounter) { 679 for (const auto &Range : preprocessRangeCounter(RangeCounter)) { 680 uint64_t RangeBegin = Range.first.first; 681 uint64_t RangeEnd = Range.first.second; 682 uint64_t Count = Range.second; 683 684 InstructionPointer IP(Binary, RangeBegin, true); 685 // Disjoint ranges may have range in the middle of two instr, 686 // e.g. If Instr1 at Addr1, and Instr2 at Addr2, disjoint range 687 // can be Addr1+1 to Addr2-1. We should ignore such range. 688 if (IP.Address > RangeEnd) 689 continue; 690 691 do { 692 const SampleContextFrameVector FrameVec = 693 Binary->getFrameLocationStack(IP.Address); 694 if (!FrameVec.empty()) { 695 // FIXME: As accumulating total count per instruction caused some 696 // regression, we changed to accumulate total count per byte as a 697 // workaround. Tuning hotness threshold on the compiler side might be 698 // necessary in the future. 699 FunctionSamples &FunctionProfile = getLeafProfileAndAddTotalSamples( 700 FrameVec, Count * Binary->getInstSize(IP.Address)); 701 updateBodySamplesforFunctionProfile(FunctionProfile, FrameVec.back(), 702 Count); 703 } 704 } while (IP.advance() && IP.Address <= RangeEnd); 705 } 706 } 707 708 StringRef 709 ProfileGeneratorBase::getCalleeNameForAddress(uint64_t TargetAddress) { 710 // Get the function range by branch target if it's a call branch. 711 auto *FRange = Binary->findFuncRangeForStartAddr(TargetAddress); 712 713 // We won't accumulate sample count for a range whose start is not the real 714 // function entry such as outlined function or inner labels. 715 if (!FRange || !FRange->IsFuncEntry) 716 return StringRef(); 717 718 return FunctionSamples::getCanonicalFnName(FRange->getFuncName()); 719 } 720 721 void ProfileGenerator::populateBoundarySamplesForAllFunctions( 722 const BranchSample &BranchCounters) { 723 for (const auto &Entry : BranchCounters) { 724 uint64_t SourceAddress = Entry.first.first; 725 uint64_t TargetAddress = Entry.first.second; 726 uint64_t Count = Entry.second; 727 assert(Count != 0 && "Unexpected zero weight branch"); 728 729 StringRef CalleeName = getCalleeNameForAddress(TargetAddress); 730 if (CalleeName.size() == 0) 731 continue; 732 // Record called target sample and its count. 733 const SampleContextFrameVector &FrameVec = 734 Binary->getCachedFrameLocationStack(SourceAddress); 735 if (!FrameVec.empty()) { 736 FunctionSamples &FunctionProfile = 737 getLeafProfileAndAddTotalSamples(FrameVec, 0); 738 FunctionProfile.addCalledTargetSamples( 739 FrameVec.back().Location.LineOffset, 740 getBaseDiscriminator(FrameVec.back().Location.Discriminator), 741 FunctionId(CalleeName), Count); 742 } 743 // Add head samples for callee. 744 FunctionSamples &CalleeProfile = 745 getTopLevelFunctionProfile(FunctionId(CalleeName)); 746 CalleeProfile.addHeadSamples(Count); 747 } 748 } 749 750 void ProfileGeneratorBase::calculateBodySamplesAndSize( 751 const FunctionSamples &FSamples, uint64_t &TotalBodySamples, 752 uint64_t &FuncBodySize) { 753 // Note that ideally the size should be the number of function instruction. 754 // However, for probe-based profile, we don't have the accurate instruction 755 // count for each probe, instead, the probe sample is the samples count for 756 // the block, which is equivelant to 757 // total_instruction_samples/num_of_instruction in one block. Hence, we use 758 // the number of probe as a proxy for the function's size. 759 FuncBodySize += FSamples.getBodySamples().size(); 760 761 // The accumulated body samples re-calculated here could be different from the 762 // TotalSamples(getTotalSamples) field of FunctionSamples for line-number 763 // based profile. The reason is that TotalSamples is the sum of all the 764 // samples of the machine instruction in one source-code line, however, the 765 // entry of Bodysamples is the only max number of them, so the TotalSamples is 766 // usually much bigger than the accumulated body samples as one souce-code 767 // line can emit many machine instructions. We observed a regression when we 768 // switched to use the accumulated body samples(by using 769 // -update-total-samples). Hence, it's safer to re-calculate here to avoid 770 // such discrepancy. There is no problem for probe-based profile, as the 771 // TotalSamples is exactly the same as the accumulated body samples. 772 for (const auto &I : FSamples.getBodySamples()) 773 TotalBodySamples += I.second.getSamples(); 774 775 for (const auto &CallsiteSamples : FSamples.getCallsiteSamples()) 776 for (const auto &Callee : CallsiteSamples.second) { 777 // For binary-level density, the inlinees' samples and size should be 778 // included in the calculation. 779 calculateBodySamplesAndSize(Callee.second, TotalBodySamples, 780 FuncBodySize); 781 } 782 } 783 784 // Calculate Profile-density: 785 // Calculate the density for each function and sort them in descending order, 786 // keep accumulating their total samples unitl it exceeds the 787 // percentage_threshold(cut-off) of total profile samples, the profile-density 788 // is the last(minimum) function-density of the processed functions, which means 789 // all the functions hot to perf are on good density if the profile-density is 790 // good. The percentage_threshold(--profile-density-cutoff-hot) is configurable 791 // depending on how much regression the system want to tolerate. 792 double 793 ProfileGeneratorBase::calculateDensity(const SampleProfileMap &Profiles) { 794 double ProfileDensity = 0.0; 795 796 uint64_t TotalProfileSamples = 0; 797 // A list of the function profile density and its total samples. 798 std::vector<std::pair<double, uint64_t>> FuncDensityList; 799 for (const auto &I : Profiles) { 800 uint64_t TotalBodySamples = 0; 801 uint64_t FuncBodySize = 0; 802 calculateBodySamplesAndSize(I.second, TotalBodySamples, FuncBodySize); 803 804 if (FuncBodySize == 0) 805 continue; 806 807 double FuncDensity = static_cast<double>(TotalBodySamples) / FuncBodySize; 808 TotalProfileSamples += TotalBodySamples; 809 FuncDensityList.emplace_back(FuncDensity, TotalBodySamples); 810 } 811 812 // Sorted by the density in descending order. 813 llvm::stable_sort(FuncDensityList, [&](const std::pair<double, uint64_t> &A, 814 const std::pair<double, uint64_t> &B) { 815 if (A.first != B.first) 816 return A.first > B.first; 817 return A.second < B.second; 818 }); 819 820 uint64_t AccumulatedSamples = 0; 821 uint32_t I = 0; 822 assert(ProfileDensityCutOffHot <= 1000000 && 823 "The cutoff value is greater than 1000000(100%)"); 824 while (AccumulatedSamples < TotalProfileSamples * 825 static_cast<float>(ProfileDensityCutOffHot) / 826 1000000 && 827 I < FuncDensityList.size()) { 828 AccumulatedSamples += FuncDensityList[I].second; 829 ProfileDensity = FuncDensityList[I].first; 830 I++; 831 } 832 833 return ProfileDensity; 834 } 835 836 void ProfileGeneratorBase::calculateAndShowDensity( 837 const SampleProfileMap &Profiles) { 838 double Density = calculateDensity(Profiles); 839 showDensitySuggestion(Density); 840 } 841 842 FunctionSamples * 843 CSProfileGenerator::getOrCreateFunctionSamples(ContextTrieNode *ContextNode, 844 bool WasLeafInlined) { 845 FunctionSamples *FProfile = ContextNode->getFunctionSamples(); 846 if (!FProfile) { 847 FSamplesList.emplace_back(); 848 FProfile = &FSamplesList.back(); 849 FProfile->setFunction(ContextNode->getFuncName()); 850 ContextNode->setFunctionSamples(FProfile); 851 } 852 // Update ContextWasInlined attribute for existing contexts. 853 // The current function can be called in two ways: 854 // - when processing a probe of the current frame 855 // - when processing the entry probe of an inlinee's frame, which 856 // is then used to update the callsite count of the current frame. 857 // The two can happen in any order, hence here we are making sure 858 // `ContextWasInlined` is always set as expected. 859 // TODO: Note that the former does not always happen if no probes of the 860 // current frame has samples, and if the latter happens, we could lose the 861 // attribute. This should be fixed. 862 if (WasLeafInlined) 863 FProfile->getContext().setAttribute(ContextWasInlined); 864 return FProfile; 865 } 866 867 ContextTrieNode * 868 CSProfileGenerator::getOrCreateContextNode(const SampleContextFrames Context, 869 bool WasLeafInlined) { 870 ContextTrieNode *ContextNode = 871 ContextTracker.getOrCreateContextPath(Context, true); 872 getOrCreateFunctionSamples(ContextNode, WasLeafInlined); 873 return ContextNode; 874 } 875 876 void CSProfileGenerator::generateProfile() { 877 FunctionSamples::ProfileIsCS = true; 878 879 collectProfiledFunctions(); 880 881 if (Binary->usePseudoProbes()) { 882 Binary->decodePseudoProbe(); 883 if (InferMissingFrames) 884 initializeMissingFrameInferrer(); 885 } 886 887 if (SampleCounters) { 888 if (Binary->usePseudoProbes()) { 889 generateProbeBasedProfile(); 890 } else { 891 generateLineNumBasedProfile(); 892 } 893 } 894 895 if (Binary->getTrackFuncContextSize()) 896 computeSizeForProfiledFunctions(); 897 898 postProcessProfiles(); 899 } 900 901 void CSProfileGenerator::initializeMissingFrameInferrer() { 902 Binary->getMissingContextInferrer()->initialize(SampleCounters); 903 } 904 905 void CSProfileGenerator::inferMissingFrames( 906 const SmallVectorImpl<uint64_t> &Context, 907 SmallVectorImpl<uint64_t> &NewContext) { 908 Binary->inferMissingFrames(Context, NewContext); 909 } 910 911 void CSProfileGenerator::computeSizeForProfiledFunctions() { 912 for (auto *Func : Binary->getProfiledFunctions()) 913 Binary->computeInlinedContextSizeForFunc(Func); 914 915 // Flush the symbolizer to save memory. 916 Binary->flushSymbolizer(); 917 } 918 919 void CSProfileGenerator::updateFunctionSamples() { 920 for (auto *Node : ContextTracker) { 921 FunctionSamples *FSamples = Node->getFunctionSamples(); 922 if (FSamples) { 923 if (UpdateTotalSamples) 924 FSamples->updateTotalSamples(); 925 FSamples->updateCallsiteSamples(); 926 } 927 } 928 } 929 930 void CSProfileGenerator::generateLineNumBasedProfile() { 931 for (const auto &CI : *SampleCounters) { 932 const auto *CtxKey = cast<StringBasedCtxKey>(CI.first.getPtr()); 933 934 ContextTrieNode *ContextNode = &getRootContext(); 935 // Sample context will be empty if the jump is an external-to-internal call 936 // pattern, the head samples should be added for the internal function. 937 if (!CtxKey->Context.empty()) { 938 // Get or create function profile for the range 939 ContextNode = 940 getOrCreateContextNode(CtxKey->Context, CtxKey->WasLeafInlined); 941 // Fill in function body samples 942 populateBodySamplesForFunction(*ContextNode->getFunctionSamples(), 943 CI.second.RangeCounter); 944 } 945 // Fill in boundary sample counts as well as call site samples for calls 946 populateBoundarySamplesForFunction(ContextNode, CI.second.BranchCounter); 947 } 948 // Fill in call site value sample for inlined calls and also use context to 949 // infer missing samples. Since we don't have call count for inlined 950 // functions, we estimate it from inlinee's profile using the entry of the 951 // body sample. 952 populateInferredFunctionSamples(getRootContext()); 953 954 updateFunctionSamples(); 955 } 956 957 void CSProfileGenerator::populateBodySamplesForFunction( 958 FunctionSamples &FunctionProfile, const RangeSample &RangeCounter) { 959 // Compute disjoint ranges first, so we can use MAX 960 // for calculating count for each location. 961 RangeSample Ranges; 962 findDisjointRanges(Ranges, RangeCounter); 963 for (const auto &Range : Ranges) { 964 uint64_t RangeBegin = Range.first.first; 965 uint64_t RangeEnd = Range.first.second; 966 uint64_t Count = Range.second; 967 // Disjoint ranges have introduce zero-filled gap that 968 // doesn't belong to current context, filter them out. 969 if (Count == 0) 970 continue; 971 972 InstructionPointer IP(Binary, RangeBegin, true); 973 // Disjoint ranges may have range in the middle of two instr, 974 // e.g. If Instr1 at Addr1, and Instr2 at Addr2, disjoint range 975 // can be Addr1+1 to Addr2-1. We should ignore such range. 976 if (IP.Address > RangeEnd) 977 continue; 978 979 do { 980 auto LeafLoc = Binary->getInlineLeafFrameLoc(IP.Address); 981 if (LeafLoc) { 982 // Recording body sample for this specific context 983 updateBodySamplesforFunctionProfile(FunctionProfile, *LeafLoc, Count); 984 FunctionProfile.addTotalSamples(Count); 985 } 986 } while (IP.advance() && IP.Address <= RangeEnd); 987 } 988 } 989 990 void CSProfileGenerator::populateBoundarySamplesForFunction( 991 ContextTrieNode *Node, const BranchSample &BranchCounters) { 992 993 for (const auto &Entry : BranchCounters) { 994 uint64_t SourceAddress = Entry.first.first; 995 uint64_t TargetAddress = Entry.first.second; 996 uint64_t Count = Entry.second; 997 assert(Count != 0 && "Unexpected zero weight branch"); 998 999 StringRef CalleeName = getCalleeNameForAddress(TargetAddress); 1000 if (CalleeName.size() == 0) 1001 continue; 1002 1003 ContextTrieNode *CallerNode = Node; 1004 LineLocation CalleeCallSite(0, 0); 1005 if (CallerNode != &getRootContext()) { 1006 // Record called target sample and its count 1007 auto LeafLoc = Binary->getInlineLeafFrameLoc(SourceAddress); 1008 if (LeafLoc) { 1009 CallerNode->getFunctionSamples()->addCalledTargetSamples( 1010 LeafLoc->Location.LineOffset, 1011 getBaseDiscriminator(LeafLoc->Location.Discriminator), 1012 FunctionId(CalleeName), 1013 Count); 1014 // Record head sample for called target(callee) 1015 CalleeCallSite = LeafLoc->Location; 1016 } 1017 } 1018 1019 ContextTrieNode *CalleeNode = 1020 CallerNode->getOrCreateChildContext(CalleeCallSite, 1021 FunctionId(CalleeName)); 1022 FunctionSamples *CalleeProfile = getOrCreateFunctionSamples(CalleeNode); 1023 CalleeProfile->addHeadSamples(Count); 1024 } 1025 } 1026 1027 void CSProfileGenerator::populateInferredFunctionSamples( 1028 ContextTrieNode &Node) { 1029 // There is no call jmp sample between the inliner and inlinee, we need to use 1030 // the inlinee's context to infer inliner's context, i.e. parent(inliner)'s 1031 // sample depends on child(inlinee)'s sample, so traverse the tree in 1032 // post-order. 1033 for (auto &It : Node.getAllChildContext()) 1034 populateInferredFunctionSamples(It.second); 1035 1036 FunctionSamples *CalleeProfile = Node.getFunctionSamples(); 1037 if (!CalleeProfile) 1038 return; 1039 // If we already have head sample counts, we must have value profile 1040 // for call sites added already. Skip to avoid double counting. 1041 if (CalleeProfile->getHeadSamples()) 1042 return; 1043 ContextTrieNode *CallerNode = Node.getParentContext(); 1044 // If we don't have context, nothing to do for caller's call site. 1045 // This could happen for entry point function. 1046 if (CallerNode == &getRootContext()) 1047 return; 1048 1049 LineLocation CallerLeafFrameLoc = Node.getCallSiteLoc(); 1050 FunctionSamples &CallerProfile = *getOrCreateFunctionSamples(CallerNode); 1051 // Since we don't have call count for inlined functions, we 1052 // estimate it from inlinee's profile using entry body sample. 1053 uint64_t EstimatedCallCount = CalleeProfile->getHeadSamplesEstimate(); 1054 // If we don't have samples with location, use 1 to indicate live. 1055 if (!EstimatedCallCount && !CalleeProfile->getBodySamples().size()) 1056 EstimatedCallCount = 1; 1057 CallerProfile.addCalledTargetSamples(CallerLeafFrameLoc.LineOffset, 1058 CallerLeafFrameLoc.Discriminator, 1059 Node.getFuncName(), EstimatedCallCount); 1060 CallerProfile.addBodySamples(CallerLeafFrameLoc.LineOffset, 1061 CallerLeafFrameLoc.Discriminator, 1062 EstimatedCallCount); 1063 CallerProfile.addTotalSamples(EstimatedCallCount); 1064 } 1065 1066 void CSProfileGenerator::convertToProfileMap( 1067 ContextTrieNode &Node, SampleContextFrameVector &Context) { 1068 FunctionSamples *FProfile = Node.getFunctionSamples(); 1069 if (FProfile) { 1070 Context.emplace_back(Node.getFuncName(), LineLocation(0, 0)); 1071 // Save the new context for future references. 1072 SampleContextFrames NewContext = *Contexts.insert(Context).first; 1073 auto Ret = ProfileMap.emplace(NewContext, std::move(*FProfile)); 1074 FunctionSamples &NewProfile = Ret.first->second; 1075 NewProfile.getContext().setContext(NewContext); 1076 Context.pop_back(); 1077 } 1078 1079 for (auto &It : Node.getAllChildContext()) { 1080 ContextTrieNode &ChildNode = It.second; 1081 Context.emplace_back(Node.getFuncName(), ChildNode.getCallSiteLoc()); 1082 convertToProfileMap(ChildNode, Context); 1083 Context.pop_back(); 1084 } 1085 } 1086 1087 void CSProfileGenerator::convertToProfileMap() { 1088 assert(ProfileMap.empty() && 1089 "ProfileMap should be empty before converting from the trie"); 1090 assert(IsProfileValidOnTrie && 1091 "Do not convert the trie twice, it's already destroyed"); 1092 1093 SampleContextFrameVector Context; 1094 for (auto &It : getRootContext().getAllChildContext()) 1095 convertToProfileMap(It.second, Context); 1096 1097 IsProfileValidOnTrie = false; 1098 } 1099 1100 void CSProfileGenerator::postProcessProfiles() { 1101 // Compute hot/cold threshold based on profile. This will be used for cold 1102 // context profile merging/trimming. 1103 computeSummaryAndThreshold(); 1104 1105 // Run global pre-inliner to adjust/merge context profile based on estimated 1106 // inline decisions. 1107 if (EnableCSPreInliner) { 1108 ContextTracker.populateFuncToCtxtMap(); 1109 CSPreInliner(ContextTracker, *Binary, Summary.get()).run(); 1110 // Turn off the profile merger by default unless it is explicitly enabled. 1111 if (!CSProfMergeColdContext.getNumOccurrences()) 1112 CSProfMergeColdContext = false; 1113 } 1114 1115 convertToProfileMap(); 1116 1117 // Trim and merge cold context profile using cold threshold above. 1118 if (TrimColdProfile || CSProfMergeColdContext) { 1119 SampleContextTrimmer(ProfileMap) 1120 .trimAndMergeColdContextProfiles( 1121 HotCountThreshold, TrimColdProfile, CSProfMergeColdContext, 1122 CSProfMaxColdContextDepth, EnableCSPreInliner); 1123 } 1124 1125 if (GenCSNestedProfile) { 1126 ProfileConverter CSConverter(ProfileMap); 1127 CSConverter.convertCSProfiles(); 1128 FunctionSamples::ProfileIsCS = false; 1129 } 1130 filterAmbiguousProfile(ProfileMap); 1131 ProfileGeneratorBase::calculateAndShowDensity(ProfileMap); 1132 } 1133 1134 void ProfileGeneratorBase::computeSummaryAndThreshold( 1135 SampleProfileMap &Profiles) { 1136 SampleProfileSummaryBuilder Builder(ProfileSummaryBuilder::DefaultCutoffs); 1137 Summary = Builder.computeSummaryForProfiles(Profiles); 1138 HotCountThreshold = ProfileSummaryBuilder::getHotCountThreshold( 1139 (Summary->getDetailedSummary())); 1140 ColdCountThreshold = ProfileSummaryBuilder::getColdCountThreshold( 1141 (Summary->getDetailedSummary())); 1142 } 1143 1144 void CSProfileGenerator::computeSummaryAndThreshold() { 1145 // Always merge and use context-less profile map to compute summary. 1146 SampleProfileMap ContextLessProfiles; 1147 ContextTracker.createContextLessProfileMap(ContextLessProfiles); 1148 1149 // Set the flag below to avoid merging the profile again in 1150 // computeSummaryAndThreshold 1151 FunctionSamples::ProfileIsCS = false; 1152 assert( 1153 (!UseContextLessSummary.getNumOccurrences() || UseContextLessSummary) && 1154 "Don't set --profile-summary-contextless to false for profile " 1155 "generation"); 1156 ProfileGeneratorBase::computeSummaryAndThreshold(ContextLessProfiles); 1157 // Recover the old value. 1158 FunctionSamples::ProfileIsCS = true; 1159 } 1160 1161 void ProfileGeneratorBase::extractProbesFromRange( 1162 const RangeSample &RangeCounter, ProbeCounterMap &ProbeCounter, 1163 bool FindDisjointRanges) { 1164 const RangeSample *PRanges = &RangeCounter; 1165 RangeSample Ranges; 1166 if (FindDisjointRanges) { 1167 findDisjointRanges(Ranges, RangeCounter); 1168 PRanges = &Ranges; 1169 } 1170 1171 for (const auto &Range : *PRanges) { 1172 uint64_t RangeBegin = Range.first.first; 1173 uint64_t RangeEnd = Range.first.second; 1174 uint64_t Count = Range.second; 1175 1176 InstructionPointer IP(Binary, RangeBegin, true); 1177 // Disjoint ranges may have range in the middle of two instr, 1178 // e.g. If Instr1 at Addr1, and Instr2 at Addr2, disjoint range 1179 // can be Addr1+1 to Addr2-1. We should ignore such range. 1180 if (IP.Address > RangeEnd) 1181 continue; 1182 1183 do { 1184 const AddressProbesMap &Address2ProbesMap = 1185 Binary->getAddress2ProbesMap(); 1186 auto It = Address2ProbesMap.find(IP.Address); 1187 if (It != Address2ProbesMap.end()) { 1188 for (const MCDecodedPseudoProbe &Probe : It->second) { 1189 ProbeCounter[&Probe] += Count; 1190 } 1191 } 1192 } while (IP.advance() && IP.Address <= RangeEnd); 1193 } 1194 } 1195 1196 static void extractPrefixContextStack(SampleContextFrameVector &ContextStack, 1197 const SmallVectorImpl<uint64_t> &AddrVec, 1198 ProfiledBinary *Binary) { 1199 SmallVector<const MCDecodedPseudoProbe *, 16> Probes; 1200 for (auto Address : reverse(AddrVec)) { 1201 const MCDecodedPseudoProbe *CallProbe = 1202 Binary->getCallProbeForAddr(Address); 1203 // These could be the cases when a probe is not found at a calliste. Cutting 1204 // off the context from here since the inliner will not know how to consume 1205 // a context with unknown callsites. 1206 // 1. for functions that are not sampled when 1207 // --decode-probe-for-profiled-functions-only is on. 1208 // 2. for a merged callsite. Callsite merging may cause the loss of original 1209 // probe IDs. 1210 // 3. for an external callsite. 1211 if (!CallProbe) 1212 break; 1213 Probes.push_back(CallProbe); 1214 } 1215 1216 std::reverse(Probes.begin(), Probes.end()); 1217 1218 // Extract context stack for reusing, leaf context stack will be added 1219 // compressed while looking up function profile. 1220 for (const auto *P : Probes) { 1221 Binary->getInlineContextForProbe(P, ContextStack, true); 1222 } 1223 } 1224 1225 void CSProfileGenerator::generateProbeBasedProfile() { 1226 // Enable pseudo probe functionalities in SampleProf 1227 FunctionSamples::ProfileIsProbeBased = true; 1228 for (const auto &CI : *SampleCounters) { 1229 const AddrBasedCtxKey *CtxKey = 1230 dyn_cast<AddrBasedCtxKey>(CI.first.getPtr()); 1231 // Fill in function body samples from probes, also infer caller's samples 1232 // from callee's probe 1233 populateBodySamplesWithProbes(CI.second.RangeCounter, CtxKey); 1234 // Fill in boundary samples for a call probe 1235 populateBoundarySamplesWithProbes(CI.second.BranchCounter, CtxKey); 1236 } 1237 } 1238 1239 void CSProfileGenerator::populateBodySamplesWithProbes( 1240 const RangeSample &RangeCounter, const AddrBasedCtxKey *CtxKey) { 1241 ProbeCounterMap ProbeCounter; 1242 // Extract the top frame probes by looking up each address among the range in 1243 // the Address2ProbeMap 1244 extractProbesFromRange(RangeCounter, ProbeCounter); 1245 std::unordered_map<MCDecodedPseudoProbeInlineTree *, 1246 std::unordered_set<FunctionSamples *>> 1247 FrameSamples; 1248 for (const auto &PI : ProbeCounter) { 1249 const MCDecodedPseudoProbe *Probe = PI.first; 1250 uint64_t Count = PI.second; 1251 // Disjoint ranges have introduce zero-filled gap that 1252 // doesn't belong to current context, filter them out. 1253 if (!Probe->isBlock() || Count == 0) 1254 continue; 1255 1256 ContextTrieNode *ContextNode = getContextNodeForLeafProbe(CtxKey, Probe); 1257 FunctionSamples &FunctionProfile = *ContextNode->getFunctionSamples(); 1258 // Record the current frame and FunctionProfile whenever samples are 1259 // collected for non-danglie probes. This is for reporting all of the 1260 // zero count probes of the frame later. 1261 FrameSamples[Probe->getInlineTreeNode()].insert(&FunctionProfile); 1262 FunctionProfile.addBodySamples(Probe->getIndex(), Probe->getDiscriminator(), 1263 Count); 1264 FunctionProfile.addTotalSamples(Count); 1265 if (Probe->isEntry()) { 1266 FunctionProfile.addHeadSamples(Count); 1267 // Look up for the caller's function profile 1268 const auto *InlinerDesc = Binary->getInlinerDescForProbe(Probe); 1269 ContextTrieNode *CallerNode = ContextNode->getParentContext(); 1270 if (InlinerDesc != nullptr && CallerNode != &getRootContext()) { 1271 // Since the context id will be compressed, we have to use callee's 1272 // context id to infer caller's context id to ensure they share the 1273 // same context prefix. 1274 uint64_t CallerIndex = ContextNode->getCallSiteLoc().LineOffset; 1275 uint64_t CallerDiscriminator = ContextNode->getCallSiteLoc().Discriminator; 1276 assert(CallerIndex && 1277 "Inferred caller's location index shouldn't be zero!"); 1278 assert(!CallerDiscriminator && 1279 "Callsite probe should not have a discriminator!"); 1280 FunctionSamples &CallerProfile = 1281 *getOrCreateFunctionSamples(CallerNode); 1282 CallerProfile.setFunctionHash(InlinerDesc->FuncHash); 1283 CallerProfile.addBodySamples(CallerIndex, CallerDiscriminator, Count); 1284 CallerProfile.addTotalSamples(Count); 1285 CallerProfile.addCalledTargetSamples(CallerIndex, CallerDiscriminator, 1286 ContextNode->getFuncName(), Count); 1287 } 1288 } 1289 } 1290 1291 // Assign zero count for remaining probes without sample hits to 1292 // differentiate from probes optimized away, of which the counts are unknown 1293 // and will be inferred by the compiler. 1294 for (auto &I : FrameSamples) { 1295 for (auto *FunctionProfile : I.second) { 1296 for (auto *Probe : I.first->getProbes()) { 1297 FunctionProfile->addBodySamples(Probe->getIndex(), 1298 Probe->getDiscriminator(), 0); 1299 } 1300 } 1301 } 1302 } 1303 1304 void CSProfileGenerator::populateBoundarySamplesWithProbes( 1305 const BranchSample &BranchCounter, const AddrBasedCtxKey *CtxKey) { 1306 for (const auto &BI : BranchCounter) { 1307 uint64_t SourceAddress = BI.first.first; 1308 uint64_t TargetAddress = BI.first.second; 1309 uint64_t Count = BI.second; 1310 const MCDecodedPseudoProbe *CallProbe = 1311 Binary->getCallProbeForAddr(SourceAddress); 1312 if (CallProbe == nullptr) 1313 continue; 1314 FunctionSamples &FunctionProfile = 1315 getFunctionProfileForLeafProbe(CtxKey, CallProbe); 1316 FunctionProfile.addBodySamples(CallProbe->getIndex(), 0, Count); 1317 FunctionProfile.addTotalSamples(Count); 1318 StringRef CalleeName = getCalleeNameForAddress(TargetAddress); 1319 if (CalleeName.size() == 0) 1320 continue; 1321 FunctionProfile.addCalledTargetSamples(CallProbe->getIndex(), 1322 CallProbe->getDiscriminator(), 1323 FunctionId(CalleeName), Count); 1324 } 1325 } 1326 1327 ContextTrieNode *CSProfileGenerator::getContextNodeForLeafProbe( 1328 const AddrBasedCtxKey *CtxKey, const MCDecodedPseudoProbe *LeafProbe) { 1329 1330 const SmallVectorImpl<uint64_t> *PContext = &CtxKey->Context; 1331 SmallVector<uint64_t, 16> NewContext; 1332 1333 if (InferMissingFrames) { 1334 SmallVector<uint64_t, 16> Context = CtxKey->Context; 1335 // Append leaf frame for a complete inference. 1336 Context.push_back(LeafProbe->getAddress()); 1337 inferMissingFrames(Context, NewContext); 1338 // Pop out the leaf probe that was pushed in above. 1339 NewContext.pop_back(); 1340 PContext = &NewContext; 1341 } 1342 1343 SampleContextFrameVector ContextStack; 1344 extractPrefixContextStack(ContextStack, *PContext, Binary); 1345 1346 // Explicitly copy the context for appending the leaf context 1347 SampleContextFrameVector NewContextStack(ContextStack.begin(), 1348 ContextStack.end()); 1349 Binary->getInlineContextForProbe(LeafProbe, NewContextStack, true); 1350 // For leaf inlined context with the top frame, we should strip off the top 1351 // frame's probe id, like: 1352 // Inlined stack: [foo:1, bar:2], the ContextId will be "foo:1 @ bar" 1353 auto LeafFrame = NewContextStack.back(); 1354 LeafFrame.Location = LineLocation(0, 0); 1355 NewContextStack.pop_back(); 1356 // Compress the context string except for the leaf frame 1357 CSProfileGenerator::compressRecursionContext(NewContextStack); 1358 CSProfileGenerator::trimContext(NewContextStack); 1359 NewContextStack.push_back(LeafFrame); 1360 1361 const auto *FuncDesc = Binary->getFuncDescForGUID(LeafProbe->getGuid()); 1362 bool WasLeafInlined = LeafProbe->getInlineTreeNode()->hasInlineSite(); 1363 ContextTrieNode *ContextNode = 1364 getOrCreateContextNode(NewContextStack, WasLeafInlined); 1365 ContextNode->getFunctionSamples()->setFunctionHash(FuncDesc->FuncHash); 1366 return ContextNode; 1367 } 1368 1369 FunctionSamples &CSProfileGenerator::getFunctionProfileForLeafProbe( 1370 const AddrBasedCtxKey *CtxKey, const MCDecodedPseudoProbe *LeafProbe) { 1371 return *getContextNodeForLeafProbe(CtxKey, LeafProbe)->getFunctionSamples(); 1372 } 1373 1374 } // end namespace sampleprof 1375 } // end namespace llvm 1376