1 //===- bolt/Profile/DataAggregator.h - Perf data aggregator -----*- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This family of functions reads profile data written by perf record, 10 // aggregates it and then writes it back to an output file. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #ifndef BOLT_PROFILE_DATA_AGGREGATOR_H 15 #define BOLT_PROFILE_DATA_AGGREGATOR_H 16 17 #include "bolt/Profile/DataReader.h" 18 #include "bolt/Profile/YAMLProfileWriter.h" 19 #include "llvm/ADT/StringRef.h" 20 #include "llvm/Support/Error.h" 21 #include "llvm/Support/Program.h" 22 #include <unordered_map> 23 24 namespace llvm { 25 namespace bolt { 26 27 class BinaryFunction; 28 class BinaryContext; 29 class BoltAddressTranslation; 30 31 /// DataAggregator inherits all parsing logic from DataReader as well as 32 /// its data structures used to represent aggregated profile data in memory. 33 /// 34 /// The aggregator works by dispatching two separate perf-script jobs that 35 /// read perf samples and perf task annotations. Later, we read the output 36 /// files to extract information about which PID was used for this binary. 37 /// With the PID, we filter the samples and extract all LBR entries. 38 /// 39 /// To aggregate LBR entries, we rely on a BinaryFunction map to locate the 40 /// original function where the event happened. Then, we convert a raw address 41 /// to an offset relative to the start of this function and aggregate branch 42 /// information for each function. 43 /// 44 /// This must be coordinated with RewriteInstance so we have BinaryFunctions in 45 /// State::Disassembled. After this state, BinaryFunction will drop the 46 /// instruction map with original addresses we rely on to validate the traces 47 /// found in the LBR. 48 /// 49 /// The last step is to write the aggregated data to disk in the output file 50 /// specified by the user. 51 class DataAggregator : public DataReader { 52 public: 53 explicit DataAggregator(StringRef Filename) : DataReader(Filename) { 54 start(); 55 } 56 57 ~DataAggregator(); 58 59 StringRef getReaderName() const override { return "perf data aggregator"; } 60 61 bool isTrustedSource() const override { return true; } 62 63 Error preprocessProfile(BinaryContext &BC) override; 64 65 Error readProfilePreCFG(BinaryContext &BC) override { 66 return Error::success(); 67 } 68 69 Error readProfile(BinaryContext &BC) override; 70 71 bool mayHaveProfileData(const BinaryFunction &BF) override; 72 73 /// Set Bolt Address Translation Table when processing samples collected in 74 /// bolted binaries 75 void setBAT(BoltAddressTranslation *B) override { BAT = B; } 76 77 /// Check whether \p FileName is a perf.data file 78 static bool checkPerfDataMagic(StringRef FileName); 79 80 private: 81 struct PerfBranchSample { 82 SmallVector<LBREntry, 32> LBR; 83 }; 84 85 struct PerfBasicSample { 86 StringRef EventName; 87 uint64_t PC; 88 }; 89 90 struct PerfMemSample { 91 uint64_t PC; 92 uint64_t Addr; 93 }; 94 95 /// Used for parsing specific pre-aggregated input files. 96 struct AggregatedLBREntry { 97 enum Type : char { BRANCH = 0, FT, FT_EXTERNAL_ORIGIN }; 98 Location From; 99 Location To; 100 uint64_t Count; 101 uint64_t Mispreds; 102 Type EntryType; 103 }; 104 105 struct Trace { 106 uint64_t From; 107 uint64_t To; 108 Trace(uint64_t From, uint64_t To) : From(From), To(To) {} 109 bool operator==(const Trace &Other) const { 110 return From == Other.From && To == Other.To; 111 } 112 }; 113 114 struct TraceHash { 115 size_t operator()(const Trace &L) const { 116 return std::hash<uint64_t>()(L.From << 32 | L.To); 117 } 118 }; 119 120 struct FTInfo { 121 uint64_t InternCount{0}; 122 uint64_t ExternCount{0}; 123 }; 124 125 struct TakenBranchInfo { 126 uint64_t TakenCount{0}; 127 uint64_t MispredCount{0}; 128 }; 129 130 /// Intermediate storage for profile data. We save the results of parsing 131 /// and use them later for processing and assigning profile. 132 std::unordered_map<Trace, TakenBranchInfo, TraceHash> BranchLBRs; 133 std::unordered_map<Trace, FTInfo, TraceHash> FallthroughLBRs; 134 std::vector<AggregatedLBREntry> AggregatedLBRs; 135 std::unordered_map<uint64_t, uint64_t> BasicSamples; 136 std::vector<PerfMemSample> MemSamples; 137 138 template <typename T> void clear(T &Container) { 139 T TempContainer; 140 TempContainer.swap(Container); 141 } 142 143 /// Perf utility full path name 144 std::string PerfPath; 145 146 /// Perf process spawning bookkeeping 147 struct PerfProcessInfo { 148 bool IsFinished{false}; 149 sys::ProcessInfo PI; 150 SmallVector<char, 256> StdoutPath; 151 SmallVector<char, 256> StderrPath; 152 }; 153 154 /// Process info for spawned processes 155 PerfProcessInfo MainEventsPPI; 156 PerfProcessInfo MemEventsPPI; 157 PerfProcessInfo MMapEventsPPI; 158 PerfProcessInfo TaskEventsPPI; 159 160 /// Kernel VM starts at fixed based address 161 /// https://www.kernel.org/doc/Documentation/x86/x86_64/mm.txt 162 static constexpr uint64_t KernelBaseAddr = 0xffff800000000000; 163 164 /// Current list of created temporary files 165 std::vector<std::string> TempFiles; 166 167 /// Name of the binary with matching build-id from perf.data if different 168 /// from the file name in BC. 169 std::string BuildIDBinaryName; 170 171 /// Memory map info for a single file as recorded in perf.data 172 /// When a binary has multiple text segments, the Size is computed as the 173 /// difference of the last address of these segments from the BaseAddress. 174 /// The base addresses of all text segments must be the same. 175 struct MMapInfo { 176 uint64_t BaseAddress{0}; /// Base address of the mapped binary. 177 uint64_t MMapAddress{0}; /// Address of the executable segment. 178 uint64_t Size{0}; /// Size of the mapping. 179 uint64_t Offset{0}; /// File offset of the mapped segment. 180 int32_t PID{-1}; /// Process ID. 181 bool Forked{false}; /// Was the process forked? 182 uint64_t Time{0ULL}; /// Time in micro seconds. 183 }; 184 185 /// Per-PID map info for the binary 186 std::unordered_map<uint64_t, MMapInfo> BinaryMMapInfo; 187 188 /// Fork event info 189 struct ForkInfo { 190 int32_t ParentPID; 191 int32_t ChildPID; 192 uint64_t Time{0ULL}; 193 }; 194 195 /// References to core BOLT data structures 196 BinaryContext *BC{nullptr}; 197 198 BoltAddressTranslation *BAT{nullptr}; 199 200 /// Update function execution profile with a recorded trace. 201 /// A trace is region of code executed between two LBR entries supplied in 202 /// execution order. 203 /// 204 /// Return a vector of offsets corresponding to a trace in a function 205 /// if the trace is valid, std::nullopt otherwise. 206 std::optional<SmallVector<std::pair<uint64_t, uint64_t>, 16>> 207 getFallthroughsInTrace(BinaryFunction &BF, const LBREntry &First, 208 const LBREntry &Second, uint64_t Count = 1) const; 209 210 /// Record external entry into the function \p BF. 211 /// 212 /// Return true if the entry is valid, false otherwise. 213 bool recordEntry(BinaryFunction &BF, uint64_t To, bool Mispred, 214 uint64_t Count = 1) const; 215 216 /// Record exit from the function \p BF via a call or return. 217 /// 218 /// Return true if the exit point is valid, false otherwise. 219 bool recordExit(BinaryFunction &BF, uint64_t From, bool Mispred, 220 uint64_t Count = 1) const; 221 222 /// Aggregation statistics 223 uint64_t NumInvalidTraces{0}; 224 uint64_t NumLongRangeTraces{0}; 225 /// Specifies how many samples were recorded in cold areas if we are dealing 226 /// with profiling data collected in a bolted binary. For LBRs, incremented 227 /// for the source of the branch to avoid counting cold activity twice (one 228 /// for source and another for destination). 229 uint64_t NumColdSamples{0}; 230 231 /// Looks into system PATH for Linux Perf and set up the aggregator to use it 232 void findPerfExecutable(); 233 234 /// Launch a perf subprocess with given args and save output for later 235 /// parsing. 236 void launchPerfProcess(StringRef Name, PerfProcessInfo &PPI, 237 const char *ArgsString, bool Wait); 238 239 /// Delete all temporary files created to hold the output generated by spawned 240 /// subprocesses during the aggregation job 241 void deleteTempFiles(); 242 243 // Semantic pass helpers 244 245 /// Look up which function contains an address by using out map of 246 /// disassembled BinaryFunctions 247 BinaryFunction *getBinaryFunctionContainingAddress(uint64_t Address) const; 248 249 /// Perform BAT translation for a given \p Func and return the parent 250 /// BinaryFunction or nullptr. 251 BinaryFunction *getBATParentFunction(const BinaryFunction &Func) const; 252 253 /// Retrieve the location name to be used for samples recorded in \p Func. 254 static StringRef getLocationName(const BinaryFunction &Func, bool BAT); 255 256 /// Semantic actions - parser hooks to interpret parsed perf samples 257 /// Register a sample (non-LBR mode), i.e. a new hit at \p Address 258 bool doSample(BinaryFunction &Func, const uint64_t Address, uint64_t Count); 259 260 /// Register an intraprocedural branch \p Branch. 261 bool doIntraBranch(BinaryFunction &Func, uint64_t From, uint64_t To, 262 uint64_t Count, uint64_t Mispreds); 263 264 /// Register an interprocedural branch from \p FromFunc to \p ToFunc with 265 /// offsets \p From and \p To, respectively. 266 bool doInterBranch(BinaryFunction *FromFunc, BinaryFunction *ToFunc, 267 uint64_t From, uint64_t To, uint64_t Count, 268 uint64_t Mispreds); 269 270 /// Register a \p Branch. 271 bool doBranch(uint64_t From, uint64_t To, uint64_t Count, uint64_t Mispreds, 272 bool IsPreagg); 273 274 /// Register a trace between two LBR entries supplied in execution order. 275 bool doTrace(const LBREntry &First, const LBREntry &Second, 276 uint64_t Count = 1); 277 278 /// Parser helpers 279 /// Return false if we exhausted our parser buffer and finished parsing 280 /// everything 281 bool hasData() const { return !ParsingBuf.empty(); } 282 283 /// Print heat map based on LBR samples. 284 std::error_code printLBRHeatMap(); 285 286 /// Parse a single perf sample containing a PID associated with a sequence of 287 /// LBR entries. If the PID does not correspond to the binary we are looking 288 /// for, return std::errc::no_such_process. If other parsing errors occur, 289 /// return the error. Otherwise, return the parsed sample. 290 ErrorOr<PerfBranchSample> parseBranchSample(); 291 292 /// Parse a single perf sample containing a PID associated with an event name 293 /// and a PC 294 ErrorOr<PerfBasicSample> parseBasicSample(); 295 296 /// Parse a single perf sample containing a PID associated with an IP and 297 /// address. 298 ErrorOr<PerfMemSample> parseMemSample(); 299 300 /// Parse pre-aggregated LBR samples created by an external tool 301 ErrorOr<AggregatedLBREntry> parseAggregatedLBREntry(); 302 303 /// Parse either buildid:offset or just offset, representing a location in the 304 /// binary. Used exclusively for pre-aggregated LBR samples. 305 ErrorOr<Location> parseLocationOrOffset(); 306 307 /// Check if a field separator is the next char to parse and, if yes, consume 308 /// it and return true 309 bool checkAndConsumeFS(); 310 311 /// Consume the entire line 312 void consumeRestOfLine(); 313 314 /// True if the next token in the parsing buffer is a new line, but don't 315 /// consume it (peek only). 316 bool checkNewLine(); 317 318 using PerfProcessErrorCallbackTy = std::function<void(int, StringRef)>; 319 /// Prepare to parse data from a given perf script invocation. 320 /// Returns an invocation exit code. 321 int prepareToParse(StringRef Name, PerfProcessInfo &Process, 322 PerfProcessErrorCallbackTy Callback); 323 324 /// Parse a single LBR entry as output by perf script -Fbrstack 325 ErrorOr<LBREntry> parseLBREntry(); 326 327 /// Parse LBR sample, returns the number of traces. 328 uint64_t parseLBRSample(const PerfBranchSample &Sample, bool NeedsSkylakeFix); 329 330 /// Parse and pre-aggregate branch events. 331 std::error_code parseBranchEvents(); 332 333 /// Process all branch events. 334 void processBranchEvents(); 335 336 /// Parse the full output generated by perf script to report non-LBR samples. 337 std::error_code parseBasicEvents(); 338 339 /// Process non-LBR events. 340 void processBasicEvents(); 341 342 /// Parse the full output generated by perf script to report memory events. 343 std::error_code parseMemEvents(); 344 345 /// Process parsed memory events profile. 346 void processMemEvents(); 347 348 /// Parse a single line of a PERF_RECORD_MMAP2 event looking for a mapping 349 /// between the binary name and its memory layout in a process with a given 350 /// PID. 351 /// On success return a <FileName, MMapInfo> pair. 352 ErrorOr<std::pair<StringRef, MMapInfo>> parseMMapEvent(); 353 354 /// Parse PERF_RECORD_FORK event. 355 std::optional<ForkInfo> parseForkEvent(); 356 357 /// Parse 'PERF_RECORD_COMM exec'. Don't consume the string. 358 std::optional<int32_t> parseCommExecEvent(); 359 360 /// Parse the full output generated by `perf script --show-mmap-events` 361 /// to generate mapping between binary files and their memory mappings for 362 /// all PIDs. 363 std::error_code parseMMapEvents(); 364 365 /// Parse output of `perf script --show-task-events`, and forked processes 366 /// to the set of tracked PIDs. 367 std::error_code parseTaskEvents(); 368 369 /// Parse a single pair of binary full path and associated build-id 370 std::optional<std::pair<StringRef, StringRef>> parseNameBuildIDPair(); 371 372 /// Coordinate reading and parsing of pre-aggregated file 373 /// 374 /// The regular perf2bolt aggregation job is to read perf output directly. 375 /// However, if the data is coming from a database instead of perf, one could 376 /// write a query to produce a pre-aggregated file. This function deals with 377 /// this case. 378 /// 379 /// The pre-aggregated file contains aggregated LBR data, but without binary 380 /// knowledge. BOLT will parse it and, using information from the disassembled 381 /// binary, augment it with fall-through edge frequency information. After 382 /// this step is finished, this data can be either written to disk to be 383 /// consumed by BOLT later, or can be used by BOLT immediately if kept in 384 /// memory. 385 /// 386 /// File format syntax: 387 /// {B|F|f} [<start_id>:]<start_offset> [<end_id>:]<end_offset> <count> 388 /// [<mispred_count>] 389 /// 390 /// B - indicates an aggregated branch 391 /// F - an aggregated fall-through 392 /// f - an aggregated fall-through with external origin - used to disambiguate 393 /// between a return hitting a basic block head and a regular internal 394 /// jump to the block 395 /// 396 /// <start_id> - build id of the object containing the start address. We can 397 /// skip it for the main binary and use "X" for an unknown object. This will 398 /// save some space and facilitate human parsing. 399 /// 400 /// <start_offset> - hex offset from the object base load address (0 for the 401 /// main executable unless it's PIE) to the start address. 402 /// 403 /// <end_id>, <end_offset> - same for the end address. 404 /// 405 /// <count> - total aggregated count of the branch or a fall-through. 406 /// 407 /// <mispred_count> - the number of times the branch was mispredicted. 408 /// Omitted for fall-throughs. 409 /// 410 /// Example: 411 /// F 41be50 41be50 3 412 /// F 41be90 41be90 4 413 /// B 4b1942 39b57f0 3 0 414 /// B 4b196f 4b19e0 2 0 415 void parsePreAggregated(); 416 417 /// Parse the full output of pre-aggregated LBR samples generated by 418 /// an external tool. 419 std::error_code parsePreAggregatedLBRSamples(); 420 421 /// Process parsed pre-aggregated data. 422 void processPreAggregated(); 423 424 /// If \p Address falls into the binary address space based on memory 425 /// mapping info \p MMI, then adjust it for further processing by subtracting 426 /// the base load address. External addresses, i.e. addresses that do not 427 /// correspond to the binary allocated address space, are adjusted to avoid 428 /// conflicts. 429 void adjustAddress(uint64_t &Address, const MMapInfo &MMI) const { 430 if (Address >= MMI.MMapAddress && Address < MMI.MMapAddress + MMI.Size) { 431 Address -= MMI.BaseAddress; 432 } else if (Address < MMI.Size) { 433 // Make sure the address is not treated as belonging to the binary. 434 Address = (-1ULL); 435 } 436 } 437 438 /// Adjust addresses in \p LBR entry. 439 void adjustLBR(LBREntry &LBR, const MMapInfo &MMI) const { 440 adjustAddress(LBR.From, MMI); 441 adjustAddress(LBR.To, MMI); 442 } 443 444 /// Ignore kernel/user transition LBR if requested 445 bool ignoreKernelInterrupt(LBREntry &LBR) const; 446 447 /// Populate functions in \p BC with profile. 448 void processProfile(BinaryContext &BC); 449 450 /// Start an aggregation job asynchronously. 451 void start(); 452 453 /// Returns true if this aggregation job is using a translation table to 454 /// remap samples collected on binaries already processed by BOLT. 455 bool usesBAT() const { return BAT; } 456 457 /// Force all subprocesses to stop and cancel aggregation 458 void abort(); 459 460 /// Dump data structures into a file readable by llvm-bolt 461 std::error_code writeAggregatedFile(StringRef OutputFilename) const; 462 463 /// Dump translated data structures into YAML 464 std::error_code writeBATYAML(BinaryContext &BC, 465 StringRef OutputFilename) const; 466 467 /// Filter out binaries based on PID 468 void filterBinaryMMapInfo(); 469 470 /// If we have a build-id available for the input file, use it to assist 471 /// matching profile to a binary. 472 /// 473 /// If the binary name changed after profile collection, use build-id 474 /// to get the proper name in perf data when build-ids are available. 475 /// If \p FileBuildID has no match, then issue an error and exit. 476 void processFileBuildID(StringRef FileBuildID); 477 478 /// Debugging dump methods 479 void dump() const; 480 void dump(const LBREntry &LBR) const; 481 void dump(const PerfBranchSample &Sample) const; 482 void dump(const PerfMemSample &Sample) const; 483 484 public: 485 /// If perf.data was collected without build ids, the buildid-list may contain 486 /// incomplete entries. Return true if the buffer containing 487 /// "perf buildid-list" output has only valid entries and is non- empty. 488 /// Return false otherwise. 489 bool hasAllBuildIDs(); 490 491 /// Parse the output generated by "perf buildid-list" to extract build-ids 492 /// and return a file name matching a given \p FileBuildID. 493 std::optional<StringRef> getFileNameForBuildID(StringRef FileBuildID); 494 495 /// Get a constant reference to the parsed binary mmap entries. 496 const std::unordered_map<uint64_t, MMapInfo> &getBinaryMMapInfo() { 497 return BinaryMMapInfo; 498 } 499 500 friend class YAMLProfileWriter; 501 }; 502 } // namespace bolt 503 } // namespace llvm 504 505 #endif 506