xref: /llvm-project/bolt/include/bolt/Profile/DataAggregator.h (revision e6c9cd9c060c1fa8343398b9556a5a6c0f35d515)
1 //===- bolt/Profile/DataAggregator.h - Perf data aggregator -----*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This family of functions reads profile data written by perf record,
10 // aggregates it and then writes it back to an output file.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #ifndef BOLT_PROFILE_DATA_AGGREGATOR_H
15 #define BOLT_PROFILE_DATA_AGGREGATOR_H
16 
17 #include "bolt/Profile/DataReader.h"
18 #include "bolt/Profile/YAMLProfileWriter.h"
19 #include "llvm/ADT/StringRef.h"
20 #include "llvm/Support/Error.h"
21 #include "llvm/Support/Program.h"
22 #include <unordered_map>
23 
24 namespace llvm {
25 namespace bolt {
26 
27 class BinaryFunction;
28 class BinaryContext;
29 class BoltAddressTranslation;
30 
31 /// DataAggregator inherits all parsing logic from DataReader as well as
32 /// its data structures used to represent aggregated profile data in memory.
33 ///
34 /// The aggregator works by dispatching two separate perf-script jobs that
35 /// read perf samples and perf task annotations. Later, we read the output
36 /// files to extract information about which PID was used for this binary.
37 /// With the PID, we filter the samples and extract all LBR entries.
38 ///
39 /// To aggregate LBR entries, we rely on a BinaryFunction map to locate the
40 /// original function where the event happened. Then, we convert a raw address
41 /// to an offset relative to the start of this function and aggregate branch
42 /// information for each function.
43 ///
44 /// This must be coordinated with RewriteInstance so we have BinaryFunctions in
45 /// State::Disassembled. After this state, BinaryFunction will drop the
46 /// instruction map with original addresses we rely on to validate the traces
47 /// found in the LBR.
48 ///
49 /// The last step is to write the aggregated data to disk in the output file
50 /// specified by the user.
51 class DataAggregator : public DataReader {
52 public:
53   explicit DataAggregator(StringRef Filename) : DataReader(Filename) {
54     start();
55   }
56 
57   ~DataAggregator();
58 
59   StringRef getReaderName() const override { return "perf data aggregator"; }
60 
61   bool isTrustedSource() const override { return true; }
62 
63   Error preprocessProfile(BinaryContext &BC) override;
64 
65   Error readProfilePreCFG(BinaryContext &BC) override {
66     return Error::success();
67   }
68 
69   Error readProfile(BinaryContext &BC) override;
70 
71   bool mayHaveProfileData(const BinaryFunction &BF) override;
72 
73   /// Set Bolt Address Translation Table when processing samples collected in
74   /// bolted binaries
75   void setBAT(BoltAddressTranslation *B) override { BAT = B; }
76 
77   /// Check whether \p FileName is a perf.data file
78   static bool checkPerfDataMagic(StringRef FileName);
79 
80 private:
81   struct PerfBranchSample {
82     SmallVector<LBREntry, 32> LBR;
83   };
84 
85   struct PerfBasicSample {
86     StringRef EventName;
87     uint64_t PC;
88   };
89 
90   struct PerfMemSample {
91     uint64_t PC;
92     uint64_t Addr;
93   };
94 
95   /// Used for parsing specific pre-aggregated input files.
96   struct AggregatedLBREntry {
97     enum Type : char { BRANCH = 0, FT, FT_EXTERNAL_ORIGIN };
98     Location From;
99     Location To;
100     uint64_t Count;
101     uint64_t Mispreds;
102     Type EntryType;
103   };
104 
105   struct Trace {
106     uint64_t From;
107     uint64_t To;
108     Trace(uint64_t From, uint64_t To) : From(From), To(To) {}
109     bool operator==(const Trace &Other) const {
110       return From == Other.From && To == Other.To;
111     }
112   };
113 
114   struct TraceHash {
115     size_t operator()(const Trace &L) const {
116       return std::hash<uint64_t>()(L.From << 32 | L.To);
117     }
118   };
119 
120   struct FTInfo {
121     uint64_t InternCount{0};
122     uint64_t ExternCount{0};
123   };
124 
125   struct TakenBranchInfo {
126     uint64_t TakenCount{0};
127     uint64_t MispredCount{0};
128   };
129 
130   /// Intermediate storage for profile data. We save the results of parsing
131   /// and use them later for processing and assigning profile.
132   std::unordered_map<Trace, TakenBranchInfo, TraceHash> BranchLBRs;
133   std::unordered_map<Trace, FTInfo, TraceHash> FallthroughLBRs;
134   std::vector<AggregatedLBREntry> AggregatedLBRs;
135   std::unordered_map<uint64_t, uint64_t> BasicSamples;
136   std::vector<PerfMemSample> MemSamples;
137 
138   template <typename T> void clear(T &Container) {
139     T TempContainer;
140     TempContainer.swap(Container);
141   }
142 
143   /// Perf utility full path name
144   std::string PerfPath;
145 
146   /// Perf process spawning bookkeeping
147   struct PerfProcessInfo {
148     bool IsFinished{false};
149     sys::ProcessInfo PI;
150     SmallVector<char, 256> StdoutPath;
151     SmallVector<char, 256> StderrPath;
152   };
153 
154   /// Process info for spawned processes
155   PerfProcessInfo MainEventsPPI;
156   PerfProcessInfo MemEventsPPI;
157   PerfProcessInfo MMapEventsPPI;
158   PerfProcessInfo TaskEventsPPI;
159 
160   /// Kernel VM starts at fixed based address
161   /// https://www.kernel.org/doc/Documentation/x86/x86_64/mm.txt
162   static constexpr uint64_t KernelBaseAddr = 0xffff800000000000;
163 
164   /// Current list of created temporary files
165   std::vector<std::string> TempFiles;
166 
167   /// Name of the binary with matching build-id from perf.data if different
168   /// from the file name in BC.
169   std::string BuildIDBinaryName;
170 
171   /// Memory map info for a single file as recorded in perf.data
172   /// When a binary has multiple text segments, the Size is computed as the
173   /// difference of the last address of these segments from the BaseAddress.
174   /// The base addresses of all text segments must be the same.
175   struct MMapInfo {
176     uint64_t BaseAddress{0}; /// Base address of the mapped binary.
177     uint64_t MMapAddress{0}; /// Address of the executable segment.
178     uint64_t Size{0};        /// Size of the mapping.
179     uint64_t Offset{0};      /// File offset of the mapped segment.
180     int32_t PID{-1};         /// Process ID.
181     bool Forked{false};      /// Was the process forked?
182     uint64_t Time{0ULL};     /// Time in micro seconds.
183   };
184 
185   /// Per-PID map info for the binary
186   std::unordered_map<uint64_t, MMapInfo> BinaryMMapInfo;
187 
188   /// Fork event info
189   struct ForkInfo {
190     int32_t ParentPID;
191     int32_t ChildPID;
192     uint64_t Time{0ULL};
193   };
194 
195   /// References to core BOLT data structures
196   BinaryContext *BC{nullptr};
197 
198   BoltAddressTranslation *BAT{nullptr};
199 
200   /// Update function execution profile with a recorded trace.
201   /// A trace is region of code executed between two LBR entries supplied in
202   /// execution order.
203   ///
204   /// Return a vector of offsets corresponding to a trace in a function
205   /// if the trace is valid, std::nullopt otherwise.
206   std::optional<SmallVector<std::pair<uint64_t, uint64_t>, 16>>
207   getFallthroughsInTrace(BinaryFunction &BF, const LBREntry &First,
208                          const LBREntry &Second, uint64_t Count = 1) const;
209 
210   /// Record external entry into the function \p BF.
211   ///
212   /// Return true if the entry is valid, false otherwise.
213   bool recordEntry(BinaryFunction &BF, uint64_t To, bool Mispred,
214                    uint64_t Count = 1) const;
215 
216   /// Record exit from the function \p BF via a call or return.
217   ///
218   /// Return true if the exit point is valid, false otherwise.
219   bool recordExit(BinaryFunction &BF, uint64_t From, bool Mispred,
220                   uint64_t Count = 1) const;
221 
222   /// Aggregation statistics
223   uint64_t NumInvalidTraces{0};
224   uint64_t NumLongRangeTraces{0};
225   /// Specifies how many samples were recorded in cold areas if we are dealing
226   /// with profiling data collected in a bolted binary. For LBRs, incremented
227   /// for the source of the branch to avoid counting cold activity twice (one
228   /// for source and another for destination).
229   uint64_t NumColdSamples{0};
230 
231   /// Looks into system PATH for Linux Perf and set up the aggregator to use it
232   void findPerfExecutable();
233 
234   /// Launch a perf subprocess with given args and save output for later
235   /// parsing.
236   void launchPerfProcess(StringRef Name, PerfProcessInfo &PPI,
237                          const char *ArgsString, bool Wait);
238 
239   /// Delete all temporary files created to hold the output generated by spawned
240   /// subprocesses during the aggregation job
241   void deleteTempFiles();
242 
243   // Semantic pass helpers
244 
245   /// Look up which function contains an address by using out map of
246   /// disassembled BinaryFunctions
247   BinaryFunction *getBinaryFunctionContainingAddress(uint64_t Address) const;
248 
249   /// Perform BAT translation for a given \p Func and return the parent
250   /// BinaryFunction or nullptr.
251   BinaryFunction *getBATParentFunction(const BinaryFunction &Func) const;
252 
253   /// Retrieve the location name to be used for samples recorded in \p Func.
254   static StringRef getLocationName(const BinaryFunction &Func, bool BAT);
255 
256   /// Semantic actions - parser hooks to interpret parsed perf samples
257   /// Register a sample (non-LBR mode), i.e. a new hit at \p Address
258   bool doSample(BinaryFunction &Func, const uint64_t Address, uint64_t Count);
259 
260   /// Register an intraprocedural branch \p Branch.
261   bool doIntraBranch(BinaryFunction &Func, uint64_t From, uint64_t To,
262                      uint64_t Count, uint64_t Mispreds);
263 
264   /// Register an interprocedural branch from \p FromFunc to \p ToFunc with
265   /// offsets \p From and \p To, respectively.
266   bool doInterBranch(BinaryFunction *FromFunc, BinaryFunction *ToFunc,
267                      uint64_t From, uint64_t To, uint64_t Count,
268                      uint64_t Mispreds);
269 
270   /// Register a \p Branch.
271   bool doBranch(uint64_t From, uint64_t To, uint64_t Count, uint64_t Mispreds,
272                 bool IsPreagg);
273 
274   /// Register a trace between two LBR entries supplied in execution order.
275   bool doTrace(const LBREntry &First, const LBREntry &Second,
276                uint64_t Count = 1);
277 
278   /// Parser helpers
279   /// Return false if we exhausted our parser buffer and finished parsing
280   /// everything
281   bool hasData() const { return !ParsingBuf.empty(); }
282 
283   /// Print heat map based on LBR samples.
284   std::error_code printLBRHeatMap();
285 
286   /// Parse a single perf sample containing a PID associated with a sequence of
287   /// LBR entries. If the PID does not correspond to the binary we are looking
288   /// for, return std::errc::no_such_process. If other parsing errors occur,
289   /// return the error. Otherwise, return the parsed sample.
290   ErrorOr<PerfBranchSample> parseBranchSample();
291 
292   /// Parse a single perf sample containing a PID associated with an event name
293   /// and a PC
294   ErrorOr<PerfBasicSample> parseBasicSample();
295 
296   /// Parse a single perf sample containing a PID associated with an IP and
297   /// address.
298   ErrorOr<PerfMemSample> parseMemSample();
299 
300   /// Parse pre-aggregated LBR samples created by an external tool
301   ErrorOr<AggregatedLBREntry> parseAggregatedLBREntry();
302 
303   /// Parse either buildid:offset or just offset, representing a location in the
304   /// binary. Used exclusively for pre-aggregated LBR samples.
305   ErrorOr<Location> parseLocationOrOffset();
306 
307   /// Check if a field separator is the next char to parse and, if yes, consume
308   /// it and return true
309   bool checkAndConsumeFS();
310 
311   /// Consume the entire line
312   void consumeRestOfLine();
313 
314   /// True if the next token in the parsing buffer is a new line, but don't
315   /// consume it (peek only).
316   bool checkNewLine();
317 
318   using PerfProcessErrorCallbackTy = std::function<void(int, StringRef)>;
319   /// Prepare to parse data from a given perf script invocation.
320   /// Returns an invocation exit code.
321   int prepareToParse(StringRef Name, PerfProcessInfo &Process,
322                      PerfProcessErrorCallbackTy Callback);
323 
324   /// Parse a single LBR entry as output by perf script -Fbrstack
325   ErrorOr<LBREntry> parseLBREntry();
326 
327   /// Parse LBR sample, returns the number of traces.
328   uint64_t parseLBRSample(const PerfBranchSample &Sample, bool NeedsSkylakeFix);
329 
330   /// Parse and pre-aggregate branch events.
331   std::error_code parseBranchEvents();
332 
333   /// Process all branch events.
334   void processBranchEvents();
335 
336   /// Parse the full output generated by perf script to report non-LBR samples.
337   std::error_code parseBasicEvents();
338 
339   /// Process non-LBR events.
340   void processBasicEvents();
341 
342   /// Parse the full output generated by perf script to report memory events.
343   std::error_code parseMemEvents();
344 
345   /// Process parsed memory events profile.
346   void processMemEvents();
347 
348   /// Parse a single line of a PERF_RECORD_MMAP2 event looking for a mapping
349   /// between the binary name and its memory layout in a process with a given
350   /// PID.
351   /// On success return a <FileName, MMapInfo> pair.
352   ErrorOr<std::pair<StringRef, MMapInfo>> parseMMapEvent();
353 
354   /// Parse PERF_RECORD_FORK event.
355   std::optional<ForkInfo> parseForkEvent();
356 
357   /// Parse 'PERF_RECORD_COMM exec'. Don't consume the string.
358   std::optional<int32_t> parseCommExecEvent();
359 
360   /// Parse the full output generated by `perf script --show-mmap-events`
361   /// to generate mapping between binary files and their memory mappings for
362   /// all PIDs.
363   std::error_code parseMMapEvents();
364 
365   /// Parse output of `perf script --show-task-events`, and forked processes
366   /// to the set of tracked PIDs.
367   std::error_code parseTaskEvents();
368 
369   /// Parse a single pair of binary full path and associated build-id
370   std::optional<std::pair<StringRef, StringRef>> parseNameBuildIDPair();
371 
372   /// Coordinate reading and parsing of pre-aggregated file
373   ///
374   /// The regular perf2bolt aggregation job is to read perf output directly.
375   /// However, if the data is coming from a database instead of perf, one could
376   /// write a query to produce a pre-aggregated file. This function deals with
377   /// this case.
378   ///
379   /// The pre-aggregated file contains aggregated LBR data, but without binary
380   /// knowledge. BOLT will parse it and, using information from the disassembled
381   /// binary, augment it with fall-through edge frequency information. After
382   /// this step is finished, this data can be either written to disk to be
383   /// consumed by BOLT later, or can be used by BOLT immediately if kept in
384   /// memory.
385   ///
386   /// File format syntax:
387   /// {B|F|f} [<start_id>:]<start_offset> [<end_id>:]<end_offset> <count>
388   ///       [<mispred_count>]
389   ///
390   /// B - indicates an aggregated branch
391   /// F - an aggregated fall-through
392   /// f - an aggregated fall-through with external origin - used to disambiguate
393   ///       between a return hitting a basic block head and a regular internal
394   ///       jump to the block
395   ///
396   /// <start_id> - build id of the object containing the start address. We can
397   /// skip it for the main binary and use "X" for an unknown object. This will
398   /// save some space and facilitate human parsing.
399   ///
400   /// <start_offset> - hex offset from the object base load address (0 for the
401   /// main executable unless it's PIE) to the start address.
402   ///
403   /// <end_id>, <end_offset> - same for the end address.
404   ///
405   /// <count> - total aggregated count of the branch or a fall-through.
406   ///
407   /// <mispred_count> - the number of times the branch was mispredicted.
408   /// Omitted for fall-throughs.
409   ///
410   /// Example:
411   /// F 41be50 41be50 3
412   /// F 41be90 41be90 4
413   /// B 4b1942 39b57f0 3 0
414   /// B 4b196f 4b19e0 2 0
415   void parsePreAggregated();
416 
417   /// Parse the full output of pre-aggregated LBR samples generated by
418   /// an external tool.
419   std::error_code parsePreAggregatedLBRSamples();
420 
421   /// Process parsed pre-aggregated data.
422   void processPreAggregated();
423 
424   /// If \p Address falls into the binary address space based on memory
425   /// mapping info \p MMI, then adjust it for further processing by subtracting
426   /// the base load address. External addresses, i.e. addresses that do not
427   /// correspond to the binary allocated address space, are adjusted to avoid
428   /// conflicts.
429   void adjustAddress(uint64_t &Address, const MMapInfo &MMI) const {
430     if (Address >= MMI.MMapAddress && Address < MMI.MMapAddress + MMI.Size) {
431       Address -= MMI.BaseAddress;
432     } else if (Address < MMI.Size) {
433       // Make sure the address is not treated as belonging to the binary.
434       Address = (-1ULL);
435     }
436   }
437 
438   /// Adjust addresses in \p LBR entry.
439   void adjustLBR(LBREntry &LBR, const MMapInfo &MMI) const {
440     adjustAddress(LBR.From, MMI);
441     adjustAddress(LBR.To, MMI);
442   }
443 
444   /// Ignore kernel/user transition LBR if requested
445   bool ignoreKernelInterrupt(LBREntry &LBR) const;
446 
447   /// Populate functions in \p BC with profile.
448   void processProfile(BinaryContext &BC);
449 
450   /// Start an aggregation job asynchronously.
451   void start();
452 
453   /// Returns true if this aggregation job is using a translation table to
454   /// remap samples collected on binaries already processed by BOLT.
455   bool usesBAT() const { return BAT; }
456 
457   /// Force all subprocesses to stop and cancel aggregation
458   void abort();
459 
460   /// Dump data structures into a file readable by llvm-bolt
461   std::error_code writeAggregatedFile(StringRef OutputFilename) const;
462 
463   /// Dump translated data structures into YAML
464   std::error_code writeBATYAML(BinaryContext &BC,
465                                StringRef OutputFilename) const;
466 
467   /// Filter out binaries based on PID
468   void filterBinaryMMapInfo();
469 
470   /// If we have a build-id available for the input file, use it to assist
471   /// matching profile to a binary.
472   ///
473   /// If the binary name changed after profile collection, use build-id
474   /// to get the proper name in perf data when build-ids are available.
475   /// If \p FileBuildID has no match, then issue an error and exit.
476   void processFileBuildID(StringRef FileBuildID);
477 
478   /// Debugging dump methods
479   void dump() const;
480   void dump(const LBREntry &LBR) const;
481   void dump(const PerfBranchSample &Sample) const;
482   void dump(const PerfMemSample &Sample) const;
483 
484 public:
485   /// If perf.data was collected without build ids, the buildid-list may contain
486   /// incomplete entries. Return true if the buffer containing
487   /// "perf buildid-list" output has only valid entries and is non- empty.
488   /// Return false otherwise.
489   bool hasAllBuildIDs();
490 
491   /// Parse the output generated by "perf buildid-list" to extract build-ids
492   /// and return a file name matching a given \p FileBuildID.
493   std::optional<StringRef> getFileNameForBuildID(StringRef FileBuildID);
494 
495   /// Get a constant reference to the parsed binary mmap entries.
496   const std::unordered_map<uint64_t, MMapInfo> &getBinaryMMapInfo() {
497     return BinaryMMapInfo;
498   }
499 
500   friend class YAMLProfileWriter;
501 };
502 } // namespace bolt
503 } // namespace llvm
504 
505 #endif
506