xref: /llvm-project/bolt/runtime/instr.cpp (revision 4314f4ceb5c8a95835b50d3b93f82e0f11f587f1)
1 //===- bolt/runtime/instr.cpp ---------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // BOLT runtime instrumentation library for x86 Linux. Currently, BOLT does
10 // not support linking modules with dependencies on one another into the final
11 // binary (TODO?), which means this library has to be self-contained in a single
12 // module.
13 //
14 // All extern declarations here need to be defined by BOLT itself. Those will be
15 // undefined symbols that BOLT needs to resolve by emitting these symbols with
16 // MCStreamer. Currently, Passes/Instrumentation.cpp is the pass responsible
17 // for defining the symbols here and these two files have a tight coupling: one
18 // working statically when you run BOLT and another during program runtime when
19 // you run an instrumented binary. The main goal here is to output an fdata file
20 // (BOLT profile) with the instrumentation counters inserted by the static pass.
21 // Counters for indirect calls are an exception, as we can't know them
22 // statically. These counters are created and managed here. To allow this, we
23 // need a minimal framework for allocating memory dynamically. We provide this
24 // with the BumpPtrAllocator class (not LLVM's, but our own version of it).
25 //
26 // Since this code is intended to be inserted into any executable, we decided to
27 // make it standalone and do not depend on any external libraries (i.e. language
28 // support libraries, such as glibc or stdc++). To allow this, we provide a few
29 // light implementations of common OS interacting functionalities using direct
30 // syscall wrappers. Our simple allocator doesn't manage deallocations that
31 // fragment the memory space, so it's stack based. This is the minimal framework
32 // provided here to allow processing instrumented counters and writing fdata.
33 //
34 // In the C++ idiom used here, we never use or rely on constructors or
35 // destructors for global objects. That's because those need support from the
36 // linker in initialization/finalization code, and we want to keep our linker
37 // very simple. Similarly, we don't create any global objects that are zero
38 // initialized, since those would need to go .bss, which our simple linker also
39 // don't support (TODO?).
40 //
41 //===----------------------------------------------------------------------===//
42 
43 #if defined (__x86_64__)
44 #include "common.h"
45 
46 // Enables a very verbose logging to stderr useful when debugging
47 //#define ENABLE_DEBUG
48 
49 #ifdef ENABLE_DEBUG
50 #define DEBUG(X)                                                               \
51   { X; }
52 #else
53 #define DEBUG(X)                                                               \
54   {}
55 #endif
56 
57 #pragma GCC visibility push(hidden)
58 
59 extern "C" {
60 
61 #if defined(__APPLE__)
62 extern uint64_t* _bolt_instr_locations_getter();
63 extern uint32_t _bolt_num_counters_getter();
64 
65 extern uint8_t* _bolt_instr_tables_getter();
66 extern uint32_t _bolt_instr_num_funcs_getter();
67 
68 #else
69 
70 // Main counters inserted by instrumentation, incremented during runtime when
71 // points of interest (locations) in the program are reached. Those are direct
72 // calls and direct and indirect branches (local ones). There are also counters
73 // for basic block execution if they are a spanning tree leaf and need to be
74 // counted in order to infer the execution count of other edges of the CFG.
75 extern uint64_t __bolt_instr_locations[];
76 extern uint32_t __bolt_num_counters;
77 // Descriptions are serialized metadata about binary functions written by BOLT,
78 // so we have a minimal understanding about the program structure. For a
79 // reference on the exact format of this metadata, see *Description structs,
80 // Location, IntrumentedNode and EntryNode.
81 // Number of indirect call site descriptions
82 extern uint32_t __bolt_instr_num_ind_calls;
83 // Number of indirect call target descriptions
84 extern uint32_t __bolt_instr_num_ind_targets;
85 // Number of function descriptions
86 extern uint32_t __bolt_instr_num_funcs;
87 // Time to sleep across dumps (when we write the fdata profile to disk)
88 extern uint32_t __bolt_instr_sleep_time;
89 // Do not clear counters across dumps, rewrite file with the updated values
90 extern bool __bolt_instr_no_counters_clear;
91 // Wait until all forks of instrumented process will finish
92 extern bool __bolt_instr_wait_forks;
93 // Filename to dump data to
94 extern char __bolt_instr_filename[];
95 // Instumented binary file path
96 extern char __bolt_instr_binpath[];
97 // If true, append current PID to the fdata filename when creating it so
98 // different invocations of the same program can be differentiated.
99 extern bool __bolt_instr_use_pid;
100 // Functions that will be used to instrument indirect calls. BOLT static pass
101 // will identify indirect calls and modify them to load the address in these
102 // trampolines and call this address instead. BOLT can't use direct calls to
103 // our handlers because our addresses here are not known at analysis time. We
104 // only support resolving dependencies from this file to the output of BOLT,
105 // *not* the other way around.
106 // TODO: We need better linking support to make that happen.
107 extern void (*__bolt_ind_call_counter_func_pointer)();
108 extern void (*__bolt_ind_tailcall_counter_func_pointer)();
109 // Function pointers to init/fini trampoline routines in the binary, so we can
110 // resume regular execution of these functions that we hooked
111 extern void __bolt_start_trampoline();
112 extern void __bolt_fini_trampoline();
113 
114 #endif
115 }
116 
117 namespace {
118 
119 /// A simple allocator that mmaps a fixed size region and manages this space
120 /// in a stack fashion, meaning you always deallocate the last element that
121 /// was allocated. In practice, we don't need to deallocate individual elements.
122 /// We monotonically increase our usage and then deallocate everything once we
123 /// are done processing something.
124 class BumpPtrAllocator {
125   /// This is written before each allocation and act as a canary to detect when
126   /// a bug caused our program to cross allocation boundaries.
127   struct EntryMetadata {
128     uint64_t Magic;
129     uint64_t AllocSize;
130   };
131 
132 public:
133   void *allocate(size_t Size) {
134     Lock L(M);
135 
136     if (StackBase == nullptr) {
137       StackBase = reinterpret_cast<uint8_t *>(
138           __mmap(0, MaxSize, PROT_READ | PROT_WRITE,
139                  (Shared ? MAP_SHARED : MAP_PRIVATE) | MAP_ANONYMOUS, -1, 0));
140       assert(StackBase != MAP_FAILED,
141              "BumpPtrAllocator: failed to mmap stack!");
142       StackSize = 0;
143     }
144 
145     Size = alignTo(Size + sizeof(EntryMetadata), 16);
146     uint8_t *AllocAddress = StackBase + StackSize + sizeof(EntryMetadata);
147     auto *M = reinterpret_cast<EntryMetadata *>(StackBase + StackSize);
148     M->Magic = Magic;
149     M->AllocSize = Size;
150     StackSize += Size;
151     assert(StackSize < MaxSize, "allocator ran out of memory");
152     return AllocAddress;
153   }
154 
155 #ifdef DEBUG
156   /// Element-wise deallocation is only used for debugging to catch memory
157   /// bugs by checking magic bytes. Ordinarily, we reset the allocator once
158   /// we are done with it. Reset is done with clear(). There's no need
159   /// to deallocate each element individually.
160   void deallocate(void *Ptr) {
161     Lock L(M);
162     uint8_t MetadataOffset = sizeof(EntryMetadata);
163     auto *M = reinterpret_cast<EntryMetadata *>(
164         reinterpret_cast<uint8_t *>(Ptr) - MetadataOffset);
165     const uint8_t *StackTop = StackBase + StackSize + MetadataOffset;
166     // Validate size
167     if (Ptr != StackTop - M->AllocSize) {
168       // Failed validation, check if it is a pointer returned by operator new []
169       MetadataOffset +=
170           sizeof(uint64_t); // Space for number of elements alloc'ed
171       M = reinterpret_cast<EntryMetadata *>(reinterpret_cast<uint8_t *>(Ptr) -
172                                             MetadataOffset);
173       // Ok, it failed both checks if this assertion fails. Stop the program, we
174       // have a memory bug.
175       assert(Ptr == StackTop - M->AllocSize,
176              "must deallocate the last element alloc'ed");
177     }
178     assert(M->Magic == Magic, "allocator magic is corrupt");
179     StackSize -= M->AllocSize;
180   }
181 #else
182   void deallocate(void *) {}
183 #endif
184 
185   void clear() {
186     Lock L(M);
187     StackSize = 0;
188   }
189 
190   /// Set mmap reservation size (only relevant before first allocation)
191   void setMaxSize(uint64_t Size) { MaxSize = Size; }
192 
193   /// Set mmap reservation privacy (only relevant before first allocation)
194   void setShared(bool S) { Shared = S; }
195 
196   void destroy() {
197     if (StackBase == nullptr)
198       return;
199     __munmap(StackBase, MaxSize);
200   }
201 
202 private:
203   static constexpr uint64_t Magic = 0x1122334455667788ull;
204   uint64_t MaxSize = 0xa00000;
205   uint8_t *StackBase{nullptr};
206   uint64_t StackSize{0};
207   bool Shared{false};
208   Mutex M;
209 };
210 
211 /// Used for allocating indirect call instrumentation counters. Initialized by
212 /// __bolt_instr_setup, our initialization routine.
213 BumpPtrAllocator GlobalAlloc;
214 } // anonymous namespace
215 
216 // User-defined placement new operators. We only use those (as opposed to
217 // overriding the regular operator new) so we can keep our allocator in the
218 // stack instead of in a data section (global).
219 void *operator new(size_t Sz, BumpPtrAllocator &A) { return A.allocate(Sz); }
220 void *operator new(size_t Sz, BumpPtrAllocator &A, char C) {
221   auto *Ptr = reinterpret_cast<char *>(A.allocate(Sz));
222   memset(Ptr, C, Sz);
223   return Ptr;
224 }
225 void *operator new[](size_t Sz, BumpPtrAllocator &A) {
226   return A.allocate(Sz);
227 }
228 void *operator new[](size_t Sz, BumpPtrAllocator &A, char C) {
229   auto *Ptr = reinterpret_cast<char *>(A.allocate(Sz));
230   memset(Ptr, C, Sz);
231   return Ptr;
232 }
233 // Only called during exception unwinding (useless). We must manually dealloc.
234 // C++ language weirdness
235 void operator delete(void *Ptr, BumpPtrAllocator &A) { A.deallocate(Ptr); }
236 
237 namespace {
238 
239 // Disable instrumentation optimizations that sacrifice profile accuracy
240 extern "C" bool __bolt_instr_conservative;
241 
242 /// Basic key-val atom stored in our hash
243 struct SimpleHashTableEntryBase {
244   uint64_t Key;
245   uint64_t Val;
246   void dump(const char *Msg = nullptr) {
247     // TODO: make some sort of formatting function
248     // Currently we have to do it the ugly way because
249     // we want every message to be printed atomically via a single call to
250     // __write. If we use reportNumber() and others nultiple times, we'll get
251     // garbage in mulithreaded environment
252     char Buf[BufSize];
253     char *Ptr = Buf;
254     Ptr = intToStr(Ptr, __getpid(), 10);
255     *Ptr++ = ':';
256     *Ptr++ = ' ';
257     if (Msg)
258       Ptr = strCopy(Ptr, Msg, strLen(Msg));
259     *Ptr++ = '0';
260     *Ptr++ = 'x';
261     Ptr = intToStr(Ptr, (uint64_t)this, 16);
262     *Ptr++ = ':';
263     *Ptr++ = ' ';
264     Ptr = strCopy(Ptr, "MapEntry(0x", sizeof("MapEntry(0x") - 1);
265     Ptr = intToStr(Ptr, Key, 16);
266     *Ptr++ = ',';
267     *Ptr++ = ' ';
268     *Ptr++ = '0';
269     *Ptr++ = 'x';
270     Ptr = intToStr(Ptr, Val, 16);
271     *Ptr++ = ')';
272     *Ptr++ = '\n';
273     assert(Ptr - Buf < BufSize, "Buffer overflow!");
274     // print everything all at once for atomicity
275     __write(2, Buf, Ptr - Buf);
276   }
277 };
278 
279 /// This hash table implementation starts by allocating a table of size
280 /// InitialSize. When conflicts happen in this main table, it resolves
281 /// them by chaining a new table of size IncSize. It never reallocs as our
282 /// allocator doesn't support it. The key is intended to be function pointers.
283 /// There's no clever hash function (it's just x mod size, size being prime).
284 /// I never tuned the coefficientes in the modular equation (TODO)
285 /// This is used for indirect calls (each call site has one of this, so it
286 /// should have a small footprint) and for tallying call counts globally for
287 /// each target to check if we missed the origin of some calls (this one is a
288 /// large instantiation of this template, since it is global for all call sites)
289 template <typename T = SimpleHashTableEntryBase, uint32_t InitialSize = 7,
290           uint32_t IncSize = 7>
291 class SimpleHashTable {
292 public:
293   using MapEntry = T;
294 
295   /// Increment by 1 the value of \p Key. If it is not in this table, it will be
296   /// added to the table and its value set to 1.
297   void incrementVal(uint64_t Key, BumpPtrAllocator &Alloc) {
298     ++get(Key, Alloc).Val;
299   }
300 
301   /// Basic member accessing interface. Here we pass the allocator explicitly to
302   /// avoid storing a pointer to it as part of this table (remember there is one
303   /// hash for each indirect call site, so we wan't to minimize our footprint).
304   MapEntry &get(uint64_t Key, BumpPtrAllocator &Alloc) {
305     if (!__bolt_instr_conservative) {
306       TryLock L(M);
307       if (!L.isLocked())
308         return NoEntry;
309       return getOrAllocEntry(Key, Alloc);
310     }
311     Lock L(M);
312     return getOrAllocEntry(Key, Alloc);
313   }
314 
315   /// Traverses all elements in the table
316   template <typename... Args>
317   void forEachElement(void (*Callback)(MapEntry &, Args...), Args... args) {
318     Lock L(M);
319     if (!TableRoot)
320       return;
321     return forEachElement(Callback, InitialSize, TableRoot, args...);
322   }
323 
324   void resetCounters();
325 
326 private:
327   constexpr static uint64_t VacantMarker = 0;
328   constexpr static uint64_t FollowUpTableMarker = 0x8000000000000000ull;
329 
330   MapEntry *TableRoot{nullptr};
331   MapEntry NoEntry;
332   Mutex M;
333 
334   template <typename... Args>
335   void forEachElement(void (*Callback)(MapEntry &, Args...),
336                       uint32_t NumEntries, MapEntry *Entries, Args... args) {
337     for (uint32_t I = 0; I < NumEntries; ++I) {
338       MapEntry &Entry = Entries[I];
339       if (Entry.Key == VacantMarker)
340         continue;
341       if (Entry.Key & FollowUpTableMarker) {
342         forEachElement(Callback, IncSize,
343                        reinterpret_cast<MapEntry *>(Entry.Key &
344                                                     ~FollowUpTableMarker),
345                        args...);
346         continue;
347       }
348       Callback(Entry, args...);
349     }
350   }
351 
352   MapEntry &firstAllocation(uint64_t Key, BumpPtrAllocator &Alloc) {
353     TableRoot = new (Alloc, 0) MapEntry[InitialSize];
354     MapEntry &Entry = TableRoot[Key % InitialSize];
355     Entry.Key = Key;
356     // DEBUG(Entry.dump("Created root entry: "));
357     return Entry;
358   }
359 
360   MapEntry &getEntry(MapEntry *Entries, uint64_t Key, uint64_t Selector,
361                      BumpPtrAllocator &Alloc, int CurLevel) {
362     // DEBUG(reportNumber("getEntry called, level ", CurLevel, 10));
363     const uint32_t NumEntries = CurLevel == 0 ? InitialSize : IncSize;
364     uint64_t Remainder = Selector / NumEntries;
365     Selector = Selector % NumEntries;
366     MapEntry &Entry = Entries[Selector];
367 
368     // A hit
369     if (Entry.Key == Key) {
370       // DEBUG(Entry.dump("Hit: "));
371       return Entry;
372     }
373 
374     // Vacant - add new entry
375     if (Entry.Key == VacantMarker) {
376       Entry.Key = Key;
377       // DEBUG(Entry.dump("Adding new entry: "));
378       return Entry;
379     }
380 
381     // Defer to the next level
382     if (Entry.Key & FollowUpTableMarker) {
383       return getEntry(
384           reinterpret_cast<MapEntry *>(Entry.Key & ~FollowUpTableMarker),
385           Key, Remainder, Alloc, CurLevel + 1);
386     }
387 
388     // Conflict - create the next level
389     // DEBUG(Entry.dump("Creating new level: "));
390 
391     MapEntry *NextLevelTbl = new (Alloc, 0) MapEntry[IncSize];
392     // DEBUG(
393     //     reportNumber("Newly allocated level: 0x", uint64_t(NextLevelTbl),
394     //     16));
395     uint64_t CurEntrySelector = Entry.Key / InitialSize;
396     for (int I = 0; I < CurLevel; ++I)
397       CurEntrySelector /= IncSize;
398     CurEntrySelector = CurEntrySelector % IncSize;
399     NextLevelTbl[CurEntrySelector] = Entry;
400     Entry.Key = reinterpret_cast<uint64_t>(NextLevelTbl) | FollowUpTableMarker;
401     assert((NextLevelTbl[CurEntrySelector].Key & ~FollowUpTableMarker) !=
402                uint64_t(Entries),
403            "circular reference created!\n");
404     // DEBUG(NextLevelTbl[CurEntrySelector].dump("New level entry: "));
405     // DEBUG(Entry.dump("Updated old entry: "));
406     return getEntry(NextLevelTbl, Key, Remainder, Alloc, CurLevel + 1);
407   }
408 
409   MapEntry &getOrAllocEntry(uint64_t Key, BumpPtrAllocator &Alloc) {
410     if (TableRoot)
411       return getEntry(TableRoot, Key, Key, Alloc, 0);
412     return firstAllocation(Key, Alloc);
413   }
414 };
415 
416 template <typename T> void resetIndCallCounter(T &Entry) {
417   Entry.Val = 0;
418 }
419 
420 template <typename T, uint32_t X, uint32_t Y>
421 void SimpleHashTable<T, X, Y>::resetCounters() {
422   forEachElement(resetIndCallCounter);
423 }
424 
425 /// Represents a hash table mapping a function target address to its counter.
426 using IndirectCallHashTable = SimpleHashTable<>;
427 
428 /// Initialize with number 1 instead of 0 so we don't go into .bss. This is the
429 /// global array of all hash tables storing indirect call destinations happening
430 /// during runtime, one table per call site.
431 IndirectCallHashTable *GlobalIndCallCounters{
432     reinterpret_cast<IndirectCallHashTable *>(1)};
433 
434 /// Don't allow reentrancy in the fdata writing phase - only one thread writes
435 /// it
436 Mutex *GlobalWriteProfileMutex{reinterpret_cast<Mutex *>(1)};
437 
438 /// Store number of calls in additional to target address (Key) and frequency
439 /// as perceived by the basic block counter (Val).
440 struct CallFlowEntryBase : public SimpleHashTableEntryBase {
441   uint64_t Calls;
442 };
443 
444 using CallFlowHashTableBase = SimpleHashTable<CallFlowEntryBase, 11939, 233>;
445 
446 /// This is a large table indexing all possible call targets (indirect and
447 /// direct ones). The goal is to find mismatches between number of calls (for
448 /// those calls we were able to track) and the entry basic block counter of the
449 /// callee. In most cases, these two should be equal. If not, there are two
450 /// possible scenarios here:
451 ///
452 ///  * Entry BB has higher frequency than all known calls to this function.
453 ///    In this case, we have dynamic library code or any uninstrumented code
454 ///    calling this function. We will write the profile for these untracked
455 ///    calls as having source "0 [unknown] 0" in the fdata file.
456 ///
457 ///  * Number of known calls is higher than the frequency of entry BB
458 ///    This only happens when there is no counter for the entry BB / callee
459 ///    function is not simple (in BOLT terms). We don't do anything special
460 ///    here and just ignore those (we still report all calls to the non-simple
461 ///    function, though).
462 ///
463 class CallFlowHashTable : public CallFlowHashTableBase {
464 public:
465   CallFlowHashTable(BumpPtrAllocator &Alloc) : Alloc(Alloc) {}
466 
467   MapEntry &get(uint64_t Key) { return CallFlowHashTableBase::get(Key, Alloc); }
468 
469 private:
470   // Different than the hash table for indirect call targets, we do store the
471   // allocator here since there is only one call flow hash and space overhead
472   // is negligible.
473   BumpPtrAllocator &Alloc;
474 };
475 
476 ///
477 /// Description metadata emitted by BOLT to describe the program - refer to
478 /// Passes/Instrumentation.cpp - Instrumentation::emitTablesAsELFNote()
479 ///
480 struct Location {
481   uint32_t FunctionName;
482   uint32_t Offset;
483 };
484 
485 struct CallDescription {
486   Location From;
487   uint32_t FromNode;
488   Location To;
489   uint32_t Counter;
490   uint64_t TargetAddress;
491 };
492 
493 using IndCallDescription = Location;
494 
495 struct IndCallTargetDescription {
496   Location Loc;
497   uint64_t Address;
498 };
499 
500 struct EdgeDescription {
501   Location From;
502   uint32_t FromNode;
503   Location To;
504   uint32_t ToNode;
505   uint32_t Counter;
506 };
507 
508 struct InstrumentedNode {
509   uint32_t Node;
510   uint32_t Counter;
511 };
512 
513 struct EntryNode {
514   uint64_t Node;
515   uint64_t Address;
516 };
517 
518 struct FunctionDescription {
519   uint32_t NumLeafNodes;
520   const InstrumentedNode *LeafNodes;
521   uint32_t NumEdges;
522   const EdgeDescription *Edges;
523   uint32_t NumCalls;
524   const CallDescription *Calls;
525   uint32_t NumEntryNodes;
526   const EntryNode *EntryNodes;
527 
528   /// Constructor will parse the serialized function metadata written by BOLT
529   FunctionDescription(const uint8_t *FuncDesc);
530 
531   uint64_t getSize() const {
532     return 16 + NumLeafNodes * sizeof(InstrumentedNode) +
533            NumEdges * sizeof(EdgeDescription) +
534            NumCalls * sizeof(CallDescription) +
535            NumEntryNodes * sizeof(EntryNode);
536   }
537 };
538 
539 /// The context is created when the fdata profile needs to be written to disk
540 /// and we need to interpret our runtime counters. It contains pointers to the
541 /// mmaped binary (only the BOLT written metadata section). Deserialization
542 /// should be straightforward as most data is POD or an array of POD elements.
543 /// This metadata is used to reconstruct function CFGs.
544 struct ProfileWriterContext {
545   IndCallDescription *IndCallDescriptions;
546   IndCallTargetDescription *IndCallTargets;
547   uint8_t *FuncDescriptions;
548   char *Strings;  // String table with function names used in this binary
549   int FileDesc;   // File descriptor for the file on disk backing this
550                   // information in memory via mmap
551   void *MMapPtr;  // The mmap ptr
552   int MMapSize;   // The mmap size
553 
554   /// Hash table storing all possible call destinations to detect untracked
555   /// calls and correctly report them as [unknown] in output fdata.
556   CallFlowHashTable *CallFlowTable;
557 
558   /// Lookup the sorted indirect call target vector to fetch function name and
559   /// offset for an arbitrary function pointer.
560   const IndCallTargetDescription *lookupIndCallTarget(uint64_t Target) const;
561 };
562 
563 /// Perform a string comparison and returns zero if Str1 matches Str2. Compares
564 /// at most Size characters.
565 int compareStr(const char *Str1, const char *Str2, int Size) {
566   while (*Str1 == *Str2) {
567     if (*Str1 == '\0' || --Size == 0)
568       return 0;
569     ++Str1;
570     ++Str2;
571   }
572   return 1;
573 }
574 
575 /// Output Location to the fdata file
576 char *serializeLoc(const ProfileWriterContext &Ctx, char *OutBuf,
577                    const Location Loc, uint32_t BufSize) {
578   // fdata location format: Type Name Offset
579   // Type 1 - regular symbol
580   OutBuf = strCopy(OutBuf, "1 ");
581   const char *Str = Ctx.Strings + Loc.FunctionName;
582   uint32_t Size = 25;
583   while (*Str) {
584     *OutBuf++ = *Str++;
585     if (++Size >= BufSize)
586       break;
587   }
588   assert(!*Str, "buffer overflow, function name too large");
589   *OutBuf++ = ' ';
590   OutBuf = intToStr(OutBuf, Loc.Offset, 16);
591   *OutBuf++ = ' ';
592   return OutBuf;
593 }
594 
595 /// Read and deserialize a function description written by BOLT. \p FuncDesc
596 /// points at the beginning of the function metadata structure in the file.
597 /// See Instrumentation::emitTablesAsELFNote()
598 FunctionDescription::FunctionDescription(const uint8_t *FuncDesc) {
599   NumLeafNodes = *reinterpret_cast<const uint32_t *>(FuncDesc);
600   DEBUG(reportNumber("NumLeafNodes = ", NumLeafNodes, 10));
601   LeafNodes = reinterpret_cast<const InstrumentedNode *>(FuncDesc + 4);
602 
603   NumEdges = *reinterpret_cast<const uint32_t *>(
604       FuncDesc + 4 + NumLeafNodes * sizeof(InstrumentedNode));
605   DEBUG(reportNumber("NumEdges = ", NumEdges, 10));
606   Edges = reinterpret_cast<const EdgeDescription *>(
607       FuncDesc + 8 + NumLeafNodes * sizeof(InstrumentedNode));
608 
609   NumCalls = *reinterpret_cast<const uint32_t *>(
610       FuncDesc + 8 + NumLeafNodes * sizeof(InstrumentedNode) +
611       NumEdges * sizeof(EdgeDescription));
612   DEBUG(reportNumber("NumCalls = ", NumCalls, 10));
613   Calls = reinterpret_cast<const CallDescription *>(
614       FuncDesc + 12 + NumLeafNodes * sizeof(InstrumentedNode) +
615       NumEdges * sizeof(EdgeDescription));
616   NumEntryNodes = *reinterpret_cast<const uint32_t *>(
617       FuncDesc + 12 + NumLeafNodes * sizeof(InstrumentedNode) +
618       NumEdges * sizeof(EdgeDescription) + NumCalls * sizeof(CallDescription));
619   DEBUG(reportNumber("NumEntryNodes = ", NumEntryNodes, 10));
620   EntryNodes = reinterpret_cast<const EntryNode *>(
621       FuncDesc + 16 + NumLeafNodes * sizeof(InstrumentedNode) +
622       NumEdges * sizeof(EdgeDescription) + NumCalls * sizeof(CallDescription));
623 }
624 
625 /// Read and mmap descriptions written by BOLT from the executable's notes
626 /// section
627 #if defined(HAVE_ELF_H) and !defined(__APPLE__)
628 
629 void *__attribute__((noinline)) __get_pc() {
630   return __builtin_extract_return_addr(__builtin_return_address(0));
631 }
632 
633 /// Get string with address and parse it to hex pair <StartAddress, EndAddress>
634 bool parseAddressRange(const char *Str, uint64_t &StartAddress,
635                        uint64_t &EndAddress) {
636   if (!Str)
637     return false;
638   // Parsed string format: <hex1>-<hex2>
639   StartAddress = hexToLong(Str, '-');
640   while (*Str && *Str != '-')
641     ++Str;
642   if (!*Str)
643     return false;
644   ++Str; // swallow '-'
645   EndAddress = hexToLong(Str);
646   return true;
647 }
648 
649 /// Get full path to the real binary by getting current virtual address
650 /// and searching for the appropriate link in address range in
651 /// /proc/self/map_files
652 static char *getBinaryPath() {
653   const uint32_t BufSize = 1024;
654   const uint32_t NameMax = 4096;
655   const char DirPath[] = "/proc/self/map_files/";
656   static char TargetPath[NameMax] = {};
657   char Buf[BufSize];
658 
659   if (__bolt_instr_binpath[0] != '\0')
660     return __bolt_instr_binpath;
661 
662   if (TargetPath[0] != '\0')
663     return TargetPath;
664 
665   unsigned long CurAddr = (unsigned long)__get_pc();
666   uint64_t FDdir = __open(DirPath,
667                           /*flags=*/0 /*O_RDONLY*/,
668                           /*mode=*/0666);
669   assert(static_cast<int64_t>(FDdir) >= 0,
670          "failed to open /proc/self/map_files");
671 
672   while (long Nread = __getdents(FDdir, (struct dirent *)Buf, BufSize)) {
673     assert(static_cast<int64_t>(Nread) != -1, "failed to get folder entries");
674 
675     struct dirent *d;
676     for (long Bpos = 0; Bpos < Nread; Bpos += d->d_reclen) {
677       d = (struct dirent *)(Buf + Bpos);
678 
679       uint64_t StartAddress, EndAddress;
680       if (!parseAddressRange(d->d_name, StartAddress, EndAddress))
681         continue;
682       if (CurAddr < StartAddress || CurAddr > EndAddress)
683         continue;
684       char FindBuf[NameMax];
685       char *C = strCopy(FindBuf, DirPath, NameMax);
686       C = strCopy(C, d->d_name, NameMax - (C - FindBuf));
687       *C = '\0';
688       uint32_t Ret = __readlink(FindBuf, TargetPath, sizeof(TargetPath));
689       assert(Ret != -1 && Ret != BufSize, "readlink error");
690       TargetPath[Ret] = '\0';
691       return TargetPath;
692     }
693   }
694   return nullptr;
695 }
696 
697 ProfileWriterContext readDescriptions() {
698   ProfileWriterContext Result;
699   char *BinPath = getBinaryPath();
700   assert(BinPath && BinPath[0] != '\0', "failed to find binary path");
701 
702   uint64_t FD = __open(BinPath,
703                        /*flags=*/0 /*O_RDONLY*/,
704                        /*mode=*/0666);
705   assert(static_cast<int64_t>(FD) >= 0, "failed to open binary path");
706 
707   Result.FileDesc = FD;
708 
709   // mmap our binary to memory
710   uint64_t Size = __lseek(FD, 0, 2 /*SEEK_END*/);
711   uint8_t *BinContents = reinterpret_cast<uint8_t *>(
712       __mmap(0, Size, PROT_READ, MAP_PRIVATE, FD, 0));
713   assert(BinContents != MAP_FAILED, "readDescriptions: Failed to mmap self!");
714   Result.MMapPtr = BinContents;
715   Result.MMapSize = Size;
716   Elf64_Ehdr *Hdr = reinterpret_cast<Elf64_Ehdr *>(BinContents);
717   Elf64_Shdr *Shdr = reinterpret_cast<Elf64_Shdr *>(BinContents + Hdr->e_shoff);
718   Elf64_Shdr *StringTblHeader = reinterpret_cast<Elf64_Shdr *>(
719       BinContents + Hdr->e_shoff + Hdr->e_shstrndx * Hdr->e_shentsize);
720 
721   // Find .bolt.instr.tables with the data we need and set pointers to it
722   for (int I = 0; I < Hdr->e_shnum; ++I) {
723     char *SecName = reinterpret_cast<char *>(
724         BinContents + StringTblHeader->sh_offset + Shdr->sh_name);
725     if (compareStr(SecName, ".bolt.instr.tables", 64) != 0) {
726       Shdr = reinterpret_cast<Elf64_Shdr *>(BinContents + Hdr->e_shoff +
727                                             (I + 1) * Hdr->e_shentsize);
728       continue;
729     }
730     // Actual contents of the ELF note start after offset 20 decimal:
731     // Offset 0: Producer name size (4 bytes)
732     // Offset 4: Contents size (4 bytes)
733     // Offset 8: Note type (4 bytes)
734     // Offset 12: Producer name (BOLT\0) (5 bytes + align to 4-byte boundary)
735     // Offset 20: Contents
736     uint32_t IndCallDescSize =
737         *reinterpret_cast<uint32_t *>(BinContents + Shdr->sh_offset + 20);
738     uint32_t IndCallTargetDescSize = *reinterpret_cast<uint32_t *>(
739         BinContents + Shdr->sh_offset + 24 + IndCallDescSize);
740     uint32_t FuncDescSize =
741         *reinterpret_cast<uint32_t *>(BinContents + Shdr->sh_offset + 28 +
742                                       IndCallDescSize + IndCallTargetDescSize);
743     Result.IndCallDescriptions = reinterpret_cast<IndCallDescription *>(
744         BinContents + Shdr->sh_offset + 24);
745     Result.IndCallTargets = reinterpret_cast<IndCallTargetDescription *>(
746         BinContents + Shdr->sh_offset + 28 + IndCallDescSize);
747     Result.FuncDescriptions = BinContents + Shdr->sh_offset + 32 +
748                               IndCallDescSize + IndCallTargetDescSize;
749     Result.Strings = reinterpret_cast<char *>(
750         BinContents + Shdr->sh_offset + 32 + IndCallDescSize +
751         IndCallTargetDescSize + FuncDescSize);
752     return Result;
753   }
754   const char ErrMsg[] =
755       "BOLT instrumentation runtime error: could not find section "
756       ".bolt.instr.tables\n";
757   reportError(ErrMsg, sizeof(ErrMsg));
758   return Result;
759 }
760 
761 #else
762 
763 ProfileWriterContext readDescriptions() {
764   ProfileWriterContext Result;
765   uint8_t *Tables = _bolt_instr_tables_getter();
766   uint32_t IndCallDescSize = *reinterpret_cast<uint32_t *>(Tables);
767   uint32_t IndCallTargetDescSize =
768       *reinterpret_cast<uint32_t *>(Tables + 4 + IndCallDescSize);
769   uint32_t FuncDescSize = *reinterpret_cast<uint32_t *>(
770       Tables + 8 + IndCallDescSize + IndCallTargetDescSize);
771   Result.IndCallDescriptions =
772       reinterpret_cast<IndCallDescription *>(Tables + 4);
773   Result.IndCallTargets = reinterpret_cast<IndCallTargetDescription *>(
774       Tables + 8 + IndCallDescSize);
775   Result.FuncDescriptions =
776       Tables + 12 + IndCallDescSize + IndCallTargetDescSize;
777   Result.Strings = reinterpret_cast<char *>(
778       Tables + 12 + IndCallDescSize + IndCallTargetDescSize + FuncDescSize);
779   return Result;
780 }
781 
782 #endif
783 
784 #if !defined(__APPLE__)
785 /// Debug by printing overall metadata global numbers to check it is sane
786 void printStats(const ProfileWriterContext &Ctx) {
787   char StatMsg[BufSize];
788   char *StatPtr = StatMsg;
789   StatPtr =
790       strCopy(StatPtr,
791               "\nBOLT INSTRUMENTATION RUNTIME STATISTICS\n\nIndCallDescSize: ");
792   StatPtr = intToStr(StatPtr,
793                      Ctx.FuncDescriptions -
794                          reinterpret_cast<uint8_t *>(Ctx.IndCallDescriptions),
795                      10);
796   StatPtr = strCopy(StatPtr, "\nFuncDescSize: ");
797   StatPtr = intToStr(
798       StatPtr,
799       reinterpret_cast<uint8_t *>(Ctx.Strings) - Ctx.FuncDescriptions, 10);
800   StatPtr = strCopy(StatPtr, "\n__bolt_instr_num_ind_calls: ");
801   StatPtr = intToStr(StatPtr, __bolt_instr_num_ind_calls, 10);
802   StatPtr = strCopy(StatPtr, "\n__bolt_instr_num_funcs: ");
803   StatPtr = intToStr(StatPtr, __bolt_instr_num_funcs, 10);
804   StatPtr = strCopy(StatPtr, "\n");
805   __write(2, StatMsg, StatPtr - StatMsg);
806 }
807 #endif
808 
809 
810 /// This is part of a simple CFG representation in memory, where we store
811 /// a dynamically sized array of input and output edges per node, and store
812 /// a dynamically sized array of nodes per graph. We also store the spanning
813 /// tree edges for that CFG in a separate array of nodes in
814 /// \p SpanningTreeNodes, while the regular nodes live in \p CFGNodes.
815 struct Edge {
816   uint32_t Node; // Index in nodes array regarding the destination of this edge
817   uint32_t ID;   // Edge index in an array comprising all edges of the graph
818 };
819 
820 /// A regular graph node or a spanning tree node
821 struct Node {
822   uint32_t NumInEdges{0};  // Input edge count used to size InEdge
823   uint32_t NumOutEdges{0}; // Output edge count used to size OutEdges
824   Edge *InEdges{nullptr};  // Created and managed by \p Graph
825   Edge *OutEdges{nullptr}; // ditto
826 };
827 
828 /// Main class for CFG representation in memory. Manages object creation and
829 /// destruction, populates an array of CFG nodes as well as corresponding
830 /// spanning tree nodes.
831 struct Graph {
832   uint32_t NumNodes;
833   Node *CFGNodes;
834   Node *SpanningTreeNodes;
835   uint64_t *EdgeFreqs;
836   uint64_t *CallFreqs;
837   BumpPtrAllocator &Alloc;
838   const FunctionDescription &D;
839 
840   /// Reads a list of edges from function description \p D and builds
841   /// the graph from it. Allocates several internal dynamic structures that are
842   /// later destroyed by ~Graph() and uses \p Alloc. D.LeafNodes contain all
843   /// spanning tree leaf nodes descriptions (their counters). They are the seed
844   /// used to compute the rest of the missing edge counts in a bottom-up
845   /// traversal of the spanning tree.
846   Graph(BumpPtrAllocator &Alloc, const FunctionDescription &D,
847         const uint64_t *Counters, ProfileWriterContext &Ctx);
848   ~Graph();
849   void dump() const;
850 
851 private:
852   void computeEdgeFrequencies(const uint64_t *Counters,
853                               ProfileWriterContext &Ctx);
854   void dumpEdgeFreqs() const;
855 };
856 
857 Graph::Graph(BumpPtrAllocator &Alloc, const FunctionDescription &D,
858              const uint64_t *Counters, ProfileWriterContext &Ctx)
859     : Alloc(Alloc), D(D) {
860   DEBUG(reportNumber("G = 0x", (uint64_t)this, 16));
861   // First pass to determine number of nodes
862   int32_t MaxNodes = -1;
863   CallFreqs = nullptr;
864   EdgeFreqs = nullptr;
865   for (int I = 0; I < D.NumEdges; ++I) {
866     if (static_cast<int32_t>(D.Edges[I].FromNode) > MaxNodes)
867       MaxNodes = D.Edges[I].FromNode;
868     if (static_cast<int32_t>(D.Edges[I].ToNode) > MaxNodes)
869       MaxNodes = D.Edges[I].ToNode;
870   }
871 
872   for (int I = 0; I < D.NumLeafNodes; ++I)
873     if (static_cast<int32_t>(D.LeafNodes[I].Node) > MaxNodes)
874       MaxNodes = D.LeafNodes[I].Node;
875 
876   for (int I = 0; I < D.NumCalls; ++I)
877     if (static_cast<int32_t>(D.Calls[I].FromNode) > MaxNodes)
878       MaxNodes = D.Calls[I].FromNode;
879 
880   // No nodes? Nothing to do
881   if (MaxNodes < 0) {
882     DEBUG(report("No nodes!\n"));
883     CFGNodes = nullptr;
884     SpanningTreeNodes = nullptr;
885     NumNodes = 0;
886     return;
887   }
888   ++MaxNodes;
889   DEBUG(reportNumber("NumNodes = ", MaxNodes, 10));
890   NumNodes = static_cast<uint32_t>(MaxNodes);
891 
892   // Initial allocations
893   CFGNodes = new (Alloc) Node[MaxNodes];
894 
895   DEBUG(reportNumber("G->CFGNodes = 0x", (uint64_t)CFGNodes, 16));
896   SpanningTreeNodes = new (Alloc) Node[MaxNodes];
897   DEBUG(reportNumber("G->SpanningTreeNodes = 0x",
898                      (uint64_t)SpanningTreeNodes, 16));
899 
900   // Figure out how much to allocate to each vector (in/out edge sets)
901   for (int I = 0; I < D.NumEdges; ++I) {
902     CFGNodes[D.Edges[I].FromNode].NumOutEdges++;
903     CFGNodes[D.Edges[I].ToNode].NumInEdges++;
904     if (D.Edges[I].Counter != 0xffffffff)
905       continue;
906 
907     SpanningTreeNodes[D.Edges[I].FromNode].NumOutEdges++;
908     SpanningTreeNodes[D.Edges[I].ToNode].NumInEdges++;
909   }
910 
911   // Allocate in/out edge sets
912   for (int I = 0; I < MaxNodes; ++I) {
913     if (CFGNodes[I].NumInEdges > 0)
914       CFGNodes[I].InEdges = new (Alloc) Edge[CFGNodes[I].NumInEdges];
915     if (CFGNodes[I].NumOutEdges > 0)
916       CFGNodes[I].OutEdges = new (Alloc) Edge[CFGNodes[I].NumOutEdges];
917     if (SpanningTreeNodes[I].NumInEdges > 0)
918       SpanningTreeNodes[I].InEdges =
919           new (Alloc) Edge[SpanningTreeNodes[I].NumInEdges];
920     if (SpanningTreeNodes[I].NumOutEdges > 0)
921       SpanningTreeNodes[I].OutEdges =
922           new (Alloc) Edge[SpanningTreeNodes[I].NumOutEdges];
923     CFGNodes[I].NumInEdges = 0;
924     CFGNodes[I].NumOutEdges = 0;
925     SpanningTreeNodes[I].NumInEdges = 0;
926     SpanningTreeNodes[I].NumOutEdges = 0;
927   }
928 
929   // Fill in/out edge sets
930   for (int I = 0; I < D.NumEdges; ++I) {
931     const uint32_t Src = D.Edges[I].FromNode;
932     const uint32_t Dst = D.Edges[I].ToNode;
933     Edge *E = &CFGNodes[Src].OutEdges[CFGNodes[Src].NumOutEdges++];
934     E->Node = Dst;
935     E->ID = I;
936 
937     E = &CFGNodes[Dst].InEdges[CFGNodes[Dst].NumInEdges++];
938     E->Node = Src;
939     E->ID = I;
940 
941     if (D.Edges[I].Counter != 0xffffffff)
942       continue;
943 
944     E = &SpanningTreeNodes[Src]
945              .OutEdges[SpanningTreeNodes[Src].NumOutEdges++];
946     E->Node = Dst;
947     E->ID = I;
948 
949     E = &SpanningTreeNodes[Dst]
950              .InEdges[SpanningTreeNodes[Dst].NumInEdges++];
951     E->Node = Src;
952     E->ID = I;
953   }
954 
955   computeEdgeFrequencies(Counters, Ctx);
956 }
957 
958 Graph::~Graph() {
959   if (CallFreqs)
960     Alloc.deallocate(CallFreqs);
961   if (EdgeFreqs)
962     Alloc.deallocate(EdgeFreqs);
963   for (int I = NumNodes - 1; I >= 0; --I) {
964     if (SpanningTreeNodes[I].OutEdges)
965       Alloc.deallocate(SpanningTreeNodes[I].OutEdges);
966     if (SpanningTreeNodes[I].InEdges)
967       Alloc.deallocate(SpanningTreeNodes[I].InEdges);
968     if (CFGNodes[I].OutEdges)
969       Alloc.deallocate(CFGNodes[I].OutEdges);
970     if (CFGNodes[I].InEdges)
971       Alloc.deallocate(CFGNodes[I].InEdges);
972   }
973   if (SpanningTreeNodes)
974     Alloc.deallocate(SpanningTreeNodes);
975   if (CFGNodes)
976     Alloc.deallocate(CFGNodes);
977 }
978 
979 void Graph::dump() const {
980   reportNumber("Dumping graph with number of nodes: ", NumNodes, 10);
981   report("  Full graph:\n");
982   for (int I = 0; I < NumNodes; ++I) {
983     const Node *N = &CFGNodes[I];
984     reportNumber("    Node #", I, 10);
985     reportNumber("      InEdges total ", N->NumInEdges, 10);
986     for (int J = 0; J < N->NumInEdges; ++J)
987       reportNumber("        ", N->InEdges[J].Node, 10);
988     reportNumber("      OutEdges total ", N->NumOutEdges, 10);
989     for (int J = 0; J < N->NumOutEdges; ++J)
990       reportNumber("        ", N->OutEdges[J].Node, 10);
991     report("\n");
992   }
993   report("  Spanning tree:\n");
994   for (int I = 0; I < NumNodes; ++I) {
995     const Node *N = &SpanningTreeNodes[I];
996     reportNumber("    Node #", I, 10);
997     reportNumber("      InEdges total ", N->NumInEdges, 10);
998     for (int J = 0; J < N->NumInEdges; ++J)
999       reportNumber("        ", N->InEdges[J].Node, 10);
1000     reportNumber("      OutEdges total ", N->NumOutEdges, 10);
1001     for (int J = 0; J < N->NumOutEdges; ++J)
1002       reportNumber("        ", N->OutEdges[J].Node, 10);
1003     report("\n");
1004   }
1005 }
1006 
1007 void Graph::dumpEdgeFreqs() const {
1008   reportNumber(
1009       "Dumping edge frequencies for graph with num edges: ", D.NumEdges, 10);
1010   for (int I = 0; I < D.NumEdges; ++I) {
1011     reportNumber("* Src: ", D.Edges[I].FromNode, 10);
1012     reportNumber("  Dst: ", D.Edges[I].ToNode, 10);
1013     reportNumber("    Cnt: ", EdgeFreqs[I], 10);
1014   }
1015 }
1016 
1017 /// Auxiliary map structure for fast lookups of which calls map to each node of
1018 /// the function CFG
1019 struct NodeToCallsMap {
1020   struct MapEntry {
1021     uint32_t NumCalls;
1022     uint32_t *Calls;
1023   };
1024   MapEntry *Entries;
1025   BumpPtrAllocator &Alloc;
1026   const uint32_t NumNodes;
1027 
1028   NodeToCallsMap(BumpPtrAllocator &Alloc, const FunctionDescription &D,
1029                  uint32_t NumNodes)
1030       : Alloc(Alloc), NumNodes(NumNodes) {
1031     Entries = new (Alloc, 0) MapEntry[NumNodes];
1032     for (int I = 0; I < D.NumCalls; ++I) {
1033       DEBUG(reportNumber("Registering call in node ", D.Calls[I].FromNode, 10));
1034       ++Entries[D.Calls[I].FromNode].NumCalls;
1035     }
1036     for (int I = 0; I < NumNodes; ++I) {
1037       Entries[I].Calls = Entries[I].NumCalls ? new (Alloc)
1038                                                    uint32_t[Entries[I].NumCalls]
1039                                              : nullptr;
1040       Entries[I].NumCalls = 0;
1041     }
1042     for (int I = 0; I < D.NumCalls; ++I) {
1043       MapEntry &Entry = Entries[D.Calls[I].FromNode];
1044       Entry.Calls[Entry.NumCalls++] = I;
1045     }
1046   }
1047 
1048   /// Set the frequency of all calls in node \p NodeID to Freq. However, if
1049   /// the calls have their own counters and do not depend on the basic block
1050   /// counter, this means they have landing pads and throw exceptions. In this
1051   /// case, set their frequency with their counters and return the maximum
1052   /// value observed in such counters. This will be used as the new frequency
1053   /// at basic block entry. This is used to fix the CFG edge frequencies in the
1054   /// presence of exceptions.
1055   uint64_t visitAllCallsIn(uint32_t NodeID, uint64_t Freq, uint64_t *CallFreqs,
1056                            const FunctionDescription &D,
1057                            const uint64_t *Counters,
1058                            ProfileWriterContext &Ctx) const {
1059     const MapEntry &Entry = Entries[NodeID];
1060     uint64_t MaxValue = 0ull;
1061     for (int I = 0, E = Entry.NumCalls; I != E; ++I) {
1062       const uint32_t CallID = Entry.Calls[I];
1063       DEBUG(reportNumber("  Setting freq for call ID: ", CallID, 10));
1064       const CallDescription &CallDesc = D.Calls[CallID];
1065       if (CallDesc.Counter == 0xffffffff) {
1066         CallFreqs[CallID] = Freq;
1067         DEBUG(reportNumber("  with : ", Freq, 10));
1068       } else {
1069         const uint64_t CounterVal = Counters[CallDesc.Counter];
1070         CallFreqs[CallID] = CounterVal;
1071         MaxValue = CounterVal > MaxValue ? CounterVal : MaxValue;
1072         DEBUG(reportNumber("  with (private counter) : ", CounterVal, 10));
1073       }
1074       DEBUG(reportNumber("  Address: 0x", CallDesc.TargetAddress, 16));
1075       if (CallFreqs[CallID] > 0)
1076         Ctx.CallFlowTable->get(CallDesc.TargetAddress).Calls +=
1077             CallFreqs[CallID];
1078     }
1079     return MaxValue;
1080   }
1081 
1082   ~NodeToCallsMap() {
1083     for (int I = NumNodes - 1; I >= 0; --I)
1084       if (Entries[I].Calls)
1085         Alloc.deallocate(Entries[I].Calls);
1086     Alloc.deallocate(Entries);
1087   }
1088 };
1089 
1090 /// Fill an array with the frequency of each edge in the function represented
1091 /// by G, as well as another array for each call.
1092 void Graph::computeEdgeFrequencies(const uint64_t *Counters,
1093                                    ProfileWriterContext &Ctx) {
1094   if (NumNodes == 0)
1095     return;
1096 
1097   EdgeFreqs = D.NumEdges ? new (Alloc, 0) uint64_t [D.NumEdges] : nullptr;
1098   CallFreqs = D.NumCalls ? new (Alloc, 0) uint64_t [D.NumCalls] : nullptr;
1099 
1100   // Setup a lookup for calls present in each node (BB)
1101   NodeToCallsMap *CallMap = new (Alloc) NodeToCallsMap(Alloc, D, NumNodes);
1102 
1103   // Perform a bottom-up, BFS traversal of the spanning tree in G. Edges in the
1104   // spanning tree don't have explicit counters. We must infer their value using
1105   // a linear combination of other counters (sum of counters of the outgoing
1106   // edges minus sum of counters of the incoming edges).
1107   uint32_t *Stack = new (Alloc) uint32_t [NumNodes];
1108   uint32_t StackTop = 0;
1109   enum Status : uint8_t { S_NEW = 0, S_VISITING, S_VISITED };
1110   Status *Visited = new (Alloc, 0) Status[NumNodes];
1111   uint64_t *LeafFrequency = new (Alloc, 0) uint64_t[NumNodes];
1112   uint64_t *EntryAddress = new (Alloc, 0) uint64_t[NumNodes];
1113 
1114   // Setup a fast lookup for frequency of leaf nodes, which have special
1115   // basic block frequency instrumentation (they are not edge profiled).
1116   for (int I = 0; I < D.NumLeafNodes; ++I) {
1117     LeafFrequency[D.LeafNodes[I].Node] = Counters[D.LeafNodes[I].Counter];
1118     DEBUG({
1119       if (Counters[D.LeafNodes[I].Counter] > 0) {
1120         reportNumber("Leaf Node# ", D.LeafNodes[I].Node, 10);
1121         reportNumber("     Counter: ", Counters[D.LeafNodes[I].Counter], 10);
1122       }
1123     });
1124   }
1125   for (int I = 0; I < D.NumEntryNodes; ++I) {
1126     EntryAddress[D.EntryNodes[I].Node] = D.EntryNodes[I].Address;
1127     DEBUG({
1128         reportNumber("Entry Node# ", D.EntryNodes[I].Node, 10);
1129         reportNumber("      Address: ", D.EntryNodes[I].Address, 16);
1130     });
1131   }
1132   // Add all root nodes to the stack
1133   for (int I = 0; I < NumNodes; ++I)
1134     if (SpanningTreeNodes[I].NumInEdges == 0)
1135       Stack[StackTop++] = I;
1136 
1137   // Empty stack?
1138   if (StackTop == 0) {
1139     DEBUG(report("Empty stack!\n"));
1140     Alloc.deallocate(EntryAddress);
1141     Alloc.deallocate(LeafFrequency);
1142     Alloc.deallocate(Visited);
1143     Alloc.deallocate(Stack);
1144     CallMap->~NodeToCallsMap();
1145     Alloc.deallocate(CallMap);
1146     if (CallFreqs)
1147       Alloc.deallocate(CallFreqs);
1148     if (EdgeFreqs)
1149       Alloc.deallocate(EdgeFreqs);
1150     EdgeFreqs = nullptr;
1151     CallFreqs = nullptr;
1152     return;
1153   }
1154   // Add all known edge counts, will infer the rest
1155   for (int I = 0; I < D.NumEdges; ++I) {
1156     const uint32_t C = D.Edges[I].Counter;
1157     if (C == 0xffffffff) // inferred counter - we will compute its value
1158       continue;
1159     EdgeFreqs[I] = Counters[C];
1160   }
1161 
1162   while (StackTop > 0) {
1163     const uint32_t Cur = Stack[--StackTop];
1164     DEBUG({
1165       if (Visited[Cur] == S_VISITING)
1166         report("(visiting) ");
1167       else
1168         report("(new) ");
1169       reportNumber("Cur: ", Cur, 10);
1170     });
1171 
1172     // This shouldn't happen in a tree
1173     assert(Visited[Cur] != S_VISITED, "should not have visited nodes in stack");
1174     if (Visited[Cur] == S_NEW) {
1175       Visited[Cur] = S_VISITING;
1176       Stack[StackTop++] = Cur;
1177       assert(StackTop <= NumNodes, "stack grew too large");
1178       for (int I = 0, E = SpanningTreeNodes[Cur].NumOutEdges; I < E; ++I) {
1179         const uint32_t Succ = SpanningTreeNodes[Cur].OutEdges[I].Node;
1180         Stack[StackTop++] = Succ;
1181         assert(StackTop <= NumNodes, "stack grew too large");
1182       }
1183       continue;
1184     }
1185     Visited[Cur] = S_VISITED;
1186 
1187     // Establish our node frequency based on outgoing edges, which should all be
1188     // resolved by now.
1189     int64_t CurNodeFreq = LeafFrequency[Cur];
1190     // Not a leaf?
1191     if (!CurNodeFreq) {
1192       for (int I = 0, E = CFGNodes[Cur].NumOutEdges; I != E; ++I) {
1193         const uint32_t SuccEdge = CFGNodes[Cur].OutEdges[I].ID;
1194         CurNodeFreq += EdgeFreqs[SuccEdge];
1195       }
1196     }
1197     if (CurNodeFreq < 0)
1198       CurNodeFreq = 0;
1199 
1200     const uint64_t CallFreq = CallMap->visitAllCallsIn(
1201         Cur, CurNodeFreq > 0 ? CurNodeFreq : 0, CallFreqs, D, Counters, Ctx);
1202 
1203     // Exception handling affected our output flow? Fix with calls info
1204     DEBUG({
1205       if (CallFreq > CurNodeFreq)
1206         report("Bumping node frequency with call info\n");
1207     });
1208     CurNodeFreq = CallFreq > CurNodeFreq ? CallFreq : CurNodeFreq;
1209 
1210     if (CurNodeFreq > 0) {
1211       if (uint64_t Addr = EntryAddress[Cur]) {
1212         DEBUG(
1213             reportNumber("  Setting flow at entry point address 0x", Addr, 16));
1214         DEBUG(reportNumber("  with: ", CurNodeFreq, 10));
1215         Ctx.CallFlowTable->get(Addr).Val = CurNodeFreq;
1216       }
1217     }
1218 
1219     // No parent? Reached a tree root, limit to call frequency updating.
1220     if (SpanningTreeNodes[Cur].NumInEdges == 0)
1221       continue;
1222 
1223     assert(SpanningTreeNodes[Cur].NumInEdges == 1, "must have 1 parent");
1224     const uint32_t Parent = SpanningTreeNodes[Cur].InEdges[0].Node;
1225     const uint32_t ParentEdge = SpanningTreeNodes[Cur].InEdges[0].ID;
1226 
1227     // Calculate parent edge freq.
1228     int64_t ParentEdgeFreq = CurNodeFreq;
1229     for (int I = 0, E = CFGNodes[Cur].NumInEdges; I != E; ++I) {
1230       const uint32_t PredEdge = CFGNodes[Cur].InEdges[I].ID;
1231       ParentEdgeFreq -= EdgeFreqs[PredEdge];
1232     }
1233 
1234     // Sometimes the conservative CFG that BOLT builds will lead to incorrect
1235     // flow computation. For example, in a BB that transitively calls the exit
1236     // syscall, BOLT will add a fall-through successor even though it should not
1237     // have any successors. So this block execution will likely be wrong. We
1238     // tolerate this imperfection since this case should be quite infrequent.
1239     if (ParentEdgeFreq < 0) {
1240       DEBUG(dumpEdgeFreqs());
1241       DEBUG(report("WARNING: incorrect flow"));
1242       ParentEdgeFreq = 0;
1243     }
1244     DEBUG(reportNumber("  Setting freq for ParentEdge: ", ParentEdge, 10));
1245     DEBUG(reportNumber("  with ParentEdgeFreq: ", ParentEdgeFreq, 10));
1246     EdgeFreqs[ParentEdge] = ParentEdgeFreq;
1247   }
1248 
1249   Alloc.deallocate(EntryAddress);
1250   Alloc.deallocate(LeafFrequency);
1251   Alloc.deallocate(Visited);
1252   Alloc.deallocate(Stack);
1253   CallMap->~NodeToCallsMap();
1254   Alloc.deallocate(CallMap);
1255   DEBUG(dumpEdgeFreqs());
1256 }
1257 
1258 /// Write to \p FD all of the edge profiles for function \p FuncDesc. Uses
1259 /// \p Alloc to allocate helper dynamic structures used to compute profile for
1260 /// edges that we do not explictly instrument.
1261 const uint8_t *writeFunctionProfile(int FD, ProfileWriterContext &Ctx,
1262                                     const uint8_t *FuncDesc,
1263                                     BumpPtrAllocator &Alloc) {
1264   const FunctionDescription F(FuncDesc);
1265   const uint8_t *next = FuncDesc + F.getSize();
1266 
1267 #if !defined(__APPLE__)
1268   uint64_t *bolt_instr_locations = __bolt_instr_locations;
1269 #else
1270   uint64_t *bolt_instr_locations = _bolt_instr_locations_getter();
1271 #endif
1272 
1273   // Skip funcs we know are cold
1274 #ifndef ENABLE_DEBUG
1275   uint64_t CountersFreq = 0;
1276   for (int I = 0; I < F.NumLeafNodes; ++I)
1277     CountersFreq += bolt_instr_locations[F.LeafNodes[I].Counter];
1278 
1279   if (CountersFreq == 0) {
1280     for (int I = 0; I < F.NumEdges; ++I) {
1281       const uint32_t C = F.Edges[I].Counter;
1282       if (C == 0xffffffff)
1283         continue;
1284       CountersFreq += bolt_instr_locations[C];
1285     }
1286     if (CountersFreq == 0) {
1287       for (int I = 0; I < F.NumCalls; ++I) {
1288         const uint32_t C = F.Calls[I].Counter;
1289         if (C == 0xffffffff)
1290           continue;
1291         CountersFreq += bolt_instr_locations[C];
1292       }
1293       if (CountersFreq == 0)
1294         return next;
1295     }
1296   }
1297 #endif
1298 
1299   Graph *G = new (Alloc) Graph(Alloc, F, bolt_instr_locations, Ctx);
1300   DEBUG(G->dump());
1301 
1302   if (!G->EdgeFreqs && !G->CallFreqs) {
1303     G->~Graph();
1304     Alloc.deallocate(G);
1305     return next;
1306   }
1307 
1308   for (int I = 0; I < F.NumEdges; ++I) {
1309     const uint64_t Freq = G->EdgeFreqs[I];
1310     if (Freq == 0)
1311       continue;
1312     const EdgeDescription *Desc = &F.Edges[I];
1313     char LineBuf[BufSize];
1314     char *Ptr = LineBuf;
1315     Ptr = serializeLoc(Ctx, Ptr, Desc->From, BufSize);
1316     Ptr = serializeLoc(Ctx, Ptr, Desc->To, BufSize - (Ptr - LineBuf));
1317     Ptr = strCopy(Ptr, "0 ", BufSize - (Ptr - LineBuf) - 22);
1318     Ptr = intToStr(Ptr, Freq, 10);
1319     *Ptr++ = '\n';
1320     __write(FD, LineBuf, Ptr - LineBuf);
1321   }
1322 
1323   for (int I = 0; I < F.NumCalls; ++I) {
1324     const uint64_t Freq = G->CallFreqs[I];
1325     if (Freq == 0)
1326       continue;
1327     char LineBuf[BufSize];
1328     char *Ptr = LineBuf;
1329     const CallDescription *Desc = &F.Calls[I];
1330     Ptr = serializeLoc(Ctx, Ptr, Desc->From, BufSize);
1331     Ptr = serializeLoc(Ctx, Ptr, Desc->To, BufSize - (Ptr - LineBuf));
1332     Ptr = strCopy(Ptr, "0 ", BufSize - (Ptr - LineBuf) - 25);
1333     Ptr = intToStr(Ptr, Freq, 10);
1334     *Ptr++ = '\n';
1335     __write(FD, LineBuf, Ptr - LineBuf);
1336   }
1337 
1338   G->~Graph();
1339   Alloc.deallocate(G);
1340   return next;
1341 }
1342 
1343 #if !defined(__APPLE__)
1344 const IndCallTargetDescription *
1345 ProfileWriterContext::lookupIndCallTarget(uint64_t Target) const {
1346   uint32_t B = 0;
1347   uint32_t E = __bolt_instr_num_ind_targets;
1348   if (E == 0)
1349     return nullptr;
1350   do {
1351     uint32_t I = (E - B) / 2 + B;
1352     if (IndCallTargets[I].Address == Target)
1353       return &IndCallTargets[I];
1354     if (IndCallTargets[I].Address < Target)
1355       B = I + 1;
1356     else
1357       E = I;
1358   } while (B < E);
1359   return nullptr;
1360 }
1361 
1362 /// Write a single indirect call <src, target> pair to the fdata file
1363 void visitIndCallCounter(IndirectCallHashTable::MapEntry &Entry,
1364                          int FD, int CallsiteID,
1365                          ProfileWriterContext *Ctx) {
1366   if (Entry.Val == 0)
1367     return;
1368   DEBUG(reportNumber("Target func 0x", Entry.Key, 16));
1369   DEBUG(reportNumber("Target freq: ", Entry.Val, 10));
1370   const IndCallDescription *CallsiteDesc =
1371       &Ctx->IndCallDescriptions[CallsiteID];
1372   const IndCallTargetDescription *TargetDesc =
1373       Ctx->lookupIndCallTarget(Entry.Key);
1374   if (!TargetDesc) {
1375     DEBUG(report("Failed to lookup indirect call target\n"));
1376     char LineBuf[BufSize];
1377     char *Ptr = LineBuf;
1378     Ptr = serializeLoc(*Ctx, Ptr, *CallsiteDesc, BufSize);
1379     Ptr = strCopy(Ptr, "0 [unknown] 0 0 ", BufSize - (Ptr - LineBuf) - 40);
1380     Ptr = intToStr(Ptr, Entry.Val, 10);
1381     *Ptr++ = '\n';
1382     __write(FD, LineBuf, Ptr - LineBuf);
1383     return;
1384   }
1385   Ctx->CallFlowTable->get(TargetDesc->Address).Calls += Entry.Val;
1386   char LineBuf[BufSize];
1387   char *Ptr = LineBuf;
1388   Ptr = serializeLoc(*Ctx, Ptr, *CallsiteDesc, BufSize);
1389   Ptr = serializeLoc(*Ctx, Ptr, TargetDesc->Loc, BufSize - (Ptr - LineBuf));
1390   Ptr = strCopy(Ptr, "0 ", BufSize - (Ptr - LineBuf) - 25);
1391   Ptr = intToStr(Ptr, Entry.Val, 10);
1392   *Ptr++ = '\n';
1393   __write(FD, LineBuf, Ptr - LineBuf);
1394 }
1395 
1396 /// Write to \p FD all of the indirect call profiles.
1397 void writeIndirectCallProfile(int FD, ProfileWriterContext &Ctx) {
1398   for (int I = 0; I < __bolt_instr_num_ind_calls; ++I) {
1399     DEBUG(reportNumber("IndCallsite #", I, 10));
1400     GlobalIndCallCounters[I].forEachElement(visitIndCallCounter, FD, I, &Ctx);
1401   }
1402 }
1403 
1404 /// Check a single call flow for a callee versus all known callers. If there are
1405 /// less callers than what the callee expects, write the difference with source
1406 /// [unknown] in the profile.
1407 void visitCallFlowEntry(CallFlowHashTable::MapEntry &Entry, int FD,
1408                         ProfileWriterContext *Ctx) {
1409   DEBUG(reportNumber("Call flow entry address: 0x", Entry.Key, 16));
1410   DEBUG(reportNumber("Calls: ", Entry.Calls, 10));
1411   DEBUG(reportNumber("Reported entry frequency: ", Entry.Val, 10));
1412   DEBUG({
1413     if (Entry.Calls > Entry.Val)
1414       report("  More calls than expected!\n");
1415   });
1416   if (Entry.Val <= Entry.Calls)
1417     return;
1418   DEBUG(reportNumber(
1419       "  Balancing calls with traffic: ", Entry.Val - Entry.Calls, 10));
1420   const IndCallTargetDescription *TargetDesc =
1421       Ctx->lookupIndCallTarget(Entry.Key);
1422   if (!TargetDesc) {
1423     // There is probably something wrong with this callee and this should be
1424     // investigated, but I don't want to assert and lose all data collected.
1425     DEBUG(report("WARNING: failed to look up call target!\n"));
1426     return;
1427   }
1428   char LineBuf[BufSize];
1429   char *Ptr = LineBuf;
1430   Ptr = strCopy(Ptr, "0 [unknown] 0 ", BufSize);
1431   Ptr = serializeLoc(*Ctx, Ptr, TargetDesc->Loc, BufSize - (Ptr - LineBuf));
1432   Ptr = strCopy(Ptr, "0 ", BufSize - (Ptr - LineBuf) - 25);
1433   Ptr = intToStr(Ptr, Entry.Val - Entry.Calls, 10);
1434   *Ptr++ = '\n';
1435   __write(FD, LineBuf, Ptr - LineBuf);
1436 }
1437 
1438 /// Open fdata file for writing and return a valid file descriptor, aborting
1439 /// program upon failure.
1440 int openProfile() {
1441   // Build the profile name string by appending our PID
1442   char Buf[BufSize];
1443   char *Ptr = Buf;
1444   uint64_t PID = __getpid();
1445   Ptr = strCopy(Buf, __bolt_instr_filename, BufSize);
1446   if (__bolt_instr_use_pid) {
1447     Ptr = strCopy(Ptr, ".", BufSize - (Ptr - Buf + 1));
1448     Ptr = intToStr(Ptr, PID, 10);
1449     Ptr = strCopy(Ptr, ".fdata", BufSize - (Ptr - Buf + 1));
1450   }
1451   *Ptr++ = '\0';
1452   uint64_t FD = __open(Buf,
1453                        /*flags=*/0x241 /*O_WRONLY|O_TRUNC|O_CREAT*/,
1454                        /*mode=*/0666);
1455   if (static_cast<int64_t>(FD) < 0) {
1456     report("Error while trying to open profile file for writing: ");
1457     report(Buf);
1458     reportNumber("\nFailed with error number: 0x",
1459                  0 - static_cast<int64_t>(FD), 16);
1460     __exit(1);
1461   }
1462   return FD;
1463 }
1464 
1465 #endif
1466 
1467 } // anonymous namespace
1468 
1469 #if !defined(__APPLE__)
1470 
1471 /// Reset all counters in case you want to start profiling a new phase of your
1472 /// program independently of prior phases.
1473 /// The address of this function is printed by BOLT and this can be called by
1474 /// any attached debugger during runtime. There is a useful oneliner for gdb:
1475 ///
1476 ///   gdb -p $(pgrep -xo PROCESSNAME) -ex 'p ((void(*)())0xdeadbeef)()' \
1477 ///     -ex 'set confirm off' -ex quit
1478 ///
1479 /// Where 0xdeadbeef is this function address and PROCESSNAME your binary file
1480 /// name.
1481 extern "C" void __bolt_instr_clear_counters() {
1482   memset(reinterpret_cast<char *>(__bolt_instr_locations), 0,
1483          __bolt_num_counters * 8);
1484   for (int I = 0; I < __bolt_instr_num_ind_calls; ++I)
1485     GlobalIndCallCounters[I].resetCounters();
1486 }
1487 
1488 /// This is the entry point for profile writing.
1489 /// There are three ways of getting here:
1490 ///
1491 ///  * Program execution ended, finalization methods are running and BOLT
1492 ///    hooked into FINI from your binary dynamic section;
1493 ///  * You used the sleep timer option and during initialization we forked
1494 ///    a separete process that will call this function periodically;
1495 ///  * BOLT prints this function address so you can attach a debugger and
1496 ///    call this function directly to get your profile written to disk
1497 ///    on demand.
1498 ///
1499 extern "C" void __attribute((force_align_arg_pointer))
1500 __bolt_instr_data_dump() {
1501   // Already dumping
1502   if (!GlobalWriteProfileMutex->acquire())
1503     return;
1504 
1505   BumpPtrAllocator HashAlloc;
1506   HashAlloc.setMaxSize(0x6400000);
1507   ProfileWriterContext Ctx = readDescriptions();
1508   Ctx.CallFlowTable = new (HashAlloc, 0) CallFlowHashTable(HashAlloc);
1509 
1510   DEBUG(printStats(Ctx));
1511 
1512   int FD = openProfile();
1513 
1514   BumpPtrAllocator Alloc;
1515   Alloc.setMaxSize(0x6400000);
1516   const uint8_t *FuncDesc = Ctx.FuncDescriptions;
1517   for (int I = 0, E = __bolt_instr_num_funcs; I < E; ++I) {
1518     FuncDesc = writeFunctionProfile(FD, Ctx, FuncDesc, Alloc);
1519     Alloc.clear();
1520     DEBUG(reportNumber("FuncDesc now: ", (uint64_t)FuncDesc, 16));
1521   }
1522   assert(FuncDesc == (void *)Ctx.Strings,
1523          "FuncDesc ptr must be equal to stringtable");
1524 
1525   writeIndirectCallProfile(FD, Ctx);
1526   Ctx.CallFlowTable->forEachElement(visitCallFlowEntry, FD, &Ctx);
1527 
1528   __fsync(FD);
1529   __close(FD);
1530   __munmap(Ctx.MMapPtr, Ctx.MMapSize);
1531   __close(Ctx.FileDesc);
1532   HashAlloc.destroy();
1533   GlobalWriteProfileMutex->release();
1534   DEBUG(report("Finished writing profile.\n"));
1535 }
1536 
1537 /// Event loop for our child process spawned during setup to dump profile data
1538 /// at user-specified intervals
1539 void watchProcess() {
1540   timespec ts, rem;
1541   uint64_t Ellapsed = 0ull;
1542   uint64_t ppid;
1543   if (__bolt_instr_wait_forks) {
1544     // Store parent pgid
1545     ppid = -__getpgid(0);
1546     // And leave parent process group
1547     __setpgid(0, 0);
1548   } else {
1549     // Store parent pid
1550     ppid = __getppid();
1551     if (ppid == 1) {
1552       // Parent already dead
1553       __bolt_instr_data_dump();
1554       goto out;
1555     }
1556   }
1557 
1558   ts.tv_sec = 1;
1559   ts.tv_nsec = 0;
1560   while (1) {
1561     __nanosleep(&ts, &rem);
1562     // This means our parent process or all its forks are dead,
1563     // so no need for us to keep dumping.
1564     if (__kill(ppid, 0) < 0) {
1565       if (__bolt_instr_no_counters_clear)
1566         __bolt_instr_data_dump();
1567       break;
1568     }
1569 
1570     if (++Ellapsed < __bolt_instr_sleep_time)
1571       continue;
1572 
1573     Ellapsed = 0;
1574     __bolt_instr_data_dump();
1575     if (__bolt_instr_no_counters_clear == false)
1576       __bolt_instr_clear_counters();
1577   }
1578 
1579 out:;
1580   DEBUG(report("My parent process is dead, bye!\n"));
1581   __exit(0);
1582 }
1583 
1584 extern "C" void __bolt_instr_indirect_call();
1585 extern "C" void __bolt_instr_indirect_tailcall();
1586 
1587 /// Initialization code
1588 extern "C" void __attribute((force_align_arg_pointer)) __bolt_instr_setup() {
1589   __bolt_ind_call_counter_func_pointer = __bolt_instr_indirect_call;
1590   __bolt_ind_tailcall_counter_func_pointer = __bolt_instr_indirect_tailcall;
1591 
1592   const uint64_t CountersStart =
1593       reinterpret_cast<uint64_t>(&__bolt_instr_locations[0]);
1594   const uint64_t CountersEnd = alignTo(
1595       reinterpret_cast<uint64_t>(&__bolt_instr_locations[__bolt_num_counters]),
1596       0x1000);
1597   DEBUG(reportNumber("replace mmap start: ", CountersStart, 16));
1598   DEBUG(reportNumber("replace mmap stop: ", CountersEnd, 16));
1599   assert(CountersEnd > CountersStart, "no counters");
1600 
1601   const bool Shared = !__bolt_instr_use_pid;
1602   const uint64_t MapPrivateOrShared = Shared ? MAP_SHARED : MAP_PRIVATE;
1603 
1604   void *Ret =
1605       __mmap(CountersStart, CountersEnd - CountersStart, PROT_READ | PROT_WRITE,
1606              MAP_ANONYMOUS | MapPrivateOrShared | MAP_FIXED, -1, 0);
1607   assert(Ret != MAP_FAILED, "__bolt_instr_setup: Failed to mmap counters!");
1608 
1609   // Conservatively reserve 100MiB shared pages
1610   GlobalAlloc.setMaxSize(0x6400000);
1611   GlobalAlloc.setShared(Shared);
1612   GlobalWriteProfileMutex = new (GlobalAlloc, 0) Mutex();
1613   if (__bolt_instr_num_ind_calls > 0)
1614     GlobalIndCallCounters =
1615         new (GlobalAlloc, 0) IndirectCallHashTable[__bolt_instr_num_ind_calls];
1616 
1617   if (__bolt_instr_sleep_time != 0) {
1618     // Separate instrumented process to the own process group
1619     if (__bolt_instr_wait_forks)
1620       __setpgid(0, 0);
1621 
1622     if (long PID = __fork())
1623       return;
1624     watchProcess();
1625   }
1626 }
1627 
1628 extern "C" __attribute((force_align_arg_pointer)) void
1629 instrumentIndirectCall(uint64_t Target, uint64_t IndCallID) {
1630   GlobalIndCallCounters[IndCallID].incrementVal(Target, GlobalAlloc);
1631 }
1632 
1633 /// We receive as in-stack arguments the identifier of the indirect call site
1634 /// as well as the target address for the call
1635 extern "C" __attribute((naked)) void __bolt_instr_indirect_call()
1636 {
1637   __asm__ __volatile__(SAVE_ALL
1638                        "mov 0xa0(%%rsp), %%rdi\n"
1639                        "mov 0x98(%%rsp), %%rsi\n"
1640                        "call instrumentIndirectCall\n"
1641                        RESTORE_ALL
1642                        "ret\n"
1643                        :::);
1644 }
1645 
1646 extern "C" __attribute((naked)) void __bolt_instr_indirect_tailcall()
1647 {
1648   __asm__ __volatile__(SAVE_ALL
1649                        "mov 0x98(%%rsp), %%rdi\n"
1650                        "mov 0x90(%%rsp), %%rsi\n"
1651                        "call instrumentIndirectCall\n"
1652                        RESTORE_ALL
1653                        "ret\n"
1654                        :::);
1655 }
1656 
1657 /// This is hooking ELF's entry, it needs to save all machine state.
1658 extern "C" __attribute((naked)) void __bolt_instr_start()
1659 {
1660   __asm__ __volatile__(SAVE_ALL
1661                        "call __bolt_instr_setup\n"
1662                        RESTORE_ALL
1663                        "jmp __bolt_start_trampoline\n"
1664                        :::);
1665 }
1666 
1667 /// This is hooking into ELF's DT_FINI
1668 extern "C" void __bolt_instr_fini() {
1669   __bolt_fini_trampoline();
1670   if (__bolt_instr_sleep_time == 0)
1671     __bolt_instr_data_dump();
1672   DEBUG(report("Finished.\n"));
1673 }
1674 
1675 #endif
1676 
1677 #if defined(__APPLE__)
1678 
1679 extern "C" void __bolt_instr_data_dump() {
1680   ProfileWriterContext Ctx = readDescriptions();
1681 
1682   int FD = 2;
1683   BumpPtrAllocator Alloc;
1684   const uint8_t *FuncDesc = Ctx.FuncDescriptions;
1685   uint32_t bolt_instr_num_funcs = _bolt_instr_num_funcs_getter();
1686 
1687   for (int I = 0, E = bolt_instr_num_funcs; I < E; ++I) {
1688     FuncDesc = writeFunctionProfile(FD, Ctx, FuncDesc, Alloc);
1689     Alloc.clear();
1690     DEBUG(reportNumber("FuncDesc now: ", (uint64_t)FuncDesc, 16));
1691   }
1692   assert(FuncDesc == (void *)Ctx.Strings,
1693          "FuncDesc ptr must be equal to stringtable");
1694 }
1695 
1696 // On OSX/iOS the final symbol name of an extern "C" function/variable contains
1697 // one extra leading underscore: _bolt_instr_setup -> __bolt_instr_setup.
1698 extern "C"
1699 __attribute__((section("__TEXT,__setup")))
1700 __attribute__((force_align_arg_pointer))
1701 void _bolt_instr_setup() {
1702   __asm__ __volatile__(SAVE_ALL :::);
1703 
1704   report("Hello!\n");
1705 
1706   __asm__ __volatile__(RESTORE_ALL :::);
1707 }
1708 
1709 extern "C"
1710 __attribute__((section("__TEXT,__fini")))
1711 __attribute__((force_align_arg_pointer))
1712 void _bolt_instr_fini() {
1713   report("Bye!\n");
1714   __bolt_instr_data_dump();
1715 }
1716 
1717 #endif
1718 #endif
1719