xref: /llvm-project/bolt/runtime/instr.cpp (revision a799298152e3ef08b4919cdaac7a614f7cca9bc6)
1 //===- bolt/runtime/instr.cpp ---------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // BOLT runtime instrumentation library for x86 Linux. Currently, BOLT does
10 // not support linking modules with dependencies on one another into the final
11 // binary (TODO?), which means this library has to be self-contained in a single
12 // module.
13 //
14 // All extern declarations here need to be defined by BOLT itself. Those will be
15 // undefined symbols that BOLT needs to resolve by emitting these symbols with
16 // MCStreamer. Currently, Passes/Instrumentation.cpp is the pass responsible
17 // for defining the symbols here and these two files have a tight coupling: one
18 // working statically when you run BOLT and another during program runtime when
19 // you run an instrumented binary. The main goal here is to output an fdata file
20 // (BOLT profile) with the instrumentation counters inserted by the static pass.
21 // Counters for indirect calls are an exception, as we can't know them
22 // statically. These counters are created and managed here. To allow this, we
23 // need a minimal framework for allocating memory dynamically. We provide this
24 // with the BumpPtrAllocator class (not LLVM's, but our own version of it).
25 //
26 // Since this code is intended to be inserted into any executable, we decided to
27 // make it standalone and do not depend on any external libraries (i.e. language
28 // support libraries, such as glibc or stdc++). To allow this, we provide a few
29 // light implementations of common OS interacting functionalities using direct
30 // syscall wrappers. Our simple allocator doesn't manage deallocations that
31 // fragment the memory space, so it's stack based. This is the minimal framework
32 // provided here to allow processing instrumented counters and writing fdata.
33 //
34 // In the C++ idiom used here, we never use or rely on constructors or
35 // destructors for global objects. That's because those need support from the
36 // linker in initialization/finalization code, and we want to keep our linker
37 // very simple. Similarly, we don't create any global objects that are zero
38 // initialized, since those would need to go .bss, which our simple linker also
39 // don't support (TODO?).
40 //
41 //===----------------------------------------------------------------------===//
42 
43 #if defined (__x86_64__)
44 #include "common.h"
45 
46 // Enables a very verbose logging to stderr useful when debugging
47 //#define ENABLE_DEBUG
48 
49 #ifdef ENABLE_DEBUG
50 #define DEBUG(X)                                                               \
51   { X; }
52 #else
53 #define DEBUG(X)                                                               \
54   {}
55 #endif
56 
57 #pragma GCC visibility push(hidden)
58 
59 extern "C" {
60 
61 #if defined(__APPLE__)
62 extern uint64_t* _bolt_instr_locations_getter();
63 extern uint32_t _bolt_num_counters_getter();
64 
65 extern uint8_t* _bolt_instr_tables_getter();
66 extern uint32_t _bolt_instr_num_funcs_getter();
67 
68 #else
69 
70 // Main counters inserted by instrumentation, incremented during runtime when
71 // points of interest (locations) in the program are reached. Those are direct
72 // calls and direct and indirect branches (local ones). There are also counters
73 // for basic block execution if they are a spanning tree leaf and need to be
74 // counted in order to infer the execution count of other edges of the CFG.
75 extern uint64_t __bolt_instr_locations[];
76 extern uint32_t __bolt_num_counters;
77 // Descriptions are serialized metadata about binary functions written by BOLT,
78 // so we have a minimal understanding about the program structure. For a
79 // reference on the exact format of this metadata, see *Description structs,
80 // Location, IntrumentedNode and EntryNode.
81 // Number of indirect call site descriptions
82 extern uint32_t __bolt_instr_num_ind_calls;
83 // Number of indirect call target descriptions
84 extern uint32_t __bolt_instr_num_ind_targets;
85 // Number of function descriptions
86 extern uint32_t __bolt_instr_num_funcs;
87 // Time to sleep across dumps (when we write the fdata profile to disk)
88 extern uint32_t __bolt_instr_sleep_time;
89 // Do not clear counters across dumps, rewrite file with the updated values
90 extern bool __bolt_instr_no_counters_clear;
91 // Wait until all forks of instrumented process will finish
92 extern bool __bolt_instr_wait_forks;
93 // Filename to dump data to
94 extern char __bolt_instr_filename[];
95 // Instumented binary file path
96 extern char __bolt_instr_binpath[];
97 // If true, append current PID to the fdata filename when creating it so
98 // different invocations of the same program can be differentiated.
99 extern bool __bolt_instr_use_pid;
100 // Functions that will be used to instrument indirect calls. BOLT static pass
101 // will identify indirect calls and modify them to load the address in these
102 // trampolines and call this address instead. BOLT can't use direct calls to
103 // our handlers because our addresses here are not known at analysis time. We
104 // only support resolving dependencies from this file to the output of BOLT,
105 // *not* the other way around.
106 // TODO: We need better linking support to make that happen.
107 extern void (*__bolt_ind_call_counter_func_pointer)();
108 extern void (*__bolt_ind_tailcall_counter_func_pointer)();
109 // Function pointers to init/fini trampoline routines in the binary, so we can
110 // resume regular execution of these functions that we hooked
111 extern void __bolt_start_trampoline();
112 extern void __bolt_fini_trampoline();
113 
114 #endif
115 }
116 
117 namespace {
118 
119 /// A simple allocator that mmaps a fixed size region and manages this space
120 /// in a stack fashion, meaning you always deallocate the last element that
121 /// was allocated. In practice, we don't need to deallocate individual elements.
122 /// We monotonically increase our usage and then deallocate everything once we
123 /// are done processing something.
124 class BumpPtrAllocator {
125   /// This is written before each allocation and act as a canary to detect when
126   /// a bug caused our program to cross allocation boundaries.
127   struct EntryMetadata {
128     uint64_t Magic;
129     uint64_t AllocSize;
130   };
131 
132 public:
133   void *allocate(size_t Size) {
134     Lock L(M);
135 
136     if (StackBase == nullptr) {
137       StackBase = reinterpret_cast<uint8_t *>(
138           __mmap(0, MaxSize, PROT_READ | PROT_WRITE,
139                  (Shared ? MAP_SHARED : MAP_PRIVATE) | MAP_ANONYMOUS, -1, 0));
140       assert(StackBase != MAP_FAILED,
141              "BumpPtrAllocator: failed to mmap stack!");
142       StackSize = 0;
143     }
144 
145     Size = alignTo(Size + sizeof(EntryMetadata), 16);
146     uint8_t *AllocAddress = StackBase + StackSize + sizeof(EntryMetadata);
147     auto *M = reinterpret_cast<EntryMetadata *>(StackBase + StackSize);
148     M->Magic = Magic;
149     M->AllocSize = Size;
150     StackSize += Size;
151     assert(StackSize < MaxSize, "allocator ran out of memory");
152     return AllocAddress;
153   }
154 
155 #ifdef DEBUG
156   /// Element-wise deallocation is only used for debugging to catch memory
157   /// bugs by checking magic bytes. Ordinarily, we reset the allocator once
158   /// we are done with it. Reset is done with clear(). There's no need
159   /// to deallocate each element individually.
160   void deallocate(void *Ptr) {
161     Lock L(M);
162     uint8_t MetadataOffset = sizeof(EntryMetadata);
163     auto *M = reinterpret_cast<EntryMetadata *>(
164         reinterpret_cast<uint8_t *>(Ptr) - MetadataOffset);
165     const uint8_t *StackTop = StackBase + StackSize + MetadataOffset;
166     // Validate size
167     if (Ptr != StackTop - M->AllocSize) {
168       // Failed validation, check if it is a pointer returned by operator new []
169       MetadataOffset +=
170           sizeof(uint64_t); // Space for number of elements alloc'ed
171       M = reinterpret_cast<EntryMetadata *>(reinterpret_cast<uint8_t *>(Ptr) -
172                                             MetadataOffset);
173       // Ok, it failed both checks if this assertion fails. Stop the program, we
174       // have a memory bug.
175       assert(Ptr == StackTop - M->AllocSize,
176              "must deallocate the last element alloc'ed");
177     }
178     assert(M->Magic == Magic, "allocator magic is corrupt");
179     StackSize -= M->AllocSize;
180   }
181 #else
182   void deallocate(void *) {}
183 #endif
184 
185   void clear() {
186     Lock L(M);
187     StackSize = 0;
188   }
189 
190   /// Set mmap reservation size (only relevant before first allocation)
191   void setMaxSize(uint64_t Size) { MaxSize = Size; }
192 
193   /// Set mmap reservation privacy (only relevant before first allocation)
194   void setShared(bool S) { Shared = S; }
195 
196   void destroy() {
197     if (StackBase == nullptr)
198       return;
199     __munmap(StackBase, MaxSize);
200   }
201 
202   // Placement operator to construct allocator in possibly shared mmaped memory
203   static void *operator new(size_t, void *Ptr) { return Ptr; };
204 
205 private:
206   static constexpr uint64_t Magic = 0x1122334455667788ull;
207   uint64_t MaxSize = 0xa00000;
208   uint8_t *StackBase{nullptr};
209   uint64_t StackSize{0};
210   bool Shared{false};
211   Mutex M;
212 };
213 
214 /// Used for allocating indirect call instrumentation counters. Initialized by
215 /// __bolt_instr_setup, our initialization routine.
216 BumpPtrAllocator *GlobalAlloc;
217 
218 // Storage for GlobalAlloc which can be shared if not using
219 // instrumentation-file-append-pid.
220 void *GlobalMetadataStorage;
221 
222 } // anonymous namespace
223 
224 // User-defined placement new operators. We only use those (as opposed to
225 // overriding the regular operator new) so we can keep our allocator in the
226 // stack instead of in a data section (global).
227 void *operator new(size_t Sz, BumpPtrAllocator &A) { return A.allocate(Sz); }
228 void *operator new(size_t Sz, BumpPtrAllocator &A, char C) {
229   auto *Ptr = reinterpret_cast<char *>(A.allocate(Sz));
230   memset(Ptr, C, Sz);
231   return Ptr;
232 }
233 void *operator new[](size_t Sz, BumpPtrAllocator &A) {
234   return A.allocate(Sz);
235 }
236 void *operator new[](size_t Sz, BumpPtrAllocator &A, char C) {
237   auto *Ptr = reinterpret_cast<char *>(A.allocate(Sz));
238   memset(Ptr, C, Sz);
239   return Ptr;
240 }
241 // Only called during exception unwinding (useless). We must manually dealloc.
242 // C++ language weirdness
243 void operator delete(void *Ptr, BumpPtrAllocator &A) { A.deallocate(Ptr); }
244 
245 namespace {
246 
247 // Disable instrumentation optimizations that sacrifice profile accuracy
248 extern "C" bool __bolt_instr_conservative;
249 
250 /// Basic key-val atom stored in our hash
251 struct SimpleHashTableEntryBase {
252   uint64_t Key;
253   uint64_t Val;
254   void dump(const char *Msg = nullptr) {
255     // TODO: make some sort of formatting function
256     // Currently we have to do it the ugly way because
257     // we want every message to be printed atomically via a single call to
258     // __write. If we use reportNumber() and others nultiple times, we'll get
259     // garbage in mulithreaded environment
260     char Buf[BufSize];
261     char *Ptr = Buf;
262     Ptr = intToStr(Ptr, __getpid(), 10);
263     *Ptr++ = ':';
264     *Ptr++ = ' ';
265     if (Msg)
266       Ptr = strCopy(Ptr, Msg, strLen(Msg));
267     *Ptr++ = '0';
268     *Ptr++ = 'x';
269     Ptr = intToStr(Ptr, (uint64_t)this, 16);
270     *Ptr++ = ':';
271     *Ptr++ = ' ';
272     Ptr = strCopy(Ptr, "MapEntry(0x", sizeof("MapEntry(0x") - 1);
273     Ptr = intToStr(Ptr, Key, 16);
274     *Ptr++ = ',';
275     *Ptr++ = ' ';
276     *Ptr++ = '0';
277     *Ptr++ = 'x';
278     Ptr = intToStr(Ptr, Val, 16);
279     *Ptr++ = ')';
280     *Ptr++ = '\n';
281     assert(Ptr - Buf < BufSize, "Buffer overflow!");
282     // print everything all at once for atomicity
283     __write(2, Buf, Ptr - Buf);
284   }
285 };
286 
287 /// This hash table implementation starts by allocating a table of size
288 /// InitialSize. When conflicts happen in this main table, it resolves
289 /// them by chaining a new table of size IncSize. It never reallocs as our
290 /// allocator doesn't support it. The key is intended to be function pointers.
291 /// There's no clever hash function (it's just x mod size, size being prime).
292 /// I never tuned the coefficientes in the modular equation (TODO)
293 /// This is used for indirect calls (each call site has one of this, so it
294 /// should have a small footprint) and for tallying call counts globally for
295 /// each target to check if we missed the origin of some calls (this one is a
296 /// large instantiation of this template, since it is global for all call sites)
297 template <typename T = SimpleHashTableEntryBase, uint32_t InitialSize = 7,
298           uint32_t IncSize = 7>
299 class SimpleHashTable {
300 public:
301   using MapEntry = T;
302 
303   /// Increment by 1 the value of \p Key. If it is not in this table, it will be
304   /// added to the table and its value set to 1.
305   void incrementVal(uint64_t Key, BumpPtrAllocator &Alloc) {
306     if (!__bolt_instr_conservative) {
307       TryLock L(M);
308       if (!L.isLocked())
309         return;
310       auto &E = getOrAllocEntry(Key, Alloc);
311       ++E.Val;
312       return;
313     }
314     Lock L(M);
315     auto &E = getOrAllocEntry(Key, Alloc);
316     ++E.Val;
317   }
318 
319   /// Basic member accessing interface. Here we pass the allocator explicitly to
320   /// avoid storing a pointer to it as part of this table (remember there is one
321   /// hash for each indirect call site, so we wan't to minimize our footprint).
322   MapEntry &get(uint64_t Key, BumpPtrAllocator &Alloc) {
323     if (!__bolt_instr_conservative) {
324       TryLock L(M);
325       if (!L.isLocked())
326         return NoEntry;
327       return getOrAllocEntry(Key, Alloc);
328     }
329     Lock L(M);
330     return getOrAllocEntry(Key, Alloc);
331   }
332 
333   /// Traverses all elements in the table
334   template <typename... Args>
335   void forEachElement(void (*Callback)(MapEntry &, Args...), Args... args) {
336     Lock L(M);
337     if (!TableRoot)
338       return;
339     return forEachElement(Callback, InitialSize, TableRoot, args...);
340   }
341 
342   void resetCounters();
343 
344 private:
345   constexpr static uint64_t VacantMarker = 0;
346   constexpr static uint64_t FollowUpTableMarker = 0x8000000000000000ull;
347 
348   MapEntry *TableRoot{nullptr};
349   MapEntry NoEntry;
350   Mutex M;
351 
352   template <typename... Args>
353   void forEachElement(void (*Callback)(MapEntry &, Args...),
354                       uint32_t NumEntries, MapEntry *Entries, Args... args) {
355     for (uint32_t I = 0; I < NumEntries; ++I) {
356       MapEntry &Entry = Entries[I];
357       if (Entry.Key == VacantMarker)
358         continue;
359       if (Entry.Key & FollowUpTableMarker) {
360         MapEntry *Next =
361             reinterpret_cast<MapEntry *>(Entry.Key & ~FollowUpTableMarker);
362         assert(Next != Entries, "Circular reference!");
363         forEachElement(Callback, IncSize, Next, args...);
364         continue;
365       }
366       Callback(Entry, args...);
367     }
368   }
369 
370   MapEntry &firstAllocation(uint64_t Key, BumpPtrAllocator &Alloc) {
371     TableRoot = new (Alloc, 0) MapEntry[InitialSize];
372     MapEntry &Entry = TableRoot[Key % InitialSize];
373     Entry.Key = Key;
374     // DEBUG(Entry.dump("Created root entry: "));
375     return Entry;
376   }
377 
378   MapEntry &getEntry(MapEntry *Entries, uint64_t Key, uint64_t Selector,
379                      BumpPtrAllocator &Alloc, int CurLevel) {
380     // DEBUG(reportNumber("getEntry called, level ", CurLevel, 10));
381     const uint32_t NumEntries = CurLevel == 0 ? InitialSize : IncSize;
382     uint64_t Remainder = Selector / NumEntries;
383     Selector = Selector % NumEntries;
384     MapEntry &Entry = Entries[Selector];
385 
386     // A hit
387     if (Entry.Key == Key) {
388       // DEBUG(Entry.dump("Hit: "));
389       return Entry;
390     }
391 
392     // Vacant - add new entry
393     if (Entry.Key == VacantMarker) {
394       Entry.Key = Key;
395       // DEBUG(Entry.dump("Adding new entry: "));
396       return Entry;
397     }
398 
399     // Defer to the next level
400     if (Entry.Key & FollowUpTableMarker) {
401       return getEntry(
402           reinterpret_cast<MapEntry *>(Entry.Key & ~FollowUpTableMarker),
403           Key, Remainder, Alloc, CurLevel + 1);
404     }
405 
406     // Conflict - create the next level
407     // DEBUG(Entry.dump("Creating new level: "));
408 
409     MapEntry *NextLevelTbl = new (Alloc, 0) MapEntry[IncSize];
410     // DEBUG(
411     //     reportNumber("Newly allocated level: 0x", uint64_t(NextLevelTbl),
412     //     16));
413     uint64_t CurEntrySelector = Entry.Key / InitialSize;
414     for (int I = 0; I < CurLevel; ++I)
415       CurEntrySelector /= IncSize;
416     CurEntrySelector = CurEntrySelector % IncSize;
417     NextLevelTbl[CurEntrySelector] = Entry;
418     Entry.Key = reinterpret_cast<uint64_t>(NextLevelTbl) | FollowUpTableMarker;
419     assert((NextLevelTbl[CurEntrySelector].Key & ~FollowUpTableMarker) !=
420                uint64_t(Entries),
421            "circular reference created!\n");
422     // DEBUG(NextLevelTbl[CurEntrySelector].dump("New level entry: "));
423     // DEBUG(Entry.dump("Updated old entry: "));
424     return getEntry(NextLevelTbl, Key, Remainder, Alloc, CurLevel + 1);
425   }
426 
427   MapEntry &getOrAllocEntry(uint64_t Key, BumpPtrAllocator &Alloc) {
428     if (TableRoot) {
429       MapEntry &E = getEntry(TableRoot, Key, Key, Alloc, 0);
430       assert(!(E.Key & FollowUpTableMarker), "Invalid entry!");
431       return E;
432     }
433     return firstAllocation(Key, Alloc);
434   }
435 };
436 
437 template <typename T> void resetIndCallCounter(T &Entry) {
438   Entry.Val = 0;
439 }
440 
441 template <typename T, uint32_t X, uint32_t Y>
442 void SimpleHashTable<T, X, Y>::resetCounters() {
443   forEachElement(resetIndCallCounter);
444 }
445 
446 /// Represents a hash table mapping a function target address to its counter.
447 using IndirectCallHashTable = SimpleHashTable<>;
448 
449 /// Initialize with number 1 instead of 0 so we don't go into .bss. This is the
450 /// global array of all hash tables storing indirect call destinations happening
451 /// during runtime, one table per call site.
452 IndirectCallHashTable *GlobalIndCallCounters{
453     reinterpret_cast<IndirectCallHashTable *>(1)};
454 
455 /// Don't allow reentrancy in the fdata writing phase - only one thread writes
456 /// it
457 Mutex *GlobalWriteProfileMutex{reinterpret_cast<Mutex *>(1)};
458 
459 /// Store number of calls in additional to target address (Key) and frequency
460 /// as perceived by the basic block counter (Val).
461 struct CallFlowEntryBase : public SimpleHashTableEntryBase {
462   uint64_t Calls;
463 };
464 
465 using CallFlowHashTableBase = SimpleHashTable<CallFlowEntryBase, 11939, 233>;
466 
467 /// This is a large table indexing all possible call targets (indirect and
468 /// direct ones). The goal is to find mismatches between number of calls (for
469 /// those calls we were able to track) and the entry basic block counter of the
470 /// callee. In most cases, these two should be equal. If not, there are two
471 /// possible scenarios here:
472 ///
473 ///  * Entry BB has higher frequency than all known calls to this function.
474 ///    In this case, we have dynamic library code or any uninstrumented code
475 ///    calling this function. We will write the profile for these untracked
476 ///    calls as having source "0 [unknown] 0" in the fdata file.
477 ///
478 ///  * Number of known calls is higher than the frequency of entry BB
479 ///    This only happens when there is no counter for the entry BB / callee
480 ///    function is not simple (in BOLT terms). We don't do anything special
481 ///    here and just ignore those (we still report all calls to the non-simple
482 ///    function, though).
483 ///
484 class CallFlowHashTable : public CallFlowHashTableBase {
485 public:
486   CallFlowHashTable(BumpPtrAllocator &Alloc) : Alloc(Alloc) {}
487 
488   MapEntry &get(uint64_t Key) { return CallFlowHashTableBase::get(Key, Alloc); }
489 
490 private:
491   // Different than the hash table for indirect call targets, we do store the
492   // allocator here since there is only one call flow hash and space overhead
493   // is negligible.
494   BumpPtrAllocator &Alloc;
495 };
496 
497 ///
498 /// Description metadata emitted by BOLT to describe the program - refer to
499 /// Passes/Instrumentation.cpp - Instrumentation::emitTablesAsELFNote()
500 ///
501 struct Location {
502   uint32_t FunctionName;
503   uint32_t Offset;
504 };
505 
506 struct CallDescription {
507   Location From;
508   uint32_t FromNode;
509   Location To;
510   uint32_t Counter;
511   uint64_t TargetAddress;
512 };
513 
514 using IndCallDescription = Location;
515 
516 struct IndCallTargetDescription {
517   Location Loc;
518   uint64_t Address;
519 };
520 
521 struct EdgeDescription {
522   Location From;
523   uint32_t FromNode;
524   Location To;
525   uint32_t ToNode;
526   uint32_t Counter;
527 };
528 
529 struct InstrumentedNode {
530   uint32_t Node;
531   uint32_t Counter;
532 };
533 
534 struct EntryNode {
535   uint64_t Node;
536   uint64_t Address;
537 };
538 
539 struct FunctionDescription {
540   uint32_t NumLeafNodes;
541   const InstrumentedNode *LeafNodes;
542   uint32_t NumEdges;
543   const EdgeDescription *Edges;
544   uint32_t NumCalls;
545   const CallDescription *Calls;
546   uint32_t NumEntryNodes;
547   const EntryNode *EntryNodes;
548 
549   /// Constructor will parse the serialized function metadata written by BOLT
550   FunctionDescription(const uint8_t *FuncDesc);
551 
552   uint64_t getSize() const {
553     return 16 + NumLeafNodes * sizeof(InstrumentedNode) +
554            NumEdges * sizeof(EdgeDescription) +
555            NumCalls * sizeof(CallDescription) +
556            NumEntryNodes * sizeof(EntryNode);
557   }
558 };
559 
560 /// The context is created when the fdata profile needs to be written to disk
561 /// and we need to interpret our runtime counters. It contains pointers to the
562 /// mmaped binary (only the BOLT written metadata section). Deserialization
563 /// should be straightforward as most data is POD or an array of POD elements.
564 /// This metadata is used to reconstruct function CFGs.
565 struct ProfileWriterContext {
566   IndCallDescription *IndCallDescriptions;
567   IndCallTargetDescription *IndCallTargets;
568   uint8_t *FuncDescriptions;
569   char *Strings;  // String table with function names used in this binary
570   int FileDesc;   // File descriptor for the file on disk backing this
571                   // information in memory via mmap
572   void *MMapPtr;  // The mmap ptr
573   int MMapSize;   // The mmap size
574 
575   /// Hash table storing all possible call destinations to detect untracked
576   /// calls and correctly report them as [unknown] in output fdata.
577   CallFlowHashTable *CallFlowTable;
578 
579   /// Lookup the sorted indirect call target vector to fetch function name and
580   /// offset for an arbitrary function pointer.
581   const IndCallTargetDescription *lookupIndCallTarget(uint64_t Target) const;
582 };
583 
584 /// Perform a string comparison and returns zero if Str1 matches Str2. Compares
585 /// at most Size characters.
586 int compareStr(const char *Str1, const char *Str2, int Size) {
587   while (*Str1 == *Str2) {
588     if (*Str1 == '\0' || --Size == 0)
589       return 0;
590     ++Str1;
591     ++Str2;
592   }
593   return 1;
594 }
595 
596 /// Output Location to the fdata file
597 char *serializeLoc(const ProfileWriterContext &Ctx, char *OutBuf,
598                    const Location Loc, uint32_t BufSize) {
599   // fdata location format: Type Name Offset
600   // Type 1 - regular symbol
601   OutBuf = strCopy(OutBuf, "1 ");
602   const char *Str = Ctx.Strings + Loc.FunctionName;
603   uint32_t Size = 25;
604   while (*Str) {
605     *OutBuf++ = *Str++;
606     if (++Size >= BufSize)
607       break;
608   }
609   assert(!*Str, "buffer overflow, function name too large");
610   *OutBuf++ = ' ';
611   OutBuf = intToStr(OutBuf, Loc.Offset, 16);
612   *OutBuf++ = ' ';
613   return OutBuf;
614 }
615 
616 /// Read and deserialize a function description written by BOLT. \p FuncDesc
617 /// points at the beginning of the function metadata structure in the file.
618 /// See Instrumentation::emitTablesAsELFNote()
619 FunctionDescription::FunctionDescription(const uint8_t *FuncDesc) {
620   NumLeafNodes = *reinterpret_cast<const uint32_t *>(FuncDesc);
621   DEBUG(reportNumber("NumLeafNodes = ", NumLeafNodes, 10));
622   LeafNodes = reinterpret_cast<const InstrumentedNode *>(FuncDesc + 4);
623 
624   NumEdges = *reinterpret_cast<const uint32_t *>(
625       FuncDesc + 4 + NumLeafNodes * sizeof(InstrumentedNode));
626   DEBUG(reportNumber("NumEdges = ", NumEdges, 10));
627   Edges = reinterpret_cast<const EdgeDescription *>(
628       FuncDesc + 8 + NumLeafNodes * sizeof(InstrumentedNode));
629 
630   NumCalls = *reinterpret_cast<const uint32_t *>(
631       FuncDesc + 8 + NumLeafNodes * sizeof(InstrumentedNode) +
632       NumEdges * sizeof(EdgeDescription));
633   DEBUG(reportNumber("NumCalls = ", NumCalls, 10));
634   Calls = reinterpret_cast<const CallDescription *>(
635       FuncDesc + 12 + NumLeafNodes * sizeof(InstrumentedNode) +
636       NumEdges * sizeof(EdgeDescription));
637   NumEntryNodes = *reinterpret_cast<const uint32_t *>(
638       FuncDesc + 12 + NumLeafNodes * sizeof(InstrumentedNode) +
639       NumEdges * sizeof(EdgeDescription) + NumCalls * sizeof(CallDescription));
640   DEBUG(reportNumber("NumEntryNodes = ", NumEntryNodes, 10));
641   EntryNodes = reinterpret_cast<const EntryNode *>(
642       FuncDesc + 16 + NumLeafNodes * sizeof(InstrumentedNode) +
643       NumEdges * sizeof(EdgeDescription) + NumCalls * sizeof(CallDescription));
644 }
645 
646 /// Read and mmap descriptions written by BOLT from the executable's notes
647 /// section
648 #if defined(HAVE_ELF_H) and !defined(__APPLE__)
649 
650 void *__attribute__((noinline)) __get_pc() {
651   return __builtin_extract_return_addr(__builtin_return_address(0));
652 }
653 
654 /// Get string with address and parse it to hex pair <StartAddress, EndAddress>
655 bool parseAddressRange(const char *Str, uint64_t &StartAddress,
656                        uint64_t &EndAddress) {
657   if (!Str)
658     return false;
659   // Parsed string format: <hex1>-<hex2>
660   StartAddress = hexToLong(Str, '-');
661   while (*Str && *Str != '-')
662     ++Str;
663   if (!*Str)
664     return false;
665   ++Str; // swallow '-'
666   EndAddress = hexToLong(Str);
667   return true;
668 }
669 
670 /// Get full path to the real binary by getting current virtual address
671 /// and searching for the appropriate link in address range in
672 /// /proc/self/map_files
673 static char *getBinaryPath() {
674   const uint32_t BufSize = 1024;
675   const uint32_t NameMax = 4096;
676   const char DirPath[] = "/proc/self/map_files/";
677   static char TargetPath[NameMax] = {};
678   char Buf[BufSize];
679 
680   if (__bolt_instr_binpath[0] != '\0')
681     return __bolt_instr_binpath;
682 
683   if (TargetPath[0] != '\0')
684     return TargetPath;
685 
686   unsigned long CurAddr = (unsigned long)__get_pc();
687   uint64_t FDdir = __open(DirPath, O_RDONLY,
688                           /*mode=*/0666);
689   assert(static_cast<int64_t>(FDdir) >= 0,
690          "failed to open /proc/self/map_files");
691 
692   while (long Nread = __getdents(FDdir, (struct dirent *)Buf, BufSize)) {
693     assert(static_cast<int64_t>(Nread) != -1, "failed to get folder entries");
694 
695     struct dirent *d;
696     for (long Bpos = 0; Bpos < Nread; Bpos += d->d_reclen) {
697       d = (struct dirent *)(Buf + Bpos);
698 
699       uint64_t StartAddress, EndAddress;
700       if (!parseAddressRange(d->d_name, StartAddress, EndAddress))
701         continue;
702       if (CurAddr < StartAddress || CurAddr > EndAddress)
703         continue;
704       char FindBuf[NameMax];
705       char *C = strCopy(FindBuf, DirPath, NameMax);
706       C = strCopy(C, d->d_name, NameMax - (C - FindBuf));
707       *C = '\0';
708       uint32_t Ret = __readlink(FindBuf, TargetPath, sizeof(TargetPath));
709       assert(Ret != -1 && Ret != BufSize, "readlink error");
710       TargetPath[Ret] = '\0';
711       return TargetPath;
712     }
713   }
714   return nullptr;
715 }
716 
717 ProfileWriterContext readDescriptions() {
718   ProfileWriterContext Result;
719   char *BinPath = getBinaryPath();
720   assert(BinPath && BinPath[0] != '\0', "failed to find binary path");
721 
722   uint64_t FD = __open(BinPath, O_RDONLY,
723                        /*mode=*/0666);
724   assert(static_cast<int64_t>(FD) >= 0, "failed to open binary path");
725 
726   Result.FileDesc = FD;
727 
728   // mmap our binary to memory
729   uint64_t Size = __lseek(FD, 0, SEEK_END);
730   uint8_t *BinContents = reinterpret_cast<uint8_t *>(
731       __mmap(0, Size, PROT_READ, MAP_PRIVATE, FD, 0));
732   assert(BinContents != MAP_FAILED, "readDescriptions: Failed to mmap self!");
733   Result.MMapPtr = BinContents;
734   Result.MMapSize = Size;
735   Elf64_Ehdr *Hdr = reinterpret_cast<Elf64_Ehdr *>(BinContents);
736   Elf64_Shdr *Shdr = reinterpret_cast<Elf64_Shdr *>(BinContents + Hdr->e_shoff);
737   Elf64_Shdr *StringTblHeader = reinterpret_cast<Elf64_Shdr *>(
738       BinContents + Hdr->e_shoff + Hdr->e_shstrndx * Hdr->e_shentsize);
739 
740   // Find .bolt.instr.tables with the data we need and set pointers to it
741   for (int I = 0; I < Hdr->e_shnum; ++I) {
742     char *SecName = reinterpret_cast<char *>(
743         BinContents + StringTblHeader->sh_offset + Shdr->sh_name);
744     if (compareStr(SecName, ".bolt.instr.tables", 64) != 0) {
745       Shdr = reinterpret_cast<Elf64_Shdr *>(BinContents + Hdr->e_shoff +
746                                             (I + 1) * Hdr->e_shentsize);
747       continue;
748     }
749     // Actual contents of the ELF note start after offset 20 decimal:
750     // Offset 0: Producer name size (4 bytes)
751     // Offset 4: Contents size (4 bytes)
752     // Offset 8: Note type (4 bytes)
753     // Offset 12: Producer name (BOLT\0) (5 bytes + align to 4-byte boundary)
754     // Offset 20: Contents
755     uint32_t IndCallDescSize =
756         *reinterpret_cast<uint32_t *>(BinContents + Shdr->sh_offset + 20);
757     uint32_t IndCallTargetDescSize = *reinterpret_cast<uint32_t *>(
758         BinContents + Shdr->sh_offset + 24 + IndCallDescSize);
759     uint32_t FuncDescSize =
760         *reinterpret_cast<uint32_t *>(BinContents + Shdr->sh_offset + 28 +
761                                       IndCallDescSize + IndCallTargetDescSize);
762     Result.IndCallDescriptions = reinterpret_cast<IndCallDescription *>(
763         BinContents + Shdr->sh_offset + 24);
764     Result.IndCallTargets = reinterpret_cast<IndCallTargetDescription *>(
765         BinContents + Shdr->sh_offset + 28 + IndCallDescSize);
766     Result.FuncDescriptions = BinContents + Shdr->sh_offset + 32 +
767                               IndCallDescSize + IndCallTargetDescSize;
768     Result.Strings = reinterpret_cast<char *>(
769         BinContents + Shdr->sh_offset + 32 + IndCallDescSize +
770         IndCallTargetDescSize + FuncDescSize);
771     return Result;
772   }
773   const char ErrMsg[] =
774       "BOLT instrumentation runtime error: could not find section "
775       ".bolt.instr.tables\n";
776   reportError(ErrMsg, sizeof(ErrMsg));
777   return Result;
778 }
779 
780 #else
781 
782 ProfileWriterContext readDescriptions() {
783   ProfileWriterContext Result;
784   uint8_t *Tables = _bolt_instr_tables_getter();
785   uint32_t IndCallDescSize = *reinterpret_cast<uint32_t *>(Tables);
786   uint32_t IndCallTargetDescSize =
787       *reinterpret_cast<uint32_t *>(Tables + 4 + IndCallDescSize);
788   uint32_t FuncDescSize = *reinterpret_cast<uint32_t *>(
789       Tables + 8 + IndCallDescSize + IndCallTargetDescSize);
790   Result.IndCallDescriptions =
791       reinterpret_cast<IndCallDescription *>(Tables + 4);
792   Result.IndCallTargets = reinterpret_cast<IndCallTargetDescription *>(
793       Tables + 8 + IndCallDescSize);
794   Result.FuncDescriptions =
795       Tables + 12 + IndCallDescSize + IndCallTargetDescSize;
796   Result.Strings = reinterpret_cast<char *>(
797       Tables + 12 + IndCallDescSize + IndCallTargetDescSize + FuncDescSize);
798   return Result;
799 }
800 
801 #endif
802 
803 #if !defined(__APPLE__)
804 /// Debug by printing overall metadata global numbers to check it is sane
805 void printStats(const ProfileWriterContext &Ctx) {
806   char StatMsg[BufSize];
807   char *StatPtr = StatMsg;
808   StatPtr =
809       strCopy(StatPtr,
810               "\nBOLT INSTRUMENTATION RUNTIME STATISTICS\n\nIndCallDescSize: ");
811   StatPtr = intToStr(StatPtr,
812                      Ctx.FuncDescriptions -
813                          reinterpret_cast<uint8_t *>(Ctx.IndCallDescriptions),
814                      10);
815   StatPtr = strCopy(StatPtr, "\nFuncDescSize: ");
816   StatPtr = intToStr(
817       StatPtr,
818       reinterpret_cast<uint8_t *>(Ctx.Strings) - Ctx.FuncDescriptions, 10);
819   StatPtr = strCopy(StatPtr, "\n__bolt_instr_num_ind_calls: ");
820   StatPtr = intToStr(StatPtr, __bolt_instr_num_ind_calls, 10);
821   StatPtr = strCopy(StatPtr, "\n__bolt_instr_num_funcs: ");
822   StatPtr = intToStr(StatPtr, __bolt_instr_num_funcs, 10);
823   StatPtr = strCopy(StatPtr, "\n");
824   __write(2, StatMsg, StatPtr - StatMsg);
825 }
826 #endif
827 
828 
829 /// This is part of a simple CFG representation in memory, where we store
830 /// a dynamically sized array of input and output edges per node, and store
831 /// a dynamically sized array of nodes per graph. We also store the spanning
832 /// tree edges for that CFG in a separate array of nodes in
833 /// \p SpanningTreeNodes, while the regular nodes live in \p CFGNodes.
834 struct Edge {
835   uint32_t Node; // Index in nodes array regarding the destination of this edge
836   uint32_t ID;   // Edge index in an array comprising all edges of the graph
837 };
838 
839 /// A regular graph node or a spanning tree node
840 struct Node {
841   uint32_t NumInEdges{0};  // Input edge count used to size InEdge
842   uint32_t NumOutEdges{0}; // Output edge count used to size OutEdges
843   Edge *InEdges{nullptr};  // Created and managed by \p Graph
844   Edge *OutEdges{nullptr}; // ditto
845 };
846 
847 /// Main class for CFG representation in memory. Manages object creation and
848 /// destruction, populates an array of CFG nodes as well as corresponding
849 /// spanning tree nodes.
850 struct Graph {
851   uint32_t NumNodes;
852   Node *CFGNodes;
853   Node *SpanningTreeNodes;
854   uint64_t *EdgeFreqs;
855   uint64_t *CallFreqs;
856   BumpPtrAllocator &Alloc;
857   const FunctionDescription &D;
858 
859   /// Reads a list of edges from function description \p D and builds
860   /// the graph from it. Allocates several internal dynamic structures that are
861   /// later destroyed by ~Graph() and uses \p Alloc. D.LeafNodes contain all
862   /// spanning tree leaf nodes descriptions (their counters). They are the seed
863   /// used to compute the rest of the missing edge counts in a bottom-up
864   /// traversal of the spanning tree.
865   Graph(BumpPtrAllocator &Alloc, const FunctionDescription &D,
866         const uint64_t *Counters, ProfileWriterContext &Ctx);
867   ~Graph();
868   void dump() const;
869 
870 private:
871   void computeEdgeFrequencies(const uint64_t *Counters,
872                               ProfileWriterContext &Ctx);
873   void dumpEdgeFreqs() const;
874 };
875 
876 Graph::Graph(BumpPtrAllocator &Alloc, const FunctionDescription &D,
877              const uint64_t *Counters, ProfileWriterContext &Ctx)
878     : Alloc(Alloc), D(D) {
879   DEBUG(reportNumber("G = 0x", (uint64_t)this, 16));
880   // First pass to determine number of nodes
881   int32_t MaxNodes = -1;
882   CallFreqs = nullptr;
883   EdgeFreqs = nullptr;
884   for (int I = 0; I < D.NumEdges; ++I) {
885     if (static_cast<int32_t>(D.Edges[I].FromNode) > MaxNodes)
886       MaxNodes = D.Edges[I].FromNode;
887     if (static_cast<int32_t>(D.Edges[I].ToNode) > MaxNodes)
888       MaxNodes = D.Edges[I].ToNode;
889   }
890 
891   for (int I = 0; I < D.NumLeafNodes; ++I)
892     if (static_cast<int32_t>(D.LeafNodes[I].Node) > MaxNodes)
893       MaxNodes = D.LeafNodes[I].Node;
894 
895   for (int I = 0; I < D.NumCalls; ++I)
896     if (static_cast<int32_t>(D.Calls[I].FromNode) > MaxNodes)
897       MaxNodes = D.Calls[I].FromNode;
898 
899   // No nodes? Nothing to do
900   if (MaxNodes < 0) {
901     DEBUG(report("No nodes!\n"));
902     CFGNodes = nullptr;
903     SpanningTreeNodes = nullptr;
904     NumNodes = 0;
905     return;
906   }
907   ++MaxNodes;
908   DEBUG(reportNumber("NumNodes = ", MaxNodes, 10));
909   NumNodes = static_cast<uint32_t>(MaxNodes);
910 
911   // Initial allocations
912   CFGNodes = new (Alloc) Node[MaxNodes];
913 
914   DEBUG(reportNumber("G->CFGNodes = 0x", (uint64_t)CFGNodes, 16));
915   SpanningTreeNodes = new (Alloc) Node[MaxNodes];
916   DEBUG(reportNumber("G->SpanningTreeNodes = 0x",
917                      (uint64_t)SpanningTreeNodes, 16));
918 
919   // Figure out how much to allocate to each vector (in/out edge sets)
920   for (int I = 0; I < D.NumEdges; ++I) {
921     CFGNodes[D.Edges[I].FromNode].NumOutEdges++;
922     CFGNodes[D.Edges[I].ToNode].NumInEdges++;
923     if (D.Edges[I].Counter != 0xffffffff)
924       continue;
925 
926     SpanningTreeNodes[D.Edges[I].FromNode].NumOutEdges++;
927     SpanningTreeNodes[D.Edges[I].ToNode].NumInEdges++;
928   }
929 
930   // Allocate in/out edge sets
931   for (int I = 0; I < MaxNodes; ++I) {
932     if (CFGNodes[I].NumInEdges > 0)
933       CFGNodes[I].InEdges = new (Alloc) Edge[CFGNodes[I].NumInEdges];
934     if (CFGNodes[I].NumOutEdges > 0)
935       CFGNodes[I].OutEdges = new (Alloc) Edge[CFGNodes[I].NumOutEdges];
936     if (SpanningTreeNodes[I].NumInEdges > 0)
937       SpanningTreeNodes[I].InEdges =
938           new (Alloc) Edge[SpanningTreeNodes[I].NumInEdges];
939     if (SpanningTreeNodes[I].NumOutEdges > 0)
940       SpanningTreeNodes[I].OutEdges =
941           new (Alloc) Edge[SpanningTreeNodes[I].NumOutEdges];
942     CFGNodes[I].NumInEdges = 0;
943     CFGNodes[I].NumOutEdges = 0;
944     SpanningTreeNodes[I].NumInEdges = 0;
945     SpanningTreeNodes[I].NumOutEdges = 0;
946   }
947 
948   // Fill in/out edge sets
949   for (int I = 0; I < D.NumEdges; ++I) {
950     const uint32_t Src = D.Edges[I].FromNode;
951     const uint32_t Dst = D.Edges[I].ToNode;
952     Edge *E = &CFGNodes[Src].OutEdges[CFGNodes[Src].NumOutEdges++];
953     E->Node = Dst;
954     E->ID = I;
955 
956     E = &CFGNodes[Dst].InEdges[CFGNodes[Dst].NumInEdges++];
957     E->Node = Src;
958     E->ID = I;
959 
960     if (D.Edges[I].Counter != 0xffffffff)
961       continue;
962 
963     E = &SpanningTreeNodes[Src]
964              .OutEdges[SpanningTreeNodes[Src].NumOutEdges++];
965     E->Node = Dst;
966     E->ID = I;
967 
968     E = &SpanningTreeNodes[Dst]
969              .InEdges[SpanningTreeNodes[Dst].NumInEdges++];
970     E->Node = Src;
971     E->ID = I;
972   }
973 
974   computeEdgeFrequencies(Counters, Ctx);
975 }
976 
977 Graph::~Graph() {
978   if (CallFreqs)
979     Alloc.deallocate(CallFreqs);
980   if (EdgeFreqs)
981     Alloc.deallocate(EdgeFreqs);
982   for (int I = NumNodes - 1; I >= 0; --I) {
983     if (SpanningTreeNodes[I].OutEdges)
984       Alloc.deallocate(SpanningTreeNodes[I].OutEdges);
985     if (SpanningTreeNodes[I].InEdges)
986       Alloc.deallocate(SpanningTreeNodes[I].InEdges);
987     if (CFGNodes[I].OutEdges)
988       Alloc.deallocate(CFGNodes[I].OutEdges);
989     if (CFGNodes[I].InEdges)
990       Alloc.deallocate(CFGNodes[I].InEdges);
991   }
992   if (SpanningTreeNodes)
993     Alloc.deallocate(SpanningTreeNodes);
994   if (CFGNodes)
995     Alloc.deallocate(CFGNodes);
996 }
997 
998 void Graph::dump() const {
999   reportNumber("Dumping graph with number of nodes: ", NumNodes, 10);
1000   report("  Full graph:\n");
1001   for (int I = 0; I < NumNodes; ++I) {
1002     const Node *N = &CFGNodes[I];
1003     reportNumber("    Node #", I, 10);
1004     reportNumber("      InEdges total ", N->NumInEdges, 10);
1005     for (int J = 0; J < N->NumInEdges; ++J)
1006       reportNumber("        ", N->InEdges[J].Node, 10);
1007     reportNumber("      OutEdges total ", N->NumOutEdges, 10);
1008     for (int J = 0; J < N->NumOutEdges; ++J)
1009       reportNumber("        ", N->OutEdges[J].Node, 10);
1010     report("\n");
1011   }
1012   report("  Spanning tree:\n");
1013   for (int I = 0; I < NumNodes; ++I) {
1014     const Node *N = &SpanningTreeNodes[I];
1015     reportNumber("    Node #", I, 10);
1016     reportNumber("      InEdges total ", N->NumInEdges, 10);
1017     for (int J = 0; J < N->NumInEdges; ++J)
1018       reportNumber("        ", N->InEdges[J].Node, 10);
1019     reportNumber("      OutEdges total ", N->NumOutEdges, 10);
1020     for (int J = 0; J < N->NumOutEdges; ++J)
1021       reportNumber("        ", N->OutEdges[J].Node, 10);
1022     report("\n");
1023   }
1024 }
1025 
1026 void Graph::dumpEdgeFreqs() const {
1027   reportNumber(
1028       "Dumping edge frequencies for graph with num edges: ", D.NumEdges, 10);
1029   for (int I = 0; I < D.NumEdges; ++I) {
1030     reportNumber("* Src: ", D.Edges[I].FromNode, 10);
1031     reportNumber("  Dst: ", D.Edges[I].ToNode, 10);
1032     reportNumber("    Cnt: ", EdgeFreqs[I], 10);
1033   }
1034 }
1035 
1036 /// Auxiliary map structure for fast lookups of which calls map to each node of
1037 /// the function CFG
1038 struct NodeToCallsMap {
1039   struct MapEntry {
1040     uint32_t NumCalls;
1041     uint32_t *Calls;
1042   };
1043   MapEntry *Entries;
1044   BumpPtrAllocator &Alloc;
1045   const uint32_t NumNodes;
1046 
1047   NodeToCallsMap(BumpPtrAllocator &Alloc, const FunctionDescription &D,
1048                  uint32_t NumNodes)
1049       : Alloc(Alloc), NumNodes(NumNodes) {
1050     Entries = new (Alloc, 0) MapEntry[NumNodes];
1051     for (int I = 0; I < D.NumCalls; ++I) {
1052       DEBUG(reportNumber("Registering call in node ", D.Calls[I].FromNode, 10));
1053       ++Entries[D.Calls[I].FromNode].NumCalls;
1054     }
1055     for (int I = 0; I < NumNodes; ++I) {
1056       Entries[I].Calls = Entries[I].NumCalls ? new (Alloc)
1057                                                    uint32_t[Entries[I].NumCalls]
1058                                              : nullptr;
1059       Entries[I].NumCalls = 0;
1060     }
1061     for (int I = 0; I < D.NumCalls; ++I) {
1062       MapEntry &Entry = Entries[D.Calls[I].FromNode];
1063       Entry.Calls[Entry.NumCalls++] = I;
1064     }
1065   }
1066 
1067   /// Set the frequency of all calls in node \p NodeID to Freq. However, if
1068   /// the calls have their own counters and do not depend on the basic block
1069   /// counter, this means they have landing pads and throw exceptions. In this
1070   /// case, set their frequency with their counters and return the maximum
1071   /// value observed in such counters. This will be used as the new frequency
1072   /// at basic block entry. This is used to fix the CFG edge frequencies in the
1073   /// presence of exceptions.
1074   uint64_t visitAllCallsIn(uint32_t NodeID, uint64_t Freq, uint64_t *CallFreqs,
1075                            const FunctionDescription &D,
1076                            const uint64_t *Counters,
1077                            ProfileWriterContext &Ctx) const {
1078     const MapEntry &Entry = Entries[NodeID];
1079     uint64_t MaxValue = 0ull;
1080     for (int I = 0, E = Entry.NumCalls; I != E; ++I) {
1081       const uint32_t CallID = Entry.Calls[I];
1082       DEBUG(reportNumber("  Setting freq for call ID: ", CallID, 10));
1083       const CallDescription &CallDesc = D.Calls[CallID];
1084       if (CallDesc.Counter == 0xffffffff) {
1085         CallFreqs[CallID] = Freq;
1086         DEBUG(reportNumber("  with : ", Freq, 10));
1087       } else {
1088         const uint64_t CounterVal = Counters[CallDesc.Counter];
1089         CallFreqs[CallID] = CounterVal;
1090         MaxValue = CounterVal > MaxValue ? CounterVal : MaxValue;
1091         DEBUG(reportNumber("  with (private counter) : ", CounterVal, 10));
1092       }
1093       DEBUG(reportNumber("  Address: 0x", CallDesc.TargetAddress, 16));
1094       if (CallFreqs[CallID] > 0)
1095         Ctx.CallFlowTable->get(CallDesc.TargetAddress).Calls +=
1096             CallFreqs[CallID];
1097     }
1098     return MaxValue;
1099   }
1100 
1101   ~NodeToCallsMap() {
1102     for (int I = NumNodes - 1; I >= 0; --I)
1103       if (Entries[I].Calls)
1104         Alloc.deallocate(Entries[I].Calls);
1105     Alloc.deallocate(Entries);
1106   }
1107 };
1108 
1109 /// Fill an array with the frequency of each edge in the function represented
1110 /// by G, as well as another array for each call.
1111 void Graph::computeEdgeFrequencies(const uint64_t *Counters,
1112                                    ProfileWriterContext &Ctx) {
1113   if (NumNodes == 0)
1114     return;
1115 
1116   EdgeFreqs = D.NumEdges ? new (Alloc, 0) uint64_t [D.NumEdges] : nullptr;
1117   CallFreqs = D.NumCalls ? new (Alloc, 0) uint64_t [D.NumCalls] : nullptr;
1118 
1119   // Setup a lookup for calls present in each node (BB)
1120   NodeToCallsMap *CallMap = new (Alloc) NodeToCallsMap(Alloc, D, NumNodes);
1121 
1122   // Perform a bottom-up, BFS traversal of the spanning tree in G. Edges in the
1123   // spanning tree don't have explicit counters. We must infer their value using
1124   // a linear combination of other counters (sum of counters of the outgoing
1125   // edges minus sum of counters of the incoming edges).
1126   uint32_t *Stack = new (Alloc) uint32_t [NumNodes];
1127   uint32_t StackTop = 0;
1128   enum Status : uint8_t { S_NEW = 0, S_VISITING, S_VISITED };
1129   Status *Visited = new (Alloc, 0) Status[NumNodes];
1130   uint64_t *LeafFrequency = new (Alloc, 0) uint64_t[NumNodes];
1131   uint64_t *EntryAddress = new (Alloc, 0) uint64_t[NumNodes];
1132 
1133   // Setup a fast lookup for frequency of leaf nodes, which have special
1134   // basic block frequency instrumentation (they are not edge profiled).
1135   for (int I = 0; I < D.NumLeafNodes; ++I) {
1136     LeafFrequency[D.LeafNodes[I].Node] = Counters[D.LeafNodes[I].Counter];
1137     DEBUG({
1138       if (Counters[D.LeafNodes[I].Counter] > 0) {
1139         reportNumber("Leaf Node# ", D.LeafNodes[I].Node, 10);
1140         reportNumber("     Counter: ", Counters[D.LeafNodes[I].Counter], 10);
1141       }
1142     });
1143   }
1144   for (int I = 0; I < D.NumEntryNodes; ++I) {
1145     EntryAddress[D.EntryNodes[I].Node] = D.EntryNodes[I].Address;
1146     DEBUG({
1147         reportNumber("Entry Node# ", D.EntryNodes[I].Node, 10);
1148         reportNumber("      Address: ", D.EntryNodes[I].Address, 16);
1149     });
1150   }
1151   // Add all root nodes to the stack
1152   for (int I = 0; I < NumNodes; ++I)
1153     if (SpanningTreeNodes[I].NumInEdges == 0)
1154       Stack[StackTop++] = I;
1155 
1156   // Empty stack?
1157   if (StackTop == 0) {
1158     DEBUG(report("Empty stack!\n"));
1159     Alloc.deallocate(EntryAddress);
1160     Alloc.deallocate(LeafFrequency);
1161     Alloc.deallocate(Visited);
1162     Alloc.deallocate(Stack);
1163     CallMap->~NodeToCallsMap();
1164     Alloc.deallocate(CallMap);
1165     if (CallFreqs)
1166       Alloc.deallocate(CallFreqs);
1167     if (EdgeFreqs)
1168       Alloc.deallocate(EdgeFreqs);
1169     EdgeFreqs = nullptr;
1170     CallFreqs = nullptr;
1171     return;
1172   }
1173   // Add all known edge counts, will infer the rest
1174   for (int I = 0; I < D.NumEdges; ++I) {
1175     const uint32_t C = D.Edges[I].Counter;
1176     if (C == 0xffffffff) // inferred counter - we will compute its value
1177       continue;
1178     EdgeFreqs[I] = Counters[C];
1179   }
1180 
1181   while (StackTop > 0) {
1182     const uint32_t Cur = Stack[--StackTop];
1183     DEBUG({
1184       if (Visited[Cur] == S_VISITING)
1185         report("(visiting) ");
1186       else
1187         report("(new) ");
1188       reportNumber("Cur: ", Cur, 10);
1189     });
1190 
1191     // This shouldn't happen in a tree
1192     assert(Visited[Cur] != S_VISITED, "should not have visited nodes in stack");
1193     if (Visited[Cur] == S_NEW) {
1194       Visited[Cur] = S_VISITING;
1195       Stack[StackTop++] = Cur;
1196       assert(StackTop <= NumNodes, "stack grew too large");
1197       for (int I = 0, E = SpanningTreeNodes[Cur].NumOutEdges; I < E; ++I) {
1198         const uint32_t Succ = SpanningTreeNodes[Cur].OutEdges[I].Node;
1199         Stack[StackTop++] = Succ;
1200         assert(StackTop <= NumNodes, "stack grew too large");
1201       }
1202       continue;
1203     }
1204     Visited[Cur] = S_VISITED;
1205 
1206     // Establish our node frequency based on outgoing edges, which should all be
1207     // resolved by now.
1208     int64_t CurNodeFreq = LeafFrequency[Cur];
1209     // Not a leaf?
1210     if (!CurNodeFreq) {
1211       for (int I = 0, E = CFGNodes[Cur].NumOutEdges; I != E; ++I) {
1212         const uint32_t SuccEdge = CFGNodes[Cur].OutEdges[I].ID;
1213         CurNodeFreq += EdgeFreqs[SuccEdge];
1214       }
1215     }
1216     if (CurNodeFreq < 0)
1217       CurNodeFreq = 0;
1218 
1219     const uint64_t CallFreq = CallMap->visitAllCallsIn(
1220         Cur, CurNodeFreq > 0 ? CurNodeFreq : 0, CallFreqs, D, Counters, Ctx);
1221 
1222     // Exception handling affected our output flow? Fix with calls info
1223     DEBUG({
1224       if (CallFreq > CurNodeFreq)
1225         report("Bumping node frequency with call info\n");
1226     });
1227     CurNodeFreq = CallFreq > CurNodeFreq ? CallFreq : CurNodeFreq;
1228 
1229     if (CurNodeFreq > 0) {
1230       if (uint64_t Addr = EntryAddress[Cur]) {
1231         DEBUG(
1232             reportNumber("  Setting flow at entry point address 0x", Addr, 16));
1233         DEBUG(reportNumber("  with: ", CurNodeFreq, 10));
1234         Ctx.CallFlowTable->get(Addr).Val = CurNodeFreq;
1235       }
1236     }
1237 
1238     // No parent? Reached a tree root, limit to call frequency updating.
1239     if (SpanningTreeNodes[Cur].NumInEdges == 0)
1240       continue;
1241 
1242     assert(SpanningTreeNodes[Cur].NumInEdges == 1, "must have 1 parent");
1243     const uint32_t Parent = SpanningTreeNodes[Cur].InEdges[0].Node;
1244     const uint32_t ParentEdge = SpanningTreeNodes[Cur].InEdges[0].ID;
1245 
1246     // Calculate parent edge freq.
1247     int64_t ParentEdgeFreq = CurNodeFreq;
1248     for (int I = 0, E = CFGNodes[Cur].NumInEdges; I != E; ++I) {
1249       const uint32_t PredEdge = CFGNodes[Cur].InEdges[I].ID;
1250       ParentEdgeFreq -= EdgeFreqs[PredEdge];
1251     }
1252 
1253     // Sometimes the conservative CFG that BOLT builds will lead to incorrect
1254     // flow computation. For example, in a BB that transitively calls the exit
1255     // syscall, BOLT will add a fall-through successor even though it should not
1256     // have any successors. So this block execution will likely be wrong. We
1257     // tolerate this imperfection since this case should be quite infrequent.
1258     if (ParentEdgeFreq < 0) {
1259       DEBUG(dumpEdgeFreqs());
1260       DEBUG(report("WARNING: incorrect flow"));
1261       ParentEdgeFreq = 0;
1262     }
1263     DEBUG(reportNumber("  Setting freq for ParentEdge: ", ParentEdge, 10));
1264     DEBUG(reportNumber("  with ParentEdgeFreq: ", ParentEdgeFreq, 10));
1265     EdgeFreqs[ParentEdge] = ParentEdgeFreq;
1266   }
1267 
1268   Alloc.deallocate(EntryAddress);
1269   Alloc.deallocate(LeafFrequency);
1270   Alloc.deallocate(Visited);
1271   Alloc.deallocate(Stack);
1272   CallMap->~NodeToCallsMap();
1273   Alloc.deallocate(CallMap);
1274   DEBUG(dumpEdgeFreqs());
1275 }
1276 
1277 /// Write to \p FD all of the edge profiles for function \p FuncDesc. Uses
1278 /// \p Alloc to allocate helper dynamic structures used to compute profile for
1279 /// edges that we do not explictly instrument.
1280 const uint8_t *writeFunctionProfile(int FD, ProfileWriterContext &Ctx,
1281                                     const uint8_t *FuncDesc,
1282                                     BumpPtrAllocator &Alloc) {
1283   const FunctionDescription F(FuncDesc);
1284   const uint8_t *next = FuncDesc + F.getSize();
1285 
1286 #if !defined(__APPLE__)
1287   uint64_t *bolt_instr_locations = __bolt_instr_locations;
1288 #else
1289   uint64_t *bolt_instr_locations = _bolt_instr_locations_getter();
1290 #endif
1291 
1292   // Skip funcs we know are cold
1293 #ifndef ENABLE_DEBUG
1294   uint64_t CountersFreq = 0;
1295   for (int I = 0; I < F.NumLeafNodes; ++I)
1296     CountersFreq += bolt_instr_locations[F.LeafNodes[I].Counter];
1297 
1298   if (CountersFreq == 0) {
1299     for (int I = 0; I < F.NumEdges; ++I) {
1300       const uint32_t C = F.Edges[I].Counter;
1301       if (C == 0xffffffff)
1302         continue;
1303       CountersFreq += bolt_instr_locations[C];
1304     }
1305     if (CountersFreq == 0) {
1306       for (int I = 0; I < F.NumCalls; ++I) {
1307         const uint32_t C = F.Calls[I].Counter;
1308         if (C == 0xffffffff)
1309           continue;
1310         CountersFreq += bolt_instr_locations[C];
1311       }
1312       if (CountersFreq == 0)
1313         return next;
1314     }
1315   }
1316 #endif
1317 
1318   Graph *G = new (Alloc) Graph(Alloc, F, bolt_instr_locations, Ctx);
1319   DEBUG(G->dump());
1320 
1321   if (!G->EdgeFreqs && !G->CallFreqs) {
1322     G->~Graph();
1323     Alloc.deallocate(G);
1324     return next;
1325   }
1326 
1327   for (int I = 0; I < F.NumEdges; ++I) {
1328     const uint64_t Freq = G->EdgeFreqs[I];
1329     if (Freq == 0)
1330       continue;
1331     const EdgeDescription *Desc = &F.Edges[I];
1332     char LineBuf[BufSize];
1333     char *Ptr = LineBuf;
1334     Ptr = serializeLoc(Ctx, Ptr, Desc->From, BufSize);
1335     Ptr = serializeLoc(Ctx, Ptr, Desc->To, BufSize - (Ptr - LineBuf));
1336     Ptr = strCopy(Ptr, "0 ", BufSize - (Ptr - LineBuf) - 22);
1337     Ptr = intToStr(Ptr, Freq, 10);
1338     *Ptr++ = '\n';
1339     __write(FD, LineBuf, Ptr - LineBuf);
1340   }
1341 
1342   for (int I = 0; I < F.NumCalls; ++I) {
1343     const uint64_t Freq = G->CallFreqs[I];
1344     if (Freq == 0)
1345       continue;
1346     char LineBuf[BufSize];
1347     char *Ptr = LineBuf;
1348     const CallDescription *Desc = &F.Calls[I];
1349     Ptr = serializeLoc(Ctx, Ptr, Desc->From, BufSize);
1350     Ptr = serializeLoc(Ctx, Ptr, Desc->To, BufSize - (Ptr - LineBuf));
1351     Ptr = strCopy(Ptr, "0 ", BufSize - (Ptr - LineBuf) - 25);
1352     Ptr = intToStr(Ptr, Freq, 10);
1353     *Ptr++ = '\n';
1354     __write(FD, LineBuf, Ptr - LineBuf);
1355   }
1356 
1357   G->~Graph();
1358   Alloc.deallocate(G);
1359   return next;
1360 }
1361 
1362 #if !defined(__APPLE__)
1363 const IndCallTargetDescription *
1364 ProfileWriterContext::lookupIndCallTarget(uint64_t Target) const {
1365   uint32_t B = 0;
1366   uint32_t E = __bolt_instr_num_ind_targets;
1367   if (E == 0)
1368     return nullptr;
1369   do {
1370     uint32_t I = (E - B) / 2 + B;
1371     if (IndCallTargets[I].Address == Target)
1372       return &IndCallTargets[I];
1373     if (IndCallTargets[I].Address < Target)
1374       B = I + 1;
1375     else
1376       E = I;
1377   } while (B < E);
1378   return nullptr;
1379 }
1380 
1381 /// Write a single indirect call <src, target> pair to the fdata file
1382 void visitIndCallCounter(IndirectCallHashTable::MapEntry &Entry,
1383                          int FD, int CallsiteID,
1384                          ProfileWriterContext *Ctx) {
1385   if (Entry.Val == 0)
1386     return;
1387   DEBUG(reportNumber("Target func 0x", Entry.Key, 16));
1388   DEBUG(reportNumber("Target freq: ", Entry.Val, 10));
1389   const IndCallDescription *CallsiteDesc =
1390       &Ctx->IndCallDescriptions[CallsiteID];
1391   const IndCallTargetDescription *TargetDesc =
1392       Ctx->lookupIndCallTarget(Entry.Key);
1393   if (!TargetDesc) {
1394     DEBUG(report("Failed to lookup indirect call target\n"));
1395     char LineBuf[BufSize];
1396     char *Ptr = LineBuf;
1397     Ptr = serializeLoc(*Ctx, Ptr, *CallsiteDesc, BufSize);
1398     Ptr = strCopy(Ptr, "0 [unknown] 0 0 ", BufSize - (Ptr - LineBuf) - 40);
1399     Ptr = intToStr(Ptr, Entry.Val, 10);
1400     *Ptr++ = '\n';
1401     __write(FD, LineBuf, Ptr - LineBuf);
1402     return;
1403   }
1404   Ctx->CallFlowTable->get(TargetDesc->Address).Calls += Entry.Val;
1405   char LineBuf[BufSize];
1406   char *Ptr = LineBuf;
1407   Ptr = serializeLoc(*Ctx, Ptr, *CallsiteDesc, BufSize);
1408   Ptr = serializeLoc(*Ctx, Ptr, TargetDesc->Loc, BufSize - (Ptr - LineBuf));
1409   Ptr = strCopy(Ptr, "0 ", BufSize - (Ptr - LineBuf) - 25);
1410   Ptr = intToStr(Ptr, Entry.Val, 10);
1411   *Ptr++ = '\n';
1412   __write(FD, LineBuf, Ptr - LineBuf);
1413 }
1414 
1415 /// Write to \p FD all of the indirect call profiles.
1416 void writeIndirectCallProfile(int FD, ProfileWriterContext &Ctx) {
1417   for (int I = 0; I < __bolt_instr_num_ind_calls; ++I) {
1418     DEBUG(reportNumber("IndCallsite #", I, 10));
1419     GlobalIndCallCounters[I].forEachElement(visitIndCallCounter, FD, I, &Ctx);
1420   }
1421 }
1422 
1423 /// Check a single call flow for a callee versus all known callers. If there are
1424 /// less callers than what the callee expects, write the difference with source
1425 /// [unknown] in the profile.
1426 void visitCallFlowEntry(CallFlowHashTable::MapEntry &Entry, int FD,
1427                         ProfileWriterContext *Ctx) {
1428   DEBUG(reportNumber("Call flow entry address: 0x", Entry.Key, 16));
1429   DEBUG(reportNumber("Calls: ", Entry.Calls, 10));
1430   DEBUG(reportNumber("Reported entry frequency: ", Entry.Val, 10));
1431   DEBUG({
1432     if (Entry.Calls > Entry.Val)
1433       report("  More calls than expected!\n");
1434   });
1435   if (Entry.Val <= Entry.Calls)
1436     return;
1437   DEBUG(reportNumber(
1438       "  Balancing calls with traffic: ", Entry.Val - Entry.Calls, 10));
1439   const IndCallTargetDescription *TargetDesc =
1440       Ctx->lookupIndCallTarget(Entry.Key);
1441   if (!TargetDesc) {
1442     // There is probably something wrong with this callee and this should be
1443     // investigated, but I don't want to assert and lose all data collected.
1444     DEBUG(report("WARNING: failed to look up call target!\n"));
1445     return;
1446   }
1447   char LineBuf[BufSize];
1448   char *Ptr = LineBuf;
1449   Ptr = strCopy(Ptr, "0 [unknown] 0 ", BufSize);
1450   Ptr = serializeLoc(*Ctx, Ptr, TargetDesc->Loc, BufSize - (Ptr - LineBuf));
1451   Ptr = strCopy(Ptr, "0 ", BufSize - (Ptr - LineBuf) - 25);
1452   Ptr = intToStr(Ptr, Entry.Val - Entry.Calls, 10);
1453   *Ptr++ = '\n';
1454   __write(FD, LineBuf, Ptr - LineBuf);
1455 }
1456 
1457 /// Open fdata file for writing and return a valid file descriptor, aborting
1458 /// program upon failure.
1459 int openProfile() {
1460   // Build the profile name string by appending our PID
1461   char Buf[BufSize];
1462   char *Ptr = Buf;
1463   uint64_t PID = __getpid();
1464   Ptr = strCopy(Buf, __bolt_instr_filename, BufSize);
1465   if (__bolt_instr_use_pid) {
1466     Ptr = strCopy(Ptr, ".", BufSize - (Ptr - Buf + 1));
1467     Ptr = intToStr(Ptr, PID, 10);
1468     Ptr = strCopy(Ptr, ".fdata", BufSize - (Ptr - Buf + 1));
1469   }
1470   *Ptr++ = '\0';
1471   uint64_t FD = __open(Buf, O_WRONLY | O_TRUNC | O_CREAT,
1472                        /*mode=*/0666);
1473   if (static_cast<int64_t>(FD) < 0) {
1474     report("Error while trying to open profile file for writing: ");
1475     report(Buf);
1476     reportNumber("\nFailed with error number: 0x",
1477                  0 - static_cast<int64_t>(FD), 16);
1478     __exit(1);
1479   }
1480   return FD;
1481 }
1482 
1483 #endif
1484 
1485 } // anonymous namespace
1486 
1487 #if !defined(__APPLE__)
1488 
1489 /// Reset all counters in case you want to start profiling a new phase of your
1490 /// program independently of prior phases.
1491 /// The address of this function is printed by BOLT and this can be called by
1492 /// any attached debugger during runtime. There is a useful oneliner for gdb:
1493 ///
1494 ///   gdb -p $(pgrep -xo PROCESSNAME) -ex 'p ((void(*)())0xdeadbeef)()' \
1495 ///     -ex 'set confirm off' -ex quit
1496 ///
1497 /// Where 0xdeadbeef is this function address and PROCESSNAME your binary file
1498 /// name.
1499 extern "C" void __bolt_instr_clear_counters() {
1500   memset(reinterpret_cast<char *>(__bolt_instr_locations), 0,
1501          __bolt_num_counters * 8);
1502   for (int I = 0; I < __bolt_instr_num_ind_calls; ++I)
1503     GlobalIndCallCounters[I].resetCounters();
1504 }
1505 
1506 /// This is the entry point for profile writing.
1507 /// There are three ways of getting here:
1508 ///
1509 ///  * Program execution ended, finalization methods are running and BOLT
1510 ///    hooked into FINI from your binary dynamic section;
1511 ///  * You used the sleep timer option and during initialization we forked
1512 ///    a separete process that will call this function periodically;
1513 ///  * BOLT prints this function address so you can attach a debugger and
1514 ///    call this function directly to get your profile written to disk
1515 ///    on demand.
1516 ///
1517 extern "C" void __attribute((force_align_arg_pointer))
1518 __bolt_instr_data_dump(int FD) {
1519   // Already dumping
1520   if (!GlobalWriteProfileMutex->acquire())
1521     return;
1522 
1523   int ret = __lseek(FD, 0, SEEK_SET);
1524   assert(ret == 0, "Failed to lseek!");
1525   ret = __ftruncate(FD, 0);
1526   assert(ret == 0, "Failed to ftruncate!");
1527   BumpPtrAllocator HashAlloc;
1528   HashAlloc.setMaxSize(0x6400000);
1529   ProfileWriterContext Ctx = readDescriptions();
1530   Ctx.CallFlowTable = new (HashAlloc, 0) CallFlowHashTable(HashAlloc);
1531 
1532   DEBUG(printStats(Ctx));
1533 
1534   BumpPtrAllocator Alloc;
1535   Alloc.setMaxSize(0x6400000);
1536   const uint8_t *FuncDesc = Ctx.FuncDescriptions;
1537   for (int I = 0, E = __bolt_instr_num_funcs; I < E; ++I) {
1538     FuncDesc = writeFunctionProfile(FD, Ctx, FuncDesc, Alloc);
1539     Alloc.clear();
1540     DEBUG(reportNumber("FuncDesc now: ", (uint64_t)FuncDesc, 16));
1541   }
1542   assert(FuncDesc == (void *)Ctx.Strings,
1543          "FuncDesc ptr must be equal to stringtable");
1544 
1545   writeIndirectCallProfile(FD, Ctx);
1546   Ctx.CallFlowTable->forEachElement(visitCallFlowEntry, FD, &Ctx);
1547 
1548   __fsync(FD);
1549   __munmap(Ctx.MMapPtr, Ctx.MMapSize);
1550   __close(Ctx.FileDesc);
1551   HashAlloc.destroy();
1552   GlobalWriteProfileMutex->release();
1553   DEBUG(report("Finished writing profile.\n"));
1554 }
1555 
1556 /// Event loop for our child process spawned during setup to dump profile data
1557 /// at user-specified intervals
1558 void watchProcess() {
1559   timespec ts, rem;
1560   uint64_t Ellapsed = 0ull;
1561   int FD = openProfile();
1562   uint64_t ppid;
1563   if (__bolt_instr_wait_forks) {
1564     // Store parent pgid
1565     ppid = -__getpgid(0);
1566     // And leave parent process group
1567     __setpgid(0, 0);
1568   } else {
1569     // Store parent pid
1570     ppid = __getppid();
1571     if (ppid == 1) {
1572       // Parent already dead
1573       __bolt_instr_data_dump(FD);
1574       goto out;
1575     }
1576   }
1577 
1578   ts.tv_sec = 1;
1579   ts.tv_nsec = 0;
1580   while (1) {
1581     __nanosleep(&ts, &rem);
1582     // This means our parent process or all its forks are dead,
1583     // so no need for us to keep dumping.
1584     if (__kill(ppid, 0) < 0) {
1585       if (__bolt_instr_no_counters_clear)
1586         __bolt_instr_data_dump(FD);
1587       break;
1588     }
1589 
1590     if (++Ellapsed < __bolt_instr_sleep_time)
1591       continue;
1592 
1593     Ellapsed = 0;
1594     __bolt_instr_data_dump(FD);
1595     if (__bolt_instr_no_counters_clear == false)
1596       __bolt_instr_clear_counters();
1597   }
1598 
1599 out:;
1600   DEBUG(report("My parent process is dead, bye!\n"));
1601   __close(FD);
1602   __exit(0);
1603 }
1604 
1605 extern "C" void __bolt_instr_indirect_call();
1606 extern "C" void __bolt_instr_indirect_tailcall();
1607 
1608 /// Initialization code
1609 extern "C" void __attribute((force_align_arg_pointer)) __bolt_instr_setup() {
1610   __bolt_ind_call_counter_func_pointer = __bolt_instr_indirect_call;
1611   __bolt_ind_tailcall_counter_func_pointer = __bolt_instr_indirect_tailcall;
1612 
1613   const uint64_t CountersStart =
1614       reinterpret_cast<uint64_t>(&__bolt_instr_locations[0]);
1615   const uint64_t CountersEnd = alignTo(
1616       reinterpret_cast<uint64_t>(&__bolt_instr_locations[__bolt_num_counters]),
1617       0x1000);
1618   DEBUG(reportNumber("replace mmap start: ", CountersStart, 16));
1619   DEBUG(reportNumber("replace mmap stop: ", CountersEnd, 16));
1620   assert(CountersEnd > CountersStart, "no counters");
1621 
1622   const bool Shared = !__bolt_instr_use_pid;
1623   const uint64_t MapPrivateOrShared = Shared ? MAP_SHARED : MAP_PRIVATE;
1624 
1625   void *Ret =
1626       __mmap(CountersStart, CountersEnd - CountersStart, PROT_READ | PROT_WRITE,
1627              MAP_ANONYMOUS | MapPrivateOrShared | MAP_FIXED, -1, 0);
1628   assert(Ret != MAP_FAILED, "__bolt_instr_setup: Failed to mmap counters!");
1629 
1630   GlobalMetadataStorage = __mmap(0, 4096, PROT_READ | PROT_WRITE,
1631                                  MapPrivateOrShared | MAP_ANONYMOUS, -1, 0);
1632   assert(GlobalMetadataStorage != MAP_FAILED,
1633          "__bolt_instr_setup: failed to mmap page for metadata!");
1634 
1635   GlobalAlloc = new (GlobalMetadataStorage) BumpPtrAllocator;
1636   // Conservatively reserve 100MiB
1637   GlobalAlloc->setMaxSize(0x6400000);
1638   GlobalAlloc->setShared(Shared);
1639   GlobalWriteProfileMutex = new (*GlobalAlloc, 0) Mutex();
1640   if (__bolt_instr_num_ind_calls > 0)
1641     GlobalIndCallCounters =
1642         new (*GlobalAlloc, 0) IndirectCallHashTable[__bolt_instr_num_ind_calls];
1643 
1644   if (__bolt_instr_sleep_time != 0) {
1645     // Separate instrumented process to the own process group
1646     if (__bolt_instr_wait_forks)
1647       __setpgid(0, 0);
1648 
1649     if (long PID = __fork())
1650       return;
1651     watchProcess();
1652   }
1653 }
1654 
1655 extern "C" __attribute((force_align_arg_pointer)) void
1656 instrumentIndirectCall(uint64_t Target, uint64_t IndCallID) {
1657   GlobalIndCallCounters[IndCallID].incrementVal(Target, *GlobalAlloc);
1658 }
1659 
1660 /// We receive as in-stack arguments the identifier of the indirect call site
1661 /// as well as the target address for the call
1662 extern "C" __attribute((naked)) void __bolt_instr_indirect_call()
1663 {
1664   __asm__ __volatile__(SAVE_ALL
1665                        "mov 0xa0(%%rsp), %%rdi\n"
1666                        "mov 0x98(%%rsp), %%rsi\n"
1667                        "call instrumentIndirectCall\n"
1668                        RESTORE_ALL
1669                        "ret\n"
1670                        :::);
1671 }
1672 
1673 extern "C" __attribute((naked)) void __bolt_instr_indirect_tailcall()
1674 {
1675   __asm__ __volatile__(SAVE_ALL
1676                        "mov 0x98(%%rsp), %%rdi\n"
1677                        "mov 0x90(%%rsp), %%rsi\n"
1678                        "call instrumentIndirectCall\n"
1679                        RESTORE_ALL
1680                        "ret\n"
1681                        :::);
1682 }
1683 
1684 /// This is hooking ELF's entry, it needs to save all machine state.
1685 extern "C" __attribute((naked)) void __bolt_instr_start()
1686 {
1687   __asm__ __volatile__(SAVE_ALL
1688                        "call __bolt_instr_setup\n"
1689                        RESTORE_ALL
1690                        "jmp __bolt_start_trampoline\n"
1691                        :::);
1692 }
1693 
1694 /// This is hooking into ELF's DT_FINI
1695 extern "C" void __bolt_instr_fini() {
1696   __bolt_fini_trampoline();
1697   if (__bolt_instr_sleep_time == 0) {
1698     int FD = openProfile();
1699     __bolt_instr_data_dump(FD);
1700     __close(FD);
1701   }
1702   DEBUG(report("Finished.\n"));
1703 }
1704 
1705 #endif
1706 
1707 #if defined(__APPLE__)
1708 
1709 extern "C" void __bolt_instr_data_dump() {
1710   ProfileWriterContext Ctx = readDescriptions();
1711 
1712   int FD = 2;
1713   BumpPtrAllocator Alloc;
1714   const uint8_t *FuncDesc = Ctx.FuncDescriptions;
1715   uint32_t bolt_instr_num_funcs = _bolt_instr_num_funcs_getter();
1716 
1717   for (int I = 0, E = bolt_instr_num_funcs; I < E; ++I) {
1718     FuncDesc = writeFunctionProfile(FD, Ctx, FuncDesc, Alloc);
1719     Alloc.clear();
1720     DEBUG(reportNumber("FuncDesc now: ", (uint64_t)FuncDesc, 16));
1721   }
1722   assert(FuncDesc == (void *)Ctx.Strings,
1723          "FuncDesc ptr must be equal to stringtable");
1724 }
1725 
1726 // On OSX/iOS the final symbol name of an extern "C" function/variable contains
1727 // one extra leading underscore: _bolt_instr_setup -> __bolt_instr_setup.
1728 extern "C"
1729 __attribute__((section("__TEXT,__setup")))
1730 __attribute__((force_align_arg_pointer))
1731 void _bolt_instr_setup() {
1732   __asm__ __volatile__(SAVE_ALL :::);
1733 
1734   report("Hello!\n");
1735 
1736   __asm__ __volatile__(RESTORE_ALL :::);
1737 }
1738 
1739 extern "C"
1740 __attribute__((section("__TEXT,__fini")))
1741 __attribute__((force_align_arg_pointer))
1742 void _bolt_instr_fini() {
1743   report("Bye!\n");
1744   __bolt_instr_data_dump();
1745 }
1746 
1747 #endif
1748 #endif
1749