xref: /llvm-project/bolt/runtime/instr.cpp (revision a86dd9ae60662cfe9f9fb709a33c71d6fec66dfb)
1 //===- bolt/runtime/instr.cpp ---------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // BOLT runtime instrumentation library for x86 Linux. Currently, BOLT does
10 // not support linking modules with dependencies on one another into the final
11 // binary (TODO?), which means this library has to be self-contained in a single
12 // module.
13 //
14 // All extern declarations here need to be defined by BOLT itself. Those will be
15 // undefined symbols that BOLT needs to resolve by emitting these symbols with
16 // MCStreamer. Currently, Passes/Instrumentation.cpp is the pass responsible
17 // for defining the symbols here and these two files have a tight coupling: one
18 // working statically when you run BOLT and another during program runtime when
19 // you run an instrumented binary. The main goal here is to output an fdata file
20 // (BOLT profile) with the instrumentation counters inserted by the static pass.
21 // Counters for indirect calls are an exception, as we can't know them
22 // statically. These counters are created and managed here. To allow this, we
23 // need a minimal framework for allocating memory dynamically. We provide this
24 // with the BumpPtrAllocator class (not LLVM's, but our own version of it).
25 //
26 // Since this code is intended to be inserted into any executable, we decided to
27 // make it standalone and do not depend on any external libraries (i.e. language
28 // support libraries, such as glibc or stdc++). To allow this, we provide a few
29 // light implementations of common OS interacting functionalities using direct
30 // syscall wrappers. Our simple allocator doesn't manage deallocations that
31 // fragment the memory space, so it's stack based. This is the minimal framework
32 // provided here to allow processing instrumented counters and writing fdata.
33 //
34 // In the C++ idiom used here, we never use or rely on constructors or
35 // destructors for global objects. That's because those need support from the
36 // linker in initialization/finalization code, and we want to keep our linker
37 // very simple. Similarly, we don't create any global objects that are zero
38 // initialized, since those would need to go .bss, which our simple linker also
39 // don't support (TODO?).
40 //
41 //===----------------------------------------------------------------------===//
42 
43 #if defined (__x86_64__)
44 #include "common.h"
45 
46 // Enables a very verbose logging to stderr useful when debugging
47 //#define ENABLE_DEBUG
48 
49 #ifdef ENABLE_DEBUG
50 #define DEBUG(X)                                                               \
51   { X; }
52 #else
53 #define DEBUG(X)                                                               \
54   {}
55 #endif
56 
57 #pragma GCC visibility push(hidden)
58 
59 extern "C" {
60 
61 #if defined(__APPLE__)
62 extern uint64_t* _bolt_instr_locations_getter();
63 extern uint32_t _bolt_num_counters_getter();
64 
65 extern uint8_t* _bolt_instr_tables_getter();
66 extern uint32_t _bolt_instr_num_funcs_getter();
67 
68 #else
69 
70 // Main counters inserted by instrumentation, incremented during runtime when
71 // points of interest (locations) in the program are reached. Those are direct
72 // calls and direct and indirect branches (local ones). There are also counters
73 // for basic block execution if they are a spanning tree leaf and need to be
74 // counted in order to infer the execution count of other edges of the CFG.
75 extern uint64_t __bolt_instr_locations[];
76 extern uint32_t __bolt_num_counters;
77 // Descriptions are serialized metadata about binary functions written by BOLT,
78 // so we have a minimal understanding about the program structure. For a
79 // reference on the exact format of this metadata, see *Description structs,
80 // Location, IntrumentedNode and EntryNode.
81 // Number of indirect call site descriptions
82 extern uint32_t __bolt_instr_num_ind_calls;
83 // Number of indirect call target descriptions
84 extern uint32_t __bolt_instr_num_ind_targets;
85 // Number of function descriptions
86 extern uint32_t __bolt_instr_num_funcs;
87 // Time to sleep across dumps (when we write the fdata profile to disk)
88 extern uint32_t __bolt_instr_sleep_time;
89 // Do not clear counters across dumps, rewrite file with the updated values
90 extern bool __bolt_instr_no_counters_clear;
91 // Wait until all forks of instrumented process will finish
92 extern bool __bolt_instr_wait_forks;
93 // Filename to dump data to
94 extern char __bolt_instr_filename[];
95 // Instumented binary file path
96 extern char __bolt_instr_binpath[];
97 // If true, append current PID to the fdata filename when creating it so
98 // different invocations of the same program can be differentiated.
99 extern bool __bolt_instr_use_pid;
100 // Functions that will be used to instrument indirect calls. BOLT static pass
101 // will identify indirect calls and modify them to load the address in these
102 // trampolines and call this address instead. BOLT can't use direct calls to
103 // our handlers because our addresses here are not known at analysis time. We
104 // only support resolving dependencies from this file to the output of BOLT,
105 // *not* the other way around.
106 // TODO: We need better linking support to make that happen.
107 extern void (*__bolt_ind_call_counter_func_pointer)();
108 extern void (*__bolt_ind_tailcall_counter_func_pointer)();
109 // Function pointers to init/fini trampoline routines in the binary, so we can
110 // resume regular execution of these functions that we hooked
111 extern void __bolt_start_trampoline();
112 extern void __bolt_fini_trampoline();
113 
114 #endif
115 }
116 
117 namespace {
118 
119 /// A simple allocator that mmaps a fixed size region and manages this space
120 /// in a stack fashion, meaning you always deallocate the last element that
121 /// was allocated. In practice, we don't need to deallocate individual elements.
122 /// We monotonically increase our usage and then deallocate everything once we
123 /// are done processing something.
124 class BumpPtrAllocator {
125   /// This is written before each allocation and act as a canary to detect when
126   /// a bug caused our program to cross allocation boundaries.
127   struct EntryMetadata {
128     uint64_t Magic;
129     uint64_t AllocSize;
130   };
131 
132 public:
133   void *allocate(size_t Size) {
134     Lock L(M);
135 
136     if (StackBase == nullptr) {
137       StackBase = reinterpret_cast<uint8_t *>(
138           __mmap(0, MaxSize, PROT_READ | PROT_WRITE,
139                  (Shared ? MAP_SHARED : MAP_PRIVATE) | MAP_ANONYMOUS, -1, 0));
140       assert(StackBase != MAP_FAILED,
141              "BumpPtrAllocator: failed to mmap stack!");
142       StackSize = 0;
143     }
144 
145     Size = alignTo(Size + sizeof(EntryMetadata), 16);
146     uint8_t *AllocAddress = StackBase + StackSize + sizeof(EntryMetadata);
147     auto *M = reinterpret_cast<EntryMetadata *>(StackBase + StackSize);
148     M->Magic = Magic;
149     M->AllocSize = Size;
150     StackSize += Size;
151     assert(StackSize < MaxSize, "allocator ran out of memory");
152     return AllocAddress;
153   }
154 
155 #ifdef DEBUG
156   /// Element-wise deallocation is only used for debugging to catch memory
157   /// bugs by checking magic bytes. Ordinarily, we reset the allocator once
158   /// we are done with it. Reset is done with clear(). There's no need
159   /// to deallocate each element individually.
160   void deallocate(void *Ptr) {
161     Lock L(M);
162     uint8_t MetadataOffset = sizeof(EntryMetadata);
163     auto *M = reinterpret_cast<EntryMetadata *>(
164         reinterpret_cast<uint8_t *>(Ptr) - MetadataOffset);
165     const uint8_t *StackTop = StackBase + StackSize + MetadataOffset;
166     // Validate size
167     if (Ptr != StackTop - M->AllocSize) {
168       // Failed validation, check if it is a pointer returned by operator new []
169       MetadataOffset +=
170           sizeof(uint64_t); // Space for number of elements alloc'ed
171       M = reinterpret_cast<EntryMetadata *>(reinterpret_cast<uint8_t *>(Ptr) -
172                                             MetadataOffset);
173       // Ok, it failed both checks if this assertion fails. Stop the program, we
174       // have a memory bug.
175       assert(Ptr == StackTop - M->AllocSize,
176              "must deallocate the last element alloc'ed");
177     }
178     assert(M->Magic == Magic, "allocator magic is corrupt");
179     StackSize -= M->AllocSize;
180   }
181 #else
182   void deallocate(void *) {}
183 #endif
184 
185   void clear() {
186     Lock L(M);
187     StackSize = 0;
188   }
189 
190   /// Set mmap reservation size (only relevant before first allocation)
191   void setMaxSize(uint64_t Size) { MaxSize = Size; }
192 
193   /// Set mmap reservation privacy (only relevant before first allocation)
194   void setShared(bool S) { Shared = S; }
195 
196   void destroy() {
197     if (StackBase == nullptr)
198       return;
199     __munmap(StackBase, MaxSize);
200   }
201 
202   // Placement operator to construct allocator in possibly shared mmaped memory
203   static void *operator new(size_t, void *Ptr) { return Ptr; };
204 
205 private:
206   static constexpr uint64_t Magic = 0x1122334455667788ull;
207   uint64_t MaxSize = 0xa00000;
208   uint8_t *StackBase{nullptr};
209   uint64_t StackSize{0};
210   bool Shared{false};
211   Mutex M;
212 };
213 
214 /// Used for allocating indirect call instrumentation counters. Initialized by
215 /// __bolt_instr_setup, our initialization routine.
216 BumpPtrAllocator *GlobalAlloc;
217 
218 // Base address which we substract from recorded PC values when searching for
219 // indirect call description entries. Needed because indCall descriptions are
220 // mapped read-only and contain static addresses. Initialized in
221 // __bolt_instr_setup.
222 uint64_t TextBaseAddress = 0;
223 
224 // Storage for GlobalAlloc which can be shared if not using
225 // instrumentation-file-append-pid.
226 void *GlobalMetadataStorage;
227 
228 } // anonymous namespace
229 
230 // User-defined placement new operators. We only use those (as opposed to
231 // overriding the regular operator new) so we can keep our allocator in the
232 // stack instead of in a data section (global).
233 void *operator new(size_t Sz, BumpPtrAllocator &A) { return A.allocate(Sz); }
234 void *operator new(size_t Sz, BumpPtrAllocator &A, char C) {
235   auto *Ptr = reinterpret_cast<char *>(A.allocate(Sz));
236   memset(Ptr, C, Sz);
237   return Ptr;
238 }
239 void *operator new[](size_t Sz, BumpPtrAllocator &A) {
240   return A.allocate(Sz);
241 }
242 void *operator new[](size_t Sz, BumpPtrAllocator &A, char C) {
243   auto *Ptr = reinterpret_cast<char *>(A.allocate(Sz));
244   memset(Ptr, C, Sz);
245   return Ptr;
246 }
247 // Only called during exception unwinding (useless). We must manually dealloc.
248 // C++ language weirdness
249 void operator delete(void *Ptr, BumpPtrAllocator &A) { A.deallocate(Ptr); }
250 
251 namespace {
252 
253 // Disable instrumentation optimizations that sacrifice profile accuracy
254 extern "C" bool __bolt_instr_conservative;
255 
256 /// Basic key-val atom stored in our hash
257 struct SimpleHashTableEntryBase {
258   uint64_t Key;
259   uint64_t Val;
260   void dump(const char *Msg = nullptr) {
261     // TODO: make some sort of formatting function
262     // Currently we have to do it the ugly way because
263     // we want every message to be printed atomically via a single call to
264     // __write. If we use reportNumber() and others nultiple times, we'll get
265     // garbage in mulithreaded environment
266     char Buf[BufSize];
267     char *Ptr = Buf;
268     Ptr = intToStr(Ptr, __getpid(), 10);
269     *Ptr++ = ':';
270     *Ptr++ = ' ';
271     if (Msg)
272       Ptr = strCopy(Ptr, Msg, strLen(Msg));
273     *Ptr++ = '0';
274     *Ptr++ = 'x';
275     Ptr = intToStr(Ptr, (uint64_t)this, 16);
276     *Ptr++ = ':';
277     *Ptr++ = ' ';
278     Ptr = strCopy(Ptr, "MapEntry(0x", sizeof("MapEntry(0x") - 1);
279     Ptr = intToStr(Ptr, Key, 16);
280     *Ptr++ = ',';
281     *Ptr++ = ' ';
282     *Ptr++ = '0';
283     *Ptr++ = 'x';
284     Ptr = intToStr(Ptr, Val, 16);
285     *Ptr++ = ')';
286     *Ptr++ = '\n';
287     assert(Ptr - Buf < BufSize, "Buffer overflow!");
288     // print everything all at once for atomicity
289     __write(2, Buf, Ptr - Buf);
290   }
291 };
292 
293 /// This hash table implementation starts by allocating a table of size
294 /// InitialSize. When conflicts happen in this main table, it resolves
295 /// them by chaining a new table of size IncSize. It never reallocs as our
296 /// allocator doesn't support it. The key is intended to be function pointers.
297 /// There's no clever hash function (it's just x mod size, size being prime).
298 /// I never tuned the coefficientes in the modular equation (TODO)
299 /// This is used for indirect calls (each call site has one of this, so it
300 /// should have a small footprint) and for tallying call counts globally for
301 /// each target to check if we missed the origin of some calls (this one is a
302 /// large instantiation of this template, since it is global for all call sites)
303 template <typename T = SimpleHashTableEntryBase, uint32_t InitialSize = 7,
304           uint32_t IncSize = 7>
305 class SimpleHashTable {
306 public:
307   using MapEntry = T;
308 
309   /// Increment by 1 the value of \p Key. If it is not in this table, it will be
310   /// added to the table and its value set to 1.
311   void incrementVal(uint64_t Key, BumpPtrAllocator &Alloc) {
312     if (!__bolt_instr_conservative) {
313       TryLock L(M);
314       if (!L.isLocked())
315         return;
316       auto &E = getOrAllocEntry(Key, Alloc);
317       ++E.Val;
318       return;
319     }
320     Lock L(M);
321     auto &E = getOrAllocEntry(Key, Alloc);
322     ++E.Val;
323   }
324 
325   /// Basic member accessing interface. Here we pass the allocator explicitly to
326   /// avoid storing a pointer to it as part of this table (remember there is one
327   /// hash for each indirect call site, so we wan't to minimize our footprint).
328   MapEntry &get(uint64_t Key, BumpPtrAllocator &Alloc) {
329     if (!__bolt_instr_conservative) {
330       TryLock L(M);
331       if (!L.isLocked())
332         return NoEntry;
333       return getOrAllocEntry(Key, Alloc);
334     }
335     Lock L(M);
336     return getOrAllocEntry(Key, Alloc);
337   }
338 
339   /// Traverses all elements in the table
340   template <typename... Args>
341   void forEachElement(void (*Callback)(MapEntry &, Args...), Args... args) {
342     Lock L(M);
343     if (!TableRoot)
344       return;
345     return forEachElement(Callback, InitialSize, TableRoot, args...);
346   }
347 
348   void resetCounters();
349 
350 private:
351   constexpr static uint64_t VacantMarker = 0;
352   constexpr static uint64_t FollowUpTableMarker = 0x8000000000000000ull;
353 
354   MapEntry *TableRoot{nullptr};
355   MapEntry NoEntry;
356   Mutex M;
357 
358   template <typename... Args>
359   void forEachElement(void (*Callback)(MapEntry &, Args...),
360                       uint32_t NumEntries, MapEntry *Entries, Args... args) {
361     for (uint32_t I = 0; I < NumEntries; ++I) {
362       MapEntry &Entry = Entries[I];
363       if (Entry.Key == VacantMarker)
364         continue;
365       if (Entry.Key & FollowUpTableMarker) {
366         MapEntry *Next =
367             reinterpret_cast<MapEntry *>(Entry.Key & ~FollowUpTableMarker);
368         assert(Next != Entries, "Circular reference!");
369         forEachElement(Callback, IncSize, Next, args...);
370         continue;
371       }
372       Callback(Entry, args...);
373     }
374   }
375 
376   MapEntry &firstAllocation(uint64_t Key, BumpPtrAllocator &Alloc) {
377     TableRoot = new (Alloc, 0) MapEntry[InitialSize];
378     MapEntry &Entry = TableRoot[Key % InitialSize];
379     Entry.Key = Key;
380     // DEBUG(Entry.dump("Created root entry: "));
381     return Entry;
382   }
383 
384   MapEntry &getEntry(MapEntry *Entries, uint64_t Key, uint64_t Selector,
385                      BumpPtrAllocator &Alloc, int CurLevel) {
386     // DEBUG(reportNumber("getEntry called, level ", CurLevel, 10));
387     const uint32_t NumEntries = CurLevel == 0 ? InitialSize : IncSize;
388     uint64_t Remainder = Selector / NumEntries;
389     Selector = Selector % NumEntries;
390     MapEntry &Entry = Entries[Selector];
391 
392     // A hit
393     if (Entry.Key == Key) {
394       // DEBUG(Entry.dump("Hit: "));
395       return Entry;
396     }
397 
398     // Vacant - add new entry
399     if (Entry.Key == VacantMarker) {
400       Entry.Key = Key;
401       // DEBUG(Entry.dump("Adding new entry: "));
402       return Entry;
403     }
404 
405     // Defer to the next level
406     if (Entry.Key & FollowUpTableMarker) {
407       return getEntry(
408           reinterpret_cast<MapEntry *>(Entry.Key & ~FollowUpTableMarker),
409           Key, Remainder, Alloc, CurLevel + 1);
410     }
411 
412     // Conflict - create the next level
413     // DEBUG(Entry.dump("Creating new level: "));
414 
415     MapEntry *NextLevelTbl = new (Alloc, 0) MapEntry[IncSize];
416     // DEBUG(
417     //     reportNumber("Newly allocated level: 0x", uint64_t(NextLevelTbl),
418     //     16));
419     uint64_t CurEntrySelector = Entry.Key / InitialSize;
420     for (int I = 0; I < CurLevel; ++I)
421       CurEntrySelector /= IncSize;
422     CurEntrySelector = CurEntrySelector % IncSize;
423     NextLevelTbl[CurEntrySelector] = Entry;
424     Entry.Key = reinterpret_cast<uint64_t>(NextLevelTbl) | FollowUpTableMarker;
425     assert((NextLevelTbl[CurEntrySelector].Key & ~FollowUpTableMarker) !=
426                uint64_t(Entries),
427            "circular reference created!\n");
428     // DEBUG(NextLevelTbl[CurEntrySelector].dump("New level entry: "));
429     // DEBUG(Entry.dump("Updated old entry: "));
430     return getEntry(NextLevelTbl, Key, Remainder, Alloc, CurLevel + 1);
431   }
432 
433   MapEntry &getOrAllocEntry(uint64_t Key, BumpPtrAllocator &Alloc) {
434     if (TableRoot) {
435       MapEntry &E = getEntry(TableRoot, Key, Key, Alloc, 0);
436       assert(!(E.Key & FollowUpTableMarker), "Invalid entry!");
437       return E;
438     }
439     return firstAllocation(Key, Alloc);
440   }
441 };
442 
443 template <typename T> void resetIndCallCounter(T &Entry) {
444   Entry.Val = 0;
445 }
446 
447 template <typename T, uint32_t X, uint32_t Y>
448 void SimpleHashTable<T, X, Y>::resetCounters() {
449   forEachElement(resetIndCallCounter);
450 }
451 
452 /// Represents a hash table mapping a function target address to its counter.
453 using IndirectCallHashTable = SimpleHashTable<>;
454 
455 /// Initialize with number 1 instead of 0 so we don't go into .bss. This is the
456 /// global array of all hash tables storing indirect call destinations happening
457 /// during runtime, one table per call site.
458 IndirectCallHashTable *GlobalIndCallCounters{
459     reinterpret_cast<IndirectCallHashTable *>(1)};
460 
461 /// Don't allow reentrancy in the fdata writing phase - only one thread writes
462 /// it
463 Mutex *GlobalWriteProfileMutex{reinterpret_cast<Mutex *>(1)};
464 
465 /// Store number of calls in additional to target address (Key) and frequency
466 /// as perceived by the basic block counter (Val).
467 struct CallFlowEntryBase : public SimpleHashTableEntryBase {
468   uint64_t Calls;
469 };
470 
471 using CallFlowHashTableBase = SimpleHashTable<CallFlowEntryBase, 11939, 233>;
472 
473 /// This is a large table indexing all possible call targets (indirect and
474 /// direct ones). The goal is to find mismatches between number of calls (for
475 /// those calls we were able to track) and the entry basic block counter of the
476 /// callee. In most cases, these two should be equal. If not, there are two
477 /// possible scenarios here:
478 ///
479 ///  * Entry BB has higher frequency than all known calls to this function.
480 ///    In this case, we have dynamic library code or any uninstrumented code
481 ///    calling this function. We will write the profile for these untracked
482 ///    calls as having source "0 [unknown] 0" in the fdata file.
483 ///
484 ///  * Number of known calls is higher than the frequency of entry BB
485 ///    This only happens when there is no counter for the entry BB / callee
486 ///    function is not simple (in BOLT terms). We don't do anything special
487 ///    here and just ignore those (we still report all calls to the non-simple
488 ///    function, though).
489 ///
490 class CallFlowHashTable : public CallFlowHashTableBase {
491 public:
492   CallFlowHashTable(BumpPtrAllocator &Alloc) : Alloc(Alloc) {}
493 
494   MapEntry &get(uint64_t Key) { return CallFlowHashTableBase::get(Key, Alloc); }
495 
496 private:
497   // Different than the hash table for indirect call targets, we do store the
498   // allocator here since there is only one call flow hash and space overhead
499   // is negligible.
500   BumpPtrAllocator &Alloc;
501 };
502 
503 ///
504 /// Description metadata emitted by BOLT to describe the program - refer to
505 /// Passes/Instrumentation.cpp - Instrumentation::emitTablesAsELFNote()
506 ///
507 struct Location {
508   uint32_t FunctionName;
509   uint32_t Offset;
510 };
511 
512 struct CallDescription {
513   Location From;
514   uint32_t FromNode;
515   Location To;
516   uint32_t Counter;
517   uint64_t TargetAddress;
518 };
519 
520 using IndCallDescription = Location;
521 
522 struct IndCallTargetDescription {
523   Location Loc;
524   uint64_t Address;
525 };
526 
527 struct EdgeDescription {
528   Location From;
529   uint32_t FromNode;
530   Location To;
531   uint32_t ToNode;
532   uint32_t Counter;
533 };
534 
535 struct InstrumentedNode {
536   uint32_t Node;
537   uint32_t Counter;
538 };
539 
540 struct EntryNode {
541   uint64_t Node;
542   uint64_t Address;
543 };
544 
545 struct FunctionDescription {
546   uint32_t NumLeafNodes;
547   const InstrumentedNode *LeafNodes;
548   uint32_t NumEdges;
549   const EdgeDescription *Edges;
550   uint32_t NumCalls;
551   const CallDescription *Calls;
552   uint32_t NumEntryNodes;
553   const EntryNode *EntryNodes;
554 
555   /// Constructor will parse the serialized function metadata written by BOLT
556   FunctionDescription(const uint8_t *FuncDesc);
557 
558   uint64_t getSize() const {
559     return 16 + NumLeafNodes * sizeof(InstrumentedNode) +
560            NumEdges * sizeof(EdgeDescription) +
561            NumCalls * sizeof(CallDescription) +
562            NumEntryNodes * sizeof(EntryNode);
563   }
564 };
565 
566 /// The context is created when the fdata profile needs to be written to disk
567 /// and we need to interpret our runtime counters. It contains pointers to the
568 /// mmaped binary (only the BOLT written metadata section). Deserialization
569 /// should be straightforward as most data is POD or an array of POD elements.
570 /// This metadata is used to reconstruct function CFGs.
571 struct ProfileWriterContext {
572   IndCallDescription *IndCallDescriptions;
573   IndCallTargetDescription *IndCallTargets;
574   uint8_t *FuncDescriptions;
575   char *Strings;  // String table with function names used in this binary
576   int FileDesc;   // File descriptor for the file on disk backing this
577                   // information in memory via mmap
578   void *MMapPtr;  // The mmap ptr
579   int MMapSize;   // The mmap size
580 
581   /// Hash table storing all possible call destinations to detect untracked
582   /// calls and correctly report them as [unknown] in output fdata.
583   CallFlowHashTable *CallFlowTable;
584 
585   /// Lookup the sorted indirect call target vector to fetch function name and
586   /// offset for an arbitrary function pointer.
587   const IndCallTargetDescription *lookupIndCallTarget(uint64_t Target) const;
588 };
589 
590 /// Perform a string comparison and returns zero if Str1 matches Str2. Compares
591 /// at most Size characters.
592 int compareStr(const char *Str1, const char *Str2, int Size) {
593   while (*Str1 == *Str2) {
594     if (*Str1 == '\0' || --Size == 0)
595       return 0;
596     ++Str1;
597     ++Str2;
598   }
599   return 1;
600 }
601 
602 /// Output Location to the fdata file
603 char *serializeLoc(const ProfileWriterContext &Ctx, char *OutBuf,
604                    const Location Loc, uint32_t BufSize) {
605   // fdata location format: Type Name Offset
606   // Type 1 - regular symbol
607   OutBuf = strCopy(OutBuf, "1 ");
608   const char *Str = Ctx.Strings + Loc.FunctionName;
609   uint32_t Size = 25;
610   while (*Str) {
611     *OutBuf++ = *Str++;
612     if (++Size >= BufSize)
613       break;
614   }
615   assert(!*Str, "buffer overflow, function name too large");
616   *OutBuf++ = ' ';
617   OutBuf = intToStr(OutBuf, Loc.Offset, 16);
618   *OutBuf++ = ' ';
619   return OutBuf;
620 }
621 
622 /// Read and deserialize a function description written by BOLT. \p FuncDesc
623 /// points at the beginning of the function metadata structure in the file.
624 /// See Instrumentation::emitTablesAsELFNote()
625 FunctionDescription::FunctionDescription(const uint8_t *FuncDesc) {
626   NumLeafNodes = *reinterpret_cast<const uint32_t *>(FuncDesc);
627   DEBUG(reportNumber("NumLeafNodes = ", NumLeafNodes, 10));
628   LeafNodes = reinterpret_cast<const InstrumentedNode *>(FuncDesc + 4);
629 
630   NumEdges = *reinterpret_cast<const uint32_t *>(
631       FuncDesc + 4 + NumLeafNodes * sizeof(InstrumentedNode));
632   DEBUG(reportNumber("NumEdges = ", NumEdges, 10));
633   Edges = reinterpret_cast<const EdgeDescription *>(
634       FuncDesc + 8 + NumLeafNodes * sizeof(InstrumentedNode));
635 
636   NumCalls = *reinterpret_cast<const uint32_t *>(
637       FuncDesc + 8 + NumLeafNodes * sizeof(InstrumentedNode) +
638       NumEdges * sizeof(EdgeDescription));
639   DEBUG(reportNumber("NumCalls = ", NumCalls, 10));
640   Calls = reinterpret_cast<const CallDescription *>(
641       FuncDesc + 12 + NumLeafNodes * sizeof(InstrumentedNode) +
642       NumEdges * sizeof(EdgeDescription));
643   NumEntryNodes = *reinterpret_cast<const uint32_t *>(
644       FuncDesc + 12 + NumLeafNodes * sizeof(InstrumentedNode) +
645       NumEdges * sizeof(EdgeDescription) + NumCalls * sizeof(CallDescription));
646   DEBUG(reportNumber("NumEntryNodes = ", NumEntryNodes, 10));
647   EntryNodes = reinterpret_cast<const EntryNode *>(
648       FuncDesc + 16 + NumLeafNodes * sizeof(InstrumentedNode) +
649       NumEdges * sizeof(EdgeDescription) + NumCalls * sizeof(CallDescription));
650 }
651 
652 /// Read and mmap descriptions written by BOLT from the executable's notes
653 /// section
654 #if defined(HAVE_ELF_H) and !defined(__APPLE__)
655 
656 void *__attribute__((noinline)) __get_pc() {
657   return __builtin_extract_return_addr(__builtin_return_address(0));
658 }
659 
660 /// Get string with address and parse it to hex pair <StartAddress, EndAddress>
661 bool parseAddressRange(const char *Str, uint64_t &StartAddress,
662                        uint64_t &EndAddress) {
663   if (!Str)
664     return false;
665   // Parsed string format: <hex1>-<hex2>
666   StartAddress = hexToLong(Str, '-');
667   while (*Str && *Str != '-')
668     ++Str;
669   if (!*Str)
670     return false;
671   ++Str; // swallow '-'
672   EndAddress = hexToLong(Str);
673   return true;
674 }
675 
676 /// Get full path to the real binary by getting current virtual address
677 /// and searching for the appropriate link in address range in
678 /// /proc/self/map_files
679 static char *getBinaryPath() {
680   const uint32_t BufSize = 1024;
681   const uint32_t NameMax = 4096;
682   const char DirPath[] = "/proc/self/map_files/";
683   static char TargetPath[NameMax] = {};
684   char Buf[BufSize];
685 
686   if (__bolt_instr_binpath[0] != '\0')
687     return __bolt_instr_binpath;
688 
689   if (TargetPath[0] != '\0')
690     return TargetPath;
691 
692   unsigned long CurAddr = (unsigned long)__get_pc();
693   uint64_t FDdir = __open(DirPath, O_RDONLY,
694                           /*mode=*/0666);
695   assert(static_cast<int64_t>(FDdir) >= 0,
696          "failed to open /proc/self/map_files");
697 
698   while (long Nread = __getdents(FDdir, (struct dirent *)Buf, BufSize)) {
699     assert(static_cast<int64_t>(Nread) != -1, "failed to get folder entries");
700 
701     struct dirent *d;
702     for (long Bpos = 0; Bpos < Nread; Bpos += d->d_reclen) {
703       d = (struct dirent *)(Buf + Bpos);
704 
705       uint64_t StartAddress, EndAddress;
706       if (!parseAddressRange(d->d_name, StartAddress, EndAddress))
707         continue;
708       if (CurAddr < StartAddress || CurAddr > EndAddress)
709         continue;
710       char FindBuf[NameMax];
711       char *C = strCopy(FindBuf, DirPath, NameMax);
712       C = strCopy(C, d->d_name, NameMax - (C - FindBuf));
713       *C = '\0';
714       uint32_t Ret = __readlink(FindBuf, TargetPath, sizeof(TargetPath));
715       assert(Ret != -1 && Ret != BufSize, "readlink error");
716       TargetPath[Ret] = '\0';
717       return TargetPath;
718     }
719   }
720   return nullptr;
721 }
722 
723 ProfileWriterContext readDescriptions() {
724   ProfileWriterContext Result;
725   char *BinPath = getBinaryPath();
726   assert(BinPath && BinPath[0] != '\0', "failed to find binary path");
727 
728   uint64_t FD = __open(BinPath, O_RDONLY,
729                        /*mode=*/0666);
730   assert(static_cast<int64_t>(FD) >= 0, "failed to open binary path");
731 
732   Result.FileDesc = FD;
733 
734   // mmap our binary to memory
735   uint64_t Size = __lseek(FD, 0, SEEK_END);
736   uint8_t *BinContents = reinterpret_cast<uint8_t *>(
737       __mmap(0, Size, PROT_READ, MAP_PRIVATE, FD, 0));
738   assert(BinContents != MAP_FAILED, "readDescriptions: Failed to mmap self!");
739   Result.MMapPtr = BinContents;
740   Result.MMapSize = Size;
741   Elf64_Ehdr *Hdr = reinterpret_cast<Elf64_Ehdr *>(BinContents);
742   Elf64_Shdr *Shdr = reinterpret_cast<Elf64_Shdr *>(BinContents + Hdr->e_shoff);
743   Elf64_Shdr *StringTblHeader = reinterpret_cast<Elf64_Shdr *>(
744       BinContents + Hdr->e_shoff + Hdr->e_shstrndx * Hdr->e_shentsize);
745 
746   // Find .bolt.instr.tables with the data we need and set pointers to it
747   for (int I = 0; I < Hdr->e_shnum; ++I) {
748     char *SecName = reinterpret_cast<char *>(
749         BinContents + StringTblHeader->sh_offset + Shdr->sh_name);
750     if (compareStr(SecName, ".bolt.instr.tables", 64) != 0) {
751       Shdr = reinterpret_cast<Elf64_Shdr *>(BinContents + Hdr->e_shoff +
752                                             (I + 1) * Hdr->e_shentsize);
753       continue;
754     }
755     // Actual contents of the ELF note start after offset 20 decimal:
756     // Offset 0: Producer name size (4 bytes)
757     // Offset 4: Contents size (4 bytes)
758     // Offset 8: Note type (4 bytes)
759     // Offset 12: Producer name (BOLT\0) (5 bytes + align to 4-byte boundary)
760     // Offset 20: Contents
761     uint32_t IndCallDescSize =
762         *reinterpret_cast<uint32_t *>(BinContents + Shdr->sh_offset + 20);
763     uint32_t IndCallTargetDescSize = *reinterpret_cast<uint32_t *>(
764         BinContents + Shdr->sh_offset + 24 + IndCallDescSize);
765     uint32_t FuncDescSize =
766         *reinterpret_cast<uint32_t *>(BinContents + Shdr->sh_offset + 28 +
767                                       IndCallDescSize + IndCallTargetDescSize);
768     Result.IndCallDescriptions = reinterpret_cast<IndCallDescription *>(
769         BinContents + Shdr->sh_offset + 24);
770     Result.IndCallTargets = reinterpret_cast<IndCallTargetDescription *>(
771         BinContents + Shdr->sh_offset + 28 + IndCallDescSize);
772     Result.FuncDescriptions = BinContents + Shdr->sh_offset + 32 +
773                               IndCallDescSize + IndCallTargetDescSize;
774     Result.Strings = reinterpret_cast<char *>(
775         BinContents + Shdr->sh_offset + 32 + IndCallDescSize +
776         IndCallTargetDescSize + FuncDescSize);
777     return Result;
778   }
779   const char ErrMsg[] =
780       "BOLT instrumentation runtime error: could not find section "
781       ".bolt.instr.tables\n";
782   reportError(ErrMsg, sizeof(ErrMsg));
783   return Result;
784 }
785 
786 #else
787 
788 ProfileWriterContext readDescriptions() {
789   ProfileWriterContext Result;
790   uint8_t *Tables = _bolt_instr_tables_getter();
791   uint32_t IndCallDescSize = *reinterpret_cast<uint32_t *>(Tables);
792   uint32_t IndCallTargetDescSize =
793       *reinterpret_cast<uint32_t *>(Tables + 4 + IndCallDescSize);
794   uint32_t FuncDescSize = *reinterpret_cast<uint32_t *>(
795       Tables + 8 + IndCallDescSize + IndCallTargetDescSize);
796   Result.IndCallDescriptions =
797       reinterpret_cast<IndCallDescription *>(Tables + 4);
798   Result.IndCallTargets = reinterpret_cast<IndCallTargetDescription *>(
799       Tables + 8 + IndCallDescSize);
800   Result.FuncDescriptions =
801       Tables + 12 + IndCallDescSize + IndCallTargetDescSize;
802   Result.Strings = reinterpret_cast<char *>(
803       Tables + 12 + IndCallDescSize + IndCallTargetDescSize + FuncDescSize);
804   return Result;
805 }
806 
807 #endif
808 
809 #if !defined(__APPLE__)
810 /// Debug by printing overall metadata global numbers to check it is sane
811 void printStats(const ProfileWriterContext &Ctx) {
812   char StatMsg[BufSize];
813   char *StatPtr = StatMsg;
814   StatPtr =
815       strCopy(StatPtr,
816               "\nBOLT INSTRUMENTATION RUNTIME STATISTICS\n\nIndCallDescSize: ");
817   StatPtr = intToStr(StatPtr,
818                      Ctx.FuncDescriptions -
819                          reinterpret_cast<uint8_t *>(Ctx.IndCallDescriptions),
820                      10);
821   StatPtr = strCopy(StatPtr, "\nFuncDescSize: ");
822   StatPtr = intToStr(
823       StatPtr,
824       reinterpret_cast<uint8_t *>(Ctx.Strings) - Ctx.FuncDescriptions, 10);
825   StatPtr = strCopy(StatPtr, "\n__bolt_instr_num_ind_calls: ");
826   StatPtr = intToStr(StatPtr, __bolt_instr_num_ind_calls, 10);
827   StatPtr = strCopy(StatPtr, "\n__bolt_instr_num_funcs: ");
828   StatPtr = intToStr(StatPtr, __bolt_instr_num_funcs, 10);
829   StatPtr = strCopy(StatPtr, "\n");
830   __write(2, StatMsg, StatPtr - StatMsg);
831 }
832 #endif
833 
834 
835 /// This is part of a simple CFG representation in memory, where we store
836 /// a dynamically sized array of input and output edges per node, and store
837 /// a dynamically sized array of nodes per graph. We also store the spanning
838 /// tree edges for that CFG in a separate array of nodes in
839 /// \p SpanningTreeNodes, while the regular nodes live in \p CFGNodes.
840 struct Edge {
841   uint32_t Node; // Index in nodes array regarding the destination of this edge
842   uint32_t ID;   // Edge index in an array comprising all edges of the graph
843 };
844 
845 /// A regular graph node or a spanning tree node
846 struct Node {
847   uint32_t NumInEdges{0};  // Input edge count used to size InEdge
848   uint32_t NumOutEdges{0}; // Output edge count used to size OutEdges
849   Edge *InEdges{nullptr};  // Created and managed by \p Graph
850   Edge *OutEdges{nullptr}; // ditto
851 };
852 
853 /// Main class for CFG representation in memory. Manages object creation and
854 /// destruction, populates an array of CFG nodes as well as corresponding
855 /// spanning tree nodes.
856 struct Graph {
857   uint32_t NumNodes;
858   Node *CFGNodes;
859   Node *SpanningTreeNodes;
860   uint64_t *EdgeFreqs;
861   uint64_t *CallFreqs;
862   BumpPtrAllocator &Alloc;
863   const FunctionDescription &D;
864 
865   /// Reads a list of edges from function description \p D and builds
866   /// the graph from it. Allocates several internal dynamic structures that are
867   /// later destroyed by ~Graph() and uses \p Alloc. D.LeafNodes contain all
868   /// spanning tree leaf nodes descriptions (their counters). They are the seed
869   /// used to compute the rest of the missing edge counts in a bottom-up
870   /// traversal of the spanning tree.
871   Graph(BumpPtrAllocator &Alloc, const FunctionDescription &D,
872         const uint64_t *Counters, ProfileWriterContext &Ctx);
873   ~Graph();
874   void dump() const;
875 
876 private:
877   void computeEdgeFrequencies(const uint64_t *Counters,
878                               ProfileWriterContext &Ctx);
879   void dumpEdgeFreqs() const;
880 };
881 
882 Graph::Graph(BumpPtrAllocator &Alloc, const FunctionDescription &D,
883              const uint64_t *Counters, ProfileWriterContext &Ctx)
884     : Alloc(Alloc), D(D) {
885   DEBUG(reportNumber("G = 0x", (uint64_t)this, 16));
886   // First pass to determine number of nodes
887   int32_t MaxNodes = -1;
888   CallFreqs = nullptr;
889   EdgeFreqs = nullptr;
890   for (int I = 0; I < D.NumEdges; ++I) {
891     if (static_cast<int32_t>(D.Edges[I].FromNode) > MaxNodes)
892       MaxNodes = D.Edges[I].FromNode;
893     if (static_cast<int32_t>(D.Edges[I].ToNode) > MaxNodes)
894       MaxNodes = D.Edges[I].ToNode;
895   }
896 
897   for (int I = 0; I < D.NumLeafNodes; ++I)
898     if (static_cast<int32_t>(D.LeafNodes[I].Node) > MaxNodes)
899       MaxNodes = D.LeafNodes[I].Node;
900 
901   for (int I = 0; I < D.NumCalls; ++I)
902     if (static_cast<int32_t>(D.Calls[I].FromNode) > MaxNodes)
903       MaxNodes = D.Calls[I].FromNode;
904 
905   // No nodes? Nothing to do
906   if (MaxNodes < 0) {
907     DEBUG(report("No nodes!\n"));
908     CFGNodes = nullptr;
909     SpanningTreeNodes = nullptr;
910     NumNodes = 0;
911     return;
912   }
913   ++MaxNodes;
914   DEBUG(reportNumber("NumNodes = ", MaxNodes, 10));
915   NumNodes = static_cast<uint32_t>(MaxNodes);
916 
917   // Initial allocations
918   CFGNodes = new (Alloc) Node[MaxNodes];
919 
920   DEBUG(reportNumber("G->CFGNodes = 0x", (uint64_t)CFGNodes, 16));
921   SpanningTreeNodes = new (Alloc) Node[MaxNodes];
922   DEBUG(reportNumber("G->SpanningTreeNodes = 0x",
923                      (uint64_t)SpanningTreeNodes, 16));
924 
925   // Figure out how much to allocate to each vector (in/out edge sets)
926   for (int I = 0; I < D.NumEdges; ++I) {
927     CFGNodes[D.Edges[I].FromNode].NumOutEdges++;
928     CFGNodes[D.Edges[I].ToNode].NumInEdges++;
929     if (D.Edges[I].Counter != 0xffffffff)
930       continue;
931 
932     SpanningTreeNodes[D.Edges[I].FromNode].NumOutEdges++;
933     SpanningTreeNodes[D.Edges[I].ToNode].NumInEdges++;
934   }
935 
936   // Allocate in/out edge sets
937   for (int I = 0; I < MaxNodes; ++I) {
938     if (CFGNodes[I].NumInEdges > 0)
939       CFGNodes[I].InEdges = new (Alloc) Edge[CFGNodes[I].NumInEdges];
940     if (CFGNodes[I].NumOutEdges > 0)
941       CFGNodes[I].OutEdges = new (Alloc) Edge[CFGNodes[I].NumOutEdges];
942     if (SpanningTreeNodes[I].NumInEdges > 0)
943       SpanningTreeNodes[I].InEdges =
944           new (Alloc) Edge[SpanningTreeNodes[I].NumInEdges];
945     if (SpanningTreeNodes[I].NumOutEdges > 0)
946       SpanningTreeNodes[I].OutEdges =
947           new (Alloc) Edge[SpanningTreeNodes[I].NumOutEdges];
948     CFGNodes[I].NumInEdges = 0;
949     CFGNodes[I].NumOutEdges = 0;
950     SpanningTreeNodes[I].NumInEdges = 0;
951     SpanningTreeNodes[I].NumOutEdges = 0;
952   }
953 
954   // Fill in/out edge sets
955   for (int I = 0; I < D.NumEdges; ++I) {
956     const uint32_t Src = D.Edges[I].FromNode;
957     const uint32_t Dst = D.Edges[I].ToNode;
958     Edge *E = &CFGNodes[Src].OutEdges[CFGNodes[Src].NumOutEdges++];
959     E->Node = Dst;
960     E->ID = I;
961 
962     E = &CFGNodes[Dst].InEdges[CFGNodes[Dst].NumInEdges++];
963     E->Node = Src;
964     E->ID = I;
965 
966     if (D.Edges[I].Counter != 0xffffffff)
967       continue;
968 
969     E = &SpanningTreeNodes[Src]
970              .OutEdges[SpanningTreeNodes[Src].NumOutEdges++];
971     E->Node = Dst;
972     E->ID = I;
973 
974     E = &SpanningTreeNodes[Dst]
975              .InEdges[SpanningTreeNodes[Dst].NumInEdges++];
976     E->Node = Src;
977     E->ID = I;
978   }
979 
980   computeEdgeFrequencies(Counters, Ctx);
981 }
982 
983 Graph::~Graph() {
984   if (CallFreqs)
985     Alloc.deallocate(CallFreqs);
986   if (EdgeFreqs)
987     Alloc.deallocate(EdgeFreqs);
988   for (int I = NumNodes - 1; I >= 0; --I) {
989     if (SpanningTreeNodes[I].OutEdges)
990       Alloc.deallocate(SpanningTreeNodes[I].OutEdges);
991     if (SpanningTreeNodes[I].InEdges)
992       Alloc.deallocate(SpanningTreeNodes[I].InEdges);
993     if (CFGNodes[I].OutEdges)
994       Alloc.deallocate(CFGNodes[I].OutEdges);
995     if (CFGNodes[I].InEdges)
996       Alloc.deallocate(CFGNodes[I].InEdges);
997   }
998   if (SpanningTreeNodes)
999     Alloc.deallocate(SpanningTreeNodes);
1000   if (CFGNodes)
1001     Alloc.deallocate(CFGNodes);
1002 }
1003 
1004 void Graph::dump() const {
1005   reportNumber("Dumping graph with number of nodes: ", NumNodes, 10);
1006   report("  Full graph:\n");
1007   for (int I = 0; I < NumNodes; ++I) {
1008     const Node *N = &CFGNodes[I];
1009     reportNumber("    Node #", I, 10);
1010     reportNumber("      InEdges total ", N->NumInEdges, 10);
1011     for (int J = 0; J < N->NumInEdges; ++J)
1012       reportNumber("        ", N->InEdges[J].Node, 10);
1013     reportNumber("      OutEdges total ", N->NumOutEdges, 10);
1014     for (int J = 0; J < N->NumOutEdges; ++J)
1015       reportNumber("        ", N->OutEdges[J].Node, 10);
1016     report("\n");
1017   }
1018   report("  Spanning tree:\n");
1019   for (int I = 0; I < NumNodes; ++I) {
1020     const Node *N = &SpanningTreeNodes[I];
1021     reportNumber("    Node #", I, 10);
1022     reportNumber("      InEdges total ", N->NumInEdges, 10);
1023     for (int J = 0; J < N->NumInEdges; ++J)
1024       reportNumber("        ", N->InEdges[J].Node, 10);
1025     reportNumber("      OutEdges total ", N->NumOutEdges, 10);
1026     for (int J = 0; J < N->NumOutEdges; ++J)
1027       reportNumber("        ", N->OutEdges[J].Node, 10);
1028     report("\n");
1029   }
1030 }
1031 
1032 void Graph::dumpEdgeFreqs() const {
1033   reportNumber(
1034       "Dumping edge frequencies for graph with num edges: ", D.NumEdges, 10);
1035   for (int I = 0; I < D.NumEdges; ++I) {
1036     reportNumber("* Src: ", D.Edges[I].FromNode, 10);
1037     reportNumber("  Dst: ", D.Edges[I].ToNode, 10);
1038     reportNumber("    Cnt: ", EdgeFreqs[I], 10);
1039   }
1040 }
1041 
1042 /// Auxiliary map structure for fast lookups of which calls map to each node of
1043 /// the function CFG
1044 struct NodeToCallsMap {
1045   struct MapEntry {
1046     uint32_t NumCalls;
1047     uint32_t *Calls;
1048   };
1049   MapEntry *Entries;
1050   BumpPtrAllocator &Alloc;
1051   const uint32_t NumNodes;
1052 
1053   NodeToCallsMap(BumpPtrAllocator &Alloc, const FunctionDescription &D,
1054                  uint32_t NumNodes)
1055       : Alloc(Alloc), NumNodes(NumNodes) {
1056     Entries = new (Alloc, 0) MapEntry[NumNodes];
1057     for (int I = 0; I < D.NumCalls; ++I) {
1058       DEBUG(reportNumber("Registering call in node ", D.Calls[I].FromNode, 10));
1059       ++Entries[D.Calls[I].FromNode].NumCalls;
1060     }
1061     for (int I = 0; I < NumNodes; ++I) {
1062       Entries[I].Calls = Entries[I].NumCalls ? new (Alloc)
1063                                                    uint32_t[Entries[I].NumCalls]
1064                                              : nullptr;
1065       Entries[I].NumCalls = 0;
1066     }
1067     for (int I = 0; I < D.NumCalls; ++I) {
1068       MapEntry &Entry = Entries[D.Calls[I].FromNode];
1069       Entry.Calls[Entry.NumCalls++] = I;
1070     }
1071   }
1072 
1073   /// Set the frequency of all calls in node \p NodeID to Freq. However, if
1074   /// the calls have their own counters and do not depend on the basic block
1075   /// counter, this means they have landing pads and throw exceptions. In this
1076   /// case, set their frequency with their counters and return the maximum
1077   /// value observed in such counters. This will be used as the new frequency
1078   /// at basic block entry. This is used to fix the CFG edge frequencies in the
1079   /// presence of exceptions.
1080   uint64_t visitAllCallsIn(uint32_t NodeID, uint64_t Freq, uint64_t *CallFreqs,
1081                            const FunctionDescription &D,
1082                            const uint64_t *Counters,
1083                            ProfileWriterContext &Ctx) const {
1084     const MapEntry &Entry = Entries[NodeID];
1085     uint64_t MaxValue = 0ull;
1086     for (int I = 0, E = Entry.NumCalls; I != E; ++I) {
1087       const uint32_t CallID = Entry.Calls[I];
1088       DEBUG(reportNumber("  Setting freq for call ID: ", CallID, 10));
1089       const CallDescription &CallDesc = D.Calls[CallID];
1090       if (CallDesc.Counter == 0xffffffff) {
1091         CallFreqs[CallID] = Freq;
1092         DEBUG(reportNumber("  with : ", Freq, 10));
1093       } else {
1094         const uint64_t CounterVal = Counters[CallDesc.Counter];
1095         CallFreqs[CallID] = CounterVal;
1096         MaxValue = CounterVal > MaxValue ? CounterVal : MaxValue;
1097         DEBUG(reportNumber("  with (private counter) : ", CounterVal, 10));
1098       }
1099       DEBUG(reportNumber("  Address: 0x", CallDesc.TargetAddress, 16));
1100       if (CallFreqs[CallID] > 0)
1101         Ctx.CallFlowTable->get(CallDesc.TargetAddress).Calls +=
1102             CallFreqs[CallID];
1103     }
1104     return MaxValue;
1105   }
1106 
1107   ~NodeToCallsMap() {
1108     for (int I = NumNodes - 1; I >= 0; --I)
1109       if (Entries[I].Calls)
1110         Alloc.deallocate(Entries[I].Calls);
1111     Alloc.deallocate(Entries);
1112   }
1113 };
1114 
1115 /// Fill an array with the frequency of each edge in the function represented
1116 /// by G, as well as another array for each call.
1117 void Graph::computeEdgeFrequencies(const uint64_t *Counters,
1118                                    ProfileWriterContext &Ctx) {
1119   if (NumNodes == 0)
1120     return;
1121 
1122   EdgeFreqs = D.NumEdges ? new (Alloc, 0) uint64_t [D.NumEdges] : nullptr;
1123   CallFreqs = D.NumCalls ? new (Alloc, 0) uint64_t [D.NumCalls] : nullptr;
1124 
1125   // Setup a lookup for calls present in each node (BB)
1126   NodeToCallsMap *CallMap = new (Alloc) NodeToCallsMap(Alloc, D, NumNodes);
1127 
1128   // Perform a bottom-up, BFS traversal of the spanning tree in G. Edges in the
1129   // spanning tree don't have explicit counters. We must infer their value using
1130   // a linear combination of other counters (sum of counters of the outgoing
1131   // edges minus sum of counters of the incoming edges).
1132   uint32_t *Stack = new (Alloc) uint32_t [NumNodes];
1133   uint32_t StackTop = 0;
1134   enum Status : uint8_t { S_NEW = 0, S_VISITING, S_VISITED };
1135   Status *Visited = new (Alloc, 0) Status[NumNodes];
1136   uint64_t *LeafFrequency = new (Alloc, 0) uint64_t[NumNodes];
1137   uint64_t *EntryAddress = new (Alloc, 0) uint64_t[NumNodes];
1138 
1139   // Setup a fast lookup for frequency of leaf nodes, which have special
1140   // basic block frequency instrumentation (they are not edge profiled).
1141   for (int I = 0; I < D.NumLeafNodes; ++I) {
1142     LeafFrequency[D.LeafNodes[I].Node] = Counters[D.LeafNodes[I].Counter];
1143     DEBUG({
1144       if (Counters[D.LeafNodes[I].Counter] > 0) {
1145         reportNumber("Leaf Node# ", D.LeafNodes[I].Node, 10);
1146         reportNumber("     Counter: ", Counters[D.LeafNodes[I].Counter], 10);
1147       }
1148     });
1149   }
1150   for (int I = 0; I < D.NumEntryNodes; ++I) {
1151     EntryAddress[D.EntryNodes[I].Node] = D.EntryNodes[I].Address;
1152     DEBUG({
1153         reportNumber("Entry Node# ", D.EntryNodes[I].Node, 10);
1154         reportNumber("      Address: ", D.EntryNodes[I].Address, 16);
1155     });
1156   }
1157   // Add all root nodes to the stack
1158   for (int I = 0; I < NumNodes; ++I)
1159     if (SpanningTreeNodes[I].NumInEdges == 0)
1160       Stack[StackTop++] = I;
1161 
1162   // Empty stack?
1163   if (StackTop == 0) {
1164     DEBUG(report("Empty stack!\n"));
1165     Alloc.deallocate(EntryAddress);
1166     Alloc.deallocate(LeafFrequency);
1167     Alloc.deallocate(Visited);
1168     Alloc.deallocate(Stack);
1169     CallMap->~NodeToCallsMap();
1170     Alloc.deallocate(CallMap);
1171     if (CallFreqs)
1172       Alloc.deallocate(CallFreqs);
1173     if (EdgeFreqs)
1174       Alloc.deallocate(EdgeFreqs);
1175     EdgeFreqs = nullptr;
1176     CallFreqs = nullptr;
1177     return;
1178   }
1179   // Add all known edge counts, will infer the rest
1180   for (int I = 0; I < D.NumEdges; ++I) {
1181     const uint32_t C = D.Edges[I].Counter;
1182     if (C == 0xffffffff) // inferred counter - we will compute its value
1183       continue;
1184     EdgeFreqs[I] = Counters[C];
1185   }
1186 
1187   while (StackTop > 0) {
1188     const uint32_t Cur = Stack[--StackTop];
1189     DEBUG({
1190       if (Visited[Cur] == S_VISITING)
1191         report("(visiting) ");
1192       else
1193         report("(new) ");
1194       reportNumber("Cur: ", Cur, 10);
1195     });
1196 
1197     // This shouldn't happen in a tree
1198     assert(Visited[Cur] != S_VISITED, "should not have visited nodes in stack");
1199     if (Visited[Cur] == S_NEW) {
1200       Visited[Cur] = S_VISITING;
1201       Stack[StackTop++] = Cur;
1202       assert(StackTop <= NumNodes, "stack grew too large");
1203       for (int I = 0, E = SpanningTreeNodes[Cur].NumOutEdges; I < E; ++I) {
1204         const uint32_t Succ = SpanningTreeNodes[Cur].OutEdges[I].Node;
1205         Stack[StackTop++] = Succ;
1206         assert(StackTop <= NumNodes, "stack grew too large");
1207       }
1208       continue;
1209     }
1210     Visited[Cur] = S_VISITED;
1211 
1212     // Establish our node frequency based on outgoing edges, which should all be
1213     // resolved by now.
1214     int64_t CurNodeFreq = LeafFrequency[Cur];
1215     // Not a leaf?
1216     if (!CurNodeFreq) {
1217       for (int I = 0, E = CFGNodes[Cur].NumOutEdges; I != E; ++I) {
1218         const uint32_t SuccEdge = CFGNodes[Cur].OutEdges[I].ID;
1219         CurNodeFreq += EdgeFreqs[SuccEdge];
1220       }
1221     }
1222     if (CurNodeFreq < 0)
1223       CurNodeFreq = 0;
1224 
1225     const uint64_t CallFreq = CallMap->visitAllCallsIn(
1226         Cur, CurNodeFreq > 0 ? CurNodeFreq : 0, CallFreqs, D, Counters, Ctx);
1227 
1228     // Exception handling affected our output flow? Fix with calls info
1229     DEBUG({
1230       if (CallFreq > CurNodeFreq)
1231         report("Bumping node frequency with call info\n");
1232     });
1233     CurNodeFreq = CallFreq > CurNodeFreq ? CallFreq : CurNodeFreq;
1234 
1235     if (CurNodeFreq > 0) {
1236       if (uint64_t Addr = EntryAddress[Cur]) {
1237         DEBUG(
1238             reportNumber("  Setting flow at entry point address 0x", Addr, 16));
1239         DEBUG(reportNumber("  with: ", CurNodeFreq, 10));
1240         Ctx.CallFlowTable->get(Addr).Val = CurNodeFreq;
1241       }
1242     }
1243 
1244     // No parent? Reached a tree root, limit to call frequency updating.
1245     if (SpanningTreeNodes[Cur].NumInEdges == 0)
1246       continue;
1247 
1248     assert(SpanningTreeNodes[Cur].NumInEdges == 1, "must have 1 parent");
1249     const uint32_t Parent = SpanningTreeNodes[Cur].InEdges[0].Node;
1250     const uint32_t ParentEdge = SpanningTreeNodes[Cur].InEdges[0].ID;
1251 
1252     // Calculate parent edge freq.
1253     int64_t ParentEdgeFreq = CurNodeFreq;
1254     for (int I = 0, E = CFGNodes[Cur].NumInEdges; I != E; ++I) {
1255       const uint32_t PredEdge = CFGNodes[Cur].InEdges[I].ID;
1256       ParentEdgeFreq -= EdgeFreqs[PredEdge];
1257     }
1258 
1259     // Sometimes the conservative CFG that BOLT builds will lead to incorrect
1260     // flow computation. For example, in a BB that transitively calls the exit
1261     // syscall, BOLT will add a fall-through successor even though it should not
1262     // have any successors. So this block execution will likely be wrong. We
1263     // tolerate this imperfection since this case should be quite infrequent.
1264     if (ParentEdgeFreq < 0) {
1265       DEBUG(dumpEdgeFreqs());
1266       DEBUG(report("WARNING: incorrect flow"));
1267       ParentEdgeFreq = 0;
1268     }
1269     DEBUG(reportNumber("  Setting freq for ParentEdge: ", ParentEdge, 10));
1270     DEBUG(reportNumber("  with ParentEdgeFreq: ", ParentEdgeFreq, 10));
1271     EdgeFreqs[ParentEdge] = ParentEdgeFreq;
1272   }
1273 
1274   Alloc.deallocate(EntryAddress);
1275   Alloc.deallocate(LeafFrequency);
1276   Alloc.deallocate(Visited);
1277   Alloc.deallocate(Stack);
1278   CallMap->~NodeToCallsMap();
1279   Alloc.deallocate(CallMap);
1280   DEBUG(dumpEdgeFreqs());
1281 }
1282 
1283 /// Write to \p FD all of the edge profiles for function \p FuncDesc. Uses
1284 /// \p Alloc to allocate helper dynamic structures used to compute profile for
1285 /// edges that we do not explictly instrument.
1286 const uint8_t *writeFunctionProfile(int FD, ProfileWriterContext &Ctx,
1287                                     const uint8_t *FuncDesc,
1288                                     BumpPtrAllocator &Alloc) {
1289   const FunctionDescription F(FuncDesc);
1290   const uint8_t *next = FuncDesc + F.getSize();
1291 
1292 #if !defined(__APPLE__)
1293   uint64_t *bolt_instr_locations = __bolt_instr_locations;
1294 #else
1295   uint64_t *bolt_instr_locations = _bolt_instr_locations_getter();
1296 #endif
1297 
1298   // Skip funcs we know are cold
1299 #ifndef ENABLE_DEBUG
1300   uint64_t CountersFreq = 0;
1301   for (int I = 0; I < F.NumLeafNodes; ++I)
1302     CountersFreq += bolt_instr_locations[F.LeafNodes[I].Counter];
1303 
1304   if (CountersFreq == 0) {
1305     for (int I = 0; I < F.NumEdges; ++I) {
1306       const uint32_t C = F.Edges[I].Counter;
1307       if (C == 0xffffffff)
1308         continue;
1309       CountersFreq += bolt_instr_locations[C];
1310     }
1311     if (CountersFreq == 0) {
1312       for (int I = 0; I < F.NumCalls; ++I) {
1313         const uint32_t C = F.Calls[I].Counter;
1314         if (C == 0xffffffff)
1315           continue;
1316         CountersFreq += bolt_instr_locations[C];
1317       }
1318       if (CountersFreq == 0)
1319         return next;
1320     }
1321   }
1322 #endif
1323 
1324   Graph *G = new (Alloc) Graph(Alloc, F, bolt_instr_locations, Ctx);
1325   DEBUG(G->dump());
1326 
1327   if (!G->EdgeFreqs && !G->CallFreqs) {
1328     G->~Graph();
1329     Alloc.deallocate(G);
1330     return next;
1331   }
1332 
1333   for (int I = 0; I < F.NumEdges; ++I) {
1334     const uint64_t Freq = G->EdgeFreqs[I];
1335     if (Freq == 0)
1336       continue;
1337     const EdgeDescription *Desc = &F.Edges[I];
1338     char LineBuf[BufSize];
1339     char *Ptr = LineBuf;
1340     Ptr = serializeLoc(Ctx, Ptr, Desc->From, BufSize);
1341     Ptr = serializeLoc(Ctx, Ptr, Desc->To, BufSize - (Ptr - LineBuf));
1342     Ptr = strCopy(Ptr, "0 ", BufSize - (Ptr - LineBuf) - 22);
1343     Ptr = intToStr(Ptr, Freq, 10);
1344     *Ptr++ = '\n';
1345     __write(FD, LineBuf, Ptr - LineBuf);
1346   }
1347 
1348   for (int I = 0; I < F.NumCalls; ++I) {
1349     const uint64_t Freq = G->CallFreqs[I];
1350     if (Freq == 0)
1351       continue;
1352     char LineBuf[BufSize];
1353     char *Ptr = LineBuf;
1354     const CallDescription *Desc = &F.Calls[I];
1355     Ptr = serializeLoc(Ctx, Ptr, Desc->From, BufSize);
1356     Ptr = serializeLoc(Ctx, Ptr, Desc->To, BufSize - (Ptr - LineBuf));
1357     Ptr = strCopy(Ptr, "0 ", BufSize - (Ptr - LineBuf) - 25);
1358     Ptr = intToStr(Ptr, Freq, 10);
1359     *Ptr++ = '\n';
1360     __write(FD, LineBuf, Ptr - LineBuf);
1361   }
1362 
1363   G->~Graph();
1364   Alloc.deallocate(G);
1365   return next;
1366 }
1367 
1368 #if !defined(__APPLE__)
1369 const IndCallTargetDescription *
1370 ProfileWriterContext::lookupIndCallTarget(uint64_t Target) const {
1371   uint32_t B = 0;
1372   uint32_t E = __bolt_instr_num_ind_targets;
1373   if (E == 0)
1374     return nullptr;
1375   do {
1376     uint32_t I = (E - B) / 2 + B;
1377     if (IndCallTargets[I].Address == Target)
1378       return &IndCallTargets[I];
1379     if (IndCallTargets[I].Address < Target)
1380       B = I + 1;
1381     else
1382       E = I;
1383   } while (B < E);
1384   return nullptr;
1385 }
1386 
1387 /// Write a single indirect call <src, target> pair to the fdata file
1388 void visitIndCallCounter(IndirectCallHashTable::MapEntry &Entry,
1389                          int FD, int CallsiteID,
1390                          ProfileWriterContext *Ctx) {
1391   if (Entry.Val == 0)
1392     return;
1393   DEBUG(reportNumber("Target func 0x", Entry.Key, 16));
1394   DEBUG(reportNumber("Target freq: ", Entry.Val, 10));
1395   const IndCallDescription *CallsiteDesc =
1396       &Ctx->IndCallDescriptions[CallsiteID];
1397   const IndCallTargetDescription *TargetDesc =
1398       Ctx->lookupIndCallTarget(Entry.Key - TextBaseAddress);
1399   if (!TargetDesc) {
1400     DEBUG(report("Failed to lookup indirect call target\n"));
1401     char LineBuf[BufSize];
1402     char *Ptr = LineBuf;
1403     Ptr = serializeLoc(*Ctx, Ptr, *CallsiteDesc, BufSize);
1404     Ptr = strCopy(Ptr, "0 [unknown] 0 0 ", BufSize - (Ptr - LineBuf) - 40);
1405     Ptr = intToStr(Ptr, Entry.Val, 10);
1406     *Ptr++ = '\n';
1407     __write(FD, LineBuf, Ptr - LineBuf);
1408     return;
1409   }
1410   Ctx->CallFlowTable->get(TargetDesc->Address).Calls += Entry.Val;
1411   char LineBuf[BufSize];
1412   char *Ptr = LineBuf;
1413   Ptr = serializeLoc(*Ctx, Ptr, *CallsiteDesc, BufSize);
1414   Ptr = serializeLoc(*Ctx, Ptr, TargetDesc->Loc, BufSize - (Ptr - LineBuf));
1415   Ptr = strCopy(Ptr, "0 ", BufSize - (Ptr - LineBuf) - 25);
1416   Ptr = intToStr(Ptr, Entry.Val, 10);
1417   *Ptr++ = '\n';
1418   __write(FD, LineBuf, Ptr - LineBuf);
1419 }
1420 
1421 /// Write to \p FD all of the indirect call profiles.
1422 void writeIndirectCallProfile(int FD, ProfileWriterContext &Ctx) {
1423   for (int I = 0; I < __bolt_instr_num_ind_calls; ++I) {
1424     DEBUG(reportNumber("IndCallsite #", I, 10));
1425     GlobalIndCallCounters[I].forEachElement(visitIndCallCounter, FD, I, &Ctx);
1426   }
1427 }
1428 
1429 /// Check a single call flow for a callee versus all known callers. If there are
1430 /// less callers than what the callee expects, write the difference with source
1431 /// [unknown] in the profile.
1432 void visitCallFlowEntry(CallFlowHashTable::MapEntry &Entry, int FD,
1433                         ProfileWriterContext *Ctx) {
1434   DEBUG(reportNumber("Call flow entry address: 0x", Entry.Key, 16));
1435   DEBUG(reportNumber("Calls: ", Entry.Calls, 10));
1436   DEBUG(reportNumber("Reported entry frequency: ", Entry.Val, 10));
1437   DEBUG({
1438     if (Entry.Calls > Entry.Val)
1439       report("  More calls than expected!\n");
1440   });
1441   if (Entry.Val <= Entry.Calls)
1442     return;
1443   DEBUG(reportNumber(
1444       "  Balancing calls with traffic: ", Entry.Val - Entry.Calls, 10));
1445   const IndCallTargetDescription *TargetDesc =
1446       Ctx->lookupIndCallTarget(Entry.Key);
1447   if (!TargetDesc) {
1448     // There is probably something wrong with this callee and this should be
1449     // investigated, but I don't want to assert and lose all data collected.
1450     DEBUG(report("WARNING: failed to look up call target!\n"));
1451     return;
1452   }
1453   char LineBuf[BufSize];
1454   char *Ptr = LineBuf;
1455   Ptr = strCopy(Ptr, "0 [unknown] 0 ", BufSize);
1456   Ptr = serializeLoc(*Ctx, Ptr, TargetDesc->Loc, BufSize - (Ptr - LineBuf));
1457   Ptr = strCopy(Ptr, "0 ", BufSize - (Ptr - LineBuf) - 25);
1458   Ptr = intToStr(Ptr, Entry.Val - Entry.Calls, 10);
1459   *Ptr++ = '\n';
1460   __write(FD, LineBuf, Ptr - LineBuf);
1461 }
1462 
1463 /// Open fdata file for writing and return a valid file descriptor, aborting
1464 /// program upon failure.
1465 int openProfile() {
1466   // Build the profile name string by appending our PID
1467   char Buf[BufSize];
1468   char *Ptr = Buf;
1469   uint64_t PID = __getpid();
1470   Ptr = strCopy(Buf, __bolt_instr_filename, BufSize);
1471   if (__bolt_instr_use_pid) {
1472     Ptr = strCopy(Ptr, ".", BufSize - (Ptr - Buf + 1));
1473     Ptr = intToStr(Ptr, PID, 10);
1474     Ptr = strCopy(Ptr, ".fdata", BufSize - (Ptr - Buf + 1));
1475   }
1476   *Ptr++ = '\0';
1477   uint64_t FD = __open(Buf, O_WRONLY | O_TRUNC | O_CREAT,
1478                        /*mode=*/0666);
1479   if (static_cast<int64_t>(FD) < 0) {
1480     report("Error while trying to open profile file for writing: ");
1481     report(Buf);
1482     reportNumber("\nFailed with error number: 0x",
1483                  0 - static_cast<int64_t>(FD), 16);
1484     __exit(1);
1485   }
1486   return FD;
1487 }
1488 
1489 #endif
1490 
1491 } // anonymous namespace
1492 
1493 #if !defined(__APPLE__)
1494 
1495 /// Reset all counters in case you want to start profiling a new phase of your
1496 /// program independently of prior phases.
1497 /// The address of this function is printed by BOLT and this can be called by
1498 /// any attached debugger during runtime. There is a useful oneliner for gdb:
1499 ///
1500 ///   gdb -p $(pgrep -xo PROCESSNAME) -ex 'p ((void(*)())0xdeadbeef)()' \
1501 ///     -ex 'set confirm off' -ex quit
1502 ///
1503 /// Where 0xdeadbeef is this function address and PROCESSNAME your binary file
1504 /// name.
1505 extern "C" void __bolt_instr_clear_counters() {
1506   memset(reinterpret_cast<char *>(__bolt_instr_locations), 0,
1507          __bolt_num_counters * 8);
1508   for (int I = 0; I < __bolt_instr_num_ind_calls; ++I)
1509     GlobalIndCallCounters[I].resetCounters();
1510 }
1511 
1512 /// This is the entry point for profile writing.
1513 /// There are three ways of getting here:
1514 ///
1515 ///  * Program execution ended, finalization methods are running and BOLT
1516 ///    hooked into FINI from your binary dynamic section;
1517 ///  * You used the sleep timer option and during initialization we forked
1518 ///    a separete process that will call this function periodically;
1519 ///  * BOLT prints this function address so you can attach a debugger and
1520 ///    call this function directly to get your profile written to disk
1521 ///    on demand.
1522 ///
1523 extern "C" void __attribute((force_align_arg_pointer))
1524 __bolt_instr_data_dump(int FD) {
1525   // Already dumping
1526   if (!GlobalWriteProfileMutex->acquire())
1527     return;
1528 
1529   int ret = __lseek(FD, 0, SEEK_SET);
1530   assert(ret == 0, "Failed to lseek!");
1531   ret = __ftruncate(FD, 0);
1532   assert(ret == 0, "Failed to ftruncate!");
1533   BumpPtrAllocator HashAlloc;
1534   HashAlloc.setMaxSize(0x6400000);
1535   ProfileWriterContext Ctx = readDescriptions();
1536   Ctx.CallFlowTable = new (HashAlloc, 0) CallFlowHashTable(HashAlloc);
1537 
1538   DEBUG(printStats(Ctx));
1539 
1540   BumpPtrAllocator Alloc;
1541   Alloc.setMaxSize(0x6400000);
1542   const uint8_t *FuncDesc = Ctx.FuncDescriptions;
1543   for (int I = 0, E = __bolt_instr_num_funcs; I < E; ++I) {
1544     FuncDesc = writeFunctionProfile(FD, Ctx, FuncDesc, Alloc);
1545     Alloc.clear();
1546     DEBUG(reportNumber("FuncDesc now: ", (uint64_t)FuncDesc, 16));
1547   }
1548   assert(FuncDesc == (void *)Ctx.Strings,
1549          "FuncDesc ptr must be equal to stringtable");
1550 
1551   writeIndirectCallProfile(FD, Ctx);
1552   Ctx.CallFlowTable->forEachElement(visitCallFlowEntry, FD, &Ctx);
1553 
1554   __fsync(FD);
1555   __munmap(Ctx.MMapPtr, Ctx.MMapSize);
1556   __close(Ctx.FileDesc);
1557   HashAlloc.destroy();
1558   GlobalWriteProfileMutex->release();
1559   DEBUG(report("Finished writing profile.\n"));
1560 }
1561 
1562 /// Event loop for our child process spawned during setup to dump profile data
1563 /// at user-specified intervals
1564 void watchProcess() {
1565   timespec ts, rem;
1566   uint64_t Ellapsed = 0ull;
1567   int FD = openProfile();
1568   uint64_t ppid;
1569   if (__bolt_instr_wait_forks) {
1570     // Store parent pgid
1571     ppid = -__getpgid(0);
1572     // And leave parent process group
1573     __setpgid(0, 0);
1574   } else {
1575     // Store parent pid
1576     ppid = __getppid();
1577     if (ppid == 1) {
1578       // Parent already dead
1579       __bolt_instr_data_dump(FD);
1580       goto out;
1581     }
1582   }
1583 
1584   ts.tv_sec = 1;
1585   ts.tv_nsec = 0;
1586   while (1) {
1587     __nanosleep(&ts, &rem);
1588     // This means our parent process or all its forks are dead,
1589     // so no need for us to keep dumping.
1590     if (__kill(ppid, 0) < 0) {
1591       if (__bolt_instr_no_counters_clear)
1592         __bolt_instr_data_dump(FD);
1593       break;
1594     }
1595 
1596     if (++Ellapsed < __bolt_instr_sleep_time)
1597       continue;
1598 
1599     Ellapsed = 0;
1600     __bolt_instr_data_dump(FD);
1601     if (__bolt_instr_no_counters_clear == false)
1602       __bolt_instr_clear_counters();
1603   }
1604 
1605 out:;
1606   DEBUG(report("My parent process is dead, bye!\n"));
1607   __close(FD);
1608   __exit(0);
1609 }
1610 
1611 extern "C" void __bolt_instr_indirect_call();
1612 extern "C" void __bolt_instr_indirect_tailcall();
1613 
1614 /// Initialization code
1615 extern "C" void __attribute((force_align_arg_pointer)) __bolt_instr_setup() {
1616   __bolt_ind_call_counter_func_pointer = __bolt_instr_indirect_call;
1617   __bolt_ind_tailcall_counter_func_pointer = __bolt_instr_indirect_tailcall;
1618   TextBaseAddress = getTextBaseAddress();
1619 
1620   const uint64_t CountersStart =
1621       reinterpret_cast<uint64_t>(&__bolt_instr_locations[0]);
1622   const uint64_t CountersEnd = alignTo(
1623       reinterpret_cast<uint64_t>(&__bolt_instr_locations[__bolt_num_counters]),
1624       0x1000);
1625   DEBUG(reportNumber("replace mmap start: ", CountersStart, 16));
1626   DEBUG(reportNumber("replace mmap stop: ", CountersEnd, 16));
1627   assert(CountersEnd > CountersStart, "no counters");
1628 
1629   const bool Shared = !__bolt_instr_use_pid;
1630   const uint64_t MapPrivateOrShared = Shared ? MAP_SHARED : MAP_PRIVATE;
1631 
1632   void *Ret =
1633       __mmap(CountersStart, CountersEnd - CountersStart, PROT_READ | PROT_WRITE,
1634              MAP_ANONYMOUS | MapPrivateOrShared | MAP_FIXED, -1, 0);
1635   assert(Ret != MAP_FAILED, "__bolt_instr_setup: Failed to mmap counters!");
1636 
1637   GlobalMetadataStorage = __mmap(0, 4096, PROT_READ | PROT_WRITE,
1638                                  MapPrivateOrShared | MAP_ANONYMOUS, -1, 0);
1639   assert(GlobalMetadataStorage != MAP_FAILED,
1640          "__bolt_instr_setup: failed to mmap page for metadata!");
1641 
1642   GlobalAlloc = new (GlobalMetadataStorage) BumpPtrAllocator;
1643   // Conservatively reserve 100MiB
1644   GlobalAlloc->setMaxSize(0x6400000);
1645   GlobalAlloc->setShared(Shared);
1646   GlobalWriteProfileMutex = new (*GlobalAlloc, 0) Mutex();
1647   if (__bolt_instr_num_ind_calls > 0)
1648     GlobalIndCallCounters =
1649         new (*GlobalAlloc, 0) IndirectCallHashTable[__bolt_instr_num_ind_calls];
1650 
1651   if (__bolt_instr_sleep_time != 0) {
1652     // Separate instrumented process to the own process group
1653     if (__bolt_instr_wait_forks)
1654       __setpgid(0, 0);
1655 
1656     if (long PID = __fork())
1657       return;
1658     watchProcess();
1659   }
1660 }
1661 
1662 extern "C" __attribute((force_align_arg_pointer)) void
1663 instrumentIndirectCall(uint64_t Target, uint64_t IndCallID) {
1664   GlobalIndCallCounters[IndCallID].incrementVal(Target, *GlobalAlloc);
1665 }
1666 
1667 /// We receive as in-stack arguments the identifier of the indirect call site
1668 /// as well as the target address for the call
1669 extern "C" __attribute((naked)) void __bolt_instr_indirect_call()
1670 {
1671   __asm__ __volatile__(SAVE_ALL
1672                        "mov 0xa0(%%rsp), %%rdi\n"
1673                        "mov 0x98(%%rsp), %%rsi\n"
1674                        "call instrumentIndirectCall\n"
1675                        RESTORE_ALL
1676                        "ret\n"
1677                        :::);
1678 }
1679 
1680 extern "C" __attribute((naked)) void __bolt_instr_indirect_tailcall()
1681 {
1682   __asm__ __volatile__(SAVE_ALL
1683                        "mov 0x98(%%rsp), %%rdi\n"
1684                        "mov 0x90(%%rsp), %%rsi\n"
1685                        "call instrumentIndirectCall\n"
1686                        RESTORE_ALL
1687                        "ret\n"
1688                        :::);
1689 }
1690 
1691 /// This is hooking ELF's entry, it needs to save all machine state.
1692 extern "C" __attribute((naked)) void __bolt_instr_start()
1693 {
1694   __asm__ __volatile__(SAVE_ALL
1695                        "call __bolt_instr_setup\n"
1696                        RESTORE_ALL
1697                        "jmp __bolt_start_trampoline\n"
1698                        :::);
1699 }
1700 
1701 /// This is hooking into ELF's DT_FINI
1702 extern "C" void __bolt_instr_fini() {
1703   __bolt_fini_trampoline();
1704   if (__bolt_instr_sleep_time == 0) {
1705     int FD = openProfile();
1706     __bolt_instr_data_dump(FD);
1707     __close(FD);
1708   }
1709   DEBUG(report("Finished.\n"));
1710 }
1711 
1712 #endif
1713 
1714 #if defined(__APPLE__)
1715 
1716 extern "C" void __bolt_instr_data_dump() {
1717   ProfileWriterContext Ctx = readDescriptions();
1718 
1719   int FD = 2;
1720   BumpPtrAllocator Alloc;
1721   const uint8_t *FuncDesc = Ctx.FuncDescriptions;
1722   uint32_t bolt_instr_num_funcs = _bolt_instr_num_funcs_getter();
1723 
1724   for (int I = 0, E = bolt_instr_num_funcs; I < E; ++I) {
1725     FuncDesc = writeFunctionProfile(FD, Ctx, FuncDesc, Alloc);
1726     Alloc.clear();
1727     DEBUG(reportNumber("FuncDesc now: ", (uint64_t)FuncDesc, 16));
1728   }
1729   assert(FuncDesc == (void *)Ctx.Strings,
1730          "FuncDesc ptr must be equal to stringtable");
1731 }
1732 
1733 // On OSX/iOS the final symbol name of an extern "C" function/variable contains
1734 // one extra leading underscore: _bolt_instr_setup -> __bolt_instr_setup.
1735 extern "C"
1736 __attribute__((section("__TEXT,__setup")))
1737 __attribute__((force_align_arg_pointer))
1738 void _bolt_instr_setup() {
1739   __asm__ __volatile__(SAVE_ALL :::);
1740 
1741   report("Hello!\n");
1742 
1743   __asm__ __volatile__(RESTORE_ALL :::);
1744 }
1745 
1746 extern "C"
1747 __attribute__((section("__TEXT,__fini")))
1748 __attribute__((force_align_arg_pointer))
1749 void _bolt_instr_fini() {
1750   report("Bye!\n");
1751   __bolt_instr_data_dump();
1752 }
1753 
1754 #endif
1755 #endif
1756