1 //===- CtxInstrProfiling.cpp - contextual instrumented PGO ----------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 9 #include "CtxInstrProfiling.h" 10 #include "sanitizer_common/sanitizer_allocator_internal.h" 11 #include "sanitizer_common/sanitizer_common.h" 12 #include "sanitizer_common/sanitizer_dense_map.h" 13 #include "sanitizer_common/sanitizer_libc.h" 14 #include "sanitizer_common/sanitizer_mutex.h" 15 #include "sanitizer_common/sanitizer_placement_new.h" 16 #include "sanitizer_common/sanitizer_thread_safety.h" 17 #include "sanitizer_common/sanitizer_vector.h" 18 19 #include <assert.h> 20 21 using namespace __ctx_profile; 22 23 namespace { 24 // Keep track of all the context roots we actually saw, so we can then traverse 25 // them when the user asks for the profile in __llvm_ctx_profile_fetch 26 __sanitizer::SpinMutex AllContextsMutex; 27 SANITIZER_GUARDED_BY(AllContextsMutex) 28 __sanitizer::Vector<ContextRoot *> AllContextRoots; 29 30 // utility to taint a pointer by setting the LSB. There is an assumption 31 // throughout that the addresses of contexts are even (really, they should be 32 // align(8), but "even"-ness is the minimum assumption) 33 // "scratch contexts" are buffers that we return in certain cases - they are 34 // large enough to allow for memory safe counter access, but they don't link 35 // subcontexts below them (the runtime recognizes them and enforces that) 36 ContextNode *markAsScratch(const ContextNode *Ctx) { 37 return reinterpret_cast<ContextNode *>(reinterpret_cast<uint64_t>(Ctx) | 1); 38 } 39 40 // Used when getting the data from TLS. We don't *really* need to reset, but 41 // it's a simpler system if we do. 42 template <typename T> inline T consume(T &V) { 43 auto R = V; 44 V = {0}; 45 return R; 46 } 47 48 // We allocate at least kBuffSize Arena pages. The scratch buffer is also that 49 // large. 50 constexpr size_t kPower = 20; 51 constexpr size_t kBuffSize = 1 << kPower; 52 53 // Highly unlikely we need more than kBuffSize for a context. 54 size_t getArenaAllocSize(size_t Needed) { 55 if (Needed >= kBuffSize) 56 return 2 * Needed; 57 return kBuffSize; 58 } 59 60 // verify the structural integrity of the context 61 bool validate(const ContextRoot *Root) { 62 // all contexts should be laid out in some arena page. Go over each arena 63 // allocated for this Root, and jump over contained contexts based on 64 // self-reported sizes. 65 __sanitizer::DenseMap<uint64_t, bool> ContextStartAddrs; 66 for (const auto *Mem = Root->FirstMemBlock; Mem; Mem = Mem->next()) { 67 const auto *Pos = Mem->start(); 68 while (Pos < Mem->pos()) { 69 const auto *Ctx = reinterpret_cast<const ContextNode *>(Pos); 70 if (!ContextStartAddrs.insert({reinterpret_cast<uint64_t>(Ctx), true}) 71 .second) 72 return false; 73 Pos += Ctx->size(); 74 } 75 } 76 77 // Now traverse the contexts again the same way, but validate all nonull 78 // subcontext addresses appear in the set computed above. 79 for (const auto *Mem = Root->FirstMemBlock; Mem; Mem = Mem->next()) { 80 const auto *Pos = Mem->start(); 81 while (Pos < Mem->pos()) { 82 const auto *Ctx = reinterpret_cast<const ContextNode *>(Pos); 83 for (uint32_t I = 0; I < Ctx->callsites_size(); ++I) 84 for (auto *Sub = Ctx->subContexts()[I]; Sub; Sub = Sub->next()) 85 if (!ContextStartAddrs.find(reinterpret_cast<uint64_t>(Sub))) 86 return false; 87 88 Pos += Ctx->size(); 89 } 90 } 91 return true; 92 } 93 94 inline ContextNode *allocContextNode(char *Place, GUID Guid, 95 uint32_t NumCounters, 96 uint32_t NumCallsites, 97 ContextNode *Next = nullptr) { 98 assert(reinterpret_cast<uint64_t>(Place) % ExpectedAlignment == 0); 99 return new (Place) ContextNode(Guid, NumCounters, NumCallsites, Next); 100 } 101 102 void resetContextNode(ContextNode &Node) { 103 // FIXME(mtrofin): this is std::memset, which we can probably use if we 104 // drop/reduce the dependency on sanitizer_common. 105 for (uint32_t I = 0; I < Node.counters_size(); ++I) 106 Node.counters()[I] = 0; 107 for (uint32_t I = 0; I < Node.callsites_size(); ++I) 108 for (auto *Next = Node.subContexts()[I]; Next; Next = Next->next()) 109 resetContextNode(*Next); 110 } 111 112 void onContextEnter(ContextNode &Node) { ++Node.counters()[0]; } 113 114 } // namespace 115 116 // the scratch buffer - what we give when we can't produce a real context (the 117 // scratch isn't "real" in that it's expected to be clobbered carelessly - we 118 // don't read it). The other important thing is that the callees from a scratch 119 // context also get a scratch context. 120 // Eventually this can be replaced with per-function buffers, a'la the typical 121 // (flat) instrumented FDO buffers. The clobbering aspect won't apply there, but 122 // the part about determining the nature of the subcontexts does. 123 __thread char __Buffer[kBuffSize] = {0}; 124 125 #define TheScratchContext \ 126 markAsScratch(reinterpret_cast<ContextNode *>(__Buffer)) 127 128 // init the TLSes 129 __thread void *volatile __llvm_ctx_profile_expected_callee[2] = {nullptr, 130 nullptr}; 131 __thread ContextNode **volatile __llvm_ctx_profile_callsite[2] = {0, 0}; 132 133 __thread ContextRoot *volatile __llvm_ctx_profile_current_context_root = 134 nullptr; 135 136 Arena::Arena(uint32_t Size) : Size(Size) { 137 __sanitizer::internal_memset(start(), 0, Size); 138 } 139 140 // FIXME(mtrofin): use malloc / mmap instead of sanitizer common APIs to reduce 141 // the dependency on the latter. 142 Arena *Arena::allocateNewArena(size_t Size, Arena *Prev) { 143 assert(!Prev || Prev->Next == nullptr); 144 Arena *NewArena = new (__sanitizer::InternalAlloc( 145 Size + sizeof(Arena), /*cache=*/nullptr, /*alignment=*/ExpectedAlignment)) 146 Arena(Size); 147 if (Prev) 148 Prev->Next = NewArena; 149 return NewArena; 150 } 151 152 void Arena::freeArenaList(Arena *&A) { 153 assert(A); 154 for (auto *I = A; I != nullptr;) { 155 auto *Current = I; 156 I = I->Next; 157 __sanitizer::InternalFree(Current); 158 } 159 A = nullptr; 160 } 161 162 // If this is the first time we hit a callsite with this (Guid) particular 163 // callee, we need to allocate. 164 ContextNode *getCallsiteSlow(GUID Guid, ContextNode **InsertionPoint, 165 uint32_t NumCounters, uint32_t NumCallsites) { 166 auto AllocSize = ContextNode::getAllocSize(NumCounters, NumCallsites); 167 auto *Mem = __llvm_ctx_profile_current_context_root->CurrentMem; 168 char *AllocPlace = Mem->tryBumpAllocate(AllocSize); 169 if (!AllocPlace) { 170 // if we failed to allocate on the current arena, allocate a new arena, 171 // and place it on __llvm_ctx_profile_current_context_root->CurrentMem so we 172 // find it from now on for other cases when we need to getCallsiteSlow. 173 // Note that allocateNewArena will link the allocated memory in the list of 174 // Arenas. 175 __llvm_ctx_profile_current_context_root->CurrentMem = Mem = 176 Mem->allocateNewArena(getArenaAllocSize(AllocSize), Mem); 177 AllocPlace = Mem->tryBumpAllocate(AllocSize); 178 } 179 auto *Ret = allocContextNode(AllocPlace, Guid, NumCounters, NumCallsites, 180 *InsertionPoint); 181 *InsertionPoint = Ret; 182 return Ret; 183 } 184 185 ContextNode *__llvm_ctx_profile_get_context(void *Callee, GUID Guid, 186 uint32_t NumCounters, 187 uint32_t NumCallsites) { 188 // fast "out" if we're not even doing contextual collection. 189 if (!__llvm_ctx_profile_current_context_root) 190 return TheScratchContext; 191 192 // also fast "out" if the caller is scratch. We can see if it's scratch by 193 // looking at the interior pointer into the subcontexts vector that the caller 194 // provided, which, if the context is scratch, so is that interior pointer 195 // (because all the address calculations are using even values. Or more 196 // precisely, aligned - 8 values) 197 auto **CallsiteContext = consume(__llvm_ctx_profile_callsite[0]); 198 if (!CallsiteContext || isScratch(CallsiteContext)) 199 return TheScratchContext; 200 201 // if the callee isn't the expected one, return scratch. 202 // Signal handler(s) could have been invoked at any point in the execution. 203 // Should that have happened, and had it (the handler) be built with 204 // instrumentation, its __llvm_ctx_profile_get_context would have failed here. 205 // Its sub call graph would have then populated 206 // __llvm_ctx_profile_{expected_callee | callsite} at index 1. 207 // The normal call graph may be impacted in that, if the signal handler 208 // happened somewhere before we read the TLS here, we'd see the TLS reset and 209 // we'd also fail here. That would just mean we would loose counter values for 210 // the normal subgraph, this time around. That should be very unlikely, but if 211 // it happens too frequently, we should be able to detect discrepancies in 212 // entry counts (caller-callee). At the moment, the design goes on the 213 // assumption that is so unfrequent, though, that it's not worth doing more 214 // for that case. 215 auto *ExpectedCallee = consume(__llvm_ctx_profile_expected_callee[0]); 216 if (ExpectedCallee != Callee) 217 return TheScratchContext; 218 219 auto *Callsite = *CallsiteContext; 220 // in the case of indirect calls, we will have all seen targets forming a 221 // linked list here. Find the one corresponding to this callee. 222 while (Callsite && Callsite->guid() != Guid) { 223 Callsite = Callsite->next(); 224 } 225 auto *Ret = Callsite ? Callsite 226 : getCallsiteSlow(Guid, CallsiteContext, NumCounters, 227 NumCallsites); 228 if (Ret->callsites_size() != NumCallsites || 229 Ret->counters_size() != NumCounters) 230 __sanitizer::Printf("[ctxprof] Returned ctx differs from what's asked: " 231 "Context: %p, Asked: %lu %u %u, Got: %lu %u %u \n", 232 reinterpret_cast<void *>(Ret), Guid, NumCallsites, 233 NumCounters, Ret->guid(), Ret->callsites_size(), 234 Ret->counters_size()); 235 onContextEnter(*Ret); 236 return Ret; 237 } 238 239 // This should be called once for a Root. Allocate the first arena, set up the 240 // first context. 241 void setupContext(ContextRoot *Root, GUID Guid, uint32_t NumCounters, 242 uint32_t NumCallsites) { 243 __sanitizer::GenericScopedLock<__sanitizer::SpinMutex> Lock( 244 &AllContextsMutex); 245 // Re-check - we got here without having had taken a lock. 246 if (Root->FirstMemBlock) 247 return; 248 const auto Needed = ContextNode::getAllocSize(NumCounters, NumCallsites); 249 auto *M = Arena::allocateNewArena(getArenaAllocSize(Needed)); 250 Root->FirstMemBlock = M; 251 Root->CurrentMem = M; 252 Root->FirstNode = allocContextNode(M->tryBumpAllocate(Needed), Guid, 253 NumCounters, NumCallsites); 254 AllContextRoots.PushBack(Root); 255 } 256 257 ContextNode *__llvm_ctx_profile_start_context( 258 ContextRoot *Root, GUID Guid, uint32_t Counters, 259 uint32_t Callsites) SANITIZER_NO_THREAD_SAFETY_ANALYSIS { 260 if (!Root->FirstMemBlock) { 261 setupContext(Root, Guid, Counters, Callsites); 262 } 263 if (Root->Taken.TryLock()) { 264 __llvm_ctx_profile_current_context_root = Root; 265 onContextEnter(*Root->FirstNode); 266 return Root->FirstNode; 267 } 268 // If this thread couldn't take the lock, return scratch context. 269 __llvm_ctx_profile_current_context_root = nullptr; 270 return TheScratchContext; 271 } 272 273 void __llvm_ctx_profile_release_context(ContextRoot *Root) 274 SANITIZER_NO_THREAD_SAFETY_ANALYSIS { 275 if (__llvm_ctx_profile_current_context_root) { 276 __llvm_ctx_profile_current_context_root = nullptr; 277 Root->Taken.Unlock(); 278 } 279 } 280 281 void __llvm_ctx_profile_start_collection() { 282 size_t NumMemUnits = 0; 283 __sanitizer::GenericScopedLock<__sanitizer::SpinMutex> Lock( 284 &AllContextsMutex); 285 for (uint32_t I = 0; I < AllContextRoots.Size(); ++I) { 286 auto *Root = AllContextRoots[I]; 287 __sanitizer::GenericScopedLock<__sanitizer::StaticSpinMutex> Lock( 288 &Root->Taken); 289 for (auto *Mem = Root->FirstMemBlock; Mem; Mem = Mem->next()) 290 ++NumMemUnits; 291 292 resetContextNode(*Root->FirstNode); 293 } 294 __sanitizer::Printf("[ctxprof] Initial NumMemUnits: %zu \n", NumMemUnits); 295 } 296 297 bool __llvm_ctx_profile_fetch(void *Data, 298 bool (*Writer)(void *W, const ContextNode &)) { 299 assert(Writer); 300 __sanitizer::GenericScopedLock<__sanitizer::SpinMutex> Lock( 301 &AllContextsMutex); 302 303 for (int I = 0, E = AllContextRoots.Size(); I < E; ++I) { 304 auto *Root = AllContextRoots[I]; 305 __sanitizer::GenericScopedLock<__sanitizer::StaticSpinMutex> TakenLock( 306 &Root->Taken); 307 if (!validate(Root)) { 308 __sanitizer::Printf("[ctxprof] Contextual Profile is %s\n", "invalid"); 309 return false; 310 } 311 if (!Writer(Data, *Root->FirstNode)) 312 return false; 313 } 314 return true; 315 } 316 317 void __llvm_ctx_profile_free() { 318 __sanitizer::GenericScopedLock<__sanitizer::SpinMutex> Lock( 319 &AllContextsMutex); 320 for (int I = 0, E = AllContextRoots.Size(); I < E; ++I) 321 for (auto *A = AllContextRoots[I]->FirstMemBlock; A;) { 322 auto *C = A; 323 A = A->next(); 324 __sanitizer::InternalFree(C); 325 } 326 AllContextRoots.Reset(); 327 } 328