1 //==-- MemProfContextDisambiguation.cpp - Disambiguate contexts -------------=// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements support for context disambiguation of allocation 10 // calls for profile guided heap optimization. Specifically, it uses Memprof 11 // profiles which indicate context specific allocation behavior (currently 12 // distinguishing cold vs hot memory allocations). Cloning is performed to 13 // expose the cold allocation call contexts, and the allocation calls are 14 // subsequently annotated with an attribute for later transformation. 15 // 16 // The transformations can be performed either directly on IR (regular LTO), or 17 // on a ThinLTO index (and later applied to the IR during the ThinLTO backend). 18 // Both types of LTO operate on a the same base graph representation, which 19 // uses CRTP to support either IR or Index formats. 20 // 21 //===----------------------------------------------------------------------===// 22 23 #include "llvm/Transforms/IPO/MemProfContextDisambiguation.h" 24 #include "llvm/ADT/DenseMap.h" 25 #include "llvm/ADT/DenseSet.h" 26 #include "llvm/ADT/MapVector.h" 27 #include "llvm/ADT/SetOperations.h" 28 #include "llvm/ADT/SmallPtrSet.h" 29 #include "llvm/ADT/SmallSet.h" 30 #include "llvm/ADT/SmallVector.h" 31 #include "llvm/ADT/Statistic.h" 32 #include "llvm/Analysis/MemoryProfileInfo.h" 33 #include "llvm/Analysis/ModuleSummaryAnalysis.h" 34 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 35 #include "llvm/Bitcode/BitcodeReader.h" 36 #include "llvm/IR/Constants.h" 37 #include "llvm/IR/Instructions.h" 38 #include "llvm/IR/Module.h" 39 #include "llvm/IR/ModuleSummaryIndex.h" 40 #include "llvm/Pass.h" 41 #include "llvm/Support/CommandLine.h" 42 #include "llvm/Support/FileSystem.h" 43 #include "llvm/Support/GraphWriter.h" 44 #include "llvm/Support/raw_ostream.h" 45 #include "llvm/Transforms/IPO.h" 46 #include "llvm/Transforms/Utils/Cloning.h" 47 #include <sstream> 48 #include <vector> 49 using namespace llvm; 50 using namespace llvm::memprof; 51 52 #define DEBUG_TYPE "memprof-context-disambiguation" 53 54 STATISTIC(FunctionClonesAnalysis, 55 "Number of function clones created during whole program analysis"); 56 STATISTIC(FunctionClonesThinBackend, 57 "Number of function clones created during ThinLTO backend"); 58 STATISTIC(FunctionsClonedThinBackend, 59 "Number of functions that had clones created during ThinLTO backend"); 60 STATISTIC(AllocTypeNotCold, "Number of not cold static allocations (possibly " 61 "cloned) during whole program analysis"); 62 STATISTIC(AllocTypeCold, "Number of cold static allocations (possibly cloned) " 63 "during whole program analysis"); 64 STATISTIC(AllocTypeNotColdThinBackend, 65 "Number of not cold static allocations (possibly cloned) during " 66 "ThinLTO backend"); 67 STATISTIC(AllocTypeColdThinBackend, "Number of cold static allocations " 68 "(possibly cloned) during ThinLTO backend"); 69 STATISTIC(OrigAllocsThinBackend, 70 "Number of original (not cloned) allocations with memprof profiles " 71 "during ThinLTO backend"); 72 STATISTIC( 73 AllocVersionsThinBackend, 74 "Number of allocation versions (including clones) during ThinLTO backend"); 75 STATISTIC(MaxAllocVersionsThinBackend, 76 "Maximum number of allocation versions created for an original " 77 "allocation during ThinLTO backend"); 78 STATISTIC(UnclonableAllocsThinBackend, 79 "Number of unclonable ambigous allocations during ThinLTO backend"); 80 81 static cl::opt<std::string> DotFilePathPrefix( 82 "memprof-dot-file-path-prefix", cl::init(""), cl::Hidden, 83 cl::value_desc("filename"), 84 cl::desc("Specify the path prefix of the MemProf dot files.")); 85 86 static cl::opt<bool> ExportToDot("memprof-export-to-dot", cl::init(false), 87 cl::Hidden, 88 cl::desc("Export graph to dot files.")); 89 90 static cl::opt<bool> 91 DumpCCG("memprof-dump-ccg", cl::init(false), cl::Hidden, 92 cl::desc("Dump CallingContextGraph to stdout after each stage.")); 93 94 static cl::opt<bool> 95 VerifyCCG("memprof-verify-ccg", cl::init(false), cl::Hidden, 96 cl::desc("Perform verification checks on CallingContextGraph.")); 97 98 static cl::opt<bool> 99 VerifyNodes("memprof-verify-nodes", cl::init(false), cl::Hidden, 100 cl::desc("Perform frequent verification checks on nodes.")); 101 102 static cl::opt<std::string> MemProfImportSummary( 103 "memprof-import-summary", 104 cl::desc("Import summary to use for testing the ThinLTO backend via opt"), 105 cl::Hidden); 106 107 namespace llvm { 108 // Indicate we are linking with an allocator that supports hot/cold operator 109 // new interfaces. 110 cl::opt<bool> SupportsHotColdNew( 111 "supports-hot-cold-new", cl::init(false), cl::Hidden, 112 cl::desc("Linking with hot/cold operator new interfaces")); 113 } // namespace llvm 114 115 namespace { 116 /// CRTP base for graphs built from either IR or ThinLTO summary index. 117 /// 118 /// The graph represents the call contexts in all memprof metadata on allocation 119 /// calls, with nodes for the allocations themselves, as well as for the calls 120 /// in each context. The graph is initially built from the allocation memprof 121 /// metadata (or summary) MIBs. It is then updated to match calls with callsite 122 /// metadata onto the nodes, updating it to reflect any inlining performed on 123 /// those calls. 124 /// 125 /// Each MIB (representing an allocation's call context with allocation 126 /// behavior) is assigned a unique context id during the graph build. The edges 127 /// and nodes in the graph are decorated with the context ids they carry. This 128 /// is used to correctly update the graph when cloning is performed so that we 129 /// can uniquify the context for a single (possibly cloned) allocation. 130 template <typename DerivedCCG, typename FuncTy, typename CallTy> 131 class CallsiteContextGraph { 132 public: 133 CallsiteContextGraph() = default; 134 CallsiteContextGraph(const CallsiteContextGraph &) = default; 135 CallsiteContextGraph(CallsiteContextGraph &&) = default; 136 137 /// Main entry point to perform analysis and transformations on graph. 138 bool process(); 139 140 /// Perform cloning on the graph necessary to uniquely identify the allocation 141 /// behavior of an allocation based on its context. 142 void identifyClones(); 143 144 /// Assign callsite clones to functions, cloning functions as needed to 145 /// accommodate the combinations of their callsite clones reached by callers. 146 /// For regular LTO this clones functions and callsites in the IR, but for 147 /// ThinLTO the cloning decisions are noted in the summaries and later applied 148 /// in applyImport. 149 bool assignFunctions(); 150 151 void dump() const; 152 void print(raw_ostream &OS) const; 153 154 friend raw_ostream &operator<<(raw_ostream &OS, 155 const CallsiteContextGraph &CCG) { 156 CCG.print(OS); 157 return OS; 158 } 159 160 friend struct GraphTraits< 161 const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>; 162 friend struct DOTGraphTraits< 163 const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>; 164 165 void exportToDot(std::string Label) const; 166 167 /// Represents a function clone via FuncTy pointer and clone number pair. 168 struct FuncInfo final 169 : public std::pair<FuncTy *, unsigned /*Clone number*/> { 170 using Base = std::pair<FuncTy *, unsigned>; 171 FuncInfo(const Base &B) : Base(B) {} 172 FuncInfo(FuncTy *F = nullptr, unsigned CloneNo = 0) : Base(F, CloneNo) {} 173 explicit operator bool() const { return this->first != nullptr; } 174 FuncTy *func() const { return this->first; } 175 unsigned cloneNo() const { return this->second; } 176 }; 177 178 /// Represents a callsite clone via CallTy and clone number pair. 179 struct CallInfo final : public std::pair<CallTy, unsigned /*Clone number*/> { 180 using Base = std::pair<CallTy, unsigned>; 181 CallInfo(const Base &B) : Base(B) {} 182 CallInfo(CallTy Call = nullptr, unsigned CloneNo = 0) 183 : Base(Call, CloneNo) {} 184 explicit operator bool() const { return (bool)this->first; } 185 CallTy call() const { return this->first; } 186 unsigned cloneNo() const { return this->second; } 187 void setCloneNo(unsigned N) { this->second = N; } 188 void print(raw_ostream &OS) const { 189 if (!operator bool()) { 190 assert(!cloneNo()); 191 OS << "null Call"; 192 return; 193 } 194 call()->print(OS); 195 OS << "\t(clone " << cloneNo() << ")"; 196 } 197 void dump() const { 198 print(dbgs()); 199 dbgs() << "\n"; 200 } 201 friend raw_ostream &operator<<(raw_ostream &OS, const CallInfo &Call) { 202 Call.print(OS); 203 return OS; 204 } 205 }; 206 207 struct ContextEdge; 208 209 /// Node in the Callsite Context Graph 210 struct ContextNode { 211 // Keep this for now since in the IR case where we have an Instruction* it 212 // is not as immediately discoverable. Used for printing richer information 213 // when dumping graph. 214 bool IsAllocation; 215 216 // Keeps track of when the Call was reset to null because there was 217 // recursion. 218 bool Recursive = false; 219 220 // The corresponding allocation or interior call. 221 CallInfo Call; 222 223 // For alloc nodes this is a unique id assigned when constructed, and for 224 // callsite stack nodes it is the original stack id when the node is 225 // constructed from the memprof MIB metadata on the alloc nodes. Note that 226 // this is only used when matching callsite metadata onto the stack nodes 227 // created when processing the allocation memprof MIBs, and for labeling 228 // nodes in the dot graph. Therefore we don't bother to assign a value for 229 // clones. 230 uint64_t OrigStackOrAllocId = 0; 231 232 // This will be formed by ORing together the AllocationType enum values 233 // for contexts including this node. 234 uint8_t AllocTypes = 0; 235 236 // Edges to all callees in the profiled call stacks. 237 // TODO: Should this be a map (from Callee node) for more efficient lookup? 238 std::vector<std::shared_ptr<ContextEdge>> CalleeEdges; 239 240 // Edges to all callers in the profiled call stacks. 241 // TODO: Should this be a map (from Caller node) for more efficient lookup? 242 std::vector<std::shared_ptr<ContextEdge>> CallerEdges; 243 244 // The set of IDs for contexts including this node. 245 DenseSet<uint32_t> ContextIds; 246 247 // List of clones of this ContextNode, initially empty. 248 std::vector<ContextNode *> Clones; 249 250 // If a clone, points to the original uncloned node. 251 ContextNode *CloneOf = nullptr; 252 253 ContextNode(bool IsAllocation) : IsAllocation(IsAllocation), Call() {} 254 255 ContextNode(bool IsAllocation, CallInfo C) 256 : IsAllocation(IsAllocation), Call(C) {} 257 258 void addClone(ContextNode *Clone) { 259 if (CloneOf) { 260 CloneOf->Clones.push_back(Clone); 261 Clone->CloneOf = CloneOf; 262 } else { 263 Clones.push_back(Clone); 264 assert(!Clone->CloneOf); 265 Clone->CloneOf = this; 266 } 267 } 268 269 ContextNode *getOrigNode() { 270 if (!CloneOf) 271 return this; 272 return CloneOf; 273 } 274 275 void addOrUpdateCallerEdge(ContextNode *Caller, AllocationType AllocType, 276 unsigned int ContextId); 277 278 ContextEdge *findEdgeFromCallee(const ContextNode *Callee); 279 ContextEdge *findEdgeFromCaller(const ContextNode *Caller); 280 void eraseCalleeEdge(const ContextEdge *Edge); 281 void eraseCallerEdge(const ContextEdge *Edge); 282 283 void setCall(CallInfo C) { Call = C; } 284 285 bool hasCall() const { return (bool)Call.call(); } 286 287 void printCall(raw_ostream &OS) const { Call.print(OS); } 288 289 // True if this node was effectively removed from the graph, in which case 290 // its context id set, caller edges, and callee edges should all be empty. 291 bool isRemoved() const { 292 assert(ContextIds.empty() == 293 (CalleeEdges.empty() && CallerEdges.empty())); 294 return ContextIds.empty(); 295 } 296 297 void dump() const; 298 void print(raw_ostream &OS) const; 299 300 friend raw_ostream &operator<<(raw_ostream &OS, const ContextNode &Node) { 301 Node.print(OS); 302 return OS; 303 } 304 }; 305 306 /// Edge in the Callsite Context Graph from a ContextNode N to a caller or 307 /// callee. 308 struct ContextEdge { 309 ContextNode *Callee; 310 ContextNode *Caller; 311 312 // This will be formed by ORing together the AllocationType enum values 313 // for contexts including this edge. 314 uint8_t AllocTypes = 0; 315 316 // The set of IDs for contexts including this edge. 317 DenseSet<uint32_t> ContextIds; 318 319 ContextEdge(ContextNode *Callee, ContextNode *Caller, uint8_t AllocType, 320 DenseSet<uint32_t> ContextIds) 321 : Callee(Callee), Caller(Caller), AllocTypes(AllocType), 322 ContextIds(ContextIds) {} 323 324 DenseSet<uint32_t> &getContextIds() { return ContextIds; } 325 326 void dump() const; 327 void print(raw_ostream &OS) const; 328 329 friend raw_ostream &operator<<(raw_ostream &OS, const ContextEdge &Edge) { 330 Edge.print(OS); 331 return OS; 332 } 333 }; 334 335 /// Helper to remove callee edges that have allocation type None (due to not 336 /// carrying any context ids) after transformations. 337 void removeNoneTypeCalleeEdges(ContextNode *Node); 338 339 protected: 340 /// Get a list of nodes corresponding to the stack ids in the given callsite 341 /// context. 342 template <class NodeT, class IteratorT> 343 std::vector<uint64_t> 344 getStackIdsWithContextNodes(CallStack<NodeT, IteratorT> &CallsiteContext); 345 346 /// Adds nodes for the given allocation and any stack ids on its memprof MIB 347 /// metadata (or summary). 348 ContextNode *addAllocNode(CallInfo Call, const FuncTy *F); 349 350 /// Adds nodes for the given MIB stack ids. 351 template <class NodeT, class IteratorT> 352 void addStackNodesForMIB(ContextNode *AllocNode, 353 CallStack<NodeT, IteratorT> &StackContext, 354 CallStack<NodeT, IteratorT> &CallsiteContext, 355 AllocationType AllocType); 356 357 /// Matches all callsite metadata (or summary) to the nodes created for 358 /// allocation memprof MIB metadata, synthesizing new nodes to reflect any 359 /// inlining performed on those callsite instructions. 360 void updateStackNodes(); 361 362 /// Update graph to conservatively handle any callsite stack nodes that target 363 /// multiple different callee target functions. 364 void handleCallsitesWithMultipleTargets(); 365 366 /// Save lists of calls with MemProf metadata in each function, for faster 367 /// iteration. 368 std::vector<std::pair<FuncTy *, std::vector<CallInfo>>> 369 FuncToCallsWithMetadata; 370 371 /// Map from callsite node to the enclosing caller function. 372 std::map<const ContextNode *, const FuncTy *> NodeToCallingFunc; 373 374 private: 375 using EdgeIter = typename std::vector<std::shared_ptr<ContextEdge>>::iterator; 376 377 using CallContextInfo = std::tuple<CallTy, std::vector<uint64_t>, 378 const FuncTy *, DenseSet<uint32_t>>; 379 380 /// Assigns the given Node to calls at or inlined into the location with 381 /// the Node's stack id, after post order traversing and processing its 382 /// caller nodes. Uses the call information recorded in the given 383 /// StackIdToMatchingCalls map, and creates new nodes for inlined sequences 384 /// as needed. Called by updateStackNodes which sets up the given 385 /// StackIdToMatchingCalls map. 386 void assignStackNodesPostOrder( 387 ContextNode *Node, DenseSet<const ContextNode *> &Visited, 388 DenseMap<uint64_t, std::vector<CallContextInfo>> &StackIdToMatchingCalls); 389 390 /// Duplicates the given set of context ids, updating the provided 391 /// map from each original id with the newly generated context ids, 392 /// and returning the new duplicated id set. 393 DenseSet<uint32_t> duplicateContextIds( 394 const DenseSet<uint32_t> &StackSequenceContextIds, 395 DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds); 396 397 /// Propagates all duplicated context ids across the graph. 398 void propagateDuplicateContextIds( 399 const DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds); 400 401 /// Connect the NewNode to OrigNode's callees if TowardsCallee is true, 402 /// else to its callers. Also updates OrigNode's edges to remove any context 403 /// ids moved to the newly created edge. 404 void connectNewNode(ContextNode *NewNode, ContextNode *OrigNode, 405 bool TowardsCallee); 406 407 /// Get the stack id corresponding to the given Id or Index (for IR this will 408 /// return itself, for a summary index this will return the id recorded in the 409 /// index for that stack id index value). 410 uint64_t getStackId(uint64_t IdOrIndex) const { 411 return static_cast<const DerivedCCG *>(this)->getStackId(IdOrIndex); 412 } 413 414 /// Returns true if the given call targets the given function. 415 bool calleeMatchesFunc(CallTy Call, const FuncTy *Func) { 416 return static_cast<DerivedCCG *>(this)->calleeMatchesFunc(Call, Func); 417 } 418 419 /// Get a list of nodes corresponding to the stack ids in the given 420 /// callsite's context. 421 std::vector<uint64_t> getStackIdsWithContextNodesForCall(CallTy Call) { 422 return static_cast<DerivedCCG *>(this)->getStackIdsWithContextNodesForCall( 423 Call); 424 } 425 426 /// Get the last stack id in the context for callsite. 427 uint64_t getLastStackId(CallTy Call) { 428 return static_cast<DerivedCCG *>(this)->getLastStackId(Call); 429 } 430 431 /// Update the allocation call to record type of allocated memory. 432 void updateAllocationCall(CallInfo &Call, AllocationType AllocType) { 433 AllocType == AllocationType::Cold ? AllocTypeCold++ : AllocTypeNotCold++; 434 static_cast<DerivedCCG *>(this)->updateAllocationCall(Call, AllocType); 435 } 436 437 /// Update non-allocation call to invoke (possibly cloned) function 438 /// CalleeFunc. 439 void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc) { 440 static_cast<DerivedCCG *>(this)->updateCall(CallerCall, CalleeFunc); 441 } 442 443 /// Clone the given function for the given callsite, recording mapping of all 444 /// of the functions tracked calls to their new versions in the CallMap. 445 /// Assigns new clones to clone number CloneNo. 446 FuncInfo cloneFunctionForCallsite( 447 FuncInfo &Func, CallInfo &Call, std::map<CallInfo, CallInfo> &CallMap, 448 std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) { 449 return static_cast<DerivedCCG *>(this)->cloneFunctionForCallsite( 450 Func, Call, CallMap, CallsWithMetadataInFunc, CloneNo); 451 } 452 453 /// Gets a label to use in the dot graph for the given call clone in the given 454 /// function. 455 std::string getLabel(const FuncTy *Func, const CallTy Call, 456 unsigned CloneNo) const { 457 return static_cast<const DerivedCCG *>(this)->getLabel(Func, Call, CloneNo); 458 } 459 460 /// Helpers to find the node corresponding to the given call or stackid. 461 ContextNode *getNodeForInst(const CallInfo &C); 462 ContextNode *getNodeForAlloc(const CallInfo &C); 463 ContextNode *getNodeForStackId(uint64_t StackId); 464 465 /// Removes the node information recorded for the given call. 466 void unsetNodeForInst(const CallInfo &C); 467 468 /// Computes the alloc type corresponding to the given context ids, by 469 /// unioning their recorded alloc types. 470 uint8_t computeAllocType(DenseSet<uint32_t> &ContextIds); 471 472 /// Returns the alloction type of the intersection of the contexts of two 473 /// nodes (based on their provided context id sets), optimized for the case 474 /// when Node1Ids is smaller than Node2Ids. 475 uint8_t intersectAllocTypesImpl(const DenseSet<uint32_t> &Node1Ids, 476 const DenseSet<uint32_t> &Node2Ids); 477 478 /// Returns the alloction type of the intersection of the contexts of two 479 /// nodes (based on their provided context id sets). 480 uint8_t intersectAllocTypes(const DenseSet<uint32_t> &Node1Ids, 481 const DenseSet<uint32_t> &Node2Ids); 482 483 /// Create a clone of Edge's callee and move Edge to that new callee node, 484 /// performing the necessary context id and allocation type updates. 485 /// If callee's caller edge iterator is supplied, it is updated when removing 486 /// the edge from that list. 487 ContextNode * 488 moveEdgeToNewCalleeClone(const std::shared_ptr<ContextEdge> &Edge, 489 EdgeIter *CallerEdgeI = nullptr); 490 491 /// Change the callee of Edge to existing callee clone NewCallee, performing 492 /// the necessary context id and allocation type updates. 493 /// If callee's caller edge iterator is supplied, it is updated when removing 494 /// the edge from that list. 495 void moveEdgeToExistingCalleeClone(const std::shared_ptr<ContextEdge> &Edge, 496 ContextNode *NewCallee, 497 EdgeIter *CallerEdgeI = nullptr, 498 bool NewClone = false); 499 500 /// Recursively perform cloning on the graph for the given Node and its 501 /// callers, in order to uniquely identify the allocation behavior of an 502 /// allocation given its context. 503 void identifyClones(ContextNode *Node, 504 DenseSet<const ContextNode *> &Visited); 505 506 /// Map from each context ID to the AllocationType assigned to that context. 507 std::map<uint32_t, AllocationType> ContextIdToAllocationType; 508 509 /// Identifies the context node created for a stack id when adding the MIB 510 /// contexts to the graph. This is used to locate the context nodes when 511 /// trying to assign the corresponding callsites with those stack ids to these 512 /// nodes. 513 std::map<uint64_t, ContextNode *> StackEntryIdToContextNodeMap; 514 515 /// Maps to track the calls to their corresponding nodes in the graph. 516 MapVector<CallInfo, ContextNode *> AllocationCallToContextNodeMap; 517 MapVector<CallInfo, ContextNode *> NonAllocationCallToContextNodeMap; 518 519 /// Owner of all ContextNode unique_ptrs. 520 std::vector<std::unique_ptr<ContextNode>> NodeOwner; 521 522 /// Perform sanity checks on graph when requested. 523 void check() const; 524 525 /// Keeps track of the last unique context id assigned. 526 unsigned int LastContextId = 0; 527 }; 528 529 template <typename DerivedCCG, typename FuncTy, typename CallTy> 530 using ContextNode = 531 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode; 532 template <typename DerivedCCG, typename FuncTy, typename CallTy> 533 using ContextEdge = 534 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge; 535 template <typename DerivedCCG, typename FuncTy, typename CallTy> 536 using FuncInfo = 537 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::FuncInfo; 538 template <typename DerivedCCG, typename FuncTy, typename CallTy> 539 using CallInfo = 540 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::CallInfo; 541 542 /// CRTP derived class for graphs built from IR (regular LTO). 543 class ModuleCallsiteContextGraph 544 : public CallsiteContextGraph<ModuleCallsiteContextGraph, Function, 545 Instruction *> { 546 public: 547 ModuleCallsiteContextGraph( 548 Module &M, 549 function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter); 550 551 private: 552 friend CallsiteContextGraph<ModuleCallsiteContextGraph, Function, 553 Instruction *>; 554 555 uint64_t getStackId(uint64_t IdOrIndex) const; 556 bool calleeMatchesFunc(Instruction *Call, const Function *Func); 557 uint64_t getLastStackId(Instruction *Call); 558 std::vector<uint64_t> getStackIdsWithContextNodesForCall(Instruction *Call); 559 void updateAllocationCall(CallInfo &Call, AllocationType AllocType); 560 void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc); 561 CallsiteContextGraph<ModuleCallsiteContextGraph, Function, 562 Instruction *>::FuncInfo 563 cloneFunctionForCallsite(FuncInfo &Func, CallInfo &Call, 564 std::map<CallInfo, CallInfo> &CallMap, 565 std::vector<CallInfo> &CallsWithMetadataInFunc, 566 unsigned CloneNo); 567 std::string getLabel(const Function *Func, const Instruction *Call, 568 unsigned CloneNo) const; 569 570 const Module &Mod; 571 function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter; 572 }; 573 574 /// Represents a call in the summary index graph, which can either be an 575 /// allocation or an interior callsite node in an allocation's context. 576 /// Holds a pointer to the corresponding data structure in the index. 577 struct IndexCall : public PointerUnion<CallsiteInfo *, AllocInfo *> { 578 IndexCall() : PointerUnion() {} 579 IndexCall(std::nullptr_t) : IndexCall() {} 580 IndexCall(CallsiteInfo *StackNode) : PointerUnion(StackNode) {} 581 IndexCall(AllocInfo *AllocNode) : PointerUnion(AllocNode) {} 582 IndexCall(PointerUnion PT) : PointerUnion(PT) {} 583 584 IndexCall *operator->() { return this; } 585 586 PointerUnion<CallsiteInfo *, AllocInfo *> getBase() const { return *this; } 587 588 void print(raw_ostream &OS) const { 589 if (auto *AI = llvm::dyn_cast_if_present<AllocInfo *>(getBase())) { 590 OS << *AI; 591 } else { 592 auto *CI = llvm::dyn_cast_if_present<CallsiteInfo *>(getBase()); 593 assert(CI); 594 OS << *CI; 595 } 596 } 597 }; 598 599 /// CRTP derived class for graphs built from summary index (ThinLTO). 600 class IndexCallsiteContextGraph 601 : public CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary, 602 IndexCall> { 603 public: 604 IndexCallsiteContextGraph( 605 ModuleSummaryIndex &Index, 606 function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)> 607 isPrevailing); 608 609 private: 610 friend CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary, 611 IndexCall>; 612 613 uint64_t getStackId(uint64_t IdOrIndex) const; 614 bool calleeMatchesFunc(IndexCall &Call, const FunctionSummary *Func); 615 uint64_t getLastStackId(IndexCall &Call); 616 std::vector<uint64_t> getStackIdsWithContextNodesForCall(IndexCall &Call); 617 void updateAllocationCall(CallInfo &Call, AllocationType AllocType); 618 void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc); 619 CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary, 620 IndexCall>::FuncInfo 621 cloneFunctionForCallsite(FuncInfo &Func, CallInfo &Call, 622 std::map<CallInfo, CallInfo> &CallMap, 623 std::vector<CallInfo> &CallsWithMetadataInFunc, 624 unsigned CloneNo); 625 std::string getLabel(const FunctionSummary *Func, const IndexCall &Call, 626 unsigned CloneNo) const; 627 628 // Saves mapping from function summaries containing memprof records back to 629 // its VI, for use in checking and debugging. 630 std::map<const FunctionSummary *, ValueInfo> FSToVIMap; 631 632 const ModuleSummaryIndex &Index; 633 }; 634 } // namespace 635 636 namespace llvm { 637 template <> 638 struct DenseMapInfo<typename CallsiteContextGraph< 639 ModuleCallsiteContextGraph, Function, Instruction *>::CallInfo> 640 : public DenseMapInfo<std::pair<Instruction *, unsigned>> {}; 641 template <> 642 struct DenseMapInfo<typename CallsiteContextGraph< 643 IndexCallsiteContextGraph, FunctionSummary, IndexCall>::CallInfo> 644 : public DenseMapInfo<std::pair<IndexCall, unsigned>> {}; 645 template <> 646 struct DenseMapInfo<IndexCall> 647 : public DenseMapInfo<PointerUnion<CallsiteInfo *, AllocInfo *>> {}; 648 } // end namespace llvm 649 650 namespace { 651 652 struct FieldSeparator { 653 bool Skip = true; 654 const char *Sep; 655 656 FieldSeparator(const char *Sep = ", ") : Sep(Sep) {} 657 }; 658 659 raw_ostream &operator<<(raw_ostream &OS, FieldSeparator &FS) { 660 if (FS.Skip) { 661 FS.Skip = false; 662 return OS; 663 } 664 return OS << FS.Sep; 665 } 666 667 // Map the uint8_t alloc types (which may contain NotCold|Cold) to the alloc 668 // type we should actually use on the corresponding allocation. 669 // If we can't clone a node that has NotCold+Cold alloc type, we will fall 670 // back to using NotCold. So don't bother cloning to distinguish NotCold+Cold 671 // from NotCold. 672 AllocationType allocTypeToUse(uint8_t AllocTypes) { 673 assert(AllocTypes != (uint8_t)AllocationType::None); 674 if (AllocTypes == 675 ((uint8_t)AllocationType::NotCold | (uint8_t)AllocationType::Cold)) 676 return AllocationType::NotCold; 677 else 678 return (AllocationType)AllocTypes; 679 } 680 681 // Helper to check if the alloc types for all edges recorded in the 682 // InAllocTypes vector match the alloc types for all edges in the Edges 683 // vector. 684 template <typename DerivedCCG, typename FuncTy, typename CallTy> 685 bool allocTypesMatch( 686 const std::vector<uint8_t> &InAllocTypes, 687 const std::vector<std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>>> 688 &Edges) { 689 return std::equal( 690 InAllocTypes.begin(), InAllocTypes.end(), Edges.begin(), 691 [](const uint8_t &l, 692 const std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>> &r) { 693 // Can share if one of the edges is None type - don't 694 // care about the type along that edge as it doesn't 695 // exist for those context ids. 696 if (l == (uint8_t)AllocationType::None || 697 r->AllocTypes == (uint8_t)AllocationType::None) 698 return true; 699 return allocTypeToUse(l) == allocTypeToUse(r->AllocTypes); 700 }); 701 } 702 703 } // end anonymous namespace 704 705 template <typename DerivedCCG, typename FuncTy, typename CallTy> 706 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode * 707 CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getNodeForInst( 708 const CallInfo &C) { 709 ContextNode *Node = getNodeForAlloc(C); 710 if (Node) 711 return Node; 712 713 return NonAllocationCallToContextNodeMap.lookup(C); 714 } 715 716 template <typename DerivedCCG, typename FuncTy, typename CallTy> 717 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode * 718 CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getNodeForAlloc( 719 const CallInfo &C) { 720 return AllocationCallToContextNodeMap.lookup(C); 721 } 722 723 template <typename DerivedCCG, typename FuncTy, typename CallTy> 724 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode * 725 CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getNodeForStackId( 726 uint64_t StackId) { 727 auto StackEntryNode = StackEntryIdToContextNodeMap.find(StackId); 728 if (StackEntryNode != StackEntryIdToContextNodeMap.end()) 729 return StackEntryNode->second; 730 return nullptr; 731 } 732 733 template <typename DerivedCCG, typename FuncTy, typename CallTy> 734 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::unsetNodeForInst( 735 const CallInfo &C) { 736 AllocationCallToContextNodeMap.erase(C) || 737 NonAllocationCallToContextNodeMap.erase(C); 738 assert(!AllocationCallToContextNodeMap.count(C) && 739 !NonAllocationCallToContextNodeMap.count(C)); 740 } 741 742 template <typename DerivedCCG, typename FuncTy, typename CallTy> 743 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode:: 744 addOrUpdateCallerEdge(ContextNode *Caller, AllocationType AllocType, 745 unsigned int ContextId) { 746 for (auto &Edge : CallerEdges) { 747 if (Edge->Caller == Caller) { 748 Edge->AllocTypes |= (uint8_t)AllocType; 749 Edge->getContextIds().insert(ContextId); 750 return; 751 } 752 } 753 std::shared_ptr<ContextEdge> Edge = std::make_shared<ContextEdge>( 754 this, Caller, (uint8_t)AllocType, DenseSet<uint32_t>({ContextId})); 755 CallerEdges.push_back(Edge); 756 Caller->CalleeEdges.push_back(Edge); 757 } 758 759 template <typename DerivedCCG, typename FuncTy, typename CallTy> 760 void CallsiteContextGraph< 761 DerivedCCG, FuncTy, CallTy>::removeNoneTypeCalleeEdges(ContextNode *Node) { 762 for (auto EI = Node->CalleeEdges.begin(); EI != Node->CalleeEdges.end();) { 763 auto Edge = *EI; 764 if (Edge->AllocTypes == (uint8_t)AllocationType::None) { 765 assert(Edge->ContextIds.empty()); 766 Edge->Callee->eraseCallerEdge(Edge.get()); 767 EI = Node->CalleeEdges.erase(EI); 768 } else 769 ++EI; 770 } 771 } 772 773 template <typename DerivedCCG, typename FuncTy, typename CallTy> 774 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge * 775 CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode:: 776 findEdgeFromCallee(const ContextNode *Callee) { 777 for (const auto &Edge : CalleeEdges) 778 if (Edge->Callee == Callee) 779 return Edge.get(); 780 return nullptr; 781 } 782 783 template <typename DerivedCCG, typename FuncTy, typename CallTy> 784 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge * 785 CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode:: 786 findEdgeFromCaller(const ContextNode *Caller) { 787 for (const auto &Edge : CallerEdges) 788 if (Edge->Caller == Caller) 789 return Edge.get(); 790 return nullptr; 791 } 792 793 template <typename DerivedCCG, typename FuncTy, typename CallTy> 794 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode:: 795 eraseCalleeEdge(const ContextEdge *Edge) { 796 auto EI = llvm::find_if( 797 CalleeEdges, [Edge](const std::shared_ptr<ContextEdge> &CalleeEdge) { 798 return CalleeEdge.get() == Edge; 799 }); 800 assert(EI != CalleeEdges.end()); 801 CalleeEdges.erase(EI); 802 } 803 804 template <typename DerivedCCG, typename FuncTy, typename CallTy> 805 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode:: 806 eraseCallerEdge(const ContextEdge *Edge) { 807 auto EI = llvm::find_if( 808 CallerEdges, [Edge](const std::shared_ptr<ContextEdge> &CallerEdge) { 809 return CallerEdge.get() == Edge; 810 }); 811 assert(EI != CallerEdges.end()); 812 CallerEdges.erase(EI); 813 } 814 815 template <typename DerivedCCG, typename FuncTy, typename CallTy> 816 uint8_t CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::computeAllocType( 817 DenseSet<uint32_t> &ContextIds) { 818 uint8_t BothTypes = 819 (uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold; 820 uint8_t AllocType = (uint8_t)AllocationType::None; 821 for (auto Id : ContextIds) { 822 AllocType |= (uint8_t)ContextIdToAllocationType[Id]; 823 // Bail early if alloc type reached both, no further refinement. 824 if (AllocType == BothTypes) 825 return AllocType; 826 } 827 return AllocType; 828 } 829 830 template <typename DerivedCCG, typename FuncTy, typename CallTy> 831 uint8_t 832 CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::intersectAllocTypesImpl( 833 const DenseSet<uint32_t> &Node1Ids, const DenseSet<uint32_t> &Node2Ids) { 834 uint8_t BothTypes = 835 (uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold; 836 uint8_t AllocType = (uint8_t)AllocationType::None; 837 for (auto Id : Node1Ids) { 838 if (!Node2Ids.count(Id)) 839 continue; 840 AllocType |= (uint8_t)ContextIdToAllocationType[Id]; 841 // Bail early if alloc type reached both, no further refinement. 842 if (AllocType == BothTypes) 843 return AllocType; 844 } 845 return AllocType; 846 } 847 848 template <typename DerivedCCG, typename FuncTy, typename CallTy> 849 uint8_t CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::intersectAllocTypes( 850 const DenseSet<uint32_t> &Node1Ids, const DenseSet<uint32_t> &Node2Ids) { 851 if (Node1Ids.size() < Node2Ids.size()) 852 return intersectAllocTypesImpl(Node1Ids, Node2Ids); 853 else 854 return intersectAllocTypesImpl(Node2Ids, Node1Ids); 855 } 856 857 template <typename DerivedCCG, typename FuncTy, typename CallTy> 858 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode * 859 CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::addAllocNode( 860 CallInfo Call, const FuncTy *F) { 861 assert(!getNodeForAlloc(Call)); 862 NodeOwner.push_back( 863 std::make_unique<ContextNode>(/*IsAllocation=*/true, Call)); 864 ContextNode *AllocNode = NodeOwner.back().get(); 865 AllocationCallToContextNodeMap[Call] = AllocNode; 866 NodeToCallingFunc[AllocNode] = F; 867 // Use LastContextId as a uniq id for MIB allocation nodes. 868 AllocNode->OrigStackOrAllocId = LastContextId; 869 // Alloc type should be updated as we add in the MIBs. We should assert 870 // afterwards that it is not still None. 871 AllocNode->AllocTypes = (uint8_t)AllocationType::None; 872 873 return AllocNode; 874 } 875 876 template <typename DerivedCCG, typename FuncTy, typename CallTy> 877 template <class NodeT, class IteratorT> 878 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::addStackNodesForMIB( 879 ContextNode *AllocNode, CallStack<NodeT, IteratorT> &StackContext, 880 CallStack<NodeT, IteratorT> &CallsiteContext, AllocationType AllocType) { 881 // Treating the hot alloc type as NotCold before the disambiguation for "hot" 882 // is done. 883 if (AllocType == AllocationType::Hot) 884 AllocType = AllocationType::NotCold; 885 886 ContextIdToAllocationType[++LastContextId] = AllocType; 887 888 // Update alloc type and context ids for this MIB. 889 AllocNode->AllocTypes |= (uint8_t)AllocType; 890 AllocNode->ContextIds.insert(LastContextId); 891 892 // Now add or update nodes for each stack id in alloc's context. 893 // Later when processing the stack ids on non-alloc callsites we will adjust 894 // for any inlining in the context. 895 ContextNode *PrevNode = AllocNode; 896 // Look for recursion (direct recursion should have been collapsed by 897 // module summary analysis, here we should just be detecting mutual 898 // recursion). Mark these nodes so we don't try to clone. 899 SmallSet<uint64_t, 8> StackIdSet; 900 // Skip any on the allocation call (inlining). 901 for (auto ContextIter = StackContext.beginAfterSharedPrefix(CallsiteContext); 902 ContextIter != StackContext.end(); ++ContextIter) { 903 auto StackId = getStackId(*ContextIter); 904 ContextNode *StackNode = getNodeForStackId(StackId); 905 if (!StackNode) { 906 NodeOwner.push_back( 907 std::make_unique<ContextNode>(/*IsAllocation=*/false)); 908 StackNode = NodeOwner.back().get(); 909 StackEntryIdToContextNodeMap[StackId] = StackNode; 910 StackNode->OrigStackOrAllocId = StackId; 911 } 912 auto Ins = StackIdSet.insert(StackId); 913 if (!Ins.second) 914 StackNode->Recursive = true; 915 StackNode->ContextIds.insert(LastContextId); 916 StackNode->AllocTypes |= (uint8_t)AllocType; 917 PrevNode->addOrUpdateCallerEdge(StackNode, AllocType, LastContextId); 918 PrevNode = StackNode; 919 } 920 } 921 922 template <typename DerivedCCG, typename FuncTy, typename CallTy> 923 DenseSet<uint32_t> 924 CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::duplicateContextIds( 925 const DenseSet<uint32_t> &StackSequenceContextIds, 926 DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds) { 927 DenseSet<uint32_t> NewContextIds; 928 for (auto OldId : StackSequenceContextIds) { 929 NewContextIds.insert(++LastContextId); 930 OldToNewContextIds[OldId].insert(LastContextId); 931 assert(ContextIdToAllocationType.count(OldId)); 932 // The new context has the same allocation type as original. 933 ContextIdToAllocationType[LastContextId] = ContextIdToAllocationType[OldId]; 934 } 935 return NewContextIds; 936 } 937 938 template <typename DerivedCCG, typename FuncTy, typename CallTy> 939 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>:: 940 propagateDuplicateContextIds( 941 const DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds) { 942 // Build a set of duplicated context ids corresponding to the input id set. 943 auto GetNewIds = [&OldToNewContextIds](const DenseSet<uint32_t> &ContextIds) { 944 DenseSet<uint32_t> NewIds; 945 for (auto Id : ContextIds) 946 if (auto NewId = OldToNewContextIds.find(Id); 947 NewId != OldToNewContextIds.end()) 948 NewIds.insert(NewId->second.begin(), NewId->second.end()); 949 return NewIds; 950 }; 951 952 // Recursively update context ids sets along caller edges. 953 auto UpdateCallers = [&](ContextNode *Node, 954 DenseSet<const ContextEdge *> &Visited, 955 auto &&UpdateCallers) -> void { 956 for (const auto &Edge : Node->CallerEdges) { 957 auto Inserted = Visited.insert(Edge.get()); 958 if (!Inserted.second) 959 continue; 960 ContextNode *NextNode = Edge->Caller; 961 DenseSet<uint32_t> NewIdsToAdd = GetNewIds(Edge->getContextIds()); 962 // Only need to recursively iterate to NextNode via this caller edge if 963 // it resulted in any added ids to NextNode. 964 if (!NewIdsToAdd.empty()) { 965 Edge->getContextIds().insert(NewIdsToAdd.begin(), NewIdsToAdd.end()); 966 NextNode->ContextIds.insert(NewIdsToAdd.begin(), NewIdsToAdd.end()); 967 UpdateCallers(NextNode, Visited, UpdateCallers); 968 } 969 } 970 }; 971 972 DenseSet<const ContextEdge *> Visited; 973 for (auto &Entry : AllocationCallToContextNodeMap) { 974 auto *Node = Entry.second; 975 // Update ids on the allocation nodes before calling the recursive 976 // update along caller edges, since this simplifies the logic during 977 // that traversal. 978 DenseSet<uint32_t> NewIdsToAdd = GetNewIds(Node->ContextIds); 979 Node->ContextIds.insert(NewIdsToAdd.begin(), NewIdsToAdd.end()); 980 UpdateCallers(Node, Visited, UpdateCallers); 981 } 982 } 983 984 template <typename DerivedCCG, typename FuncTy, typename CallTy> 985 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::connectNewNode( 986 ContextNode *NewNode, ContextNode *OrigNode, bool TowardsCallee) { 987 // Make a copy of the context ids, since this will be adjusted below as they 988 // are moved. 989 DenseSet<uint32_t> RemainingContextIds = NewNode->ContextIds; 990 auto &OrigEdges = 991 TowardsCallee ? OrigNode->CalleeEdges : OrigNode->CallerEdges; 992 // Increment iterator in loop so that we can remove edges as needed. 993 for (auto EI = OrigEdges.begin(); EI != OrigEdges.end();) { 994 auto Edge = *EI; 995 // Remove any matching context ids from Edge, return set that were found and 996 // removed, these are the new edge's context ids. Also update the remaining 997 // (not found ids). 998 DenseSet<uint32_t> NewEdgeContextIds, NotFoundContextIds; 999 set_subtract(Edge->getContextIds(), RemainingContextIds, NewEdgeContextIds, 1000 NotFoundContextIds); 1001 RemainingContextIds.swap(NotFoundContextIds); 1002 // If no matching context ids for this edge, skip it. 1003 if (NewEdgeContextIds.empty()) { 1004 ++EI; 1005 continue; 1006 } 1007 if (TowardsCallee) { 1008 auto NewEdge = std::make_shared<ContextEdge>( 1009 Edge->Callee, NewNode, computeAllocType(NewEdgeContextIds), 1010 NewEdgeContextIds); 1011 NewNode->CalleeEdges.push_back(NewEdge); 1012 NewEdge->Callee->CallerEdges.push_back(NewEdge); 1013 } else { 1014 auto NewEdge = std::make_shared<ContextEdge>( 1015 NewNode, Edge->Caller, computeAllocType(NewEdgeContextIds), 1016 NewEdgeContextIds); 1017 NewNode->CallerEdges.push_back(NewEdge); 1018 NewEdge->Caller->CalleeEdges.push_back(NewEdge); 1019 } 1020 // Remove old edge if context ids empty. 1021 if (Edge->getContextIds().empty()) { 1022 if (TowardsCallee) { 1023 Edge->Callee->eraseCallerEdge(Edge.get()); 1024 EI = OrigNode->CalleeEdges.erase(EI); 1025 } else { 1026 Edge->Caller->eraseCalleeEdge(Edge.get()); 1027 EI = OrigNode->CallerEdges.erase(EI); 1028 } 1029 continue; 1030 } 1031 ++EI; 1032 } 1033 } 1034 1035 template <typename DerivedCCG, typename FuncTy, typename CallTy> 1036 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>:: 1037 assignStackNodesPostOrder(ContextNode *Node, 1038 DenseSet<const ContextNode *> &Visited, 1039 DenseMap<uint64_t, std::vector<CallContextInfo>> 1040 &StackIdToMatchingCalls) { 1041 auto Inserted = Visited.insert(Node); 1042 if (!Inserted.second) 1043 return; 1044 // Post order traversal. Iterate over a copy since we may add nodes and 1045 // therefore new callers during the recursive call, invalidating any 1046 // iterator over the original edge vector. We don't need to process these 1047 // new nodes as they were already processed on creation. 1048 auto CallerEdges = Node->CallerEdges; 1049 for (auto &Edge : CallerEdges) { 1050 // Skip any that have been removed during the recursion. 1051 if (!Edge) 1052 continue; 1053 assignStackNodesPostOrder(Edge->Caller, Visited, StackIdToMatchingCalls); 1054 } 1055 1056 // If this node's stack id is in the map, update the graph to contain new 1057 // nodes representing any inlining at interior callsites. Note we move the 1058 // associated context ids over to the new nodes. 1059 1060 // Ignore this node if it is for an allocation or we didn't record any 1061 // stack id lists ending at it. 1062 if (Node->IsAllocation || 1063 !StackIdToMatchingCalls.count(Node->OrigStackOrAllocId)) 1064 return; 1065 1066 auto &Calls = StackIdToMatchingCalls[Node->OrigStackOrAllocId]; 1067 // Handle the simple case first. A single call with a single stack id. 1068 // In this case there is no need to create any new context nodes, simply 1069 // assign the context node for stack id to this Call. 1070 if (Calls.size() == 1) { 1071 auto &[Call, Ids, Func, SavedContextIds] = Calls[0]; 1072 if (Ids.size() == 1) { 1073 assert(SavedContextIds.empty()); 1074 // It should be this Node 1075 assert(Node == getNodeForStackId(Ids[0])); 1076 if (Node->Recursive) 1077 return; 1078 Node->setCall(Call); 1079 NonAllocationCallToContextNodeMap[Call] = Node; 1080 NodeToCallingFunc[Node] = Func; 1081 return; 1082 } 1083 } 1084 1085 // Find the node for the last stack id, which should be the same 1086 // across all calls recorded for this id, and is this node's id. 1087 uint64_t LastId = Node->OrigStackOrAllocId; 1088 ContextNode *LastNode = getNodeForStackId(LastId); 1089 // We should only have kept stack ids that had nodes. 1090 assert(LastNode); 1091 1092 for (unsigned I = 0; I < Calls.size(); I++) { 1093 auto &[Call, Ids, Func, SavedContextIds] = Calls[I]; 1094 // Skip any for which we didn't assign any ids, these don't get a node in 1095 // the graph. 1096 if (SavedContextIds.empty()) 1097 continue; 1098 1099 assert(LastId == Ids.back()); 1100 1101 ContextNode *FirstNode = getNodeForStackId(Ids[0]); 1102 assert(FirstNode); 1103 1104 // Recompute the context ids for this stack id sequence (the 1105 // intersection of the context ids of the corresponding nodes). 1106 // Start with the ids we saved in the map for this call, which could be 1107 // duplicated context ids. We have to recompute as we might have overlap 1108 // overlap between the saved context ids for different last nodes, and 1109 // removed them already during the post order traversal. 1110 set_intersect(SavedContextIds, FirstNode->ContextIds); 1111 ContextNode *PrevNode = nullptr; 1112 for (auto Id : Ids) { 1113 ContextNode *CurNode = getNodeForStackId(Id); 1114 // We should only have kept stack ids that had nodes and weren't 1115 // recursive. 1116 assert(CurNode); 1117 assert(!CurNode->Recursive); 1118 if (!PrevNode) { 1119 PrevNode = CurNode; 1120 continue; 1121 } 1122 auto *Edge = CurNode->findEdgeFromCallee(PrevNode); 1123 if (!Edge) { 1124 SavedContextIds.clear(); 1125 break; 1126 } 1127 PrevNode = CurNode; 1128 set_intersect(SavedContextIds, Edge->getContextIds()); 1129 1130 // If we now have no context ids for clone, skip this call. 1131 if (SavedContextIds.empty()) 1132 break; 1133 } 1134 if (SavedContextIds.empty()) 1135 continue; 1136 1137 // Create new context node. 1138 NodeOwner.push_back( 1139 std::make_unique<ContextNode>(/*IsAllocation=*/false, Call)); 1140 ContextNode *NewNode = NodeOwner.back().get(); 1141 NodeToCallingFunc[NewNode] = Func; 1142 NonAllocationCallToContextNodeMap[Call] = NewNode; 1143 NewNode->ContextIds = SavedContextIds; 1144 NewNode->AllocTypes = computeAllocType(NewNode->ContextIds); 1145 1146 // Connect to callees of innermost stack frame in inlined call chain. 1147 // This updates context ids for FirstNode's callee's to reflect those 1148 // moved to NewNode. 1149 connectNewNode(NewNode, FirstNode, /*TowardsCallee=*/true); 1150 1151 // Connect to callers of outermost stack frame in inlined call chain. 1152 // This updates context ids for FirstNode's caller's to reflect those 1153 // moved to NewNode. 1154 connectNewNode(NewNode, LastNode, /*TowardsCallee=*/false); 1155 1156 // Now we need to remove context ids from edges/nodes between First and 1157 // Last Node. 1158 PrevNode = nullptr; 1159 for (auto Id : Ids) { 1160 ContextNode *CurNode = getNodeForStackId(Id); 1161 // We should only have kept stack ids that had nodes. 1162 assert(CurNode); 1163 1164 // Remove the context ids moved to NewNode from CurNode, and the 1165 // edge from the prior node. 1166 set_subtract(CurNode->ContextIds, NewNode->ContextIds); 1167 if (PrevNode) { 1168 auto *PrevEdge = CurNode->findEdgeFromCallee(PrevNode); 1169 assert(PrevEdge); 1170 set_subtract(PrevEdge->getContextIds(), NewNode->ContextIds); 1171 if (PrevEdge->getContextIds().empty()) { 1172 PrevNode->eraseCallerEdge(PrevEdge); 1173 CurNode->eraseCalleeEdge(PrevEdge); 1174 } 1175 } 1176 PrevNode = CurNode; 1177 } 1178 } 1179 } 1180 1181 template <typename DerivedCCG, typename FuncTy, typename CallTy> 1182 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::updateStackNodes() { 1183 // Map of stack id to all calls with that as the last (outermost caller) 1184 // callsite id that has a context node (some might not due to pruning 1185 // performed during matching of the allocation profile contexts). 1186 // The CallContextInfo contains the Call and a list of its stack ids with 1187 // ContextNodes, the function containing Call, and the set of context ids 1188 // the analysis will eventually identify for use in any new node created 1189 // for that callsite. 1190 DenseMap<uint64_t, std::vector<CallContextInfo>> StackIdToMatchingCalls; 1191 for (auto &[Func, CallsWithMetadata] : FuncToCallsWithMetadata) { 1192 for (auto &Call : CallsWithMetadata) { 1193 // Ignore allocations, already handled. 1194 if (AllocationCallToContextNodeMap.count(Call)) 1195 continue; 1196 auto StackIdsWithContextNodes = 1197 getStackIdsWithContextNodesForCall(Call.call()); 1198 // If there were no nodes created for MIBs on allocs (maybe this was in 1199 // the unambiguous part of the MIB stack that was pruned), ignore. 1200 if (StackIdsWithContextNodes.empty()) 1201 continue; 1202 // Otherwise, record this Call along with the list of ids for the last 1203 // (outermost caller) stack id with a node. 1204 StackIdToMatchingCalls[StackIdsWithContextNodes.back()].push_back( 1205 {Call.call(), StackIdsWithContextNodes, Func, {}}); 1206 } 1207 } 1208 1209 // First make a pass through all stack ids that correspond to a call, 1210 // as identified in the above loop. Compute the context ids corresponding to 1211 // each of these calls when they correspond to multiple stack ids due to 1212 // due to inlining. Perform any duplication of context ids required when 1213 // there is more than one call with the same stack ids. Their (possibly newly 1214 // duplicated) context ids are saved in the StackIdToMatchingCalls map. 1215 DenseMap<uint32_t, DenseSet<uint32_t>> OldToNewContextIds; 1216 for (auto &It : StackIdToMatchingCalls) { 1217 auto &Calls = It.getSecond(); 1218 // Skip single calls with a single stack id. These don't need a new node. 1219 if (Calls.size() == 1) { 1220 auto &Ids = std::get<1>(Calls[0]); 1221 if (Ids.size() == 1) 1222 continue; 1223 } 1224 // In order to do the best and maximal matching of inlined calls to context 1225 // node sequences we will sort the vectors of stack ids in descending order 1226 // of length, and within each length, lexicographically by stack id. The 1227 // latter is so that we can specially handle calls that have identical stack 1228 // id sequences (either due to cloning or artificially because of the MIB 1229 // context pruning). 1230 std::stable_sort(Calls.begin(), Calls.end(), 1231 [](const CallContextInfo &A, const CallContextInfo &B) { 1232 auto &IdsA = std::get<1>(A); 1233 auto &IdsB = std::get<1>(B); 1234 return IdsA.size() > IdsB.size() || 1235 (IdsA.size() == IdsB.size() && IdsA < IdsB); 1236 }); 1237 1238 // Find the node for the last stack id, which should be the same 1239 // across all calls recorded for this id, and is the id for this 1240 // entry in the StackIdToMatchingCalls map. 1241 uint64_t LastId = It.getFirst(); 1242 ContextNode *LastNode = getNodeForStackId(LastId); 1243 // We should only have kept stack ids that had nodes. 1244 assert(LastNode); 1245 1246 if (LastNode->Recursive) 1247 continue; 1248 1249 // Initialize the context ids with the last node's. We will subsequently 1250 // refine the context ids by computing the intersection along all edges. 1251 DenseSet<uint32_t> LastNodeContextIds = LastNode->ContextIds; 1252 assert(!LastNodeContextIds.empty()); 1253 1254 for (unsigned I = 0; I < Calls.size(); I++) { 1255 auto &[Call, Ids, Func, SavedContextIds] = Calls[I]; 1256 assert(SavedContextIds.empty()); 1257 assert(LastId == Ids.back()); 1258 1259 // First compute the context ids for this stack id sequence (the 1260 // intersection of the context ids of the corresponding nodes). 1261 // Start with the remaining saved ids for the last node. 1262 assert(!LastNodeContextIds.empty()); 1263 DenseSet<uint32_t> StackSequenceContextIds = LastNodeContextIds; 1264 1265 ContextNode *PrevNode = LastNode; 1266 ContextNode *CurNode = LastNode; 1267 bool Skip = false; 1268 1269 // Iterate backwards through the stack Ids, starting after the last Id 1270 // in the list, which was handled once outside for all Calls. 1271 for (auto IdIter = Ids.rbegin() + 1; IdIter != Ids.rend(); IdIter++) { 1272 auto Id = *IdIter; 1273 CurNode = getNodeForStackId(Id); 1274 // We should only have kept stack ids that had nodes. 1275 assert(CurNode); 1276 1277 if (CurNode->Recursive) { 1278 Skip = true; 1279 break; 1280 } 1281 1282 auto *Edge = CurNode->findEdgeFromCaller(PrevNode); 1283 // If there is no edge then the nodes belong to different MIB contexts, 1284 // and we should skip this inlined context sequence. For example, this 1285 // particular inlined context may include stack ids A->B, and we may 1286 // indeed have nodes for both A and B, but it is possible that they were 1287 // never profiled in sequence in a single MIB for any allocation (i.e. 1288 // we might have profiled an allocation that involves the callsite A, 1289 // but through a different one of its callee callsites, and we might 1290 // have profiled an allocation that involves callsite B, but reached 1291 // from a different caller callsite). 1292 if (!Edge) { 1293 Skip = true; 1294 break; 1295 } 1296 PrevNode = CurNode; 1297 1298 // Update the context ids, which is the intersection of the ids along 1299 // all edges in the sequence. 1300 set_intersect(StackSequenceContextIds, Edge->getContextIds()); 1301 1302 // If we now have no context ids for clone, skip this call. 1303 if (StackSequenceContextIds.empty()) { 1304 Skip = true; 1305 break; 1306 } 1307 } 1308 if (Skip) 1309 continue; 1310 1311 // If some of this call's stack ids did not have corresponding nodes (due 1312 // to pruning), don't include any context ids for contexts that extend 1313 // beyond these nodes. Otherwise we would be matching part of unrelated / 1314 // not fully matching stack contexts. To do this, subtract any context ids 1315 // found in caller nodes of the last node found above. 1316 if (Ids.back() != getLastStackId(Call)) { 1317 for (const auto &PE : CurNode->CallerEdges) { 1318 set_subtract(StackSequenceContextIds, PE->getContextIds()); 1319 if (StackSequenceContextIds.empty()) 1320 break; 1321 } 1322 // If we now have no context ids for clone, skip this call. 1323 if (StackSequenceContextIds.empty()) 1324 continue; 1325 } 1326 1327 // Check if the next set of stack ids is the same (since the Calls vector 1328 // of tuples is sorted by the stack ids we can just look at the next one). 1329 bool DuplicateContextIds = false; 1330 if (I + 1 < Calls.size()) { 1331 auto NextIds = std::get<1>(Calls[I + 1]); 1332 DuplicateContextIds = Ids == NextIds; 1333 } 1334 1335 // If we don't have duplicate context ids, then we can assign all the 1336 // context ids computed for the original node sequence to this call. 1337 // If there are duplicate calls with the same stack ids then we synthesize 1338 // new context ids that are duplicates of the originals. These are 1339 // assigned to SavedContextIds, which is a reference into the map entry 1340 // for this call, allowing us to access these ids later on. 1341 OldToNewContextIds.reserve(OldToNewContextIds.size() + 1342 StackSequenceContextIds.size()); 1343 SavedContextIds = 1344 DuplicateContextIds 1345 ? duplicateContextIds(StackSequenceContextIds, OldToNewContextIds) 1346 : StackSequenceContextIds; 1347 assert(!SavedContextIds.empty()); 1348 1349 if (!DuplicateContextIds) { 1350 // Update saved last node's context ids to remove those that are 1351 // assigned to other calls, so that it is ready for the next call at 1352 // this stack id. 1353 set_subtract(LastNodeContextIds, StackSequenceContextIds); 1354 if (LastNodeContextIds.empty()) 1355 break; 1356 } 1357 } 1358 } 1359 1360 // Propagate the duplicate context ids over the graph. 1361 propagateDuplicateContextIds(OldToNewContextIds); 1362 1363 if (VerifyCCG) 1364 check(); 1365 1366 // Now perform a post-order traversal over the graph, starting with the 1367 // allocation nodes, essentially processing nodes from callers to callees. 1368 // For any that contains an id in the map, update the graph to contain new 1369 // nodes representing any inlining at interior callsites. Note we move the 1370 // associated context ids over to the new nodes. 1371 DenseSet<const ContextNode *> Visited; 1372 for (auto &Entry : AllocationCallToContextNodeMap) 1373 assignStackNodesPostOrder(Entry.second, Visited, StackIdToMatchingCalls); 1374 } 1375 1376 uint64_t ModuleCallsiteContextGraph::getLastStackId(Instruction *Call) { 1377 CallStack<MDNode, MDNode::op_iterator> CallsiteContext( 1378 Call->getMetadata(LLVMContext::MD_callsite)); 1379 return CallsiteContext.back(); 1380 } 1381 1382 uint64_t IndexCallsiteContextGraph::getLastStackId(IndexCall &Call) { 1383 assert(isa<CallsiteInfo *>(Call.getBase())); 1384 CallStack<CallsiteInfo, SmallVector<unsigned>::const_iterator> 1385 CallsiteContext(dyn_cast_if_present<CallsiteInfo *>(Call.getBase())); 1386 // Need to convert index into stack id. 1387 return Index.getStackIdAtIndex(CallsiteContext.back()); 1388 } 1389 1390 static const std::string MemProfCloneSuffix = ".memprof."; 1391 1392 static std::string getMemProfFuncName(Twine Base, unsigned CloneNo) { 1393 // We use CloneNo == 0 to refer to the original version, which doesn't get 1394 // renamed with a suffix. 1395 if (!CloneNo) 1396 return Base.str(); 1397 return (Base + MemProfCloneSuffix + Twine(CloneNo)).str(); 1398 } 1399 1400 std::string ModuleCallsiteContextGraph::getLabel(const Function *Func, 1401 const Instruction *Call, 1402 unsigned CloneNo) const { 1403 return (Twine(Call->getFunction()->getName()) + " -> " + 1404 cast<CallBase>(Call)->getCalledFunction()->getName()) 1405 .str(); 1406 } 1407 1408 std::string IndexCallsiteContextGraph::getLabel(const FunctionSummary *Func, 1409 const IndexCall &Call, 1410 unsigned CloneNo) const { 1411 auto VI = FSToVIMap.find(Func); 1412 assert(VI != FSToVIMap.end()); 1413 if (isa<AllocInfo *>(Call.getBase())) 1414 return (VI->second.name() + " -> alloc").str(); 1415 else { 1416 auto *Callsite = dyn_cast_if_present<CallsiteInfo *>(Call.getBase()); 1417 return (VI->second.name() + " -> " + 1418 getMemProfFuncName(Callsite->Callee.name(), 1419 Callsite->Clones[CloneNo])) 1420 .str(); 1421 } 1422 } 1423 1424 std::vector<uint64_t> 1425 ModuleCallsiteContextGraph::getStackIdsWithContextNodesForCall( 1426 Instruction *Call) { 1427 CallStack<MDNode, MDNode::op_iterator> CallsiteContext( 1428 Call->getMetadata(LLVMContext::MD_callsite)); 1429 return getStackIdsWithContextNodes<MDNode, MDNode::op_iterator>( 1430 CallsiteContext); 1431 } 1432 1433 std::vector<uint64_t> 1434 IndexCallsiteContextGraph::getStackIdsWithContextNodesForCall(IndexCall &Call) { 1435 assert(isa<CallsiteInfo *>(Call.getBase())); 1436 CallStack<CallsiteInfo, SmallVector<unsigned>::const_iterator> 1437 CallsiteContext(dyn_cast_if_present<CallsiteInfo *>(Call.getBase())); 1438 return getStackIdsWithContextNodes<CallsiteInfo, 1439 SmallVector<unsigned>::const_iterator>( 1440 CallsiteContext); 1441 } 1442 1443 template <typename DerivedCCG, typename FuncTy, typename CallTy> 1444 template <class NodeT, class IteratorT> 1445 std::vector<uint64_t> 1446 CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getStackIdsWithContextNodes( 1447 CallStack<NodeT, IteratorT> &CallsiteContext) { 1448 std::vector<uint64_t> StackIds; 1449 for (auto IdOrIndex : CallsiteContext) { 1450 auto StackId = getStackId(IdOrIndex); 1451 ContextNode *Node = getNodeForStackId(StackId); 1452 if (!Node) 1453 break; 1454 StackIds.push_back(StackId); 1455 } 1456 return StackIds; 1457 } 1458 1459 ModuleCallsiteContextGraph::ModuleCallsiteContextGraph( 1460 Module &M, function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter) 1461 : Mod(M), OREGetter(OREGetter) { 1462 for (auto &F : M) { 1463 std::vector<CallInfo> CallsWithMetadata; 1464 for (auto &BB : F) { 1465 for (auto &I : BB) { 1466 if (!isa<CallBase>(I)) 1467 continue; 1468 if (auto *MemProfMD = I.getMetadata(LLVMContext::MD_memprof)) { 1469 CallsWithMetadata.push_back(&I); 1470 auto *AllocNode = addAllocNode(&I, &F); 1471 auto *CallsiteMD = I.getMetadata(LLVMContext::MD_callsite); 1472 assert(CallsiteMD); 1473 CallStack<MDNode, MDNode::op_iterator> CallsiteContext(CallsiteMD); 1474 // Add all of the MIBs and their stack nodes. 1475 for (auto &MDOp : MemProfMD->operands()) { 1476 auto *MIBMD = cast<const MDNode>(MDOp); 1477 MDNode *StackNode = getMIBStackNode(MIBMD); 1478 assert(StackNode); 1479 CallStack<MDNode, MDNode::op_iterator> StackContext(StackNode); 1480 addStackNodesForMIB<MDNode, MDNode::op_iterator>( 1481 AllocNode, StackContext, CallsiteContext, 1482 getMIBAllocType(MIBMD)); 1483 } 1484 assert(AllocNode->AllocTypes != (uint8_t)AllocationType::None); 1485 // Memprof and callsite metadata on memory allocations no longer 1486 // needed. 1487 I.setMetadata(LLVMContext::MD_memprof, nullptr); 1488 I.setMetadata(LLVMContext::MD_callsite, nullptr); 1489 } 1490 // For callsite metadata, add to list for this function for later use. 1491 else if (I.getMetadata(LLVMContext::MD_callsite)) 1492 CallsWithMetadata.push_back(&I); 1493 } 1494 } 1495 if (!CallsWithMetadata.empty()) 1496 FuncToCallsWithMetadata.push_back({&F, CallsWithMetadata}); 1497 } 1498 1499 if (DumpCCG) { 1500 dbgs() << "CCG before updating call stack chains:\n"; 1501 dbgs() << *this; 1502 } 1503 1504 if (ExportToDot) 1505 exportToDot("prestackupdate"); 1506 1507 updateStackNodes(); 1508 1509 handleCallsitesWithMultipleTargets(); 1510 1511 // Strip off remaining callsite metadata, no longer needed. 1512 for (auto &FuncEntry : FuncToCallsWithMetadata) 1513 for (auto &Call : FuncEntry.second) 1514 Call.call()->setMetadata(LLVMContext::MD_callsite, nullptr); 1515 } 1516 1517 IndexCallsiteContextGraph::IndexCallsiteContextGraph( 1518 ModuleSummaryIndex &Index, 1519 function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)> 1520 isPrevailing) 1521 : Index(Index) { 1522 for (auto &I : Index) { 1523 auto VI = Index.getValueInfo(I); 1524 for (auto &S : VI.getSummaryList()) { 1525 // We should only add the prevailing nodes. Otherwise we may try to clone 1526 // in a weak copy that won't be linked (and may be different than the 1527 // prevailing version). 1528 // We only keep the memprof summary on the prevailing copy now when 1529 // building the combined index, as a space optimization, however don't 1530 // rely on this optimization. The linker doesn't resolve local linkage 1531 // values so don't check whether those are prevailing. 1532 if (!GlobalValue::isLocalLinkage(S->linkage()) && 1533 !isPrevailing(VI.getGUID(), S.get())) 1534 continue; 1535 auto *FS = dyn_cast<FunctionSummary>(S.get()); 1536 if (!FS) 1537 continue; 1538 std::vector<CallInfo> CallsWithMetadata; 1539 if (!FS->allocs().empty()) { 1540 for (auto &AN : FS->mutableAllocs()) { 1541 // This can happen because of recursion elimination handling that 1542 // currently exists in ModuleSummaryAnalysis. Skip these for now. 1543 // We still added them to the summary because we need to be able to 1544 // correlate properly in applyImport in the backends. 1545 if (AN.MIBs.empty()) 1546 continue; 1547 CallsWithMetadata.push_back({&AN}); 1548 auto *AllocNode = addAllocNode({&AN}, FS); 1549 // Pass an empty CallStack to the CallsiteContext (second) 1550 // parameter, since for ThinLTO we already collapsed out the inlined 1551 // stack ids on the allocation call during ModuleSummaryAnalysis. 1552 CallStack<MIBInfo, SmallVector<unsigned>::const_iterator> 1553 EmptyContext; 1554 // Now add all of the MIBs and their stack nodes. 1555 for (auto &MIB : AN.MIBs) { 1556 CallStack<MIBInfo, SmallVector<unsigned>::const_iterator> 1557 StackContext(&MIB); 1558 addStackNodesForMIB<MIBInfo, SmallVector<unsigned>::const_iterator>( 1559 AllocNode, StackContext, EmptyContext, MIB.AllocType); 1560 } 1561 assert(AllocNode->AllocTypes != (uint8_t)AllocationType::None); 1562 // Initialize version 0 on the summary alloc node to the current alloc 1563 // type, unless it has both types in which case make it default, so 1564 // that in the case where we aren't able to clone the original version 1565 // always ends up with the default allocation behavior. 1566 AN.Versions[0] = (uint8_t)allocTypeToUse(AllocNode->AllocTypes); 1567 } 1568 } 1569 // For callsite metadata, add to list for this function for later use. 1570 if (!FS->callsites().empty()) 1571 for (auto &SN : FS->mutableCallsites()) 1572 CallsWithMetadata.push_back({&SN}); 1573 1574 if (!CallsWithMetadata.empty()) 1575 FuncToCallsWithMetadata.push_back({FS, CallsWithMetadata}); 1576 1577 if (!FS->allocs().empty() || !FS->callsites().empty()) 1578 FSToVIMap[FS] = VI; 1579 } 1580 } 1581 1582 if (DumpCCG) { 1583 dbgs() << "CCG before updating call stack chains:\n"; 1584 dbgs() << *this; 1585 } 1586 1587 if (ExportToDot) 1588 exportToDot("prestackupdate"); 1589 1590 updateStackNodes(); 1591 1592 handleCallsitesWithMultipleTargets(); 1593 } 1594 1595 template <typename DerivedCCG, typename FuncTy, typename CallTy> 1596 void CallsiteContextGraph<DerivedCCG, FuncTy, 1597 CallTy>::handleCallsitesWithMultipleTargets() { 1598 // Look for and workaround callsites that call multiple functions. 1599 // This can happen for indirect calls, which needs better handling, and in 1600 // more rare cases (e.g. macro expansion). 1601 // TODO: To fix this for indirect calls we will want to perform speculative 1602 // devirtualization using either the normal PGO info with ICP, or using the 1603 // information in the profiled MemProf contexts. We can do this prior to 1604 // this transformation for regular LTO, and for ThinLTO we can simulate that 1605 // effect in the summary and perform the actual speculative devirtualization 1606 // while cloning in the ThinLTO backend. 1607 for (auto Entry = NonAllocationCallToContextNodeMap.begin(); 1608 Entry != NonAllocationCallToContextNodeMap.end();) { 1609 auto *Node = Entry->second; 1610 assert(Node->Clones.empty()); 1611 // Check all node callees and see if in the same function. 1612 bool Removed = false; 1613 auto Call = Node->Call.call(); 1614 for (auto &Edge : Node->CalleeEdges) { 1615 if (!Edge->Callee->hasCall()) 1616 continue; 1617 assert(NodeToCallingFunc.count(Edge->Callee)); 1618 // Check if the called function matches that of the callee node. 1619 if (calleeMatchesFunc(Call, NodeToCallingFunc[Edge->Callee])) 1620 continue; 1621 // Work around by setting Node to have a null call, so it gets 1622 // skipped during cloning. Otherwise assignFunctions will assert 1623 // because its data structures are not designed to handle this case. 1624 Entry = NonAllocationCallToContextNodeMap.erase(Entry); 1625 Node->setCall(CallInfo()); 1626 Removed = true; 1627 break; 1628 } 1629 if (!Removed) 1630 Entry++; 1631 } 1632 } 1633 1634 uint64_t ModuleCallsiteContextGraph::getStackId(uint64_t IdOrIndex) const { 1635 // In the Module (IR) case this is already the Id. 1636 return IdOrIndex; 1637 } 1638 1639 uint64_t IndexCallsiteContextGraph::getStackId(uint64_t IdOrIndex) const { 1640 // In the Index case this is an index into the stack id list in the summary 1641 // index, convert it to an Id. 1642 return Index.getStackIdAtIndex(IdOrIndex); 1643 } 1644 1645 bool ModuleCallsiteContextGraph::calleeMatchesFunc(Instruction *Call, 1646 const Function *Func) { 1647 auto *CB = dyn_cast<CallBase>(Call); 1648 if (!CB->getCalledOperand()) 1649 return false; 1650 auto *CalleeVal = CB->getCalledOperand()->stripPointerCasts(); 1651 auto *CalleeFunc = dyn_cast<Function>(CalleeVal); 1652 if (CalleeFunc == Func) 1653 return true; 1654 auto *Alias = dyn_cast<GlobalAlias>(CalleeVal); 1655 return Alias && Alias->getAliasee() == Func; 1656 } 1657 1658 bool IndexCallsiteContextGraph::calleeMatchesFunc(IndexCall &Call, 1659 const FunctionSummary *Func) { 1660 ValueInfo Callee = 1661 dyn_cast_if_present<CallsiteInfo *>(Call.getBase())->Callee; 1662 // If there is no summary list then this is a call to an externally defined 1663 // symbol. 1664 AliasSummary *Alias = 1665 Callee.getSummaryList().empty() 1666 ? nullptr 1667 : dyn_cast<AliasSummary>(Callee.getSummaryList()[0].get()); 1668 assert(FSToVIMap.count(Func)); 1669 return Callee == FSToVIMap[Func] || 1670 // If callee is an alias, check the aliasee, since only function 1671 // summary base objects will contain the stack node summaries and thus 1672 // get a context node. 1673 (Alias && Alias->getAliaseeVI() == FSToVIMap[Func]); 1674 } 1675 1676 static std::string getAllocTypeString(uint8_t AllocTypes) { 1677 if (!AllocTypes) 1678 return "None"; 1679 std::string Str; 1680 if (AllocTypes & (uint8_t)AllocationType::NotCold) 1681 Str += "NotCold"; 1682 if (AllocTypes & (uint8_t)AllocationType::Cold) 1683 Str += "Cold"; 1684 return Str; 1685 } 1686 1687 template <typename DerivedCCG, typename FuncTy, typename CallTy> 1688 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::dump() 1689 const { 1690 print(dbgs()); 1691 dbgs() << "\n"; 1692 } 1693 1694 template <typename DerivedCCG, typename FuncTy, typename CallTy> 1695 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::print( 1696 raw_ostream &OS) const { 1697 OS << "Node " << this << "\n"; 1698 OS << "\t"; 1699 printCall(OS); 1700 if (Recursive) 1701 OS << " (recursive)"; 1702 OS << "\n"; 1703 OS << "\tAllocTypes: " << getAllocTypeString(AllocTypes) << "\n"; 1704 OS << "\tContextIds:"; 1705 std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end()); 1706 std::sort(SortedIds.begin(), SortedIds.end()); 1707 for (auto Id : SortedIds) 1708 OS << " " << Id; 1709 OS << "\n"; 1710 OS << "\tCalleeEdges:\n"; 1711 for (auto &Edge : CalleeEdges) 1712 OS << "\t\t" << *Edge << "\n"; 1713 OS << "\tCallerEdges:\n"; 1714 for (auto &Edge : CallerEdges) 1715 OS << "\t\t" << *Edge << "\n"; 1716 if (!Clones.empty()) { 1717 OS << "\tClones: "; 1718 FieldSeparator FS; 1719 for (auto *Clone : Clones) 1720 OS << FS << Clone; 1721 OS << "\n"; 1722 } else if (CloneOf) { 1723 OS << "\tClone of " << CloneOf << "\n"; 1724 } 1725 } 1726 1727 template <typename DerivedCCG, typename FuncTy, typename CallTy> 1728 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge::dump() 1729 const { 1730 print(dbgs()); 1731 dbgs() << "\n"; 1732 } 1733 1734 template <typename DerivedCCG, typename FuncTy, typename CallTy> 1735 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge::print( 1736 raw_ostream &OS) const { 1737 OS << "Edge from Callee " << Callee << " to Caller: " << Caller 1738 << " AllocTypes: " << getAllocTypeString(AllocTypes); 1739 OS << " ContextIds:"; 1740 std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end()); 1741 std::sort(SortedIds.begin(), SortedIds.end()); 1742 for (auto Id : SortedIds) 1743 OS << " " << Id; 1744 } 1745 1746 template <typename DerivedCCG, typename FuncTy, typename CallTy> 1747 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::dump() const { 1748 print(dbgs()); 1749 } 1750 1751 template <typename DerivedCCG, typename FuncTy, typename CallTy> 1752 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::print( 1753 raw_ostream &OS) const { 1754 OS << "Callsite Context Graph:\n"; 1755 using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *; 1756 for (const auto Node : nodes<GraphType>(this)) { 1757 if (Node->isRemoved()) 1758 continue; 1759 Node->print(OS); 1760 OS << "\n"; 1761 } 1762 } 1763 1764 template <typename DerivedCCG, typename FuncTy, typename CallTy> 1765 static void checkEdge( 1766 const std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>> &Edge) { 1767 // Confirm that alloc type is not None and that we have at least one context 1768 // id. 1769 assert(Edge->AllocTypes != (uint8_t)AllocationType::None); 1770 assert(!Edge->ContextIds.empty()); 1771 } 1772 1773 template <typename DerivedCCG, typename FuncTy, typename CallTy> 1774 static void checkNode(const ContextNode<DerivedCCG, FuncTy, CallTy> *Node, 1775 bool CheckEdges = true) { 1776 if (Node->isRemoved()) 1777 return; 1778 // Node's context ids should be the union of both its callee and caller edge 1779 // context ids. 1780 if (Node->CallerEdges.size()) { 1781 auto EI = Node->CallerEdges.begin(); 1782 auto &FirstEdge = *EI; 1783 EI++; 1784 DenseSet<uint32_t> CallerEdgeContextIds(FirstEdge->ContextIds); 1785 for (; EI != Node->CallerEdges.end(); EI++) { 1786 const auto &Edge = *EI; 1787 if (CheckEdges) 1788 checkEdge<DerivedCCG, FuncTy, CallTy>(Edge); 1789 set_union(CallerEdgeContextIds, Edge->ContextIds); 1790 } 1791 // Node can have more context ids than callers if some contexts terminate at 1792 // node and some are longer. 1793 assert(Node->ContextIds == CallerEdgeContextIds || 1794 set_is_subset(CallerEdgeContextIds, Node->ContextIds)); 1795 } 1796 if (Node->CalleeEdges.size()) { 1797 auto EI = Node->CalleeEdges.begin(); 1798 auto &FirstEdge = *EI; 1799 EI++; 1800 DenseSet<uint32_t> CalleeEdgeContextIds(FirstEdge->ContextIds); 1801 for (; EI != Node->CalleeEdges.end(); EI++) { 1802 const auto &Edge = *EI; 1803 if (CheckEdges) 1804 checkEdge<DerivedCCG, FuncTy, CallTy>(Edge); 1805 set_union(CalleeEdgeContextIds, Edge->ContextIds); 1806 } 1807 assert(Node->ContextIds == CalleeEdgeContextIds); 1808 } 1809 } 1810 1811 template <typename DerivedCCG, typename FuncTy, typename CallTy> 1812 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::check() const { 1813 using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *; 1814 for (const auto Node : nodes<GraphType>(this)) { 1815 checkNode<DerivedCCG, FuncTy, CallTy>(Node, /*CheckEdges=*/false); 1816 for (auto &Edge : Node->CallerEdges) 1817 checkEdge<DerivedCCG, FuncTy, CallTy>(Edge); 1818 } 1819 } 1820 1821 template <typename DerivedCCG, typename FuncTy, typename CallTy> 1822 struct GraphTraits<const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *> { 1823 using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *; 1824 using NodeRef = const ContextNode<DerivedCCG, FuncTy, CallTy> *; 1825 1826 using NodePtrTy = std::unique_ptr<ContextNode<DerivedCCG, FuncTy, CallTy>>; 1827 static NodeRef getNode(const NodePtrTy &P) { return P.get(); } 1828 1829 using nodes_iterator = 1830 mapped_iterator<typename std::vector<NodePtrTy>::const_iterator, 1831 decltype(&getNode)>; 1832 1833 static nodes_iterator nodes_begin(GraphType G) { 1834 return nodes_iterator(G->NodeOwner.begin(), &getNode); 1835 } 1836 1837 static nodes_iterator nodes_end(GraphType G) { 1838 return nodes_iterator(G->NodeOwner.end(), &getNode); 1839 } 1840 1841 static NodeRef getEntryNode(GraphType G) { 1842 return G->NodeOwner.begin()->get(); 1843 } 1844 1845 using EdgePtrTy = std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>>; 1846 static const ContextNode<DerivedCCG, FuncTy, CallTy> * 1847 GetCallee(const EdgePtrTy &P) { 1848 return P->Callee; 1849 } 1850 1851 using ChildIteratorType = 1852 mapped_iterator<typename std::vector<std::shared_ptr<ContextEdge< 1853 DerivedCCG, FuncTy, CallTy>>>::const_iterator, 1854 decltype(&GetCallee)>; 1855 1856 static ChildIteratorType child_begin(NodeRef N) { 1857 return ChildIteratorType(N->CalleeEdges.begin(), &GetCallee); 1858 } 1859 1860 static ChildIteratorType child_end(NodeRef N) { 1861 return ChildIteratorType(N->CalleeEdges.end(), &GetCallee); 1862 } 1863 }; 1864 1865 template <typename DerivedCCG, typename FuncTy, typename CallTy> 1866 struct DOTGraphTraits<const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *> 1867 : public DefaultDOTGraphTraits { 1868 DOTGraphTraits(bool IsSimple = false) : DefaultDOTGraphTraits(IsSimple) {} 1869 1870 using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *; 1871 using GTraits = GraphTraits<GraphType>; 1872 using NodeRef = typename GTraits::NodeRef; 1873 using ChildIteratorType = typename GTraits::ChildIteratorType; 1874 1875 static std::string getNodeLabel(NodeRef Node, GraphType G) { 1876 std::string LabelString = 1877 (Twine("OrigId: ") + (Node->IsAllocation ? "Alloc" : "") + 1878 Twine(Node->OrigStackOrAllocId)) 1879 .str(); 1880 LabelString += "\n"; 1881 if (Node->hasCall()) { 1882 auto Func = G->NodeToCallingFunc.find(Node); 1883 assert(Func != G->NodeToCallingFunc.end()); 1884 LabelString += 1885 G->getLabel(Func->second, Node->Call.call(), Node->Call.cloneNo()); 1886 } else { 1887 LabelString += "null call"; 1888 if (Node->Recursive) 1889 LabelString += " (recursive)"; 1890 else 1891 LabelString += " (external)"; 1892 } 1893 return LabelString; 1894 } 1895 1896 static std::string getNodeAttributes(NodeRef Node, GraphType) { 1897 std::string AttributeString = (Twine("tooltip=\"") + getNodeId(Node) + " " + 1898 getContextIds(Node->ContextIds) + "\"") 1899 .str(); 1900 AttributeString += 1901 (Twine(",fillcolor=\"") + getColor(Node->AllocTypes) + "\"").str(); 1902 AttributeString += ",style=\"filled\""; 1903 if (Node->CloneOf) { 1904 AttributeString += ",color=\"blue\""; 1905 AttributeString += ",style=\"filled,bold,dashed\""; 1906 } else 1907 AttributeString += ",style=\"filled\""; 1908 return AttributeString; 1909 } 1910 1911 static std::string getEdgeAttributes(NodeRef, ChildIteratorType ChildIter, 1912 GraphType) { 1913 auto &Edge = *(ChildIter.getCurrent()); 1914 return (Twine("tooltip=\"") + getContextIds(Edge->ContextIds) + "\"" + 1915 Twine(",fillcolor=\"") + getColor(Edge->AllocTypes) + "\"") 1916 .str(); 1917 } 1918 1919 // Since the NodeOwners list includes nodes that are no longer connected to 1920 // the graph, skip them here. 1921 static bool isNodeHidden(NodeRef Node, GraphType) { 1922 return Node->isRemoved(); 1923 } 1924 1925 private: 1926 static std::string getContextIds(const DenseSet<uint32_t> &ContextIds) { 1927 std::string IdString = "ContextIds:"; 1928 if (ContextIds.size() < 100) { 1929 std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end()); 1930 std::sort(SortedIds.begin(), SortedIds.end()); 1931 for (auto Id : SortedIds) 1932 IdString += (" " + Twine(Id)).str(); 1933 } else { 1934 IdString += (" (" + Twine(ContextIds.size()) + " ids)").str(); 1935 } 1936 return IdString; 1937 } 1938 1939 static std::string getColor(uint8_t AllocTypes) { 1940 if (AllocTypes == (uint8_t)AllocationType::NotCold) 1941 // Color "brown1" actually looks like a lighter red. 1942 return "brown1"; 1943 if (AllocTypes == (uint8_t)AllocationType::Cold) 1944 return "cyan"; 1945 if (AllocTypes == 1946 ((uint8_t)AllocationType::NotCold | (uint8_t)AllocationType::Cold)) 1947 // Lighter purple. 1948 return "mediumorchid1"; 1949 return "gray"; 1950 } 1951 1952 static std::string getNodeId(NodeRef Node) { 1953 std::stringstream SStream; 1954 SStream << std::hex << "N0x" << (unsigned long long)Node; 1955 std::string Result = SStream.str(); 1956 return Result; 1957 } 1958 }; 1959 1960 template <typename DerivedCCG, typename FuncTy, typename CallTy> 1961 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::exportToDot( 1962 std::string Label) const { 1963 WriteGraph(this, "", false, Label, 1964 DotFilePathPrefix + "ccg." + Label + ".dot"); 1965 } 1966 1967 template <typename DerivedCCG, typename FuncTy, typename CallTy> 1968 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode * 1969 CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::moveEdgeToNewCalleeClone( 1970 const std::shared_ptr<ContextEdge> &Edge, EdgeIter *CallerEdgeI) { 1971 ContextNode *Node = Edge->Callee; 1972 NodeOwner.push_back( 1973 std::make_unique<ContextNode>(Node->IsAllocation, Node->Call)); 1974 ContextNode *Clone = NodeOwner.back().get(); 1975 Node->addClone(Clone); 1976 assert(NodeToCallingFunc.count(Node)); 1977 NodeToCallingFunc[Clone] = NodeToCallingFunc[Node]; 1978 moveEdgeToExistingCalleeClone(Edge, Clone, CallerEdgeI, /*NewClone=*/true); 1979 return Clone; 1980 } 1981 1982 template <typename DerivedCCG, typename FuncTy, typename CallTy> 1983 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>:: 1984 moveEdgeToExistingCalleeClone(const std::shared_ptr<ContextEdge> &Edge, 1985 ContextNode *NewCallee, EdgeIter *CallerEdgeI, 1986 bool NewClone) { 1987 // NewCallee and Edge's current callee must be clones of the same original 1988 // node (Edge's current callee may be the original node too). 1989 assert(NewCallee->getOrigNode() == Edge->Callee->getOrigNode()); 1990 auto &EdgeContextIds = Edge->getContextIds(); 1991 ContextNode *OldCallee = Edge->Callee; 1992 if (CallerEdgeI) 1993 *CallerEdgeI = OldCallee->CallerEdges.erase(*CallerEdgeI); 1994 else 1995 OldCallee->eraseCallerEdge(Edge.get()); 1996 Edge->Callee = NewCallee; 1997 NewCallee->CallerEdges.push_back(Edge); 1998 // Don't need to update Edge's context ids since we are simply reconnecting 1999 // it. 2000 set_subtract(OldCallee->ContextIds, EdgeContextIds); 2001 NewCallee->ContextIds.insert(EdgeContextIds.begin(), EdgeContextIds.end()); 2002 NewCallee->AllocTypes |= Edge->AllocTypes; 2003 OldCallee->AllocTypes = computeAllocType(OldCallee->ContextIds); 2004 // OldCallee alloc type should be None iff its context id set is now empty. 2005 assert((OldCallee->AllocTypes == (uint8_t)AllocationType::None) == 2006 OldCallee->ContextIds.empty()); 2007 // Now walk the old callee node's callee edges and move Edge's context ids 2008 // over to the corresponding edge into the clone (which is created here if 2009 // this is a newly created clone). 2010 for (auto &OldCalleeEdge : OldCallee->CalleeEdges) { 2011 // The context ids moving to the new callee are the subset of this edge's 2012 // context ids and the context ids on the caller edge being moved. 2013 DenseSet<uint32_t> EdgeContextIdsToMove = 2014 set_intersection(OldCalleeEdge->getContextIds(), EdgeContextIds); 2015 set_subtract(OldCalleeEdge->getContextIds(), EdgeContextIdsToMove); 2016 OldCalleeEdge->AllocTypes = 2017 computeAllocType(OldCalleeEdge->getContextIds()); 2018 if (!NewClone) { 2019 // Update context ids / alloc type on corresponding edge to NewCallee. 2020 // There is a chance this may not exist if we are reusing an existing 2021 // clone, specifically during function assignment, where we would have 2022 // removed none type edges after creating the clone. If we can't find 2023 // a corresponding edge there, fall through to the cloning below. 2024 if (auto *NewCalleeEdge = 2025 NewCallee->findEdgeFromCallee(OldCalleeEdge->Callee)) { 2026 NewCalleeEdge->getContextIds().insert(EdgeContextIdsToMove.begin(), 2027 EdgeContextIdsToMove.end()); 2028 NewCalleeEdge->AllocTypes |= computeAllocType(EdgeContextIdsToMove); 2029 continue; 2030 } 2031 } 2032 auto NewEdge = std::make_shared<ContextEdge>( 2033 OldCalleeEdge->Callee, NewCallee, 2034 computeAllocType(EdgeContextIdsToMove), EdgeContextIdsToMove); 2035 NewCallee->CalleeEdges.push_back(NewEdge); 2036 NewEdge->Callee->CallerEdges.push_back(NewEdge); 2037 } 2038 if (VerifyCCG) { 2039 checkNode<DerivedCCG, FuncTy, CallTy>(OldCallee, /*CheckEdges=*/false); 2040 checkNode<DerivedCCG, FuncTy, CallTy>(NewCallee, /*CheckEdges=*/false); 2041 for (const auto &OldCalleeEdge : OldCallee->CalleeEdges) 2042 checkNode<DerivedCCG, FuncTy, CallTy>(OldCalleeEdge->Callee, 2043 /*CheckEdges=*/false); 2044 for (const auto &NewCalleeEdge : NewCallee->CalleeEdges) 2045 checkNode<DerivedCCG, FuncTy, CallTy>(NewCalleeEdge->Callee, 2046 /*CheckEdges=*/false); 2047 } 2048 } 2049 2050 template <typename DerivedCCG, typename FuncTy, typename CallTy> 2051 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::identifyClones() { 2052 DenseSet<const ContextNode *> Visited; 2053 for (auto &Entry : AllocationCallToContextNodeMap) 2054 identifyClones(Entry.second, Visited); 2055 } 2056 2057 // helper function to check an AllocType is cold or notcold or both. 2058 bool checkColdOrNotCold(uint8_t AllocType) { 2059 return (AllocType == (uint8_t)AllocationType::Cold) || 2060 (AllocType == (uint8_t)AllocationType::NotCold) || 2061 (AllocType == 2062 ((uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold)); 2063 } 2064 2065 template <typename DerivedCCG, typename FuncTy, typename CallTy> 2066 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::identifyClones( 2067 ContextNode *Node, DenseSet<const ContextNode *> &Visited) { 2068 if (VerifyNodes) 2069 checkNode<DerivedCCG, FuncTy, CallTy>(Node); 2070 assert(!Node->CloneOf); 2071 2072 // If Node as a null call, then either it wasn't found in the module (regular 2073 // LTO) or summary index (ThinLTO), or there were other conditions blocking 2074 // cloning (e.g. recursion, calls multiple targets, etc). 2075 // Do this here so that we don't try to recursively clone callers below, which 2076 // isn't useful at least for this node. 2077 if (!Node->hasCall()) 2078 return; 2079 2080 #ifndef NDEBUG 2081 auto Insert = 2082 #endif 2083 Visited.insert(Node); 2084 // We should not have visited this node yet. 2085 assert(Insert.second); 2086 // The recursive call to identifyClones may delete the current edge from the 2087 // CallerEdges vector. Make a copy and iterate on that, simpler than passing 2088 // in an iterator and having recursive call erase from it. Other edges may 2089 // also get removed during the recursion, which will have null Callee and 2090 // Caller pointers (and are deleted later), so we skip those below. 2091 { 2092 auto CallerEdges = Node->CallerEdges; 2093 for (auto &Edge : CallerEdges) { 2094 // Skip any that have been removed by an earlier recursive call. 2095 if (Edge->Callee == nullptr && Edge->Caller == nullptr) { 2096 assert(!llvm::count(Node->CallerEdges, Edge)); 2097 continue; 2098 } 2099 // Ignore any caller we previously visited via another edge. 2100 if (!Visited.count(Edge->Caller) && !Edge->Caller->CloneOf) { 2101 identifyClones(Edge->Caller, Visited); 2102 } 2103 } 2104 } 2105 2106 // Check if we reached an unambiguous call or have have only a single caller. 2107 if (hasSingleAllocType(Node->AllocTypes) || Node->CallerEdges.size() <= 1) 2108 return; 2109 2110 // We need to clone. 2111 2112 // Try to keep the original version as alloc type NotCold. This will make 2113 // cases with indirect calls or any other situation with an unknown call to 2114 // the original function get the default behavior. We do this by sorting the 2115 // CallerEdges of the Node we will clone by alloc type. 2116 // 2117 // Give NotCold edge the lowest sort priority so those edges are at the end of 2118 // the caller edges vector, and stay on the original version (since the below 2119 // code clones greedily until it finds all remaining edges have the same type 2120 // and leaves the remaining ones on the original Node). 2121 // 2122 // We shouldn't actually have any None type edges, so the sorting priority for 2123 // that is arbitrary, and we assert in that case below. 2124 const unsigned AllocTypeCloningPriority[] = {/*None*/ 3, /*NotCold*/ 4, 2125 /*Cold*/ 1, 2126 /*NotColdCold*/ 2}; 2127 std::stable_sort(Node->CallerEdges.begin(), Node->CallerEdges.end(), 2128 [&](const std::shared_ptr<ContextEdge> &A, 2129 const std::shared_ptr<ContextEdge> &B) { 2130 assert(checkColdOrNotCold(A->AllocTypes) && 2131 checkColdOrNotCold(B->AllocTypes)); 2132 2133 if (A->AllocTypes == B->AllocTypes) 2134 // Use the first context id for each edge as a 2135 // tie-breaker. 2136 return *A->ContextIds.begin() < *B->ContextIds.begin(); 2137 return AllocTypeCloningPriority[A->AllocTypes] < 2138 AllocTypeCloningPriority[B->AllocTypes]; 2139 }); 2140 2141 assert(Node->AllocTypes != (uint8_t)AllocationType::None); 2142 2143 // Iterate until we find no more opportunities for disambiguating the alloc 2144 // types via cloning. In most cases this loop will terminate once the Node 2145 // has a single allocation type, in which case no more cloning is needed. 2146 // We need to be able to remove Edge from CallerEdges, so need to adjust 2147 // iterator inside the loop. 2148 for (auto EI = Node->CallerEdges.begin(); EI != Node->CallerEdges.end();) { 2149 auto CallerEdge = *EI; 2150 2151 // See if cloning the prior caller edge left this node with a single alloc 2152 // type or a single caller. In that case no more cloning of Node is needed. 2153 if (hasSingleAllocType(Node->AllocTypes) || Node->CallerEdges.size() <= 1) 2154 break; 2155 2156 // Compute the node callee edge alloc types corresponding to the context ids 2157 // for this caller edge. 2158 std::vector<uint8_t> CalleeEdgeAllocTypesForCallerEdge; 2159 CalleeEdgeAllocTypesForCallerEdge.reserve(Node->CalleeEdges.size()); 2160 for (auto &CalleeEdge : Node->CalleeEdges) 2161 CalleeEdgeAllocTypesForCallerEdge.push_back(intersectAllocTypes( 2162 CalleeEdge->getContextIds(), CallerEdge->getContextIds())); 2163 2164 // Don't clone if doing so will not disambiguate any alloc types amongst 2165 // caller edges (including the callee edges that would be cloned). 2166 // Otherwise we will simply move all edges to the clone. 2167 // 2168 // First check if by cloning we will disambiguate the caller allocation 2169 // type from node's allocation type. Query allocTypeToUse so that we don't 2170 // bother cloning to distinguish NotCold+Cold from NotCold. Note that 2171 // neither of these should be None type. 2172 // 2173 // Then check if by cloning node at least one of the callee edges will be 2174 // disambiguated by splitting out different context ids. 2175 assert(CallerEdge->AllocTypes != (uint8_t)AllocationType::None); 2176 assert(Node->AllocTypes != (uint8_t)AllocationType::None); 2177 if (allocTypeToUse(CallerEdge->AllocTypes) == 2178 allocTypeToUse(Node->AllocTypes) && 2179 allocTypesMatch<DerivedCCG, FuncTy, CallTy>( 2180 CalleeEdgeAllocTypesForCallerEdge, Node->CalleeEdges)) { 2181 ++EI; 2182 continue; 2183 } 2184 2185 // First see if we can use an existing clone. Check each clone and its 2186 // callee edges for matching alloc types. 2187 ContextNode *Clone = nullptr; 2188 for (auto *CurClone : Node->Clones) { 2189 if (allocTypeToUse(CurClone->AllocTypes) != 2190 allocTypeToUse(CallerEdge->AllocTypes)) 2191 continue; 2192 2193 if (!allocTypesMatch<DerivedCCG, FuncTy, CallTy>( 2194 CalleeEdgeAllocTypesForCallerEdge, CurClone->CalleeEdges)) 2195 continue; 2196 Clone = CurClone; 2197 break; 2198 } 2199 2200 // The edge iterator is adjusted when we move the CallerEdge to the clone. 2201 if (Clone) 2202 moveEdgeToExistingCalleeClone(CallerEdge, Clone, &EI); 2203 else 2204 Clone = moveEdgeToNewCalleeClone(CallerEdge, &EI); 2205 2206 assert(EI == Node->CallerEdges.end() || 2207 Node->AllocTypes != (uint8_t)AllocationType::None); 2208 // Sanity check that no alloc types on clone or its edges are None. 2209 assert(Clone->AllocTypes != (uint8_t)AllocationType::None); 2210 assert(llvm::none_of( 2211 Clone->CallerEdges, [&](const std::shared_ptr<ContextEdge> &E) { 2212 return E->AllocTypes == (uint8_t)AllocationType::None; 2213 })); 2214 } 2215 2216 // Cloning may have resulted in some cloned callee edges with type None, 2217 // because they aren't carrying any contexts. Remove those edges. 2218 for (auto *Clone : Node->Clones) { 2219 removeNoneTypeCalleeEdges(Clone); 2220 if (VerifyNodes) 2221 checkNode<DerivedCCG, FuncTy, CallTy>(Clone); 2222 } 2223 // We should still have some context ids on the original Node. 2224 assert(!Node->ContextIds.empty()); 2225 2226 // Remove any callee edges that ended up with alloc type None after creating 2227 // clones and updating callee edges. 2228 removeNoneTypeCalleeEdges(Node); 2229 2230 // Sanity check that no alloc types on node or edges are None. 2231 assert(Node->AllocTypes != (uint8_t)AllocationType::None); 2232 assert(llvm::none_of(Node->CalleeEdges, 2233 [&](const std::shared_ptr<ContextEdge> &E) { 2234 return E->AllocTypes == (uint8_t)AllocationType::None; 2235 })); 2236 assert(llvm::none_of(Node->CallerEdges, 2237 [&](const std::shared_ptr<ContextEdge> &E) { 2238 return E->AllocTypes == (uint8_t)AllocationType::None; 2239 })); 2240 2241 if (VerifyNodes) 2242 checkNode<DerivedCCG, FuncTy, CallTy>(Node); 2243 } 2244 2245 void ModuleCallsiteContextGraph::updateAllocationCall( 2246 CallInfo &Call, AllocationType AllocType) { 2247 std::string AllocTypeString = getAllocTypeAttributeString(AllocType); 2248 auto A = llvm::Attribute::get(Call.call()->getFunction()->getContext(), 2249 "memprof", AllocTypeString); 2250 cast<CallBase>(Call.call())->addFnAttr(A); 2251 OREGetter(Call.call()->getFunction()) 2252 .emit(OptimizationRemark(DEBUG_TYPE, "MemprofAttribute", Call.call()) 2253 << ore::NV("AllocationCall", Call.call()) << " in clone " 2254 << ore::NV("Caller", Call.call()->getFunction()) 2255 << " marked with memprof allocation attribute " 2256 << ore::NV("Attribute", AllocTypeString)); 2257 } 2258 2259 void IndexCallsiteContextGraph::updateAllocationCall(CallInfo &Call, 2260 AllocationType AllocType) { 2261 auto *AI = Call.call().dyn_cast<AllocInfo *>(); 2262 assert(AI); 2263 assert(AI->Versions.size() > Call.cloneNo()); 2264 AI->Versions[Call.cloneNo()] = (uint8_t)AllocType; 2265 } 2266 2267 void ModuleCallsiteContextGraph::updateCall(CallInfo &CallerCall, 2268 FuncInfo CalleeFunc) { 2269 if (CalleeFunc.cloneNo() > 0) 2270 cast<CallBase>(CallerCall.call())->setCalledFunction(CalleeFunc.func()); 2271 OREGetter(CallerCall.call()->getFunction()) 2272 .emit(OptimizationRemark(DEBUG_TYPE, "MemprofCall", CallerCall.call()) 2273 << ore::NV("Call", CallerCall.call()) << " in clone " 2274 << ore::NV("Caller", CallerCall.call()->getFunction()) 2275 << " assigned to call function clone " 2276 << ore::NV("Callee", CalleeFunc.func())); 2277 } 2278 2279 void IndexCallsiteContextGraph::updateCall(CallInfo &CallerCall, 2280 FuncInfo CalleeFunc) { 2281 auto *CI = CallerCall.call().dyn_cast<CallsiteInfo *>(); 2282 assert(CI && 2283 "Caller cannot be an allocation which should not have profiled calls"); 2284 assert(CI->Clones.size() > CallerCall.cloneNo()); 2285 CI->Clones[CallerCall.cloneNo()] = CalleeFunc.cloneNo(); 2286 } 2287 2288 CallsiteContextGraph<ModuleCallsiteContextGraph, Function, 2289 Instruction *>::FuncInfo 2290 ModuleCallsiteContextGraph::cloneFunctionForCallsite( 2291 FuncInfo &Func, CallInfo &Call, std::map<CallInfo, CallInfo> &CallMap, 2292 std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) { 2293 // Use existing LLVM facilities for cloning and obtaining Call in clone 2294 ValueToValueMapTy VMap; 2295 auto *NewFunc = CloneFunction(Func.func(), VMap); 2296 std::string Name = getMemProfFuncName(Func.func()->getName(), CloneNo); 2297 assert(!Func.func()->getParent()->getFunction(Name)); 2298 NewFunc->setName(Name); 2299 for (auto &Inst : CallsWithMetadataInFunc) { 2300 // This map always has the initial version in it. 2301 assert(Inst.cloneNo() == 0); 2302 CallMap[Inst] = {cast<Instruction>(VMap[Inst.call()]), CloneNo}; 2303 } 2304 OREGetter(Func.func()) 2305 .emit(OptimizationRemark(DEBUG_TYPE, "MemprofClone", Func.func()) 2306 << "created clone " << ore::NV("NewFunction", NewFunc)); 2307 return {NewFunc, CloneNo}; 2308 } 2309 2310 CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary, 2311 IndexCall>::FuncInfo 2312 IndexCallsiteContextGraph::cloneFunctionForCallsite( 2313 FuncInfo &Func, CallInfo &Call, std::map<CallInfo, CallInfo> &CallMap, 2314 std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) { 2315 // Check how many clones we have of Call (and therefore function). 2316 // The next clone number is the current size of versions array. 2317 // Confirm this matches the CloneNo provided by the caller, which is based on 2318 // the number of function clones we have. 2319 assert(CloneNo == 2320 (Call.call().is<AllocInfo *>() 2321 ? Call.call().dyn_cast<AllocInfo *>()->Versions.size() 2322 : Call.call().dyn_cast<CallsiteInfo *>()->Clones.size())); 2323 // Walk all the instructions in this function. Create a new version for 2324 // each (by adding an entry to the Versions/Clones summary array), and copy 2325 // over the version being called for the function clone being cloned here. 2326 // Additionally, add an entry to the CallMap for the new function clone, 2327 // mapping the original call (clone 0, what is in CallsWithMetadataInFunc) 2328 // to the new call clone. 2329 for (auto &Inst : CallsWithMetadataInFunc) { 2330 // This map always has the initial version in it. 2331 assert(Inst.cloneNo() == 0); 2332 if (auto *AI = Inst.call().dyn_cast<AllocInfo *>()) { 2333 assert(AI->Versions.size() == CloneNo); 2334 // We assign the allocation type later (in updateAllocationCall), just add 2335 // an entry for it here. 2336 AI->Versions.push_back(0); 2337 } else { 2338 auto *CI = Inst.call().dyn_cast<CallsiteInfo *>(); 2339 assert(CI && CI->Clones.size() == CloneNo); 2340 // We assign the clone number later (in updateCall), just add an entry for 2341 // it here. 2342 CI->Clones.push_back(0); 2343 } 2344 CallMap[Inst] = {Inst.call(), CloneNo}; 2345 } 2346 return {Func.func(), CloneNo}; 2347 } 2348 2349 // This method assigns cloned callsites to functions, cloning the functions as 2350 // needed. The assignment is greedy and proceeds roughly as follows: 2351 // 2352 // For each function Func: 2353 // For each call with graph Node having clones: 2354 // Initialize ClonesWorklist to Node and its clones 2355 // Initialize NodeCloneCount to 0 2356 // While ClonesWorklist is not empty: 2357 // Clone = pop front ClonesWorklist 2358 // NodeCloneCount++ 2359 // If Func has been cloned less than NodeCloneCount times: 2360 // If NodeCloneCount is 1: 2361 // Assign Clone to original Func 2362 // Continue 2363 // Create a new function clone 2364 // If other callers not assigned to call a function clone yet: 2365 // Assign them to call new function clone 2366 // Continue 2367 // Assign any other caller calling the cloned version to new clone 2368 // 2369 // For each caller of Clone: 2370 // If caller is assigned to call a specific function clone: 2371 // If we cannot assign Clone to that function clone: 2372 // Create new callsite Clone NewClone 2373 // Add NewClone to ClonesWorklist 2374 // Continue 2375 // Assign Clone to existing caller's called function clone 2376 // Else: 2377 // If Clone not already assigned to a function clone: 2378 // Assign to first function clone without assignment 2379 // Assign caller to selected function clone 2380 template <typename DerivedCCG, typename FuncTy, typename CallTy> 2381 bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() { 2382 bool Changed = false; 2383 2384 // Keep track of the assignment of nodes (callsites) to function clones they 2385 // call. 2386 DenseMap<ContextNode *, FuncInfo> CallsiteToCalleeFuncCloneMap; 2387 2388 // Update caller node to call function version CalleeFunc, by recording the 2389 // assignment in CallsiteToCalleeFuncCloneMap. 2390 auto RecordCalleeFuncOfCallsite = [&](ContextNode *Caller, 2391 const FuncInfo &CalleeFunc) { 2392 assert(Caller->hasCall()); 2393 CallsiteToCalleeFuncCloneMap[Caller] = CalleeFunc; 2394 }; 2395 2396 // Walk all functions for which we saw calls with memprof metadata, and handle 2397 // cloning for each of its calls. 2398 for (auto &[Func, CallsWithMetadata] : FuncToCallsWithMetadata) { 2399 FuncInfo OrigFunc(Func); 2400 // Map from each clone of OrigFunc to a map of remappings of each call of 2401 // interest (from original uncloned call to the corresponding cloned call in 2402 // that function clone). 2403 std::map<FuncInfo, std::map<CallInfo, CallInfo>> FuncClonesToCallMap; 2404 for (auto &Call : CallsWithMetadata) { 2405 ContextNode *Node = getNodeForInst(Call); 2406 // Skip call if we do not have a node for it (all uses of its stack ids 2407 // were either on inlined chains or pruned from the MIBs), or if we did 2408 // not create any clones for it. 2409 if (!Node || Node->Clones.empty()) 2410 continue; 2411 assert(Node->hasCall() && 2412 "Not having a call should have prevented cloning"); 2413 2414 // Track the assignment of function clones to clones of the current 2415 // callsite Node being handled. 2416 std::map<FuncInfo, ContextNode *> FuncCloneToCurNodeCloneMap; 2417 2418 // Assign callsite version CallsiteClone to function version FuncClone, 2419 // and also assign (possibly cloned) Call to CallsiteClone. 2420 auto AssignCallsiteCloneToFuncClone = [&](const FuncInfo &FuncClone, 2421 CallInfo &Call, 2422 ContextNode *CallsiteClone, 2423 bool IsAlloc) { 2424 // Record the clone of callsite node assigned to this function clone. 2425 FuncCloneToCurNodeCloneMap[FuncClone] = CallsiteClone; 2426 2427 assert(FuncClonesToCallMap.count(FuncClone)); 2428 std::map<CallInfo, CallInfo> &CallMap = FuncClonesToCallMap[FuncClone]; 2429 CallInfo CallClone(Call); 2430 if (CallMap.count(Call)) 2431 CallClone = CallMap[Call]; 2432 CallsiteClone->setCall(CallClone); 2433 }; 2434 2435 // Keep track of the clones of callsite Node that need to be assigned to 2436 // function clones. This list may be expanded in the loop body below if we 2437 // find additional cloning is required. 2438 std::deque<ContextNode *> ClonesWorklist; 2439 // Ignore original Node if we moved all of its contexts to clones. 2440 if (!Node->ContextIds.empty()) 2441 ClonesWorklist.push_back(Node); 2442 ClonesWorklist.insert(ClonesWorklist.end(), Node->Clones.begin(), 2443 Node->Clones.end()); 2444 2445 // Now walk through all of the clones of this callsite Node that we need, 2446 // and determine the assignment to a corresponding clone of the current 2447 // function (creating new function clones as needed). 2448 unsigned NodeCloneCount = 0; 2449 while (!ClonesWorklist.empty()) { 2450 ContextNode *Clone = ClonesWorklist.front(); 2451 ClonesWorklist.pop_front(); 2452 NodeCloneCount++; 2453 if (VerifyNodes) 2454 checkNode<DerivedCCG, FuncTy, CallTy>(Clone); 2455 2456 // Need to create a new function clone if we have more callsite clones 2457 // than existing function clones, which would have been assigned to an 2458 // earlier clone in the list (we assign callsite clones to function 2459 // clones greedily). 2460 if (FuncClonesToCallMap.size() < NodeCloneCount) { 2461 // If this is the first callsite copy, assign to original function. 2462 if (NodeCloneCount == 1) { 2463 // Since FuncClonesToCallMap is empty in this case, no clones have 2464 // been created for this function yet, and no callers should have 2465 // been assigned a function clone for this callee node yet. 2466 assert(llvm::none_of( 2467 Clone->CallerEdges, [&](const std::shared_ptr<ContextEdge> &E) { 2468 return CallsiteToCalleeFuncCloneMap.count(E->Caller); 2469 })); 2470 // Initialize with empty call map, assign Clone to original function 2471 // and its callers, and skip to the next clone. 2472 FuncClonesToCallMap[OrigFunc] = {}; 2473 AssignCallsiteCloneToFuncClone( 2474 OrigFunc, Call, Clone, 2475 AllocationCallToContextNodeMap.count(Call)); 2476 for (auto &CE : Clone->CallerEdges) { 2477 // Ignore any caller that does not have a recorded callsite Call. 2478 if (!CE->Caller->hasCall()) 2479 continue; 2480 RecordCalleeFuncOfCallsite(CE->Caller, OrigFunc); 2481 } 2482 continue; 2483 } 2484 2485 // First locate which copy of OrigFunc to clone again. If a caller 2486 // of this callsite clone was already assigned to call a particular 2487 // function clone, we need to redirect all of those callers to the 2488 // new function clone, and update their other callees within this 2489 // function. 2490 FuncInfo PreviousAssignedFuncClone; 2491 auto EI = llvm::find_if( 2492 Clone->CallerEdges, [&](const std::shared_ptr<ContextEdge> &E) { 2493 return CallsiteToCalleeFuncCloneMap.count(E->Caller); 2494 }); 2495 bool CallerAssignedToCloneOfFunc = false; 2496 if (EI != Clone->CallerEdges.end()) { 2497 const std::shared_ptr<ContextEdge> &Edge = *EI; 2498 PreviousAssignedFuncClone = 2499 CallsiteToCalleeFuncCloneMap[Edge->Caller]; 2500 CallerAssignedToCloneOfFunc = true; 2501 } 2502 2503 // Clone function and save it along with the CallInfo map created 2504 // during cloning in the FuncClonesToCallMap. 2505 std::map<CallInfo, CallInfo> NewCallMap; 2506 unsigned CloneNo = FuncClonesToCallMap.size(); 2507 assert(CloneNo > 0 && "Clone 0 is the original function, which " 2508 "should already exist in the map"); 2509 FuncInfo NewFuncClone = cloneFunctionForCallsite( 2510 OrigFunc, Call, NewCallMap, CallsWithMetadata, CloneNo); 2511 FuncClonesToCallMap.emplace(NewFuncClone, std::move(NewCallMap)); 2512 FunctionClonesAnalysis++; 2513 Changed = true; 2514 2515 // If no caller callsites were already assigned to a clone of this 2516 // function, we can simply assign this clone to the new func clone 2517 // and update all callers to it, then skip to the next clone. 2518 if (!CallerAssignedToCloneOfFunc) { 2519 AssignCallsiteCloneToFuncClone( 2520 NewFuncClone, Call, Clone, 2521 AllocationCallToContextNodeMap.count(Call)); 2522 for (auto &CE : Clone->CallerEdges) { 2523 // Ignore any caller that does not have a recorded callsite Call. 2524 if (!CE->Caller->hasCall()) 2525 continue; 2526 RecordCalleeFuncOfCallsite(CE->Caller, NewFuncClone); 2527 } 2528 continue; 2529 } 2530 2531 // We may need to do additional node cloning in this case. 2532 // Reset the CallsiteToCalleeFuncCloneMap entry for any callers 2533 // that were previously assigned to call PreviousAssignedFuncClone, 2534 // to record that they now call NewFuncClone. 2535 for (auto CE : Clone->CallerEdges) { 2536 // Ignore any caller that does not have a recorded callsite Call. 2537 if (!CE->Caller->hasCall()) 2538 continue; 2539 2540 if (!CallsiteToCalleeFuncCloneMap.count(CE->Caller) || 2541 // We subsequently fall through to later handling that 2542 // will perform any additional cloning required for 2543 // callers that were calling other function clones. 2544 CallsiteToCalleeFuncCloneMap[CE->Caller] != 2545 PreviousAssignedFuncClone) 2546 continue; 2547 2548 RecordCalleeFuncOfCallsite(CE->Caller, NewFuncClone); 2549 2550 // If we are cloning a function that was already assigned to some 2551 // callers, then essentially we are creating new callsite clones 2552 // of the other callsites in that function that are reached by those 2553 // callers. Clone the other callees of the current callsite's caller 2554 // that were already assigned to PreviousAssignedFuncClone 2555 // accordingly. This is important since we subsequently update the 2556 // calls from the nodes in the graph and their assignments to callee 2557 // functions recorded in CallsiteToCalleeFuncCloneMap. 2558 for (auto CalleeEdge : CE->Caller->CalleeEdges) { 2559 // Skip any that have been removed on an earlier iteration when 2560 // cleaning up newly None type callee edges. 2561 if (!CalleeEdge) 2562 continue; 2563 ContextNode *Callee = CalleeEdge->Callee; 2564 // Skip the current callsite, we are looking for other 2565 // callsites Caller calls, as well as any that does not have a 2566 // recorded callsite Call. 2567 if (Callee == Clone || !Callee->hasCall()) 2568 continue; 2569 ContextNode *NewClone = moveEdgeToNewCalleeClone(CalleeEdge); 2570 removeNoneTypeCalleeEdges(NewClone); 2571 // Moving the edge may have resulted in some none type 2572 // callee edges on the original Callee. 2573 removeNoneTypeCalleeEdges(Callee); 2574 assert(NewClone->AllocTypes != (uint8_t)AllocationType::None); 2575 // If the Callee node was already assigned to call a specific 2576 // function version, make sure its new clone is assigned to call 2577 // that same function clone. 2578 if (CallsiteToCalleeFuncCloneMap.count(Callee)) 2579 RecordCalleeFuncOfCallsite( 2580 NewClone, CallsiteToCalleeFuncCloneMap[Callee]); 2581 // Update NewClone with the new Call clone of this callsite's Call 2582 // created for the new function clone created earlier. 2583 // Recall that we have already ensured when building the graph 2584 // that each caller can only call callsites within the same 2585 // function, so we are guaranteed that Callee Call is in the 2586 // current OrigFunc. 2587 // CallMap is set up as indexed by original Call at clone 0. 2588 CallInfo OrigCall(Callee->getOrigNode()->Call); 2589 OrigCall.setCloneNo(0); 2590 std::map<CallInfo, CallInfo> &CallMap = 2591 FuncClonesToCallMap[NewFuncClone]; 2592 assert(CallMap.count(OrigCall)); 2593 CallInfo NewCall(CallMap[OrigCall]); 2594 assert(NewCall); 2595 NewClone->setCall(NewCall); 2596 } 2597 } 2598 // Fall through to handling below to perform the recording of the 2599 // function for this callsite clone. This enables handling of cases 2600 // where the callers were assigned to different clones of a function. 2601 } 2602 2603 // See if we can use existing function clone. Walk through 2604 // all caller edges to see if any have already been assigned to 2605 // a clone of this callsite's function. If we can use it, do so. If not, 2606 // because that function clone is already assigned to a different clone 2607 // of this callsite, then we need to clone again. 2608 // Basically, this checking is needed to handle the case where different 2609 // caller functions/callsites may need versions of this function 2610 // containing different mixes of callsite clones across the different 2611 // callsites within the function. If that happens, we need to create 2612 // additional function clones to handle the various combinations. 2613 // 2614 // Keep track of any new clones of this callsite created by the 2615 // following loop, as well as any existing clone that we decided to 2616 // assign this clone to. 2617 std::map<FuncInfo, ContextNode *> FuncCloneToNewCallsiteCloneMap; 2618 FuncInfo FuncCloneAssignedToCurCallsiteClone; 2619 // We need to be able to remove Edge from CallerEdges, so need to adjust 2620 // iterator in the loop. 2621 for (auto EI = Clone->CallerEdges.begin(); 2622 EI != Clone->CallerEdges.end();) { 2623 auto Edge = *EI; 2624 // Ignore any caller that does not have a recorded callsite Call. 2625 if (!Edge->Caller->hasCall()) { 2626 EI++; 2627 continue; 2628 } 2629 // If this caller already assigned to call a version of OrigFunc, need 2630 // to ensure we can assign this callsite clone to that function clone. 2631 if (CallsiteToCalleeFuncCloneMap.count(Edge->Caller)) { 2632 FuncInfo FuncCloneCalledByCaller = 2633 CallsiteToCalleeFuncCloneMap[Edge->Caller]; 2634 // First we need to confirm that this function clone is available 2635 // for use by this callsite node clone. 2636 // 2637 // While FuncCloneToCurNodeCloneMap is built only for this Node and 2638 // its callsite clones, one of those callsite clones X could have 2639 // been assigned to the same function clone called by Edge's caller 2640 // - if Edge's caller calls another callsite within Node's original 2641 // function, and that callsite has another caller reaching clone X. 2642 // We need to clone Node again in this case. 2643 if ((FuncCloneToCurNodeCloneMap.count(FuncCloneCalledByCaller) && 2644 FuncCloneToCurNodeCloneMap[FuncCloneCalledByCaller] != 2645 Clone) || 2646 // Detect when we have multiple callers of this callsite that 2647 // have already been assigned to specific, and different, clones 2648 // of OrigFunc (due to other unrelated callsites in Func they 2649 // reach via call contexts). Is this Clone of callsite Node 2650 // assigned to a different clone of OrigFunc? If so, clone Node 2651 // again. 2652 (FuncCloneAssignedToCurCallsiteClone && 2653 FuncCloneAssignedToCurCallsiteClone != 2654 FuncCloneCalledByCaller)) { 2655 // We need to use a different newly created callsite clone, in 2656 // order to assign it to another new function clone on a 2657 // subsequent iteration over the Clones array (adjusted below). 2658 // Note we specifically do not reset the 2659 // CallsiteToCalleeFuncCloneMap entry for this caller, so that 2660 // when this new clone is processed later we know which version of 2661 // the function to copy (so that other callsite clones we have 2662 // assigned to that function clone are properly cloned over). See 2663 // comments in the function cloning handling earlier. 2664 2665 // Check if we already have cloned this callsite again while 2666 // walking through caller edges, for a caller calling the same 2667 // function clone. If so, we can move this edge to that new clone 2668 // rather than creating yet another new clone. 2669 if (FuncCloneToNewCallsiteCloneMap.count( 2670 FuncCloneCalledByCaller)) { 2671 ContextNode *NewClone = 2672 FuncCloneToNewCallsiteCloneMap[FuncCloneCalledByCaller]; 2673 moveEdgeToExistingCalleeClone(Edge, NewClone, &EI); 2674 // Cleanup any none type edges cloned over. 2675 removeNoneTypeCalleeEdges(NewClone); 2676 } else { 2677 // Create a new callsite clone. 2678 ContextNode *NewClone = moveEdgeToNewCalleeClone(Edge, &EI); 2679 removeNoneTypeCalleeEdges(NewClone); 2680 FuncCloneToNewCallsiteCloneMap[FuncCloneCalledByCaller] = 2681 NewClone; 2682 // Add to list of clones and process later. 2683 ClonesWorklist.push_back(NewClone); 2684 assert(EI == Clone->CallerEdges.end() || 2685 Clone->AllocTypes != (uint8_t)AllocationType::None); 2686 assert(NewClone->AllocTypes != (uint8_t)AllocationType::None); 2687 } 2688 // Moving the caller edge may have resulted in some none type 2689 // callee edges. 2690 removeNoneTypeCalleeEdges(Clone); 2691 // We will handle the newly created callsite clone in a subsequent 2692 // iteration over this Node's Clones. Continue here since we 2693 // already adjusted iterator EI while moving the edge. 2694 continue; 2695 } 2696 2697 // Otherwise, we can use the function clone already assigned to this 2698 // caller. 2699 if (!FuncCloneAssignedToCurCallsiteClone) { 2700 FuncCloneAssignedToCurCallsiteClone = FuncCloneCalledByCaller; 2701 // Assign Clone to FuncCloneCalledByCaller 2702 AssignCallsiteCloneToFuncClone( 2703 FuncCloneCalledByCaller, Call, Clone, 2704 AllocationCallToContextNodeMap.count(Call)); 2705 } else 2706 // Don't need to do anything - callsite is already calling this 2707 // function clone. 2708 assert(FuncCloneAssignedToCurCallsiteClone == 2709 FuncCloneCalledByCaller); 2710 2711 } else { 2712 // We have not already assigned this caller to a version of 2713 // OrigFunc. Do the assignment now. 2714 2715 // First check if we have already assigned this callsite clone to a 2716 // clone of OrigFunc for another caller during this iteration over 2717 // its caller edges. 2718 if (!FuncCloneAssignedToCurCallsiteClone) { 2719 // Find first function in FuncClonesToCallMap without an assigned 2720 // clone of this callsite Node. We should always have one 2721 // available at this point due to the earlier cloning when the 2722 // FuncClonesToCallMap size was smaller than the clone number. 2723 for (auto &CF : FuncClonesToCallMap) { 2724 if (!FuncCloneToCurNodeCloneMap.count(CF.first)) { 2725 FuncCloneAssignedToCurCallsiteClone = CF.first; 2726 break; 2727 } 2728 } 2729 assert(FuncCloneAssignedToCurCallsiteClone); 2730 // Assign Clone to FuncCloneAssignedToCurCallsiteClone 2731 AssignCallsiteCloneToFuncClone( 2732 FuncCloneAssignedToCurCallsiteClone, Call, Clone, 2733 AllocationCallToContextNodeMap.count(Call)); 2734 } else 2735 assert(FuncCloneToCurNodeCloneMap 2736 [FuncCloneAssignedToCurCallsiteClone] == Clone); 2737 // Update callers to record function version called. 2738 RecordCalleeFuncOfCallsite(Edge->Caller, 2739 FuncCloneAssignedToCurCallsiteClone); 2740 } 2741 2742 EI++; 2743 } 2744 } 2745 if (VerifyCCG) { 2746 checkNode<DerivedCCG, FuncTy, CallTy>(Node); 2747 for (const auto &PE : Node->CalleeEdges) 2748 checkNode<DerivedCCG, FuncTy, CallTy>(PE->Callee); 2749 for (const auto &CE : Node->CallerEdges) 2750 checkNode<DerivedCCG, FuncTy, CallTy>(CE->Caller); 2751 for (auto *Clone : Node->Clones) { 2752 checkNode<DerivedCCG, FuncTy, CallTy>(Clone); 2753 for (const auto &PE : Clone->CalleeEdges) 2754 checkNode<DerivedCCG, FuncTy, CallTy>(PE->Callee); 2755 for (const auto &CE : Clone->CallerEdges) 2756 checkNode<DerivedCCG, FuncTy, CallTy>(CE->Caller); 2757 } 2758 } 2759 } 2760 } 2761 2762 auto UpdateCalls = [&](ContextNode *Node, 2763 DenseSet<const ContextNode *> &Visited, 2764 auto &&UpdateCalls) { 2765 auto Inserted = Visited.insert(Node); 2766 if (!Inserted.second) 2767 return; 2768 2769 for (auto *Clone : Node->Clones) 2770 UpdateCalls(Clone, Visited, UpdateCalls); 2771 2772 for (auto &Edge : Node->CallerEdges) 2773 UpdateCalls(Edge->Caller, Visited, UpdateCalls); 2774 2775 // Skip if either no call to update, or if we ended up with no context ids 2776 // (we moved all edges onto other clones). 2777 if (!Node->hasCall() || Node->ContextIds.empty()) 2778 return; 2779 2780 if (Node->IsAllocation) { 2781 updateAllocationCall(Node->Call, allocTypeToUse(Node->AllocTypes)); 2782 return; 2783 } 2784 2785 if (!CallsiteToCalleeFuncCloneMap.count(Node)) 2786 return; 2787 2788 auto CalleeFunc = CallsiteToCalleeFuncCloneMap[Node]; 2789 updateCall(Node->Call, CalleeFunc); 2790 }; 2791 2792 // Performs DFS traversal starting from allocation nodes to update calls to 2793 // reflect cloning decisions recorded earlier. For regular LTO this will 2794 // update the actual calls in the IR to call the appropriate function clone 2795 // (and add attributes to allocation calls), whereas for ThinLTO the decisions 2796 // are recorded in the summary entries. 2797 DenseSet<const ContextNode *> Visited; 2798 for (auto &Entry : AllocationCallToContextNodeMap) 2799 UpdateCalls(Entry.second, Visited, UpdateCalls); 2800 2801 return Changed; 2802 } 2803 2804 static SmallVector<std::unique_ptr<ValueToValueMapTy>, 4> createFunctionClones( 2805 Function &F, unsigned NumClones, Module &M, OptimizationRemarkEmitter &ORE, 2806 std::map<const Function *, SmallPtrSet<const GlobalAlias *, 1>> 2807 &FuncToAliasMap) { 2808 // The first "clone" is the original copy, we should only call this if we 2809 // needed to create new clones. 2810 assert(NumClones > 1); 2811 SmallVector<std::unique_ptr<ValueToValueMapTy>, 4> VMaps; 2812 VMaps.reserve(NumClones - 1); 2813 FunctionsClonedThinBackend++; 2814 for (unsigned I = 1; I < NumClones; I++) { 2815 VMaps.emplace_back(std::make_unique<ValueToValueMapTy>()); 2816 auto *NewF = CloneFunction(&F, *VMaps.back()); 2817 FunctionClonesThinBackend++; 2818 // Strip memprof and callsite metadata from clone as they are no longer 2819 // needed. 2820 for (auto &BB : *NewF) { 2821 for (auto &Inst : BB) { 2822 Inst.setMetadata(LLVMContext::MD_memprof, nullptr); 2823 Inst.setMetadata(LLVMContext::MD_callsite, nullptr); 2824 } 2825 } 2826 std::string Name = getMemProfFuncName(F.getName(), I); 2827 auto *PrevF = M.getFunction(Name); 2828 if (PrevF) { 2829 // We might have created this when adjusting callsite in another 2830 // function. It should be a declaration. 2831 assert(PrevF->isDeclaration()); 2832 NewF->takeName(PrevF); 2833 PrevF->replaceAllUsesWith(NewF); 2834 PrevF->eraseFromParent(); 2835 } else 2836 NewF->setName(Name); 2837 ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofClone", &F) 2838 << "created clone " << ore::NV("NewFunction", NewF)); 2839 2840 // Now handle aliases to this function, and clone those as well. 2841 if (!FuncToAliasMap.count(&F)) 2842 continue; 2843 for (auto *A : FuncToAliasMap[&F]) { 2844 std::string Name = getMemProfFuncName(A->getName(), I); 2845 auto *PrevA = M.getNamedAlias(Name); 2846 auto *NewA = GlobalAlias::create(A->getValueType(), 2847 A->getType()->getPointerAddressSpace(), 2848 A->getLinkage(), Name, NewF); 2849 NewA->copyAttributesFrom(A); 2850 if (PrevA) { 2851 // We might have created this when adjusting callsite in another 2852 // function. It should be a declaration. 2853 assert(PrevA->isDeclaration()); 2854 NewA->takeName(PrevA); 2855 PrevA->replaceAllUsesWith(NewA); 2856 PrevA->eraseFromParent(); 2857 } 2858 } 2859 } 2860 return VMaps; 2861 } 2862 2863 // Locate the summary for F. This is complicated by the fact that it might 2864 // have been internalized or promoted. 2865 static ValueInfo findValueInfoForFunc(const Function &F, const Module &M, 2866 const ModuleSummaryIndex *ImportSummary) { 2867 // FIXME: Ideally we would retain the original GUID in some fashion on the 2868 // function (e.g. as metadata), but for now do our best to locate the 2869 // summary without that information. 2870 ValueInfo TheFnVI = ImportSummary->getValueInfo(F.getGUID()); 2871 if (!TheFnVI) 2872 // See if theFn was internalized, by checking index directly with 2873 // original name (this avoids the name adjustment done by getGUID() for 2874 // internal symbols). 2875 TheFnVI = ImportSummary->getValueInfo(GlobalValue::getGUID(F.getName())); 2876 if (TheFnVI) 2877 return TheFnVI; 2878 // Now query with the original name before any promotion was performed. 2879 StringRef OrigName = 2880 ModuleSummaryIndex::getOriginalNameBeforePromote(F.getName()); 2881 std::string OrigId = GlobalValue::getGlobalIdentifier( 2882 OrigName, GlobalValue::InternalLinkage, M.getSourceFileName()); 2883 TheFnVI = ImportSummary->getValueInfo(GlobalValue::getGUID(OrigId)); 2884 if (TheFnVI) 2885 return TheFnVI; 2886 // Could be a promoted local imported from another module. We need to pass 2887 // down more info here to find the original module id. For now, try with 2888 // the OrigName which might have been stored in the OidGuidMap in the 2889 // index. This would not work if there were same-named locals in multiple 2890 // modules, however. 2891 auto OrigGUID = 2892 ImportSummary->getGUIDFromOriginalID(GlobalValue::getGUID(OrigName)); 2893 if (OrigGUID) 2894 TheFnVI = ImportSummary->getValueInfo(OrigGUID); 2895 return TheFnVI; 2896 } 2897 2898 bool MemProfContextDisambiguation::applyImport(Module &M) { 2899 assert(ImportSummary); 2900 bool Changed = false; 2901 2902 auto IsMemProfClone = [](const Function &F) { 2903 return F.getName().contains(MemProfCloneSuffix); 2904 }; 2905 2906 // We also need to clone any aliases that reference cloned functions, because 2907 // the modified callsites may invoke via the alias. Keep track of the aliases 2908 // for each function. 2909 std::map<const Function *, SmallPtrSet<const GlobalAlias *, 1>> 2910 FuncToAliasMap; 2911 for (auto &A : M.aliases()) { 2912 auto *Aliasee = A.getAliaseeObject(); 2913 if (auto *F = dyn_cast<Function>(Aliasee)) 2914 FuncToAliasMap[F].insert(&A); 2915 } 2916 2917 for (auto &F : M) { 2918 if (F.isDeclaration() || IsMemProfClone(F)) 2919 continue; 2920 2921 OptimizationRemarkEmitter ORE(&F); 2922 2923 SmallVector<std::unique_ptr<ValueToValueMapTy>, 4> VMaps; 2924 bool ClonesCreated = false; 2925 unsigned NumClonesCreated = 0; 2926 auto CloneFuncIfNeeded = [&](unsigned NumClones) { 2927 // We should at least have version 0 which is the original copy. 2928 assert(NumClones > 0); 2929 // If only one copy needed use original. 2930 if (NumClones == 1) 2931 return; 2932 // If we already performed cloning of this function, confirm that the 2933 // requested number of clones matches (the thin link should ensure the 2934 // number of clones for each constituent callsite is consistent within 2935 // each function), before returning. 2936 if (ClonesCreated) { 2937 assert(NumClonesCreated == NumClones); 2938 return; 2939 } 2940 VMaps = createFunctionClones(F, NumClones, M, ORE, FuncToAliasMap); 2941 // The first "clone" is the original copy, which doesn't have a VMap. 2942 assert(VMaps.size() == NumClones - 1); 2943 Changed = true; 2944 ClonesCreated = true; 2945 NumClonesCreated = NumClones; 2946 }; 2947 2948 // Locate the summary for F. 2949 ValueInfo TheFnVI = findValueInfoForFunc(F, M, ImportSummary); 2950 // If not found, this could be an imported local (see comment in 2951 // findValueInfoForFunc). Skip for now as it will be cloned in its original 2952 // module (where it would have been promoted to global scope so should 2953 // satisfy any reference in this module). 2954 if (!TheFnVI) 2955 continue; 2956 2957 auto *GVSummary = 2958 ImportSummary->findSummaryInModule(TheFnVI, M.getModuleIdentifier()); 2959 if (!GVSummary) 2960 // Must have been imported, use the first summary (might be multiple if 2961 // this was a linkonce_odr). 2962 GVSummary = TheFnVI.getSummaryList().front().get(); 2963 2964 // If this was an imported alias skip it as we won't have the function 2965 // summary, and it should be cloned in the original module. 2966 if (isa<AliasSummary>(GVSummary)) 2967 continue; 2968 2969 auto *FS = cast<FunctionSummary>(GVSummary->getBaseObject()); 2970 2971 if (FS->allocs().empty() && FS->callsites().empty()) 2972 continue; 2973 2974 auto SI = FS->callsites().begin(); 2975 auto AI = FS->allocs().begin(); 2976 2977 // Assume for now that the instructions are in the exact same order 2978 // as when the summary was created, but confirm this is correct by 2979 // matching the stack ids. 2980 for (auto &BB : F) { 2981 for (auto &I : BB) { 2982 auto *CB = dyn_cast<CallBase>(&I); 2983 // Same handling as when creating module summary. 2984 if (!mayHaveMemprofSummary(CB)) 2985 continue; 2986 2987 auto *CalledValue = CB->getCalledOperand(); 2988 auto *CalledFunction = CB->getCalledFunction(); 2989 if (CalledValue && !CalledFunction) { 2990 CalledValue = CalledValue->stripPointerCasts(); 2991 // Stripping pointer casts can reveal a called function. 2992 CalledFunction = dyn_cast<Function>(CalledValue); 2993 } 2994 // Check if this is an alias to a function. If so, get the 2995 // called aliasee for the checks below. 2996 if (auto *GA = dyn_cast<GlobalAlias>(CalledValue)) { 2997 assert(!CalledFunction && 2998 "Expected null called function in callsite for alias"); 2999 CalledFunction = dyn_cast<Function>(GA->getAliaseeObject()); 3000 } 3001 3002 CallStack<MDNode, MDNode::op_iterator> CallsiteContext( 3003 I.getMetadata(LLVMContext::MD_callsite)); 3004 auto *MemProfMD = I.getMetadata(LLVMContext::MD_memprof); 3005 3006 // Include allocs that were already assigned a memprof function 3007 // attribute in the statistics. 3008 if (CB->getAttributes().hasFnAttr("memprof")) { 3009 assert(!MemProfMD); 3010 CB->getAttributes().getFnAttr("memprof").getValueAsString() == "cold" 3011 ? AllocTypeColdThinBackend++ 3012 : AllocTypeNotColdThinBackend++; 3013 OrigAllocsThinBackend++; 3014 AllocVersionsThinBackend++; 3015 if (!MaxAllocVersionsThinBackend) 3016 MaxAllocVersionsThinBackend = 1; 3017 // Remove any remaining callsite metadata and we can skip the rest of 3018 // the handling for this instruction, since no cloning needed. 3019 I.setMetadata(LLVMContext::MD_callsite, nullptr); 3020 continue; 3021 } 3022 3023 if (MemProfMD) { 3024 // Consult the next alloc node. 3025 assert(AI != FS->allocs().end()); 3026 auto &AllocNode = *(AI++); 3027 3028 // Sanity check that the MIB stack ids match between the summary and 3029 // instruction metadata. 3030 auto MIBIter = AllocNode.MIBs.begin(); 3031 for (auto &MDOp : MemProfMD->operands()) { 3032 assert(MIBIter != AllocNode.MIBs.end()); 3033 LLVM_ATTRIBUTE_UNUSED auto StackIdIndexIter = 3034 MIBIter->StackIdIndices.begin(); 3035 auto *MIBMD = cast<const MDNode>(MDOp); 3036 MDNode *StackMDNode = getMIBStackNode(MIBMD); 3037 assert(StackMDNode); 3038 SmallVector<unsigned> StackIdsFromMetadata; 3039 CallStack<MDNode, MDNode::op_iterator> StackContext(StackMDNode); 3040 for (auto ContextIter = 3041 StackContext.beginAfterSharedPrefix(CallsiteContext); 3042 ContextIter != StackContext.end(); ++ContextIter) { 3043 // If this is a direct recursion, simply skip the duplicate 3044 // entries, to be consistent with how the summary ids were 3045 // generated during ModuleSummaryAnalysis. 3046 if (!StackIdsFromMetadata.empty() && 3047 StackIdsFromMetadata.back() == *ContextIter) 3048 continue; 3049 assert(StackIdIndexIter != MIBIter->StackIdIndices.end()); 3050 assert(ImportSummary->getStackIdAtIndex(*StackIdIndexIter) == 3051 *ContextIter); 3052 StackIdIndexIter++; 3053 } 3054 MIBIter++; 3055 } 3056 3057 // Perform cloning if not yet done. 3058 CloneFuncIfNeeded(/*NumClones=*/AllocNode.Versions.size()); 3059 3060 OrigAllocsThinBackend++; 3061 AllocVersionsThinBackend += AllocNode.Versions.size(); 3062 if (MaxAllocVersionsThinBackend < AllocNode.Versions.size()) 3063 MaxAllocVersionsThinBackend = AllocNode.Versions.size(); 3064 3065 // If there is only one version that means we didn't end up 3066 // considering this function for cloning, and in that case the alloc 3067 // will still be none type or should have gotten the default NotCold. 3068 // Skip that after calling clone helper since that does some sanity 3069 // checks that confirm we haven't decided yet that we need cloning. 3070 if (AllocNode.Versions.size() == 1) { 3071 assert((AllocationType)AllocNode.Versions[0] == 3072 AllocationType::NotCold || 3073 (AllocationType)AllocNode.Versions[0] == 3074 AllocationType::None); 3075 UnclonableAllocsThinBackend++; 3076 continue; 3077 } 3078 3079 // All versions should have a singular allocation type. 3080 assert(llvm::none_of(AllocNode.Versions, [](uint8_t Type) { 3081 return Type == ((uint8_t)AllocationType::NotCold | 3082 (uint8_t)AllocationType::Cold); 3083 })); 3084 3085 // Update the allocation types per the summary info. 3086 for (unsigned J = 0; J < AllocNode.Versions.size(); J++) { 3087 // Ignore any that didn't get an assigned allocation type. 3088 if (AllocNode.Versions[J] == (uint8_t)AllocationType::None) 3089 continue; 3090 AllocationType AllocTy = (AllocationType)AllocNode.Versions[J]; 3091 AllocTy == AllocationType::Cold ? AllocTypeColdThinBackend++ 3092 : AllocTypeNotColdThinBackend++; 3093 std::string AllocTypeString = getAllocTypeAttributeString(AllocTy); 3094 auto A = llvm::Attribute::get(F.getContext(), "memprof", 3095 AllocTypeString); 3096 CallBase *CBClone; 3097 // Copy 0 is the original function. 3098 if (!J) 3099 CBClone = CB; 3100 else 3101 // Since VMaps are only created for new clones, we index with 3102 // clone J-1 (J==0 is the original clone and does not have a VMaps 3103 // entry). 3104 CBClone = cast<CallBase>((*VMaps[J - 1])[CB]); 3105 CBClone->addFnAttr(A); 3106 ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofAttribute", CBClone) 3107 << ore::NV("AllocationCall", CBClone) << " in clone " 3108 << ore::NV("Caller", CBClone->getFunction()) 3109 << " marked with memprof allocation attribute " 3110 << ore::NV("Attribute", AllocTypeString)); 3111 } 3112 } else if (!CallsiteContext.empty()) { 3113 // Consult the next callsite node. 3114 assert(SI != FS->callsites().end()); 3115 auto &StackNode = *(SI++); 3116 3117 #ifndef NDEBUG 3118 // Sanity check that the stack ids match between the summary and 3119 // instruction metadata. 3120 auto StackIdIndexIter = StackNode.StackIdIndices.begin(); 3121 for (auto StackId : CallsiteContext) { 3122 assert(StackIdIndexIter != StackNode.StackIdIndices.end()); 3123 assert(ImportSummary->getStackIdAtIndex(*StackIdIndexIter) == 3124 StackId); 3125 StackIdIndexIter++; 3126 } 3127 #endif 3128 3129 // Perform cloning if not yet done. 3130 CloneFuncIfNeeded(/*NumClones=*/StackNode.Clones.size()); 3131 3132 // Should have skipped indirect calls via mayHaveMemprofSummary. 3133 assert(CalledFunction); 3134 assert(!IsMemProfClone(*CalledFunction)); 3135 3136 // Update the calls per the summary info. 3137 // Save orig name since it gets updated in the first iteration 3138 // below. 3139 auto CalleeOrigName = CalledFunction->getName(); 3140 for (unsigned J = 0; J < StackNode.Clones.size(); J++) { 3141 // Do nothing if this version calls the original version of its 3142 // callee. 3143 if (!StackNode.Clones[J]) 3144 continue; 3145 auto NewF = M.getOrInsertFunction( 3146 getMemProfFuncName(CalleeOrigName, StackNode.Clones[J]), 3147 CalledFunction->getFunctionType()); 3148 CallBase *CBClone; 3149 // Copy 0 is the original function. 3150 if (!J) 3151 CBClone = CB; 3152 else 3153 CBClone = cast<CallBase>((*VMaps[J - 1])[CB]); 3154 CBClone->setCalledFunction(NewF); 3155 ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofCall", CBClone) 3156 << ore::NV("Call", CBClone) << " in clone " 3157 << ore::NV("Caller", CBClone->getFunction()) 3158 << " assigned to call function clone " 3159 << ore::NV("Callee", NewF.getCallee())); 3160 } 3161 } 3162 // Memprof and callsite metadata on memory allocations no longer needed. 3163 I.setMetadata(LLVMContext::MD_memprof, nullptr); 3164 I.setMetadata(LLVMContext::MD_callsite, nullptr); 3165 } 3166 } 3167 } 3168 3169 return Changed; 3170 } 3171 3172 template <typename DerivedCCG, typename FuncTy, typename CallTy> 3173 bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::process() { 3174 if (DumpCCG) { 3175 dbgs() << "CCG before cloning:\n"; 3176 dbgs() << *this; 3177 } 3178 if (ExportToDot) 3179 exportToDot("postbuild"); 3180 3181 if (VerifyCCG) { 3182 check(); 3183 } 3184 3185 identifyClones(); 3186 3187 if (VerifyCCG) { 3188 check(); 3189 } 3190 3191 if (DumpCCG) { 3192 dbgs() << "CCG after cloning:\n"; 3193 dbgs() << *this; 3194 } 3195 if (ExportToDot) 3196 exportToDot("cloned"); 3197 3198 bool Changed = assignFunctions(); 3199 3200 if (DumpCCG) { 3201 dbgs() << "CCG after assigning function clones:\n"; 3202 dbgs() << *this; 3203 } 3204 if (ExportToDot) 3205 exportToDot("clonefuncassign"); 3206 3207 return Changed; 3208 } 3209 3210 bool MemProfContextDisambiguation::processModule( 3211 Module &M, 3212 function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter) { 3213 3214 // If we have an import summary, then the cloning decisions were made during 3215 // the thin link on the index. Apply them and return. 3216 if (ImportSummary) 3217 return applyImport(M); 3218 3219 // TODO: If/when other types of memprof cloning are enabled beyond just for 3220 // hot and cold, we will need to change this to individually control the 3221 // AllocationType passed to addStackNodesForMIB during CCG construction. 3222 // Note that we specifically check this after applying imports above, so that 3223 // the option isn't needed to be passed to distributed ThinLTO backend 3224 // clang processes, which won't necessarily have visibility into the linker 3225 // dependences. Instead the information is communicated from the LTO link to 3226 // the backends via the combined summary index. 3227 if (!SupportsHotColdNew) 3228 return false; 3229 3230 ModuleCallsiteContextGraph CCG(M, OREGetter); 3231 return CCG.process(); 3232 } 3233 3234 MemProfContextDisambiguation::MemProfContextDisambiguation( 3235 const ModuleSummaryIndex *Summary) 3236 : ImportSummary(Summary) { 3237 if (ImportSummary) { 3238 // The MemProfImportSummary should only be used for testing ThinLTO 3239 // distributed backend handling via opt, in which case we don't have a 3240 // summary from the pass pipeline. 3241 assert(MemProfImportSummary.empty()); 3242 return; 3243 } 3244 if (MemProfImportSummary.empty()) 3245 return; 3246 3247 auto ReadSummaryFile = 3248 errorOrToExpected(MemoryBuffer::getFile(MemProfImportSummary)); 3249 if (!ReadSummaryFile) { 3250 logAllUnhandledErrors(ReadSummaryFile.takeError(), errs(), 3251 "Error loading file '" + MemProfImportSummary + 3252 "': "); 3253 return; 3254 } 3255 auto ImportSummaryForTestingOrErr = getModuleSummaryIndex(**ReadSummaryFile); 3256 if (!ImportSummaryForTestingOrErr) { 3257 logAllUnhandledErrors(ImportSummaryForTestingOrErr.takeError(), errs(), 3258 "Error parsing file '" + MemProfImportSummary + 3259 "': "); 3260 return; 3261 } 3262 ImportSummaryForTesting = std::move(*ImportSummaryForTestingOrErr); 3263 ImportSummary = ImportSummaryForTesting.get(); 3264 } 3265 3266 PreservedAnalyses MemProfContextDisambiguation::run(Module &M, 3267 ModuleAnalysisManager &AM) { 3268 auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager(); 3269 auto OREGetter = [&](Function *F) -> OptimizationRemarkEmitter & { 3270 return FAM.getResult<OptimizationRemarkEmitterAnalysis>(*F); 3271 }; 3272 if (!processModule(M, OREGetter)) 3273 return PreservedAnalyses::all(); 3274 return PreservedAnalyses::none(); 3275 } 3276 3277 void MemProfContextDisambiguation::run( 3278 ModuleSummaryIndex &Index, 3279 function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)> 3280 isPrevailing) { 3281 // TODO: If/when other types of memprof cloning are enabled beyond just for 3282 // hot and cold, we will need to change this to individually control the 3283 // AllocationType passed to addStackNodesForMIB during CCG construction. 3284 // The index was set from the option, so these should be in sync. 3285 assert(Index.withSupportsHotColdNew() == SupportsHotColdNew); 3286 if (!SupportsHotColdNew) 3287 return; 3288 3289 IndexCallsiteContextGraph CCG(Index, isPrevailing); 3290 CCG.process(); 3291 } 3292