xref: /freebsd-src/contrib/llvm-project/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp (revision 1db9f3b21e39176dd5b67cf8ac378633b172463e)
1 //==-- MemProfContextDisambiguation.cpp - Disambiguate contexts -------------=//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements support for context disambiguation of allocation
10 // calls for profile guided heap optimization. Specifically, it uses Memprof
11 // profiles which indicate context specific allocation behavior (currently
12 // distinguishing cold vs hot memory allocations). Cloning is performed to
13 // expose the cold allocation call contexts, and the allocation calls are
14 // subsequently annotated with an attribute for later transformation.
15 //
16 // The transformations can be performed either directly on IR (regular LTO), or
17 // on a ThinLTO index (and later applied to the IR during the ThinLTO backend).
18 // Both types of LTO operate on a the same base graph representation, which
19 // uses CRTP to support either IR or Index formats.
20 //
21 //===----------------------------------------------------------------------===//
22 
23 #include "llvm/Transforms/IPO/MemProfContextDisambiguation.h"
24 #include "llvm/ADT/DenseMap.h"
25 #include "llvm/ADT/DenseSet.h"
26 #include "llvm/ADT/MapVector.h"
27 #include "llvm/ADT/SetOperations.h"
28 #include "llvm/ADT/SmallPtrSet.h"
29 #include "llvm/ADT/SmallSet.h"
30 #include "llvm/ADT/SmallVector.h"
31 #include "llvm/ADT/Statistic.h"
32 #include "llvm/Analysis/MemoryProfileInfo.h"
33 #include "llvm/Analysis/ModuleSummaryAnalysis.h"
34 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
35 #include "llvm/Bitcode/BitcodeReader.h"
36 #include "llvm/IR/Constants.h"
37 #include "llvm/IR/Instructions.h"
38 #include "llvm/IR/Module.h"
39 #include "llvm/IR/ModuleSummaryIndex.h"
40 #include "llvm/Pass.h"
41 #include "llvm/Support/CommandLine.h"
42 #include "llvm/Support/FileSystem.h"
43 #include "llvm/Support/GraphWriter.h"
44 #include "llvm/Support/raw_ostream.h"
45 #include "llvm/Transforms/IPO.h"
46 #include "llvm/Transforms/Utils/Cloning.h"
47 #include <sstream>
48 #include <vector>
49 using namespace llvm;
50 using namespace llvm::memprof;
51 
52 #define DEBUG_TYPE "memprof-context-disambiguation"
53 
54 STATISTIC(FunctionClonesAnalysis,
55           "Number of function clones created during whole program analysis");
56 STATISTIC(FunctionClonesThinBackend,
57           "Number of function clones created during ThinLTO backend");
58 STATISTIC(FunctionsClonedThinBackend,
59           "Number of functions that had clones created during ThinLTO backend");
60 STATISTIC(AllocTypeNotCold, "Number of not cold static allocations (possibly "
61                             "cloned) during whole program analysis");
62 STATISTIC(AllocTypeCold, "Number of cold static allocations (possibly cloned) "
63                          "during whole program analysis");
64 STATISTIC(AllocTypeNotColdThinBackend,
65           "Number of not cold static allocations (possibly cloned) during "
66           "ThinLTO backend");
67 STATISTIC(AllocTypeColdThinBackend, "Number of cold static allocations "
68                                     "(possibly cloned) during ThinLTO backend");
69 STATISTIC(OrigAllocsThinBackend,
70           "Number of original (not cloned) allocations with memprof profiles "
71           "during ThinLTO backend");
72 STATISTIC(
73     AllocVersionsThinBackend,
74     "Number of allocation versions (including clones) during ThinLTO backend");
75 STATISTIC(MaxAllocVersionsThinBackend,
76           "Maximum number of allocation versions created for an original "
77           "allocation during ThinLTO backend");
78 STATISTIC(UnclonableAllocsThinBackend,
79           "Number of unclonable ambigous allocations during ThinLTO backend");
80 
81 static cl::opt<std::string> DotFilePathPrefix(
82     "memprof-dot-file-path-prefix", cl::init(""), cl::Hidden,
83     cl::value_desc("filename"),
84     cl::desc("Specify the path prefix of the MemProf dot files."));
85 
86 static cl::opt<bool> ExportToDot("memprof-export-to-dot", cl::init(false),
87                                  cl::Hidden,
88                                  cl::desc("Export graph to dot files."));
89 
90 static cl::opt<bool>
91     DumpCCG("memprof-dump-ccg", cl::init(false), cl::Hidden,
92             cl::desc("Dump CallingContextGraph to stdout after each stage."));
93 
94 static cl::opt<bool>
95     VerifyCCG("memprof-verify-ccg", cl::init(false), cl::Hidden,
96               cl::desc("Perform verification checks on CallingContextGraph."));
97 
98 static cl::opt<bool>
99     VerifyNodes("memprof-verify-nodes", cl::init(false), cl::Hidden,
100                 cl::desc("Perform frequent verification checks on nodes."));
101 
102 static cl::opt<std::string> MemProfImportSummary(
103     "memprof-import-summary",
104     cl::desc("Import summary to use for testing the ThinLTO backend via opt"),
105     cl::Hidden);
106 
107 namespace llvm {
108 // Indicate we are linking with an allocator that supports hot/cold operator
109 // new interfaces.
110 cl::opt<bool> SupportsHotColdNew(
111     "supports-hot-cold-new", cl::init(false), cl::Hidden,
112     cl::desc("Linking with hot/cold operator new interfaces"));
113 } // namespace llvm
114 
115 namespace {
116 /// CRTP base for graphs built from either IR or ThinLTO summary index.
117 ///
118 /// The graph represents the call contexts in all memprof metadata on allocation
119 /// calls, with nodes for the allocations themselves, as well as for the calls
120 /// in each context. The graph is initially built from the allocation memprof
121 /// metadata (or summary) MIBs. It is then updated to match calls with callsite
122 /// metadata onto the nodes, updating it to reflect any inlining performed on
123 /// those calls.
124 ///
125 /// Each MIB (representing an allocation's call context with allocation
126 /// behavior) is assigned a unique context id during the graph build. The edges
127 /// and nodes in the graph are decorated with the context ids they carry. This
128 /// is used to correctly update the graph when cloning is performed so that we
129 /// can uniquify the context for a single (possibly cloned) allocation.
130 template <typename DerivedCCG, typename FuncTy, typename CallTy>
131 class CallsiteContextGraph {
132 public:
133   CallsiteContextGraph() = default;
134   CallsiteContextGraph(const CallsiteContextGraph &) = default;
135   CallsiteContextGraph(CallsiteContextGraph &&) = default;
136 
137   /// Main entry point to perform analysis and transformations on graph.
138   bool process();
139 
140   /// Perform cloning on the graph necessary to uniquely identify the allocation
141   /// behavior of an allocation based on its context.
142   void identifyClones();
143 
144   /// Assign callsite clones to functions, cloning functions as needed to
145   /// accommodate the combinations of their callsite clones reached by callers.
146   /// For regular LTO this clones functions and callsites in the IR, but for
147   /// ThinLTO the cloning decisions are noted in the summaries and later applied
148   /// in applyImport.
149   bool assignFunctions();
150 
151   void dump() const;
152   void print(raw_ostream &OS) const;
153 
154   friend raw_ostream &operator<<(raw_ostream &OS,
155                                  const CallsiteContextGraph &CCG) {
156     CCG.print(OS);
157     return OS;
158   }
159 
160   friend struct GraphTraits<
161       const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>;
162   friend struct DOTGraphTraits<
163       const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>;
164 
165   void exportToDot(std::string Label) const;
166 
167   /// Represents a function clone via FuncTy pointer and clone number pair.
168   struct FuncInfo final
169       : public std::pair<FuncTy *, unsigned /*Clone number*/> {
170     using Base = std::pair<FuncTy *, unsigned>;
171     FuncInfo(const Base &B) : Base(B) {}
172     FuncInfo(FuncTy *F = nullptr, unsigned CloneNo = 0) : Base(F, CloneNo) {}
173     explicit operator bool() const { return this->first != nullptr; }
174     FuncTy *func() const { return this->first; }
175     unsigned cloneNo() const { return this->second; }
176   };
177 
178   /// Represents a callsite clone via CallTy and clone number pair.
179   struct CallInfo final : public std::pair<CallTy, unsigned /*Clone number*/> {
180     using Base = std::pair<CallTy, unsigned>;
181     CallInfo(const Base &B) : Base(B) {}
182     CallInfo(CallTy Call = nullptr, unsigned CloneNo = 0)
183         : Base(Call, CloneNo) {}
184     explicit operator bool() const { return (bool)this->first; }
185     CallTy call() const { return this->first; }
186     unsigned cloneNo() const { return this->second; }
187     void setCloneNo(unsigned N) { this->second = N; }
188     void print(raw_ostream &OS) const {
189       if (!operator bool()) {
190         assert(!cloneNo());
191         OS << "null Call";
192         return;
193       }
194       call()->print(OS);
195       OS << "\t(clone " << cloneNo() << ")";
196     }
197     void dump() const {
198       print(dbgs());
199       dbgs() << "\n";
200     }
201     friend raw_ostream &operator<<(raw_ostream &OS, const CallInfo &Call) {
202       Call.print(OS);
203       return OS;
204     }
205   };
206 
207   struct ContextEdge;
208 
209   /// Node in the Callsite Context Graph
210   struct ContextNode {
211     // Keep this for now since in the IR case where we have an Instruction* it
212     // is not as immediately discoverable. Used for printing richer information
213     // when dumping graph.
214     bool IsAllocation;
215 
216     // Keeps track of when the Call was reset to null because there was
217     // recursion.
218     bool Recursive = false;
219 
220     // The corresponding allocation or interior call.
221     CallInfo Call;
222 
223     // For alloc nodes this is a unique id assigned when constructed, and for
224     // callsite stack nodes it is the original stack id when the node is
225     // constructed from the memprof MIB metadata on the alloc nodes. Note that
226     // this is only used when matching callsite metadata onto the stack nodes
227     // created when processing the allocation memprof MIBs, and for labeling
228     // nodes in the dot graph. Therefore we don't bother to assign a value for
229     // clones.
230     uint64_t OrigStackOrAllocId = 0;
231 
232     // This will be formed by ORing together the AllocationType enum values
233     // for contexts including this node.
234     uint8_t AllocTypes = 0;
235 
236     // Edges to all callees in the profiled call stacks.
237     // TODO: Should this be a map (from Callee node) for more efficient lookup?
238     std::vector<std::shared_ptr<ContextEdge>> CalleeEdges;
239 
240     // Edges to all callers in the profiled call stacks.
241     // TODO: Should this be a map (from Caller node) for more efficient lookup?
242     std::vector<std::shared_ptr<ContextEdge>> CallerEdges;
243 
244     // The set of IDs for contexts including this node.
245     DenseSet<uint32_t> ContextIds;
246 
247     // List of clones of this ContextNode, initially empty.
248     std::vector<ContextNode *> Clones;
249 
250     // If a clone, points to the original uncloned node.
251     ContextNode *CloneOf = nullptr;
252 
253     ContextNode(bool IsAllocation) : IsAllocation(IsAllocation), Call() {}
254 
255     ContextNode(bool IsAllocation, CallInfo C)
256         : IsAllocation(IsAllocation), Call(C) {}
257 
258     void addClone(ContextNode *Clone) {
259       if (CloneOf) {
260         CloneOf->Clones.push_back(Clone);
261         Clone->CloneOf = CloneOf;
262       } else {
263         Clones.push_back(Clone);
264         assert(!Clone->CloneOf);
265         Clone->CloneOf = this;
266       }
267     }
268 
269     ContextNode *getOrigNode() {
270       if (!CloneOf)
271         return this;
272       return CloneOf;
273     }
274 
275     void addOrUpdateCallerEdge(ContextNode *Caller, AllocationType AllocType,
276                                unsigned int ContextId);
277 
278     ContextEdge *findEdgeFromCallee(const ContextNode *Callee);
279     ContextEdge *findEdgeFromCaller(const ContextNode *Caller);
280     void eraseCalleeEdge(const ContextEdge *Edge);
281     void eraseCallerEdge(const ContextEdge *Edge);
282 
283     void setCall(CallInfo C) { Call = C; }
284 
285     bool hasCall() const { return (bool)Call.call(); }
286 
287     void printCall(raw_ostream &OS) const { Call.print(OS); }
288 
289     // True if this node was effectively removed from the graph, in which case
290     // its context id set, caller edges, and callee edges should all be empty.
291     bool isRemoved() const {
292       assert(ContextIds.empty() ==
293              (CalleeEdges.empty() && CallerEdges.empty()));
294       return ContextIds.empty();
295     }
296 
297     void dump() const;
298     void print(raw_ostream &OS) const;
299 
300     friend raw_ostream &operator<<(raw_ostream &OS, const ContextNode &Node) {
301       Node.print(OS);
302       return OS;
303     }
304   };
305 
306   /// Edge in the Callsite Context Graph from a ContextNode N to a caller or
307   /// callee.
308   struct ContextEdge {
309     ContextNode *Callee;
310     ContextNode *Caller;
311 
312     // This will be formed by ORing together the AllocationType enum values
313     // for contexts including this edge.
314     uint8_t AllocTypes = 0;
315 
316     // The set of IDs for contexts including this edge.
317     DenseSet<uint32_t> ContextIds;
318 
319     ContextEdge(ContextNode *Callee, ContextNode *Caller, uint8_t AllocType,
320                 DenseSet<uint32_t> ContextIds)
321         : Callee(Callee), Caller(Caller), AllocTypes(AllocType),
322           ContextIds(ContextIds) {}
323 
324     DenseSet<uint32_t> &getContextIds() { return ContextIds; }
325 
326     void dump() const;
327     void print(raw_ostream &OS) const;
328 
329     friend raw_ostream &operator<<(raw_ostream &OS, const ContextEdge &Edge) {
330       Edge.print(OS);
331       return OS;
332     }
333   };
334 
335   /// Helper to remove callee edges that have allocation type None (due to not
336   /// carrying any context ids) after transformations.
337   void removeNoneTypeCalleeEdges(ContextNode *Node);
338 
339 protected:
340   /// Get a list of nodes corresponding to the stack ids in the given callsite
341   /// context.
342   template <class NodeT, class IteratorT>
343   std::vector<uint64_t>
344   getStackIdsWithContextNodes(CallStack<NodeT, IteratorT> &CallsiteContext);
345 
346   /// Adds nodes for the given allocation and any stack ids on its memprof MIB
347   /// metadata (or summary).
348   ContextNode *addAllocNode(CallInfo Call, const FuncTy *F);
349 
350   /// Adds nodes for the given MIB stack ids.
351   template <class NodeT, class IteratorT>
352   void addStackNodesForMIB(ContextNode *AllocNode,
353                            CallStack<NodeT, IteratorT> &StackContext,
354                            CallStack<NodeT, IteratorT> &CallsiteContext,
355                            AllocationType AllocType);
356 
357   /// Matches all callsite metadata (or summary) to the nodes created for
358   /// allocation memprof MIB metadata, synthesizing new nodes to reflect any
359   /// inlining performed on those callsite instructions.
360   void updateStackNodes();
361 
362   /// Update graph to conservatively handle any callsite stack nodes that target
363   /// multiple different callee target functions.
364   void handleCallsitesWithMultipleTargets();
365 
366   /// Save lists of calls with MemProf metadata in each function, for faster
367   /// iteration.
368   std::vector<std::pair<FuncTy *, std::vector<CallInfo>>>
369       FuncToCallsWithMetadata;
370 
371   /// Map from callsite node to the enclosing caller function.
372   std::map<const ContextNode *, const FuncTy *> NodeToCallingFunc;
373 
374 private:
375   using EdgeIter = typename std::vector<std::shared_ptr<ContextEdge>>::iterator;
376 
377   using CallContextInfo = std::tuple<CallTy, std::vector<uint64_t>,
378                                      const FuncTy *, DenseSet<uint32_t>>;
379 
380   /// Assigns the given Node to calls at or inlined into the location with
381   /// the Node's stack id, after post order traversing and processing its
382   /// caller nodes. Uses the call information recorded in the given
383   /// StackIdToMatchingCalls map, and creates new nodes for inlined sequences
384   /// as needed. Called by updateStackNodes which sets up the given
385   /// StackIdToMatchingCalls map.
386   void assignStackNodesPostOrder(
387       ContextNode *Node, DenseSet<const ContextNode *> &Visited,
388       DenseMap<uint64_t, std::vector<CallContextInfo>> &StackIdToMatchingCalls);
389 
390   /// Duplicates the given set of context ids, updating the provided
391   /// map from each original id with the newly generated context ids,
392   /// and returning the new duplicated id set.
393   DenseSet<uint32_t> duplicateContextIds(
394       const DenseSet<uint32_t> &StackSequenceContextIds,
395       DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds);
396 
397   /// Propagates all duplicated context ids across the graph.
398   void propagateDuplicateContextIds(
399       const DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds);
400 
401   /// Connect the NewNode to OrigNode's callees if TowardsCallee is true,
402   /// else to its callers. Also updates OrigNode's edges to remove any context
403   /// ids moved to the newly created edge.
404   void connectNewNode(ContextNode *NewNode, ContextNode *OrigNode,
405                       bool TowardsCallee);
406 
407   /// Get the stack id corresponding to the given Id or Index (for IR this will
408   /// return itself, for a summary index this will return the id recorded in the
409   /// index for that stack id index value).
410   uint64_t getStackId(uint64_t IdOrIndex) const {
411     return static_cast<const DerivedCCG *>(this)->getStackId(IdOrIndex);
412   }
413 
414   /// Returns true if the given call targets the given function.
415   bool calleeMatchesFunc(CallTy Call, const FuncTy *Func) {
416     return static_cast<DerivedCCG *>(this)->calleeMatchesFunc(Call, Func);
417   }
418 
419   /// Get a list of nodes corresponding to the stack ids in the given
420   /// callsite's context.
421   std::vector<uint64_t> getStackIdsWithContextNodesForCall(CallTy Call) {
422     return static_cast<DerivedCCG *>(this)->getStackIdsWithContextNodesForCall(
423         Call);
424   }
425 
426   /// Get the last stack id in the context for callsite.
427   uint64_t getLastStackId(CallTy Call) {
428     return static_cast<DerivedCCG *>(this)->getLastStackId(Call);
429   }
430 
431   /// Update the allocation call to record type of allocated memory.
432   void updateAllocationCall(CallInfo &Call, AllocationType AllocType) {
433     AllocType == AllocationType::Cold ? AllocTypeCold++ : AllocTypeNotCold++;
434     static_cast<DerivedCCG *>(this)->updateAllocationCall(Call, AllocType);
435   }
436 
437   /// Update non-allocation call to invoke (possibly cloned) function
438   /// CalleeFunc.
439   void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc) {
440     static_cast<DerivedCCG *>(this)->updateCall(CallerCall, CalleeFunc);
441   }
442 
443   /// Clone the given function for the given callsite, recording mapping of all
444   /// of the functions tracked calls to their new versions in the CallMap.
445   /// Assigns new clones to clone number CloneNo.
446   FuncInfo cloneFunctionForCallsite(
447       FuncInfo &Func, CallInfo &Call, std::map<CallInfo, CallInfo> &CallMap,
448       std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) {
449     return static_cast<DerivedCCG *>(this)->cloneFunctionForCallsite(
450         Func, Call, CallMap, CallsWithMetadataInFunc, CloneNo);
451   }
452 
453   /// Gets a label to use in the dot graph for the given call clone in the given
454   /// function.
455   std::string getLabel(const FuncTy *Func, const CallTy Call,
456                        unsigned CloneNo) const {
457     return static_cast<const DerivedCCG *>(this)->getLabel(Func, Call, CloneNo);
458   }
459 
460   /// Helpers to find the node corresponding to the given call or stackid.
461   ContextNode *getNodeForInst(const CallInfo &C);
462   ContextNode *getNodeForAlloc(const CallInfo &C);
463   ContextNode *getNodeForStackId(uint64_t StackId);
464 
465   /// Removes the node information recorded for the given call.
466   void unsetNodeForInst(const CallInfo &C);
467 
468   /// Computes the alloc type corresponding to the given context ids, by
469   /// unioning their recorded alloc types.
470   uint8_t computeAllocType(DenseSet<uint32_t> &ContextIds);
471 
472   /// Returns the alloction type of the intersection of the contexts of two
473   /// nodes (based on their provided context id sets), optimized for the case
474   /// when Node1Ids is smaller than Node2Ids.
475   uint8_t intersectAllocTypesImpl(const DenseSet<uint32_t> &Node1Ids,
476                                   const DenseSet<uint32_t> &Node2Ids);
477 
478   /// Returns the alloction type of the intersection of the contexts of two
479   /// nodes (based on their provided context id sets).
480   uint8_t intersectAllocTypes(const DenseSet<uint32_t> &Node1Ids,
481                               const DenseSet<uint32_t> &Node2Ids);
482 
483   /// Create a clone of Edge's callee and move Edge to that new callee node,
484   /// performing the necessary context id and allocation type updates.
485   /// If callee's caller edge iterator is supplied, it is updated when removing
486   /// the edge from that list.
487   ContextNode *
488   moveEdgeToNewCalleeClone(const std::shared_ptr<ContextEdge> &Edge,
489                            EdgeIter *CallerEdgeI = nullptr);
490 
491   /// Change the callee of Edge to existing callee clone NewCallee, performing
492   /// the necessary context id and allocation type updates.
493   /// If callee's caller edge iterator is supplied, it is updated when removing
494   /// the edge from that list.
495   void moveEdgeToExistingCalleeClone(const std::shared_ptr<ContextEdge> &Edge,
496                                      ContextNode *NewCallee,
497                                      EdgeIter *CallerEdgeI = nullptr,
498                                      bool NewClone = false);
499 
500   /// Recursively perform cloning on the graph for the given Node and its
501   /// callers, in order to uniquely identify the allocation behavior of an
502   /// allocation given its context.
503   void identifyClones(ContextNode *Node,
504                       DenseSet<const ContextNode *> &Visited);
505 
506   /// Map from each context ID to the AllocationType assigned to that context.
507   std::map<uint32_t, AllocationType> ContextIdToAllocationType;
508 
509   /// Identifies the context node created for a stack id when adding the MIB
510   /// contexts to the graph. This is used to locate the context nodes when
511   /// trying to assign the corresponding callsites with those stack ids to these
512   /// nodes.
513   std::map<uint64_t, ContextNode *> StackEntryIdToContextNodeMap;
514 
515   /// Maps to track the calls to their corresponding nodes in the graph.
516   MapVector<CallInfo, ContextNode *> AllocationCallToContextNodeMap;
517   MapVector<CallInfo, ContextNode *> NonAllocationCallToContextNodeMap;
518 
519   /// Owner of all ContextNode unique_ptrs.
520   std::vector<std::unique_ptr<ContextNode>> NodeOwner;
521 
522   /// Perform sanity checks on graph when requested.
523   void check() const;
524 
525   /// Keeps track of the last unique context id assigned.
526   unsigned int LastContextId = 0;
527 };
528 
529 template <typename DerivedCCG, typename FuncTy, typename CallTy>
530 using ContextNode =
531     typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode;
532 template <typename DerivedCCG, typename FuncTy, typename CallTy>
533 using ContextEdge =
534     typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge;
535 template <typename DerivedCCG, typename FuncTy, typename CallTy>
536 using FuncInfo =
537     typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::FuncInfo;
538 template <typename DerivedCCG, typename FuncTy, typename CallTy>
539 using CallInfo =
540     typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::CallInfo;
541 
542 /// CRTP derived class for graphs built from IR (regular LTO).
543 class ModuleCallsiteContextGraph
544     : public CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
545                                   Instruction *> {
546 public:
547   ModuleCallsiteContextGraph(
548       Module &M,
549       function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter);
550 
551 private:
552   friend CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
553                               Instruction *>;
554 
555   uint64_t getStackId(uint64_t IdOrIndex) const;
556   bool calleeMatchesFunc(Instruction *Call, const Function *Func);
557   uint64_t getLastStackId(Instruction *Call);
558   std::vector<uint64_t> getStackIdsWithContextNodesForCall(Instruction *Call);
559   void updateAllocationCall(CallInfo &Call, AllocationType AllocType);
560   void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc);
561   CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
562                        Instruction *>::FuncInfo
563   cloneFunctionForCallsite(FuncInfo &Func, CallInfo &Call,
564                            std::map<CallInfo, CallInfo> &CallMap,
565                            std::vector<CallInfo> &CallsWithMetadataInFunc,
566                            unsigned CloneNo);
567   std::string getLabel(const Function *Func, const Instruction *Call,
568                        unsigned CloneNo) const;
569 
570   const Module &Mod;
571   function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter;
572 };
573 
574 /// Represents a call in the summary index graph, which can either be an
575 /// allocation or an interior callsite node in an allocation's context.
576 /// Holds a pointer to the corresponding data structure in the index.
577 struct IndexCall : public PointerUnion<CallsiteInfo *, AllocInfo *> {
578   IndexCall() : PointerUnion() {}
579   IndexCall(std::nullptr_t) : IndexCall() {}
580   IndexCall(CallsiteInfo *StackNode) : PointerUnion(StackNode) {}
581   IndexCall(AllocInfo *AllocNode) : PointerUnion(AllocNode) {}
582   IndexCall(PointerUnion PT) : PointerUnion(PT) {}
583 
584   IndexCall *operator->() { return this; }
585 
586   PointerUnion<CallsiteInfo *, AllocInfo *> getBase() const { return *this; }
587 
588   void print(raw_ostream &OS) const {
589     if (auto *AI = llvm::dyn_cast_if_present<AllocInfo *>(getBase())) {
590       OS << *AI;
591     } else {
592       auto *CI = llvm::dyn_cast_if_present<CallsiteInfo *>(getBase());
593       assert(CI);
594       OS << *CI;
595     }
596   }
597 };
598 
599 /// CRTP derived class for graphs built from summary index (ThinLTO).
600 class IndexCallsiteContextGraph
601     : public CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
602                                   IndexCall> {
603 public:
604   IndexCallsiteContextGraph(
605       ModuleSummaryIndex &Index,
606       function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)>
607           isPrevailing);
608 
609 private:
610   friend CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
611                               IndexCall>;
612 
613   uint64_t getStackId(uint64_t IdOrIndex) const;
614   bool calleeMatchesFunc(IndexCall &Call, const FunctionSummary *Func);
615   uint64_t getLastStackId(IndexCall &Call);
616   std::vector<uint64_t> getStackIdsWithContextNodesForCall(IndexCall &Call);
617   void updateAllocationCall(CallInfo &Call, AllocationType AllocType);
618   void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc);
619   CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
620                        IndexCall>::FuncInfo
621   cloneFunctionForCallsite(FuncInfo &Func, CallInfo &Call,
622                            std::map<CallInfo, CallInfo> &CallMap,
623                            std::vector<CallInfo> &CallsWithMetadataInFunc,
624                            unsigned CloneNo);
625   std::string getLabel(const FunctionSummary *Func, const IndexCall &Call,
626                        unsigned CloneNo) const;
627 
628   // Saves mapping from function summaries containing memprof records back to
629   // its VI, for use in checking and debugging.
630   std::map<const FunctionSummary *, ValueInfo> FSToVIMap;
631 
632   const ModuleSummaryIndex &Index;
633 };
634 } // namespace
635 
636 namespace llvm {
637 template <>
638 struct DenseMapInfo<typename CallsiteContextGraph<
639     ModuleCallsiteContextGraph, Function, Instruction *>::CallInfo>
640     : public DenseMapInfo<std::pair<Instruction *, unsigned>> {};
641 template <>
642 struct DenseMapInfo<typename CallsiteContextGraph<
643     IndexCallsiteContextGraph, FunctionSummary, IndexCall>::CallInfo>
644     : public DenseMapInfo<std::pair<IndexCall, unsigned>> {};
645 template <>
646 struct DenseMapInfo<IndexCall>
647     : public DenseMapInfo<PointerUnion<CallsiteInfo *, AllocInfo *>> {};
648 } // end namespace llvm
649 
650 namespace {
651 
652 struct FieldSeparator {
653   bool Skip = true;
654   const char *Sep;
655 
656   FieldSeparator(const char *Sep = ", ") : Sep(Sep) {}
657 };
658 
659 raw_ostream &operator<<(raw_ostream &OS, FieldSeparator &FS) {
660   if (FS.Skip) {
661     FS.Skip = false;
662     return OS;
663   }
664   return OS << FS.Sep;
665 }
666 
667 // Map the uint8_t alloc types (which may contain NotCold|Cold) to the alloc
668 // type we should actually use on the corresponding allocation.
669 // If we can't clone a node that has NotCold+Cold alloc type, we will fall
670 // back to using NotCold. So don't bother cloning to distinguish NotCold+Cold
671 // from NotCold.
672 AllocationType allocTypeToUse(uint8_t AllocTypes) {
673   assert(AllocTypes != (uint8_t)AllocationType::None);
674   if (AllocTypes ==
675       ((uint8_t)AllocationType::NotCold | (uint8_t)AllocationType::Cold))
676     return AllocationType::NotCold;
677   else
678     return (AllocationType)AllocTypes;
679 }
680 
681 // Helper to check if the alloc types for all edges recorded in the
682 // InAllocTypes vector match the alloc types for all edges in the Edges
683 // vector.
684 template <typename DerivedCCG, typename FuncTy, typename CallTy>
685 bool allocTypesMatch(
686     const std::vector<uint8_t> &InAllocTypes,
687     const std::vector<std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>>>
688         &Edges) {
689   return std::equal(
690       InAllocTypes.begin(), InAllocTypes.end(), Edges.begin(),
691       [](const uint8_t &l,
692          const std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>> &r) {
693         // Can share if one of the edges is None type - don't
694         // care about the type along that edge as it doesn't
695         // exist for those context ids.
696         if (l == (uint8_t)AllocationType::None ||
697             r->AllocTypes == (uint8_t)AllocationType::None)
698           return true;
699         return allocTypeToUse(l) == allocTypeToUse(r->AllocTypes);
700       });
701 }
702 
703 } // end anonymous namespace
704 
705 template <typename DerivedCCG, typename FuncTy, typename CallTy>
706 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
707 CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getNodeForInst(
708     const CallInfo &C) {
709   ContextNode *Node = getNodeForAlloc(C);
710   if (Node)
711     return Node;
712 
713   return NonAllocationCallToContextNodeMap.lookup(C);
714 }
715 
716 template <typename DerivedCCG, typename FuncTy, typename CallTy>
717 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
718 CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getNodeForAlloc(
719     const CallInfo &C) {
720   return AllocationCallToContextNodeMap.lookup(C);
721 }
722 
723 template <typename DerivedCCG, typename FuncTy, typename CallTy>
724 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
725 CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getNodeForStackId(
726     uint64_t StackId) {
727   auto StackEntryNode = StackEntryIdToContextNodeMap.find(StackId);
728   if (StackEntryNode != StackEntryIdToContextNodeMap.end())
729     return StackEntryNode->second;
730   return nullptr;
731 }
732 
733 template <typename DerivedCCG, typename FuncTy, typename CallTy>
734 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::unsetNodeForInst(
735     const CallInfo &C) {
736   AllocationCallToContextNodeMap.erase(C) ||
737       NonAllocationCallToContextNodeMap.erase(C);
738   assert(!AllocationCallToContextNodeMap.count(C) &&
739          !NonAllocationCallToContextNodeMap.count(C));
740 }
741 
742 template <typename DerivedCCG, typename FuncTy, typename CallTy>
743 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
744     addOrUpdateCallerEdge(ContextNode *Caller, AllocationType AllocType,
745                           unsigned int ContextId) {
746   for (auto &Edge : CallerEdges) {
747     if (Edge->Caller == Caller) {
748       Edge->AllocTypes |= (uint8_t)AllocType;
749       Edge->getContextIds().insert(ContextId);
750       return;
751     }
752   }
753   std::shared_ptr<ContextEdge> Edge = std::make_shared<ContextEdge>(
754       this, Caller, (uint8_t)AllocType, DenseSet<uint32_t>({ContextId}));
755   CallerEdges.push_back(Edge);
756   Caller->CalleeEdges.push_back(Edge);
757 }
758 
759 template <typename DerivedCCG, typename FuncTy, typename CallTy>
760 void CallsiteContextGraph<
761     DerivedCCG, FuncTy, CallTy>::removeNoneTypeCalleeEdges(ContextNode *Node) {
762   for (auto EI = Node->CalleeEdges.begin(); EI != Node->CalleeEdges.end();) {
763     auto Edge = *EI;
764     if (Edge->AllocTypes == (uint8_t)AllocationType::None) {
765       assert(Edge->ContextIds.empty());
766       Edge->Callee->eraseCallerEdge(Edge.get());
767       EI = Node->CalleeEdges.erase(EI);
768     } else
769       ++EI;
770   }
771 }
772 
773 template <typename DerivedCCG, typename FuncTy, typename CallTy>
774 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge *
775 CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
776     findEdgeFromCallee(const ContextNode *Callee) {
777   for (const auto &Edge : CalleeEdges)
778     if (Edge->Callee == Callee)
779       return Edge.get();
780   return nullptr;
781 }
782 
783 template <typename DerivedCCG, typename FuncTy, typename CallTy>
784 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge *
785 CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
786     findEdgeFromCaller(const ContextNode *Caller) {
787   for (const auto &Edge : CallerEdges)
788     if (Edge->Caller == Caller)
789       return Edge.get();
790   return nullptr;
791 }
792 
793 template <typename DerivedCCG, typename FuncTy, typename CallTy>
794 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
795     eraseCalleeEdge(const ContextEdge *Edge) {
796   auto EI = llvm::find_if(
797       CalleeEdges, [Edge](const std::shared_ptr<ContextEdge> &CalleeEdge) {
798         return CalleeEdge.get() == Edge;
799       });
800   assert(EI != CalleeEdges.end());
801   CalleeEdges.erase(EI);
802 }
803 
804 template <typename DerivedCCG, typename FuncTy, typename CallTy>
805 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
806     eraseCallerEdge(const ContextEdge *Edge) {
807   auto EI = llvm::find_if(
808       CallerEdges, [Edge](const std::shared_ptr<ContextEdge> &CallerEdge) {
809         return CallerEdge.get() == Edge;
810       });
811   assert(EI != CallerEdges.end());
812   CallerEdges.erase(EI);
813 }
814 
815 template <typename DerivedCCG, typename FuncTy, typename CallTy>
816 uint8_t CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::computeAllocType(
817     DenseSet<uint32_t> &ContextIds) {
818   uint8_t BothTypes =
819       (uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold;
820   uint8_t AllocType = (uint8_t)AllocationType::None;
821   for (auto Id : ContextIds) {
822     AllocType |= (uint8_t)ContextIdToAllocationType[Id];
823     // Bail early if alloc type reached both, no further refinement.
824     if (AllocType == BothTypes)
825       return AllocType;
826   }
827   return AllocType;
828 }
829 
830 template <typename DerivedCCG, typename FuncTy, typename CallTy>
831 uint8_t
832 CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::intersectAllocTypesImpl(
833     const DenseSet<uint32_t> &Node1Ids, const DenseSet<uint32_t> &Node2Ids) {
834   uint8_t BothTypes =
835       (uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold;
836   uint8_t AllocType = (uint8_t)AllocationType::None;
837   for (auto Id : Node1Ids) {
838     if (!Node2Ids.count(Id))
839       continue;
840     AllocType |= (uint8_t)ContextIdToAllocationType[Id];
841     // Bail early if alloc type reached both, no further refinement.
842     if (AllocType == BothTypes)
843       return AllocType;
844   }
845   return AllocType;
846 }
847 
848 template <typename DerivedCCG, typename FuncTy, typename CallTy>
849 uint8_t CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::intersectAllocTypes(
850     const DenseSet<uint32_t> &Node1Ids, const DenseSet<uint32_t> &Node2Ids) {
851   if (Node1Ids.size() < Node2Ids.size())
852     return intersectAllocTypesImpl(Node1Ids, Node2Ids);
853   else
854     return intersectAllocTypesImpl(Node2Ids, Node1Ids);
855 }
856 
857 template <typename DerivedCCG, typename FuncTy, typename CallTy>
858 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
859 CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::addAllocNode(
860     CallInfo Call, const FuncTy *F) {
861   assert(!getNodeForAlloc(Call));
862   NodeOwner.push_back(
863       std::make_unique<ContextNode>(/*IsAllocation=*/true, Call));
864   ContextNode *AllocNode = NodeOwner.back().get();
865   AllocationCallToContextNodeMap[Call] = AllocNode;
866   NodeToCallingFunc[AllocNode] = F;
867   // Use LastContextId as a uniq id for MIB allocation nodes.
868   AllocNode->OrigStackOrAllocId = LastContextId;
869   // Alloc type should be updated as we add in the MIBs. We should assert
870   // afterwards that it is not still None.
871   AllocNode->AllocTypes = (uint8_t)AllocationType::None;
872 
873   return AllocNode;
874 }
875 
876 template <typename DerivedCCG, typename FuncTy, typename CallTy>
877 template <class NodeT, class IteratorT>
878 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::addStackNodesForMIB(
879     ContextNode *AllocNode, CallStack<NodeT, IteratorT> &StackContext,
880     CallStack<NodeT, IteratorT> &CallsiteContext, AllocationType AllocType) {
881   // Treating the hot alloc type as NotCold before the disambiguation for "hot"
882   // is done.
883   if (AllocType == AllocationType::Hot)
884     AllocType = AllocationType::NotCold;
885 
886   ContextIdToAllocationType[++LastContextId] = AllocType;
887 
888   // Update alloc type and context ids for this MIB.
889   AllocNode->AllocTypes |= (uint8_t)AllocType;
890   AllocNode->ContextIds.insert(LastContextId);
891 
892   // Now add or update nodes for each stack id in alloc's context.
893   // Later when processing the stack ids on non-alloc callsites we will adjust
894   // for any inlining in the context.
895   ContextNode *PrevNode = AllocNode;
896   // Look for recursion (direct recursion should have been collapsed by
897   // module summary analysis, here we should just be detecting mutual
898   // recursion). Mark these nodes so we don't try to clone.
899   SmallSet<uint64_t, 8> StackIdSet;
900   // Skip any on the allocation call (inlining).
901   for (auto ContextIter = StackContext.beginAfterSharedPrefix(CallsiteContext);
902        ContextIter != StackContext.end(); ++ContextIter) {
903     auto StackId = getStackId(*ContextIter);
904     ContextNode *StackNode = getNodeForStackId(StackId);
905     if (!StackNode) {
906       NodeOwner.push_back(
907           std::make_unique<ContextNode>(/*IsAllocation=*/false));
908       StackNode = NodeOwner.back().get();
909       StackEntryIdToContextNodeMap[StackId] = StackNode;
910       StackNode->OrigStackOrAllocId = StackId;
911     }
912     auto Ins = StackIdSet.insert(StackId);
913     if (!Ins.second)
914       StackNode->Recursive = true;
915     StackNode->ContextIds.insert(LastContextId);
916     StackNode->AllocTypes |= (uint8_t)AllocType;
917     PrevNode->addOrUpdateCallerEdge(StackNode, AllocType, LastContextId);
918     PrevNode = StackNode;
919   }
920 }
921 
922 template <typename DerivedCCG, typename FuncTy, typename CallTy>
923 DenseSet<uint32_t>
924 CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::duplicateContextIds(
925     const DenseSet<uint32_t> &StackSequenceContextIds,
926     DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds) {
927   DenseSet<uint32_t> NewContextIds;
928   for (auto OldId : StackSequenceContextIds) {
929     NewContextIds.insert(++LastContextId);
930     OldToNewContextIds[OldId].insert(LastContextId);
931     assert(ContextIdToAllocationType.count(OldId));
932     // The new context has the same allocation type as original.
933     ContextIdToAllocationType[LastContextId] = ContextIdToAllocationType[OldId];
934   }
935   return NewContextIds;
936 }
937 
938 template <typename DerivedCCG, typename FuncTy, typename CallTy>
939 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
940     propagateDuplicateContextIds(
941         const DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds) {
942   // Build a set of duplicated context ids corresponding to the input id set.
943   auto GetNewIds = [&OldToNewContextIds](const DenseSet<uint32_t> &ContextIds) {
944     DenseSet<uint32_t> NewIds;
945     for (auto Id : ContextIds)
946       if (auto NewId = OldToNewContextIds.find(Id);
947           NewId != OldToNewContextIds.end())
948         NewIds.insert(NewId->second.begin(), NewId->second.end());
949     return NewIds;
950   };
951 
952   // Recursively update context ids sets along caller edges.
953   auto UpdateCallers = [&](ContextNode *Node,
954                            DenseSet<const ContextEdge *> &Visited,
955                            auto &&UpdateCallers) -> void {
956     for (const auto &Edge : Node->CallerEdges) {
957       auto Inserted = Visited.insert(Edge.get());
958       if (!Inserted.second)
959         continue;
960       ContextNode *NextNode = Edge->Caller;
961       DenseSet<uint32_t> NewIdsToAdd = GetNewIds(Edge->getContextIds());
962       // Only need to recursively iterate to NextNode via this caller edge if
963       // it resulted in any added ids to NextNode.
964       if (!NewIdsToAdd.empty()) {
965         Edge->getContextIds().insert(NewIdsToAdd.begin(), NewIdsToAdd.end());
966         NextNode->ContextIds.insert(NewIdsToAdd.begin(), NewIdsToAdd.end());
967         UpdateCallers(NextNode, Visited, UpdateCallers);
968       }
969     }
970   };
971 
972   DenseSet<const ContextEdge *> Visited;
973   for (auto &Entry : AllocationCallToContextNodeMap) {
974     auto *Node = Entry.second;
975     // Update ids on the allocation nodes before calling the recursive
976     // update along caller edges, since this simplifies the logic during
977     // that traversal.
978     DenseSet<uint32_t> NewIdsToAdd = GetNewIds(Node->ContextIds);
979     Node->ContextIds.insert(NewIdsToAdd.begin(), NewIdsToAdd.end());
980     UpdateCallers(Node, Visited, UpdateCallers);
981   }
982 }
983 
984 template <typename DerivedCCG, typename FuncTy, typename CallTy>
985 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::connectNewNode(
986     ContextNode *NewNode, ContextNode *OrigNode, bool TowardsCallee) {
987   // Make a copy of the context ids, since this will be adjusted below as they
988   // are moved.
989   DenseSet<uint32_t> RemainingContextIds = NewNode->ContextIds;
990   auto &OrigEdges =
991       TowardsCallee ? OrigNode->CalleeEdges : OrigNode->CallerEdges;
992   // Increment iterator in loop so that we can remove edges as needed.
993   for (auto EI = OrigEdges.begin(); EI != OrigEdges.end();) {
994     auto Edge = *EI;
995     // Remove any matching context ids from Edge, return set that were found and
996     // removed, these are the new edge's context ids. Also update the remaining
997     // (not found ids).
998     DenseSet<uint32_t> NewEdgeContextIds, NotFoundContextIds;
999     set_subtract(Edge->getContextIds(), RemainingContextIds, NewEdgeContextIds,
1000                  NotFoundContextIds);
1001     RemainingContextIds.swap(NotFoundContextIds);
1002     // If no matching context ids for this edge, skip it.
1003     if (NewEdgeContextIds.empty()) {
1004       ++EI;
1005       continue;
1006     }
1007     if (TowardsCallee) {
1008       auto NewEdge = std::make_shared<ContextEdge>(
1009           Edge->Callee, NewNode, computeAllocType(NewEdgeContextIds),
1010           NewEdgeContextIds);
1011       NewNode->CalleeEdges.push_back(NewEdge);
1012       NewEdge->Callee->CallerEdges.push_back(NewEdge);
1013     } else {
1014       auto NewEdge = std::make_shared<ContextEdge>(
1015           NewNode, Edge->Caller, computeAllocType(NewEdgeContextIds),
1016           NewEdgeContextIds);
1017       NewNode->CallerEdges.push_back(NewEdge);
1018       NewEdge->Caller->CalleeEdges.push_back(NewEdge);
1019     }
1020     // Remove old edge if context ids empty.
1021     if (Edge->getContextIds().empty()) {
1022       if (TowardsCallee) {
1023         Edge->Callee->eraseCallerEdge(Edge.get());
1024         EI = OrigNode->CalleeEdges.erase(EI);
1025       } else {
1026         Edge->Caller->eraseCalleeEdge(Edge.get());
1027         EI = OrigNode->CallerEdges.erase(EI);
1028       }
1029       continue;
1030     }
1031     ++EI;
1032   }
1033 }
1034 
1035 template <typename DerivedCCG, typename FuncTy, typename CallTy>
1036 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
1037     assignStackNodesPostOrder(ContextNode *Node,
1038                               DenseSet<const ContextNode *> &Visited,
1039                               DenseMap<uint64_t, std::vector<CallContextInfo>>
1040                                   &StackIdToMatchingCalls) {
1041   auto Inserted = Visited.insert(Node);
1042   if (!Inserted.second)
1043     return;
1044   // Post order traversal. Iterate over a copy since we may add nodes and
1045   // therefore new callers during the recursive call, invalidating any
1046   // iterator over the original edge vector. We don't need to process these
1047   // new nodes as they were already processed on creation.
1048   auto CallerEdges = Node->CallerEdges;
1049   for (auto &Edge : CallerEdges) {
1050     // Skip any that have been removed during the recursion.
1051     if (!Edge)
1052       continue;
1053     assignStackNodesPostOrder(Edge->Caller, Visited, StackIdToMatchingCalls);
1054   }
1055 
1056   // If this node's stack id is in the map, update the graph to contain new
1057   // nodes representing any inlining at interior callsites. Note we move the
1058   // associated context ids over to the new nodes.
1059 
1060   // Ignore this node if it is for an allocation or we didn't record any
1061   // stack id lists ending at it.
1062   if (Node->IsAllocation ||
1063       !StackIdToMatchingCalls.count(Node->OrigStackOrAllocId))
1064     return;
1065 
1066   auto &Calls = StackIdToMatchingCalls[Node->OrigStackOrAllocId];
1067   // Handle the simple case first. A single call with a single stack id.
1068   // In this case there is no need to create any new context nodes, simply
1069   // assign the context node for stack id to this Call.
1070   if (Calls.size() == 1) {
1071     auto &[Call, Ids, Func, SavedContextIds] = Calls[0];
1072     if (Ids.size() == 1) {
1073       assert(SavedContextIds.empty());
1074       // It should be this Node
1075       assert(Node == getNodeForStackId(Ids[0]));
1076       if (Node->Recursive)
1077         return;
1078       Node->setCall(Call);
1079       NonAllocationCallToContextNodeMap[Call] = Node;
1080       NodeToCallingFunc[Node] = Func;
1081       return;
1082     }
1083   }
1084 
1085   // Find the node for the last stack id, which should be the same
1086   // across all calls recorded for this id, and is this node's id.
1087   uint64_t LastId = Node->OrigStackOrAllocId;
1088   ContextNode *LastNode = getNodeForStackId(LastId);
1089   // We should only have kept stack ids that had nodes.
1090   assert(LastNode);
1091 
1092   for (unsigned I = 0; I < Calls.size(); I++) {
1093     auto &[Call, Ids, Func, SavedContextIds] = Calls[I];
1094     // Skip any for which we didn't assign any ids, these don't get a node in
1095     // the graph.
1096     if (SavedContextIds.empty())
1097       continue;
1098 
1099     assert(LastId == Ids.back());
1100 
1101     ContextNode *FirstNode = getNodeForStackId(Ids[0]);
1102     assert(FirstNode);
1103 
1104     // Recompute the context ids for this stack id sequence (the
1105     // intersection of the context ids of the corresponding nodes).
1106     // Start with the ids we saved in the map for this call, which could be
1107     // duplicated context ids. We have to recompute as we might have overlap
1108     // overlap between the saved context ids for different last nodes, and
1109     // removed them already during the post order traversal.
1110     set_intersect(SavedContextIds, FirstNode->ContextIds);
1111     ContextNode *PrevNode = nullptr;
1112     for (auto Id : Ids) {
1113       ContextNode *CurNode = getNodeForStackId(Id);
1114       // We should only have kept stack ids that had nodes and weren't
1115       // recursive.
1116       assert(CurNode);
1117       assert(!CurNode->Recursive);
1118       if (!PrevNode) {
1119         PrevNode = CurNode;
1120         continue;
1121       }
1122       auto *Edge = CurNode->findEdgeFromCallee(PrevNode);
1123       if (!Edge) {
1124         SavedContextIds.clear();
1125         break;
1126       }
1127       PrevNode = CurNode;
1128       set_intersect(SavedContextIds, Edge->getContextIds());
1129 
1130       // If we now have no context ids for clone, skip this call.
1131       if (SavedContextIds.empty())
1132         break;
1133     }
1134     if (SavedContextIds.empty())
1135       continue;
1136 
1137     // Create new context node.
1138     NodeOwner.push_back(
1139         std::make_unique<ContextNode>(/*IsAllocation=*/false, Call));
1140     ContextNode *NewNode = NodeOwner.back().get();
1141     NodeToCallingFunc[NewNode] = Func;
1142     NonAllocationCallToContextNodeMap[Call] = NewNode;
1143     NewNode->ContextIds = SavedContextIds;
1144     NewNode->AllocTypes = computeAllocType(NewNode->ContextIds);
1145 
1146     // Connect to callees of innermost stack frame in inlined call chain.
1147     // This updates context ids for FirstNode's callee's to reflect those
1148     // moved to NewNode.
1149     connectNewNode(NewNode, FirstNode, /*TowardsCallee=*/true);
1150 
1151     // Connect to callers of outermost stack frame in inlined call chain.
1152     // This updates context ids for FirstNode's caller's to reflect those
1153     // moved to NewNode.
1154     connectNewNode(NewNode, LastNode, /*TowardsCallee=*/false);
1155 
1156     // Now we need to remove context ids from edges/nodes between First and
1157     // Last Node.
1158     PrevNode = nullptr;
1159     for (auto Id : Ids) {
1160       ContextNode *CurNode = getNodeForStackId(Id);
1161       // We should only have kept stack ids that had nodes.
1162       assert(CurNode);
1163 
1164       // Remove the context ids moved to NewNode from CurNode, and the
1165       // edge from the prior node.
1166       set_subtract(CurNode->ContextIds, NewNode->ContextIds);
1167       if (PrevNode) {
1168         auto *PrevEdge = CurNode->findEdgeFromCallee(PrevNode);
1169         assert(PrevEdge);
1170         set_subtract(PrevEdge->getContextIds(), NewNode->ContextIds);
1171         if (PrevEdge->getContextIds().empty()) {
1172           PrevNode->eraseCallerEdge(PrevEdge);
1173           CurNode->eraseCalleeEdge(PrevEdge);
1174         }
1175       }
1176       PrevNode = CurNode;
1177     }
1178   }
1179 }
1180 
1181 template <typename DerivedCCG, typename FuncTy, typename CallTy>
1182 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::updateStackNodes() {
1183   // Map of stack id to all calls with that as the last (outermost caller)
1184   // callsite id that has a context node (some might not due to pruning
1185   // performed during matching of the allocation profile contexts).
1186   // The CallContextInfo contains the Call and a list of its stack ids with
1187   // ContextNodes, the function containing Call, and the set of context ids
1188   // the analysis will eventually identify for use in any new node created
1189   // for that callsite.
1190   DenseMap<uint64_t, std::vector<CallContextInfo>> StackIdToMatchingCalls;
1191   for (auto &[Func, CallsWithMetadata] : FuncToCallsWithMetadata) {
1192     for (auto &Call : CallsWithMetadata) {
1193       // Ignore allocations, already handled.
1194       if (AllocationCallToContextNodeMap.count(Call))
1195         continue;
1196       auto StackIdsWithContextNodes =
1197           getStackIdsWithContextNodesForCall(Call.call());
1198       // If there were no nodes created for MIBs on allocs (maybe this was in
1199       // the unambiguous part of the MIB stack that was pruned), ignore.
1200       if (StackIdsWithContextNodes.empty())
1201         continue;
1202       // Otherwise, record this Call along with the list of ids for the last
1203       // (outermost caller) stack id with a node.
1204       StackIdToMatchingCalls[StackIdsWithContextNodes.back()].push_back(
1205           {Call.call(), StackIdsWithContextNodes, Func, {}});
1206     }
1207   }
1208 
1209   // First make a pass through all stack ids that correspond to a call,
1210   // as identified in the above loop. Compute the context ids corresponding to
1211   // each of these calls when they correspond to multiple stack ids due to
1212   // due to inlining. Perform any duplication of context ids required when
1213   // there is more than one call with the same stack ids. Their (possibly newly
1214   // duplicated) context ids are saved in the StackIdToMatchingCalls map.
1215   DenseMap<uint32_t, DenseSet<uint32_t>> OldToNewContextIds;
1216   for (auto &It : StackIdToMatchingCalls) {
1217     auto &Calls = It.getSecond();
1218     // Skip single calls with a single stack id. These don't need a new node.
1219     if (Calls.size() == 1) {
1220       auto &Ids = std::get<1>(Calls[0]);
1221       if (Ids.size() == 1)
1222         continue;
1223     }
1224     // In order to do the best and maximal matching of inlined calls to context
1225     // node sequences we will sort the vectors of stack ids in descending order
1226     // of length, and within each length, lexicographically by stack id. The
1227     // latter is so that we can specially handle calls that have identical stack
1228     // id sequences (either due to cloning or artificially because of the MIB
1229     // context pruning).
1230     std::stable_sort(Calls.begin(), Calls.end(),
1231                      [](const CallContextInfo &A, const CallContextInfo &B) {
1232                        auto &IdsA = std::get<1>(A);
1233                        auto &IdsB = std::get<1>(B);
1234                        return IdsA.size() > IdsB.size() ||
1235                               (IdsA.size() == IdsB.size() && IdsA < IdsB);
1236                      });
1237 
1238     // Find the node for the last stack id, which should be the same
1239     // across all calls recorded for this id, and is the id for this
1240     // entry in the StackIdToMatchingCalls map.
1241     uint64_t LastId = It.getFirst();
1242     ContextNode *LastNode = getNodeForStackId(LastId);
1243     // We should only have kept stack ids that had nodes.
1244     assert(LastNode);
1245 
1246     if (LastNode->Recursive)
1247       continue;
1248 
1249     // Initialize the context ids with the last node's. We will subsequently
1250     // refine the context ids by computing the intersection along all edges.
1251     DenseSet<uint32_t> LastNodeContextIds = LastNode->ContextIds;
1252     assert(!LastNodeContextIds.empty());
1253 
1254     for (unsigned I = 0; I < Calls.size(); I++) {
1255       auto &[Call, Ids, Func, SavedContextIds] = Calls[I];
1256       assert(SavedContextIds.empty());
1257       assert(LastId == Ids.back());
1258 
1259       // First compute the context ids for this stack id sequence (the
1260       // intersection of the context ids of the corresponding nodes).
1261       // Start with the remaining saved ids for the last node.
1262       assert(!LastNodeContextIds.empty());
1263       DenseSet<uint32_t> StackSequenceContextIds = LastNodeContextIds;
1264 
1265       ContextNode *PrevNode = LastNode;
1266       ContextNode *CurNode = LastNode;
1267       bool Skip = false;
1268 
1269       // Iterate backwards through the stack Ids, starting after the last Id
1270       // in the list, which was handled once outside for all Calls.
1271       for (auto IdIter = Ids.rbegin() + 1; IdIter != Ids.rend(); IdIter++) {
1272         auto Id = *IdIter;
1273         CurNode = getNodeForStackId(Id);
1274         // We should only have kept stack ids that had nodes.
1275         assert(CurNode);
1276 
1277         if (CurNode->Recursive) {
1278           Skip = true;
1279           break;
1280         }
1281 
1282         auto *Edge = CurNode->findEdgeFromCaller(PrevNode);
1283         // If there is no edge then the nodes belong to different MIB contexts,
1284         // and we should skip this inlined context sequence. For example, this
1285         // particular inlined context may include stack ids A->B, and we may
1286         // indeed have nodes for both A and B, but it is possible that they were
1287         // never profiled in sequence in a single MIB for any allocation (i.e.
1288         // we might have profiled an allocation that involves the callsite A,
1289         // but through a different one of its callee callsites, and we might
1290         // have profiled an allocation that involves callsite B, but reached
1291         // from a different caller callsite).
1292         if (!Edge) {
1293           Skip = true;
1294           break;
1295         }
1296         PrevNode = CurNode;
1297 
1298         // Update the context ids, which is the intersection of the ids along
1299         // all edges in the sequence.
1300         set_intersect(StackSequenceContextIds, Edge->getContextIds());
1301 
1302         // If we now have no context ids for clone, skip this call.
1303         if (StackSequenceContextIds.empty()) {
1304           Skip = true;
1305           break;
1306         }
1307       }
1308       if (Skip)
1309         continue;
1310 
1311       // If some of this call's stack ids did not have corresponding nodes (due
1312       // to pruning), don't include any context ids for contexts that extend
1313       // beyond these nodes. Otherwise we would be matching part of unrelated /
1314       // not fully matching stack contexts. To do this, subtract any context ids
1315       // found in caller nodes of the last node found above.
1316       if (Ids.back() != getLastStackId(Call)) {
1317         for (const auto &PE : CurNode->CallerEdges) {
1318           set_subtract(StackSequenceContextIds, PE->getContextIds());
1319           if (StackSequenceContextIds.empty())
1320             break;
1321         }
1322         // If we now have no context ids for clone, skip this call.
1323         if (StackSequenceContextIds.empty())
1324           continue;
1325       }
1326 
1327       // Check if the next set of stack ids is the same (since the Calls vector
1328       // of tuples is sorted by the stack ids we can just look at the next one).
1329       bool DuplicateContextIds = false;
1330       if (I + 1 < Calls.size()) {
1331         auto NextIds = std::get<1>(Calls[I + 1]);
1332         DuplicateContextIds = Ids == NextIds;
1333       }
1334 
1335       // If we don't have duplicate context ids, then we can assign all the
1336       // context ids computed for the original node sequence to this call.
1337       // If there are duplicate calls with the same stack ids then we synthesize
1338       // new context ids that are duplicates of the originals. These are
1339       // assigned to SavedContextIds, which is a reference into the map entry
1340       // for this call, allowing us to access these ids later on.
1341       OldToNewContextIds.reserve(OldToNewContextIds.size() +
1342                                  StackSequenceContextIds.size());
1343       SavedContextIds =
1344           DuplicateContextIds
1345               ? duplicateContextIds(StackSequenceContextIds, OldToNewContextIds)
1346               : StackSequenceContextIds;
1347       assert(!SavedContextIds.empty());
1348 
1349       if (!DuplicateContextIds) {
1350         // Update saved last node's context ids to remove those that are
1351         // assigned to other calls, so that it is ready for the next call at
1352         // this stack id.
1353         set_subtract(LastNodeContextIds, StackSequenceContextIds);
1354         if (LastNodeContextIds.empty())
1355           break;
1356       }
1357     }
1358   }
1359 
1360   // Propagate the duplicate context ids over the graph.
1361   propagateDuplicateContextIds(OldToNewContextIds);
1362 
1363   if (VerifyCCG)
1364     check();
1365 
1366   // Now perform a post-order traversal over the graph, starting with the
1367   // allocation nodes, essentially processing nodes from callers to callees.
1368   // For any that contains an id in the map, update the graph to contain new
1369   // nodes representing any inlining at interior callsites. Note we move the
1370   // associated context ids over to the new nodes.
1371   DenseSet<const ContextNode *> Visited;
1372   for (auto &Entry : AllocationCallToContextNodeMap)
1373     assignStackNodesPostOrder(Entry.second, Visited, StackIdToMatchingCalls);
1374 }
1375 
1376 uint64_t ModuleCallsiteContextGraph::getLastStackId(Instruction *Call) {
1377   CallStack<MDNode, MDNode::op_iterator> CallsiteContext(
1378       Call->getMetadata(LLVMContext::MD_callsite));
1379   return CallsiteContext.back();
1380 }
1381 
1382 uint64_t IndexCallsiteContextGraph::getLastStackId(IndexCall &Call) {
1383   assert(isa<CallsiteInfo *>(Call.getBase()));
1384   CallStack<CallsiteInfo, SmallVector<unsigned>::const_iterator>
1385       CallsiteContext(dyn_cast_if_present<CallsiteInfo *>(Call.getBase()));
1386   // Need to convert index into stack id.
1387   return Index.getStackIdAtIndex(CallsiteContext.back());
1388 }
1389 
1390 static const std::string MemProfCloneSuffix = ".memprof.";
1391 
1392 static std::string getMemProfFuncName(Twine Base, unsigned CloneNo) {
1393   // We use CloneNo == 0 to refer to the original version, which doesn't get
1394   // renamed with a suffix.
1395   if (!CloneNo)
1396     return Base.str();
1397   return (Base + MemProfCloneSuffix + Twine(CloneNo)).str();
1398 }
1399 
1400 std::string ModuleCallsiteContextGraph::getLabel(const Function *Func,
1401                                                  const Instruction *Call,
1402                                                  unsigned CloneNo) const {
1403   return (Twine(Call->getFunction()->getName()) + " -> " +
1404           cast<CallBase>(Call)->getCalledFunction()->getName())
1405       .str();
1406 }
1407 
1408 std::string IndexCallsiteContextGraph::getLabel(const FunctionSummary *Func,
1409                                                 const IndexCall &Call,
1410                                                 unsigned CloneNo) const {
1411   auto VI = FSToVIMap.find(Func);
1412   assert(VI != FSToVIMap.end());
1413   if (isa<AllocInfo *>(Call.getBase()))
1414     return (VI->second.name() + " -> alloc").str();
1415   else {
1416     auto *Callsite = dyn_cast_if_present<CallsiteInfo *>(Call.getBase());
1417     return (VI->second.name() + " -> " +
1418             getMemProfFuncName(Callsite->Callee.name(),
1419                                Callsite->Clones[CloneNo]))
1420         .str();
1421   }
1422 }
1423 
1424 std::vector<uint64_t>
1425 ModuleCallsiteContextGraph::getStackIdsWithContextNodesForCall(
1426     Instruction *Call) {
1427   CallStack<MDNode, MDNode::op_iterator> CallsiteContext(
1428       Call->getMetadata(LLVMContext::MD_callsite));
1429   return getStackIdsWithContextNodes<MDNode, MDNode::op_iterator>(
1430       CallsiteContext);
1431 }
1432 
1433 std::vector<uint64_t>
1434 IndexCallsiteContextGraph::getStackIdsWithContextNodesForCall(IndexCall &Call) {
1435   assert(isa<CallsiteInfo *>(Call.getBase()));
1436   CallStack<CallsiteInfo, SmallVector<unsigned>::const_iterator>
1437       CallsiteContext(dyn_cast_if_present<CallsiteInfo *>(Call.getBase()));
1438   return getStackIdsWithContextNodes<CallsiteInfo,
1439                                      SmallVector<unsigned>::const_iterator>(
1440       CallsiteContext);
1441 }
1442 
1443 template <typename DerivedCCG, typename FuncTy, typename CallTy>
1444 template <class NodeT, class IteratorT>
1445 std::vector<uint64_t>
1446 CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getStackIdsWithContextNodes(
1447     CallStack<NodeT, IteratorT> &CallsiteContext) {
1448   std::vector<uint64_t> StackIds;
1449   for (auto IdOrIndex : CallsiteContext) {
1450     auto StackId = getStackId(IdOrIndex);
1451     ContextNode *Node = getNodeForStackId(StackId);
1452     if (!Node)
1453       break;
1454     StackIds.push_back(StackId);
1455   }
1456   return StackIds;
1457 }
1458 
1459 ModuleCallsiteContextGraph::ModuleCallsiteContextGraph(
1460     Module &M, function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter)
1461     : Mod(M), OREGetter(OREGetter) {
1462   for (auto &F : M) {
1463     std::vector<CallInfo> CallsWithMetadata;
1464     for (auto &BB : F) {
1465       for (auto &I : BB) {
1466         if (!isa<CallBase>(I))
1467           continue;
1468         if (auto *MemProfMD = I.getMetadata(LLVMContext::MD_memprof)) {
1469           CallsWithMetadata.push_back(&I);
1470           auto *AllocNode = addAllocNode(&I, &F);
1471           auto *CallsiteMD = I.getMetadata(LLVMContext::MD_callsite);
1472           assert(CallsiteMD);
1473           CallStack<MDNode, MDNode::op_iterator> CallsiteContext(CallsiteMD);
1474           // Add all of the MIBs and their stack nodes.
1475           for (auto &MDOp : MemProfMD->operands()) {
1476             auto *MIBMD = cast<const MDNode>(MDOp);
1477             MDNode *StackNode = getMIBStackNode(MIBMD);
1478             assert(StackNode);
1479             CallStack<MDNode, MDNode::op_iterator> StackContext(StackNode);
1480             addStackNodesForMIB<MDNode, MDNode::op_iterator>(
1481                 AllocNode, StackContext, CallsiteContext,
1482                 getMIBAllocType(MIBMD));
1483           }
1484           assert(AllocNode->AllocTypes != (uint8_t)AllocationType::None);
1485           // Memprof and callsite metadata on memory allocations no longer
1486           // needed.
1487           I.setMetadata(LLVMContext::MD_memprof, nullptr);
1488           I.setMetadata(LLVMContext::MD_callsite, nullptr);
1489         }
1490         // For callsite metadata, add to list for this function for later use.
1491         else if (I.getMetadata(LLVMContext::MD_callsite))
1492           CallsWithMetadata.push_back(&I);
1493       }
1494     }
1495     if (!CallsWithMetadata.empty())
1496       FuncToCallsWithMetadata.push_back({&F, CallsWithMetadata});
1497   }
1498 
1499   if (DumpCCG) {
1500     dbgs() << "CCG before updating call stack chains:\n";
1501     dbgs() << *this;
1502   }
1503 
1504   if (ExportToDot)
1505     exportToDot("prestackupdate");
1506 
1507   updateStackNodes();
1508 
1509   handleCallsitesWithMultipleTargets();
1510 
1511   // Strip off remaining callsite metadata, no longer needed.
1512   for (auto &FuncEntry : FuncToCallsWithMetadata)
1513     for (auto &Call : FuncEntry.second)
1514       Call.call()->setMetadata(LLVMContext::MD_callsite, nullptr);
1515 }
1516 
1517 IndexCallsiteContextGraph::IndexCallsiteContextGraph(
1518     ModuleSummaryIndex &Index,
1519     function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)>
1520         isPrevailing)
1521     : Index(Index) {
1522   for (auto &I : Index) {
1523     auto VI = Index.getValueInfo(I);
1524     for (auto &S : VI.getSummaryList()) {
1525       // We should only add the prevailing nodes. Otherwise we may try to clone
1526       // in a weak copy that won't be linked (and may be different than the
1527       // prevailing version).
1528       // We only keep the memprof summary on the prevailing copy now when
1529       // building the combined index, as a space optimization, however don't
1530       // rely on this optimization. The linker doesn't resolve local linkage
1531       // values so don't check whether those are prevailing.
1532       if (!GlobalValue::isLocalLinkage(S->linkage()) &&
1533           !isPrevailing(VI.getGUID(), S.get()))
1534         continue;
1535       auto *FS = dyn_cast<FunctionSummary>(S.get());
1536       if (!FS)
1537         continue;
1538       std::vector<CallInfo> CallsWithMetadata;
1539       if (!FS->allocs().empty()) {
1540         for (auto &AN : FS->mutableAllocs()) {
1541           // This can happen because of recursion elimination handling that
1542           // currently exists in ModuleSummaryAnalysis. Skip these for now.
1543           // We still added them to the summary because we need to be able to
1544           // correlate properly in applyImport in the backends.
1545           if (AN.MIBs.empty())
1546             continue;
1547           CallsWithMetadata.push_back({&AN});
1548           auto *AllocNode = addAllocNode({&AN}, FS);
1549           // Pass an empty CallStack to the CallsiteContext (second)
1550           // parameter, since for ThinLTO we already collapsed out the inlined
1551           // stack ids on the allocation call during ModuleSummaryAnalysis.
1552           CallStack<MIBInfo, SmallVector<unsigned>::const_iterator>
1553               EmptyContext;
1554           // Now add all of the MIBs and their stack nodes.
1555           for (auto &MIB : AN.MIBs) {
1556             CallStack<MIBInfo, SmallVector<unsigned>::const_iterator>
1557                 StackContext(&MIB);
1558             addStackNodesForMIB<MIBInfo, SmallVector<unsigned>::const_iterator>(
1559                 AllocNode, StackContext, EmptyContext, MIB.AllocType);
1560           }
1561           assert(AllocNode->AllocTypes != (uint8_t)AllocationType::None);
1562           // Initialize version 0 on the summary alloc node to the current alloc
1563           // type, unless it has both types in which case make it default, so
1564           // that in the case where we aren't able to clone the original version
1565           // always ends up with the default allocation behavior.
1566           AN.Versions[0] = (uint8_t)allocTypeToUse(AllocNode->AllocTypes);
1567         }
1568       }
1569       // For callsite metadata, add to list for this function for later use.
1570       if (!FS->callsites().empty())
1571         for (auto &SN : FS->mutableCallsites())
1572           CallsWithMetadata.push_back({&SN});
1573 
1574       if (!CallsWithMetadata.empty())
1575         FuncToCallsWithMetadata.push_back({FS, CallsWithMetadata});
1576 
1577       if (!FS->allocs().empty() || !FS->callsites().empty())
1578         FSToVIMap[FS] = VI;
1579     }
1580   }
1581 
1582   if (DumpCCG) {
1583     dbgs() << "CCG before updating call stack chains:\n";
1584     dbgs() << *this;
1585   }
1586 
1587   if (ExportToDot)
1588     exportToDot("prestackupdate");
1589 
1590   updateStackNodes();
1591 
1592   handleCallsitesWithMultipleTargets();
1593 }
1594 
1595 template <typename DerivedCCG, typename FuncTy, typename CallTy>
1596 void CallsiteContextGraph<DerivedCCG, FuncTy,
1597                           CallTy>::handleCallsitesWithMultipleTargets() {
1598   // Look for and workaround callsites that call multiple functions.
1599   // This can happen for indirect calls, which needs better handling, and in
1600   // more rare cases (e.g. macro expansion).
1601   // TODO: To fix this for indirect calls we will want to perform speculative
1602   // devirtualization using either the normal PGO info with ICP, or using the
1603   // information in the profiled MemProf contexts. We can do this prior to
1604   // this transformation for regular LTO, and for ThinLTO we can simulate that
1605   // effect in the summary and perform the actual speculative devirtualization
1606   // while cloning in the ThinLTO backend.
1607   for (auto Entry = NonAllocationCallToContextNodeMap.begin();
1608        Entry != NonAllocationCallToContextNodeMap.end();) {
1609     auto *Node = Entry->second;
1610     assert(Node->Clones.empty());
1611     // Check all node callees and see if in the same function.
1612     bool Removed = false;
1613     auto Call = Node->Call.call();
1614     for (auto &Edge : Node->CalleeEdges) {
1615       if (!Edge->Callee->hasCall())
1616         continue;
1617       assert(NodeToCallingFunc.count(Edge->Callee));
1618       // Check if the called function matches that of the callee node.
1619       if (calleeMatchesFunc(Call, NodeToCallingFunc[Edge->Callee]))
1620         continue;
1621       // Work around by setting Node to have a null call, so it gets
1622       // skipped during cloning. Otherwise assignFunctions will assert
1623       // because its data structures are not designed to handle this case.
1624       Entry = NonAllocationCallToContextNodeMap.erase(Entry);
1625       Node->setCall(CallInfo());
1626       Removed = true;
1627       break;
1628     }
1629     if (!Removed)
1630       Entry++;
1631   }
1632 }
1633 
1634 uint64_t ModuleCallsiteContextGraph::getStackId(uint64_t IdOrIndex) const {
1635   // In the Module (IR) case this is already the Id.
1636   return IdOrIndex;
1637 }
1638 
1639 uint64_t IndexCallsiteContextGraph::getStackId(uint64_t IdOrIndex) const {
1640   // In the Index case this is an index into the stack id list in the summary
1641   // index, convert it to an Id.
1642   return Index.getStackIdAtIndex(IdOrIndex);
1643 }
1644 
1645 bool ModuleCallsiteContextGraph::calleeMatchesFunc(Instruction *Call,
1646                                                    const Function *Func) {
1647   auto *CB = dyn_cast<CallBase>(Call);
1648   if (!CB->getCalledOperand())
1649     return false;
1650   auto *CalleeVal = CB->getCalledOperand()->stripPointerCasts();
1651   auto *CalleeFunc = dyn_cast<Function>(CalleeVal);
1652   if (CalleeFunc == Func)
1653     return true;
1654   auto *Alias = dyn_cast<GlobalAlias>(CalleeVal);
1655   return Alias && Alias->getAliasee() == Func;
1656 }
1657 
1658 bool IndexCallsiteContextGraph::calleeMatchesFunc(IndexCall &Call,
1659                                                   const FunctionSummary *Func) {
1660   ValueInfo Callee =
1661       dyn_cast_if_present<CallsiteInfo *>(Call.getBase())->Callee;
1662   // If there is no summary list then this is a call to an externally defined
1663   // symbol.
1664   AliasSummary *Alias =
1665       Callee.getSummaryList().empty()
1666           ? nullptr
1667           : dyn_cast<AliasSummary>(Callee.getSummaryList()[0].get());
1668   assert(FSToVIMap.count(Func));
1669   return Callee == FSToVIMap[Func] ||
1670          // If callee is an alias, check the aliasee, since only function
1671          // summary base objects will contain the stack node summaries and thus
1672          // get a context node.
1673          (Alias && Alias->getAliaseeVI() == FSToVIMap[Func]);
1674 }
1675 
1676 static std::string getAllocTypeString(uint8_t AllocTypes) {
1677   if (!AllocTypes)
1678     return "None";
1679   std::string Str;
1680   if (AllocTypes & (uint8_t)AllocationType::NotCold)
1681     Str += "NotCold";
1682   if (AllocTypes & (uint8_t)AllocationType::Cold)
1683     Str += "Cold";
1684   return Str;
1685 }
1686 
1687 template <typename DerivedCCG, typename FuncTy, typename CallTy>
1688 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::dump()
1689     const {
1690   print(dbgs());
1691   dbgs() << "\n";
1692 }
1693 
1694 template <typename DerivedCCG, typename FuncTy, typename CallTy>
1695 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::print(
1696     raw_ostream &OS) const {
1697   OS << "Node " << this << "\n";
1698   OS << "\t";
1699   printCall(OS);
1700   if (Recursive)
1701     OS << " (recursive)";
1702   OS << "\n";
1703   OS << "\tAllocTypes: " << getAllocTypeString(AllocTypes) << "\n";
1704   OS << "\tContextIds:";
1705   std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
1706   std::sort(SortedIds.begin(), SortedIds.end());
1707   for (auto Id : SortedIds)
1708     OS << " " << Id;
1709   OS << "\n";
1710   OS << "\tCalleeEdges:\n";
1711   for (auto &Edge : CalleeEdges)
1712     OS << "\t\t" << *Edge << "\n";
1713   OS << "\tCallerEdges:\n";
1714   for (auto &Edge : CallerEdges)
1715     OS << "\t\t" << *Edge << "\n";
1716   if (!Clones.empty()) {
1717     OS << "\tClones: ";
1718     FieldSeparator FS;
1719     for (auto *Clone : Clones)
1720       OS << FS << Clone;
1721     OS << "\n";
1722   } else if (CloneOf) {
1723     OS << "\tClone of " << CloneOf << "\n";
1724   }
1725 }
1726 
1727 template <typename DerivedCCG, typename FuncTy, typename CallTy>
1728 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge::dump()
1729     const {
1730   print(dbgs());
1731   dbgs() << "\n";
1732 }
1733 
1734 template <typename DerivedCCG, typename FuncTy, typename CallTy>
1735 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge::print(
1736     raw_ostream &OS) const {
1737   OS << "Edge from Callee " << Callee << " to Caller: " << Caller
1738      << " AllocTypes: " << getAllocTypeString(AllocTypes);
1739   OS << " ContextIds:";
1740   std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
1741   std::sort(SortedIds.begin(), SortedIds.end());
1742   for (auto Id : SortedIds)
1743     OS << " " << Id;
1744 }
1745 
1746 template <typename DerivedCCG, typename FuncTy, typename CallTy>
1747 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::dump() const {
1748   print(dbgs());
1749 }
1750 
1751 template <typename DerivedCCG, typename FuncTy, typename CallTy>
1752 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::print(
1753     raw_ostream &OS) const {
1754   OS << "Callsite Context Graph:\n";
1755   using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
1756   for (const auto Node : nodes<GraphType>(this)) {
1757     if (Node->isRemoved())
1758       continue;
1759     Node->print(OS);
1760     OS << "\n";
1761   }
1762 }
1763 
1764 template <typename DerivedCCG, typename FuncTy, typename CallTy>
1765 static void checkEdge(
1766     const std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>> &Edge) {
1767   // Confirm that alloc type is not None and that we have at least one context
1768   // id.
1769   assert(Edge->AllocTypes != (uint8_t)AllocationType::None);
1770   assert(!Edge->ContextIds.empty());
1771 }
1772 
1773 template <typename DerivedCCG, typename FuncTy, typename CallTy>
1774 static void checkNode(const ContextNode<DerivedCCG, FuncTy, CallTy> *Node,
1775                       bool CheckEdges = true) {
1776   if (Node->isRemoved())
1777     return;
1778   // Node's context ids should be the union of both its callee and caller edge
1779   // context ids.
1780   if (Node->CallerEdges.size()) {
1781     auto EI = Node->CallerEdges.begin();
1782     auto &FirstEdge = *EI;
1783     EI++;
1784     DenseSet<uint32_t> CallerEdgeContextIds(FirstEdge->ContextIds);
1785     for (; EI != Node->CallerEdges.end(); EI++) {
1786       const auto &Edge = *EI;
1787       if (CheckEdges)
1788         checkEdge<DerivedCCG, FuncTy, CallTy>(Edge);
1789       set_union(CallerEdgeContextIds, Edge->ContextIds);
1790     }
1791     // Node can have more context ids than callers if some contexts terminate at
1792     // node and some are longer.
1793     assert(Node->ContextIds == CallerEdgeContextIds ||
1794            set_is_subset(CallerEdgeContextIds, Node->ContextIds));
1795   }
1796   if (Node->CalleeEdges.size()) {
1797     auto EI = Node->CalleeEdges.begin();
1798     auto &FirstEdge = *EI;
1799     EI++;
1800     DenseSet<uint32_t> CalleeEdgeContextIds(FirstEdge->ContextIds);
1801     for (; EI != Node->CalleeEdges.end(); EI++) {
1802       const auto &Edge = *EI;
1803       if (CheckEdges)
1804         checkEdge<DerivedCCG, FuncTy, CallTy>(Edge);
1805       set_union(CalleeEdgeContextIds, Edge->ContextIds);
1806     }
1807     assert(Node->ContextIds == CalleeEdgeContextIds);
1808   }
1809 }
1810 
1811 template <typename DerivedCCG, typename FuncTy, typename CallTy>
1812 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::check() const {
1813   using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
1814   for (const auto Node : nodes<GraphType>(this)) {
1815     checkNode<DerivedCCG, FuncTy, CallTy>(Node, /*CheckEdges=*/false);
1816     for (auto &Edge : Node->CallerEdges)
1817       checkEdge<DerivedCCG, FuncTy, CallTy>(Edge);
1818   }
1819 }
1820 
1821 template <typename DerivedCCG, typename FuncTy, typename CallTy>
1822 struct GraphTraits<const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *> {
1823   using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
1824   using NodeRef = const ContextNode<DerivedCCG, FuncTy, CallTy> *;
1825 
1826   using NodePtrTy = std::unique_ptr<ContextNode<DerivedCCG, FuncTy, CallTy>>;
1827   static NodeRef getNode(const NodePtrTy &P) { return P.get(); }
1828 
1829   using nodes_iterator =
1830       mapped_iterator<typename std::vector<NodePtrTy>::const_iterator,
1831                       decltype(&getNode)>;
1832 
1833   static nodes_iterator nodes_begin(GraphType G) {
1834     return nodes_iterator(G->NodeOwner.begin(), &getNode);
1835   }
1836 
1837   static nodes_iterator nodes_end(GraphType G) {
1838     return nodes_iterator(G->NodeOwner.end(), &getNode);
1839   }
1840 
1841   static NodeRef getEntryNode(GraphType G) {
1842     return G->NodeOwner.begin()->get();
1843   }
1844 
1845   using EdgePtrTy = std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>>;
1846   static const ContextNode<DerivedCCG, FuncTy, CallTy> *
1847   GetCallee(const EdgePtrTy &P) {
1848     return P->Callee;
1849   }
1850 
1851   using ChildIteratorType =
1852       mapped_iterator<typename std::vector<std::shared_ptr<ContextEdge<
1853                           DerivedCCG, FuncTy, CallTy>>>::const_iterator,
1854                       decltype(&GetCallee)>;
1855 
1856   static ChildIteratorType child_begin(NodeRef N) {
1857     return ChildIteratorType(N->CalleeEdges.begin(), &GetCallee);
1858   }
1859 
1860   static ChildIteratorType child_end(NodeRef N) {
1861     return ChildIteratorType(N->CalleeEdges.end(), &GetCallee);
1862   }
1863 };
1864 
1865 template <typename DerivedCCG, typename FuncTy, typename CallTy>
1866 struct DOTGraphTraits<const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>
1867     : public DefaultDOTGraphTraits {
1868   DOTGraphTraits(bool IsSimple = false) : DefaultDOTGraphTraits(IsSimple) {}
1869 
1870   using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
1871   using GTraits = GraphTraits<GraphType>;
1872   using NodeRef = typename GTraits::NodeRef;
1873   using ChildIteratorType = typename GTraits::ChildIteratorType;
1874 
1875   static std::string getNodeLabel(NodeRef Node, GraphType G) {
1876     std::string LabelString =
1877         (Twine("OrigId: ") + (Node->IsAllocation ? "Alloc" : "") +
1878          Twine(Node->OrigStackOrAllocId))
1879             .str();
1880     LabelString += "\n";
1881     if (Node->hasCall()) {
1882       auto Func = G->NodeToCallingFunc.find(Node);
1883       assert(Func != G->NodeToCallingFunc.end());
1884       LabelString +=
1885           G->getLabel(Func->second, Node->Call.call(), Node->Call.cloneNo());
1886     } else {
1887       LabelString += "null call";
1888       if (Node->Recursive)
1889         LabelString += " (recursive)";
1890       else
1891         LabelString += " (external)";
1892     }
1893     return LabelString;
1894   }
1895 
1896   static std::string getNodeAttributes(NodeRef Node, GraphType) {
1897     std::string AttributeString = (Twine("tooltip=\"") + getNodeId(Node) + " " +
1898                                    getContextIds(Node->ContextIds) + "\"")
1899                                       .str();
1900     AttributeString +=
1901         (Twine(",fillcolor=\"") + getColor(Node->AllocTypes) + "\"").str();
1902     AttributeString += ",style=\"filled\"";
1903     if (Node->CloneOf) {
1904       AttributeString += ",color=\"blue\"";
1905       AttributeString += ",style=\"filled,bold,dashed\"";
1906     } else
1907       AttributeString += ",style=\"filled\"";
1908     return AttributeString;
1909   }
1910 
1911   static std::string getEdgeAttributes(NodeRef, ChildIteratorType ChildIter,
1912                                        GraphType) {
1913     auto &Edge = *(ChildIter.getCurrent());
1914     return (Twine("tooltip=\"") + getContextIds(Edge->ContextIds) + "\"" +
1915             Twine(",fillcolor=\"") + getColor(Edge->AllocTypes) + "\"")
1916         .str();
1917   }
1918 
1919   // Since the NodeOwners list includes nodes that are no longer connected to
1920   // the graph, skip them here.
1921   static bool isNodeHidden(NodeRef Node, GraphType) {
1922     return Node->isRemoved();
1923   }
1924 
1925 private:
1926   static std::string getContextIds(const DenseSet<uint32_t> &ContextIds) {
1927     std::string IdString = "ContextIds:";
1928     if (ContextIds.size() < 100) {
1929       std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
1930       std::sort(SortedIds.begin(), SortedIds.end());
1931       for (auto Id : SortedIds)
1932         IdString += (" " + Twine(Id)).str();
1933     } else {
1934       IdString += (" (" + Twine(ContextIds.size()) + " ids)").str();
1935     }
1936     return IdString;
1937   }
1938 
1939   static std::string getColor(uint8_t AllocTypes) {
1940     if (AllocTypes == (uint8_t)AllocationType::NotCold)
1941       // Color "brown1" actually looks like a lighter red.
1942       return "brown1";
1943     if (AllocTypes == (uint8_t)AllocationType::Cold)
1944       return "cyan";
1945     if (AllocTypes ==
1946         ((uint8_t)AllocationType::NotCold | (uint8_t)AllocationType::Cold))
1947       // Lighter purple.
1948       return "mediumorchid1";
1949     return "gray";
1950   }
1951 
1952   static std::string getNodeId(NodeRef Node) {
1953     std::stringstream SStream;
1954     SStream << std::hex << "N0x" << (unsigned long long)Node;
1955     std::string Result = SStream.str();
1956     return Result;
1957   }
1958 };
1959 
1960 template <typename DerivedCCG, typename FuncTy, typename CallTy>
1961 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::exportToDot(
1962     std::string Label) const {
1963   WriteGraph(this, "", false, Label,
1964              DotFilePathPrefix + "ccg." + Label + ".dot");
1965 }
1966 
1967 template <typename DerivedCCG, typename FuncTy, typename CallTy>
1968 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
1969 CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::moveEdgeToNewCalleeClone(
1970     const std::shared_ptr<ContextEdge> &Edge, EdgeIter *CallerEdgeI) {
1971   ContextNode *Node = Edge->Callee;
1972   NodeOwner.push_back(
1973       std::make_unique<ContextNode>(Node->IsAllocation, Node->Call));
1974   ContextNode *Clone = NodeOwner.back().get();
1975   Node->addClone(Clone);
1976   assert(NodeToCallingFunc.count(Node));
1977   NodeToCallingFunc[Clone] = NodeToCallingFunc[Node];
1978   moveEdgeToExistingCalleeClone(Edge, Clone, CallerEdgeI, /*NewClone=*/true);
1979   return Clone;
1980 }
1981 
1982 template <typename DerivedCCG, typename FuncTy, typename CallTy>
1983 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
1984     moveEdgeToExistingCalleeClone(const std::shared_ptr<ContextEdge> &Edge,
1985                                   ContextNode *NewCallee, EdgeIter *CallerEdgeI,
1986                                   bool NewClone) {
1987   // NewCallee and Edge's current callee must be clones of the same original
1988   // node (Edge's current callee may be the original node too).
1989   assert(NewCallee->getOrigNode() == Edge->Callee->getOrigNode());
1990   auto &EdgeContextIds = Edge->getContextIds();
1991   ContextNode *OldCallee = Edge->Callee;
1992   if (CallerEdgeI)
1993     *CallerEdgeI = OldCallee->CallerEdges.erase(*CallerEdgeI);
1994   else
1995     OldCallee->eraseCallerEdge(Edge.get());
1996   Edge->Callee = NewCallee;
1997   NewCallee->CallerEdges.push_back(Edge);
1998   // Don't need to update Edge's context ids since we are simply reconnecting
1999   // it.
2000   set_subtract(OldCallee->ContextIds, EdgeContextIds);
2001   NewCallee->ContextIds.insert(EdgeContextIds.begin(), EdgeContextIds.end());
2002   NewCallee->AllocTypes |= Edge->AllocTypes;
2003   OldCallee->AllocTypes = computeAllocType(OldCallee->ContextIds);
2004   // OldCallee alloc type should be None iff its context id set is now empty.
2005   assert((OldCallee->AllocTypes == (uint8_t)AllocationType::None) ==
2006          OldCallee->ContextIds.empty());
2007   // Now walk the old callee node's callee edges and move Edge's context ids
2008   // over to the corresponding edge into the clone (which is created here if
2009   // this is a newly created clone).
2010   for (auto &OldCalleeEdge : OldCallee->CalleeEdges) {
2011     // The context ids moving to the new callee are the subset of this edge's
2012     // context ids and the context ids on the caller edge being moved.
2013     DenseSet<uint32_t> EdgeContextIdsToMove =
2014         set_intersection(OldCalleeEdge->getContextIds(), EdgeContextIds);
2015     set_subtract(OldCalleeEdge->getContextIds(), EdgeContextIdsToMove);
2016     OldCalleeEdge->AllocTypes =
2017         computeAllocType(OldCalleeEdge->getContextIds());
2018     if (!NewClone) {
2019       // Update context ids / alloc type on corresponding edge to NewCallee.
2020       // There is a chance this may not exist if we are reusing an existing
2021       // clone, specifically during function assignment, where we would have
2022       // removed none type edges after creating the clone. If we can't find
2023       // a corresponding edge there, fall through to the cloning below.
2024       if (auto *NewCalleeEdge =
2025               NewCallee->findEdgeFromCallee(OldCalleeEdge->Callee)) {
2026         NewCalleeEdge->getContextIds().insert(EdgeContextIdsToMove.begin(),
2027                                               EdgeContextIdsToMove.end());
2028         NewCalleeEdge->AllocTypes |= computeAllocType(EdgeContextIdsToMove);
2029         continue;
2030       }
2031     }
2032     auto NewEdge = std::make_shared<ContextEdge>(
2033         OldCalleeEdge->Callee, NewCallee,
2034         computeAllocType(EdgeContextIdsToMove), EdgeContextIdsToMove);
2035     NewCallee->CalleeEdges.push_back(NewEdge);
2036     NewEdge->Callee->CallerEdges.push_back(NewEdge);
2037   }
2038   if (VerifyCCG) {
2039     checkNode<DerivedCCG, FuncTy, CallTy>(OldCallee, /*CheckEdges=*/false);
2040     checkNode<DerivedCCG, FuncTy, CallTy>(NewCallee, /*CheckEdges=*/false);
2041     for (const auto &OldCalleeEdge : OldCallee->CalleeEdges)
2042       checkNode<DerivedCCG, FuncTy, CallTy>(OldCalleeEdge->Callee,
2043                                             /*CheckEdges=*/false);
2044     for (const auto &NewCalleeEdge : NewCallee->CalleeEdges)
2045       checkNode<DerivedCCG, FuncTy, CallTy>(NewCalleeEdge->Callee,
2046                                             /*CheckEdges=*/false);
2047   }
2048 }
2049 
2050 template <typename DerivedCCG, typename FuncTy, typename CallTy>
2051 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::identifyClones() {
2052   DenseSet<const ContextNode *> Visited;
2053   for (auto &Entry : AllocationCallToContextNodeMap)
2054     identifyClones(Entry.second, Visited);
2055 }
2056 
2057 // helper function to check an AllocType is cold or notcold or both.
2058 bool checkColdOrNotCold(uint8_t AllocType) {
2059   return (AllocType == (uint8_t)AllocationType::Cold) ||
2060          (AllocType == (uint8_t)AllocationType::NotCold) ||
2061          (AllocType ==
2062           ((uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold));
2063 }
2064 
2065 template <typename DerivedCCG, typename FuncTy, typename CallTy>
2066 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::identifyClones(
2067     ContextNode *Node, DenseSet<const ContextNode *> &Visited) {
2068   if (VerifyNodes)
2069     checkNode<DerivedCCG, FuncTy, CallTy>(Node);
2070   assert(!Node->CloneOf);
2071 
2072   // If Node as a null call, then either it wasn't found in the module (regular
2073   // LTO) or summary index (ThinLTO), or there were other conditions blocking
2074   // cloning (e.g. recursion, calls multiple targets, etc).
2075   // Do this here so that we don't try to recursively clone callers below, which
2076   // isn't useful at least for this node.
2077   if (!Node->hasCall())
2078     return;
2079 
2080 #ifndef NDEBUG
2081   auto Insert =
2082 #endif
2083       Visited.insert(Node);
2084   // We should not have visited this node yet.
2085   assert(Insert.second);
2086   // The recursive call to identifyClones may delete the current edge from the
2087   // CallerEdges vector. Make a copy and iterate on that, simpler than passing
2088   // in an iterator and having recursive call erase from it. Other edges may
2089   // also get removed during the recursion, which will have null Callee and
2090   // Caller pointers (and are deleted later), so we skip those below.
2091   {
2092     auto CallerEdges = Node->CallerEdges;
2093     for (auto &Edge : CallerEdges) {
2094       // Skip any that have been removed by an earlier recursive call.
2095       if (Edge->Callee == nullptr && Edge->Caller == nullptr) {
2096         assert(!llvm::count(Node->CallerEdges, Edge));
2097         continue;
2098       }
2099       // Ignore any caller we previously visited via another edge.
2100       if (!Visited.count(Edge->Caller) && !Edge->Caller->CloneOf) {
2101         identifyClones(Edge->Caller, Visited);
2102       }
2103     }
2104   }
2105 
2106   // Check if we reached an unambiguous call or have have only a single caller.
2107   if (hasSingleAllocType(Node->AllocTypes) || Node->CallerEdges.size() <= 1)
2108     return;
2109 
2110   // We need to clone.
2111 
2112   // Try to keep the original version as alloc type NotCold. This will make
2113   // cases with indirect calls or any other situation with an unknown call to
2114   // the original function get the default behavior. We do this by sorting the
2115   // CallerEdges of the Node we will clone by alloc type.
2116   //
2117   // Give NotCold edge the lowest sort priority so those edges are at the end of
2118   // the caller edges vector, and stay on the original version (since the below
2119   // code clones greedily until it finds all remaining edges have the same type
2120   // and leaves the remaining ones on the original Node).
2121   //
2122   // We shouldn't actually have any None type edges, so the sorting priority for
2123   // that is arbitrary, and we assert in that case below.
2124   const unsigned AllocTypeCloningPriority[] = {/*None*/ 3, /*NotCold*/ 4,
2125                                                /*Cold*/ 1,
2126                                                /*NotColdCold*/ 2};
2127   std::stable_sort(Node->CallerEdges.begin(), Node->CallerEdges.end(),
2128                    [&](const std::shared_ptr<ContextEdge> &A,
2129                        const std::shared_ptr<ContextEdge> &B) {
2130                      assert(checkColdOrNotCold(A->AllocTypes) &&
2131                             checkColdOrNotCold(B->AllocTypes));
2132 
2133                      if (A->AllocTypes == B->AllocTypes)
2134                        // Use the first context id for each edge as a
2135                        // tie-breaker.
2136                        return *A->ContextIds.begin() < *B->ContextIds.begin();
2137                      return AllocTypeCloningPriority[A->AllocTypes] <
2138                             AllocTypeCloningPriority[B->AllocTypes];
2139                    });
2140 
2141   assert(Node->AllocTypes != (uint8_t)AllocationType::None);
2142 
2143   // Iterate until we find no more opportunities for disambiguating the alloc
2144   // types via cloning. In most cases this loop will terminate once the Node
2145   // has a single allocation type, in which case no more cloning is needed.
2146   // We need to be able to remove Edge from CallerEdges, so need to adjust
2147   // iterator inside the loop.
2148   for (auto EI = Node->CallerEdges.begin(); EI != Node->CallerEdges.end();) {
2149     auto CallerEdge = *EI;
2150 
2151     // See if cloning the prior caller edge left this node with a single alloc
2152     // type or a single caller. In that case no more cloning of Node is needed.
2153     if (hasSingleAllocType(Node->AllocTypes) || Node->CallerEdges.size() <= 1)
2154       break;
2155 
2156     // Compute the node callee edge alloc types corresponding to the context ids
2157     // for this caller edge.
2158     std::vector<uint8_t> CalleeEdgeAllocTypesForCallerEdge;
2159     CalleeEdgeAllocTypesForCallerEdge.reserve(Node->CalleeEdges.size());
2160     for (auto &CalleeEdge : Node->CalleeEdges)
2161       CalleeEdgeAllocTypesForCallerEdge.push_back(intersectAllocTypes(
2162           CalleeEdge->getContextIds(), CallerEdge->getContextIds()));
2163 
2164     // Don't clone if doing so will not disambiguate any alloc types amongst
2165     // caller edges (including the callee edges that would be cloned).
2166     // Otherwise we will simply move all edges to the clone.
2167     //
2168     // First check if by cloning we will disambiguate the caller allocation
2169     // type from node's allocation type. Query allocTypeToUse so that we don't
2170     // bother cloning to distinguish NotCold+Cold from NotCold. Note that
2171     // neither of these should be None type.
2172     //
2173     // Then check if by cloning node at least one of the callee edges will be
2174     // disambiguated by splitting out different context ids.
2175     assert(CallerEdge->AllocTypes != (uint8_t)AllocationType::None);
2176     assert(Node->AllocTypes != (uint8_t)AllocationType::None);
2177     if (allocTypeToUse(CallerEdge->AllocTypes) ==
2178             allocTypeToUse(Node->AllocTypes) &&
2179         allocTypesMatch<DerivedCCG, FuncTy, CallTy>(
2180             CalleeEdgeAllocTypesForCallerEdge, Node->CalleeEdges)) {
2181       ++EI;
2182       continue;
2183     }
2184 
2185     // First see if we can use an existing clone. Check each clone and its
2186     // callee edges for matching alloc types.
2187     ContextNode *Clone = nullptr;
2188     for (auto *CurClone : Node->Clones) {
2189       if (allocTypeToUse(CurClone->AllocTypes) !=
2190           allocTypeToUse(CallerEdge->AllocTypes))
2191         continue;
2192 
2193       if (!allocTypesMatch<DerivedCCG, FuncTy, CallTy>(
2194               CalleeEdgeAllocTypesForCallerEdge, CurClone->CalleeEdges))
2195         continue;
2196       Clone = CurClone;
2197       break;
2198     }
2199 
2200     // The edge iterator is adjusted when we move the CallerEdge to the clone.
2201     if (Clone)
2202       moveEdgeToExistingCalleeClone(CallerEdge, Clone, &EI);
2203     else
2204       Clone = moveEdgeToNewCalleeClone(CallerEdge, &EI);
2205 
2206     assert(EI == Node->CallerEdges.end() ||
2207            Node->AllocTypes != (uint8_t)AllocationType::None);
2208     // Sanity check that no alloc types on clone or its edges are None.
2209     assert(Clone->AllocTypes != (uint8_t)AllocationType::None);
2210     assert(llvm::none_of(
2211         Clone->CallerEdges, [&](const std::shared_ptr<ContextEdge> &E) {
2212           return E->AllocTypes == (uint8_t)AllocationType::None;
2213         }));
2214   }
2215 
2216   // Cloning may have resulted in some cloned callee edges with type None,
2217   // because they aren't carrying any contexts. Remove those edges.
2218   for (auto *Clone : Node->Clones) {
2219     removeNoneTypeCalleeEdges(Clone);
2220     if (VerifyNodes)
2221       checkNode<DerivedCCG, FuncTy, CallTy>(Clone);
2222   }
2223   // We should still have some context ids on the original Node.
2224   assert(!Node->ContextIds.empty());
2225 
2226   // Remove any callee edges that ended up with alloc type None after creating
2227   // clones and updating callee edges.
2228   removeNoneTypeCalleeEdges(Node);
2229 
2230   // Sanity check that no alloc types on node or edges are None.
2231   assert(Node->AllocTypes != (uint8_t)AllocationType::None);
2232   assert(llvm::none_of(Node->CalleeEdges,
2233                        [&](const std::shared_ptr<ContextEdge> &E) {
2234                          return E->AllocTypes == (uint8_t)AllocationType::None;
2235                        }));
2236   assert(llvm::none_of(Node->CallerEdges,
2237                        [&](const std::shared_ptr<ContextEdge> &E) {
2238                          return E->AllocTypes == (uint8_t)AllocationType::None;
2239                        }));
2240 
2241   if (VerifyNodes)
2242     checkNode<DerivedCCG, FuncTy, CallTy>(Node);
2243 }
2244 
2245 void ModuleCallsiteContextGraph::updateAllocationCall(
2246     CallInfo &Call, AllocationType AllocType) {
2247   std::string AllocTypeString = getAllocTypeAttributeString(AllocType);
2248   auto A = llvm::Attribute::get(Call.call()->getFunction()->getContext(),
2249                                 "memprof", AllocTypeString);
2250   cast<CallBase>(Call.call())->addFnAttr(A);
2251   OREGetter(Call.call()->getFunction())
2252       .emit(OptimizationRemark(DEBUG_TYPE, "MemprofAttribute", Call.call())
2253             << ore::NV("AllocationCall", Call.call()) << " in clone "
2254             << ore::NV("Caller", Call.call()->getFunction())
2255             << " marked with memprof allocation attribute "
2256             << ore::NV("Attribute", AllocTypeString));
2257 }
2258 
2259 void IndexCallsiteContextGraph::updateAllocationCall(CallInfo &Call,
2260                                                      AllocationType AllocType) {
2261   auto *AI = Call.call().dyn_cast<AllocInfo *>();
2262   assert(AI);
2263   assert(AI->Versions.size() > Call.cloneNo());
2264   AI->Versions[Call.cloneNo()] = (uint8_t)AllocType;
2265 }
2266 
2267 void ModuleCallsiteContextGraph::updateCall(CallInfo &CallerCall,
2268                                             FuncInfo CalleeFunc) {
2269   if (CalleeFunc.cloneNo() > 0)
2270     cast<CallBase>(CallerCall.call())->setCalledFunction(CalleeFunc.func());
2271   OREGetter(CallerCall.call()->getFunction())
2272       .emit(OptimizationRemark(DEBUG_TYPE, "MemprofCall", CallerCall.call())
2273             << ore::NV("Call", CallerCall.call()) << " in clone "
2274             << ore::NV("Caller", CallerCall.call()->getFunction())
2275             << " assigned to call function clone "
2276             << ore::NV("Callee", CalleeFunc.func()));
2277 }
2278 
2279 void IndexCallsiteContextGraph::updateCall(CallInfo &CallerCall,
2280                                            FuncInfo CalleeFunc) {
2281   auto *CI = CallerCall.call().dyn_cast<CallsiteInfo *>();
2282   assert(CI &&
2283          "Caller cannot be an allocation which should not have profiled calls");
2284   assert(CI->Clones.size() > CallerCall.cloneNo());
2285   CI->Clones[CallerCall.cloneNo()] = CalleeFunc.cloneNo();
2286 }
2287 
2288 CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
2289                      Instruction *>::FuncInfo
2290 ModuleCallsiteContextGraph::cloneFunctionForCallsite(
2291     FuncInfo &Func, CallInfo &Call, std::map<CallInfo, CallInfo> &CallMap,
2292     std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) {
2293   // Use existing LLVM facilities for cloning and obtaining Call in clone
2294   ValueToValueMapTy VMap;
2295   auto *NewFunc = CloneFunction(Func.func(), VMap);
2296   std::string Name = getMemProfFuncName(Func.func()->getName(), CloneNo);
2297   assert(!Func.func()->getParent()->getFunction(Name));
2298   NewFunc->setName(Name);
2299   for (auto &Inst : CallsWithMetadataInFunc) {
2300     // This map always has the initial version in it.
2301     assert(Inst.cloneNo() == 0);
2302     CallMap[Inst] = {cast<Instruction>(VMap[Inst.call()]), CloneNo};
2303   }
2304   OREGetter(Func.func())
2305       .emit(OptimizationRemark(DEBUG_TYPE, "MemprofClone", Func.func())
2306             << "created clone " << ore::NV("NewFunction", NewFunc));
2307   return {NewFunc, CloneNo};
2308 }
2309 
2310 CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
2311                      IndexCall>::FuncInfo
2312 IndexCallsiteContextGraph::cloneFunctionForCallsite(
2313     FuncInfo &Func, CallInfo &Call, std::map<CallInfo, CallInfo> &CallMap,
2314     std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) {
2315   // Check how many clones we have of Call (and therefore function).
2316   // The next clone number is the current size of versions array.
2317   // Confirm this matches the CloneNo provided by the caller, which is based on
2318   // the number of function clones we have.
2319   assert(CloneNo ==
2320          (Call.call().is<AllocInfo *>()
2321               ? Call.call().dyn_cast<AllocInfo *>()->Versions.size()
2322               : Call.call().dyn_cast<CallsiteInfo *>()->Clones.size()));
2323   // Walk all the instructions in this function. Create a new version for
2324   // each (by adding an entry to the Versions/Clones summary array), and copy
2325   // over the version being called for the function clone being cloned here.
2326   // Additionally, add an entry to the CallMap for the new function clone,
2327   // mapping the original call (clone 0, what is in CallsWithMetadataInFunc)
2328   // to the new call clone.
2329   for (auto &Inst : CallsWithMetadataInFunc) {
2330     // This map always has the initial version in it.
2331     assert(Inst.cloneNo() == 0);
2332     if (auto *AI = Inst.call().dyn_cast<AllocInfo *>()) {
2333       assert(AI->Versions.size() == CloneNo);
2334       // We assign the allocation type later (in updateAllocationCall), just add
2335       // an entry for it here.
2336       AI->Versions.push_back(0);
2337     } else {
2338       auto *CI = Inst.call().dyn_cast<CallsiteInfo *>();
2339       assert(CI && CI->Clones.size() == CloneNo);
2340       // We assign the clone number later (in updateCall), just add an entry for
2341       // it here.
2342       CI->Clones.push_back(0);
2343     }
2344     CallMap[Inst] = {Inst.call(), CloneNo};
2345   }
2346   return {Func.func(), CloneNo};
2347 }
2348 
2349 // This method assigns cloned callsites to functions, cloning the functions as
2350 // needed. The assignment is greedy and proceeds roughly as follows:
2351 //
2352 // For each function Func:
2353 //   For each call with graph Node having clones:
2354 //     Initialize ClonesWorklist to Node and its clones
2355 //     Initialize NodeCloneCount to 0
2356 //     While ClonesWorklist is not empty:
2357 //        Clone = pop front ClonesWorklist
2358 //        NodeCloneCount++
2359 //        If Func has been cloned less than NodeCloneCount times:
2360 //           If NodeCloneCount is 1:
2361 //             Assign Clone to original Func
2362 //             Continue
2363 //           Create a new function clone
2364 //           If other callers not assigned to call a function clone yet:
2365 //              Assign them to call new function clone
2366 //              Continue
2367 //           Assign any other caller calling the cloned version to new clone
2368 //
2369 //        For each caller of Clone:
2370 //           If caller is assigned to call a specific function clone:
2371 //             If we cannot assign Clone to that function clone:
2372 //               Create new callsite Clone NewClone
2373 //               Add NewClone to ClonesWorklist
2374 //               Continue
2375 //             Assign Clone to existing caller's called function clone
2376 //           Else:
2377 //             If Clone not already assigned to a function clone:
2378 //                Assign to first function clone without assignment
2379 //             Assign caller to selected function clone
2380 template <typename DerivedCCG, typename FuncTy, typename CallTy>
2381 bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() {
2382   bool Changed = false;
2383 
2384   // Keep track of the assignment of nodes (callsites) to function clones they
2385   // call.
2386   DenseMap<ContextNode *, FuncInfo> CallsiteToCalleeFuncCloneMap;
2387 
2388   // Update caller node to call function version CalleeFunc, by recording the
2389   // assignment in CallsiteToCalleeFuncCloneMap.
2390   auto RecordCalleeFuncOfCallsite = [&](ContextNode *Caller,
2391                                         const FuncInfo &CalleeFunc) {
2392     assert(Caller->hasCall());
2393     CallsiteToCalleeFuncCloneMap[Caller] = CalleeFunc;
2394   };
2395 
2396   // Walk all functions for which we saw calls with memprof metadata, and handle
2397   // cloning for each of its calls.
2398   for (auto &[Func, CallsWithMetadata] : FuncToCallsWithMetadata) {
2399     FuncInfo OrigFunc(Func);
2400     // Map from each clone of OrigFunc to a map of remappings of each call of
2401     // interest (from original uncloned call to the corresponding cloned call in
2402     // that function clone).
2403     std::map<FuncInfo, std::map<CallInfo, CallInfo>> FuncClonesToCallMap;
2404     for (auto &Call : CallsWithMetadata) {
2405       ContextNode *Node = getNodeForInst(Call);
2406       // Skip call if we do not have a node for it (all uses of its stack ids
2407       // were either on inlined chains or pruned from the MIBs), or if we did
2408       // not create any clones for it.
2409       if (!Node || Node->Clones.empty())
2410         continue;
2411       assert(Node->hasCall() &&
2412              "Not having a call should have prevented cloning");
2413 
2414       // Track the assignment of function clones to clones of the current
2415       // callsite Node being handled.
2416       std::map<FuncInfo, ContextNode *> FuncCloneToCurNodeCloneMap;
2417 
2418       // Assign callsite version CallsiteClone to function version FuncClone,
2419       // and also assign (possibly cloned) Call to CallsiteClone.
2420       auto AssignCallsiteCloneToFuncClone = [&](const FuncInfo &FuncClone,
2421                                                 CallInfo &Call,
2422                                                 ContextNode *CallsiteClone,
2423                                                 bool IsAlloc) {
2424         // Record the clone of callsite node assigned to this function clone.
2425         FuncCloneToCurNodeCloneMap[FuncClone] = CallsiteClone;
2426 
2427         assert(FuncClonesToCallMap.count(FuncClone));
2428         std::map<CallInfo, CallInfo> &CallMap = FuncClonesToCallMap[FuncClone];
2429         CallInfo CallClone(Call);
2430         if (CallMap.count(Call))
2431           CallClone = CallMap[Call];
2432         CallsiteClone->setCall(CallClone);
2433       };
2434 
2435       // Keep track of the clones of callsite Node that need to be assigned to
2436       // function clones. This list may be expanded in the loop body below if we
2437       // find additional cloning is required.
2438       std::deque<ContextNode *> ClonesWorklist;
2439       // Ignore original Node if we moved all of its contexts to clones.
2440       if (!Node->ContextIds.empty())
2441         ClonesWorklist.push_back(Node);
2442       ClonesWorklist.insert(ClonesWorklist.end(), Node->Clones.begin(),
2443                             Node->Clones.end());
2444 
2445       // Now walk through all of the clones of this callsite Node that we need,
2446       // and determine the assignment to a corresponding clone of the current
2447       // function (creating new function clones as needed).
2448       unsigned NodeCloneCount = 0;
2449       while (!ClonesWorklist.empty()) {
2450         ContextNode *Clone = ClonesWorklist.front();
2451         ClonesWorklist.pop_front();
2452         NodeCloneCount++;
2453         if (VerifyNodes)
2454           checkNode<DerivedCCG, FuncTy, CallTy>(Clone);
2455 
2456         // Need to create a new function clone if we have more callsite clones
2457         // than existing function clones, which would have been assigned to an
2458         // earlier clone in the list (we assign callsite clones to function
2459         // clones greedily).
2460         if (FuncClonesToCallMap.size() < NodeCloneCount) {
2461           // If this is the first callsite copy, assign to original function.
2462           if (NodeCloneCount == 1) {
2463             // Since FuncClonesToCallMap is empty in this case, no clones have
2464             // been created for this function yet, and no callers should have
2465             // been assigned a function clone for this callee node yet.
2466             assert(llvm::none_of(
2467                 Clone->CallerEdges, [&](const std::shared_ptr<ContextEdge> &E) {
2468                   return CallsiteToCalleeFuncCloneMap.count(E->Caller);
2469                 }));
2470             // Initialize with empty call map, assign Clone to original function
2471             // and its callers, and skip to the next clone.
2472             FuncClonesToCallMap[OrigFunc] = {};
2473             AssignCallsiteCloneToFuncClone(
2474                 OrigFunc, Call, Clone,
2475                 AllocationCallToContextNodeMap.count(Call));
2476             for (auto &CE : Clone->CallerEdges) {
2477               // Ignore any caller that does not have a recorded callsite Call.
2478               if (!CE->Caller->hasCall())
2479                 continue;
2480               RecordCalleeFuncOfCallsite(CE->Caller, OrigFunc);
2481             }
2482             continue;
2483           }
2484 
2485           // First locate which copy of OrigFunc to clone again. If a caller
2486           // of this callsite clone was already assigned to call a particular
2487           // function clone, we need to redirect all of those callers to the
2488           // new function clone, and update their other callees within this
2489           // function.
2490           FuncInfo PreviousAssignedFuncClone;
2491           auto EI = llvm::find_if(
2492               Clone->CallerEdges, [&](const std::shared_ptr<ContextEdge> &E) {
2493                 return CallsiteToCalleeFuncCloneMap.count(E->Caller);
2494               });
2495           bool CallerAssignedToCloneOfFunc = false;
2496           if (EI != Clone->CallerEdges.end()) {
2497             const std::shared_ptr<ContextEdge> &Edge = *EI;
2498             PreviousAssignedFuncClone =
2499                 CallsiteToCalleeFuncCloneMap[Edge->Caller];
2500             CallerAssignedToCloneOfFunc = true;
2501           }
2502 
2503           // Clone function and save it along with the CallInfo map created
2504           // during cloning in the FuncClonesToCallMap.
2505           std::map<CallInfo, CallInfo> NewCallMap;
2506           unsigned CloneNo = FuncClonesToCallMap.size();
2507           assert(CloneNo > 0 && "Clone 0 is the original function, which "
2508                                 "should already exist in the map");
2509           FuncInfo NewFuncClone = cloneFunctionForCallsite(
2510               OrigFunc, Call, NewCallMap, CallsWithMetadata, CloneNo);
2511           FuncClonesToCallMap.emplace(NewFuncClone, std::move(NewCallMap));
2512           FunctionClonesAnalysis++;
2513           Changed = true;
2514 
2515           // If no caller callsites were already assigned to a clone of this
2516           // function, we can simply assign this clone to the new func clone
2517           // and update all callers to it, then skip to the next clone.
2518           if (!CallerAssignedToCloneOfFunc) {
2519             AssignCallsiteCloneToFuncClone(
2520                 NewFuncClone, Call, Clone,
2521                 AllocationCallToContextNodeMap.count(Call));
2522             for (auto &CE : Clone->CallerEdges) {
2523               // Ignore any caller that does not have a recorded callsite Call.
2524               if (!CE->Caller->hasCall())
2525                 continue;
2526               RecordCalleeFuncOfCallsite(CE->Caller, NewFuncClone);
2527             }
2528             continue;
2529           }
2530 
2531           // We may need to do additional node cloning in this case.
2532           // Reset the CallsiteToCalleeFuncCloneMap entry for any callers
2533           // that were previously assigned to call PreviousAssignedFuncClone,
2534           // to record that they now call NewFuncClone.
2535           for (auto CE : Clone->CallerEdges) {
2536             // Ignore any caller that does not have a recorded callsite Call.
2537             if (!CE->Caller->hasCall())
2538               continue;
2539 
2540             if (!CallsiteToCalleeFuncCloneMap.count(CE->Caller) ||
2541                 // We subsequently fall through to later handling that
2542                 // will perform any additional cloning required for
2543                 // callers that were calling other function clones.
2544                 CallsiteToCalleeFuncCloneMap[CE->Caller] !=
2545                     PreviousAssignedFuncClone)
2546               continue;
2547 
2548             RecordCalleeFuncOfCallsite(CE->Caller, NewFuncClone);
2549 
2550             // If we are cloning a function that was already assigned to some
2551             // callers, then essentially we are creating new callsite clones
2552             // of the other callsites in that function that are reached by those
2553             // callers. Clone the other callees of the current callsite's caller
2554             // that were already assigned to PreviousAssignedFuncClone
2555             // accordingly. This is important since we subsequently update the
2556             // calls from the nodes in the graph and their assignments to callee
2557             // functions recorded in CallsiteToCalleeFuncCloneMap.
2558             for (auto CalleeEdge : CE->Caller->CalleeEdges) {
2559               // Skip any that have been removed on an earlier iteration when
2560               // cleaning up newly None type callee edges.
2561               if (!CalleeEdge)
2562                 continue;
2563               ContextNode *Callee = CalleeEdge->Callee;
2564               // Skip the current callsite, we are looking for other
2565               // callsites Caller calls, as well as any that does not have a
2566               // recorded callsite Call.
2567               if (Callee == Clone || !Callee->hasCall())
2568                 continue;
2569               ContextNode *NewClone = moveEdgeToNewCalleeClone(CalleeEdge);
2570               removeNoneTypeCalleeEdges(NewClone);
2571               // Moving the edge may have resulted in some none type
2572               // callee edges on the original Callee.
2573               removeNoneTypeCalleeEdges(Callee);
2574               assert(NewClone->AllocTypes != (uint8_t)AllocationType::None);
2575               // If the Callee node was already assigned to call a specific
2576               // function version, make sure its new clone is assigned to call
2577               // that same function clone.
2578               if (CallsiteToCalleeFuncCloneMap.count(Callee))
2579                 RecordCalleeFuncOfCallsite(
2580                     NewClone, CallsiteToCalleeFuncCloneMap[Callee]);
2581               // Update NewClone with the new Call clone of this callsite's Call
2582               // created for the new function clone created earlier.
2583               // Recall that we have already ensured when building the graph
2584               // that each caller can only call callsites within the same
2585               // function, so we are guaranteed that Callee Call is in the
2586               // current OrigFunc.
2587               // CallMap is set up as indexed by original Call at clone 0.
2588               CallInfo OrigCall(Callee->getOrigNode()->Call);
2589               OrigCall.setCloneNo(0);
2590               std::map<CallInfo, CallInfo> &CallMap =
2591                   FuncClonesToCallMap[NewFuncClone];
2592               assert(CallMap.count(OrigCall));
2593               CallInfo NewCall(CallMap[OrigCall]);
2594               assert(NewCall);
2595               NewClone->setCall(NewCall);
2596             }
2597           }
2598           // Fall through to handling below to perform the recording of the
2599           // function for this callsite clone. This enables handling of cases
2600           // where the callers were assigned to different clones of a function.
2601         }
2602 
2603         // See if we can use existing function clone. Walk through
2604         // all caller edges to see if any have already been assigned to
2605         // a clone of this callsite's function. If we can use it, do so. If not,
2606         // because that function clone is already assigned to a different clone
2607         // of this callsite, then we need to clone again.
2608         // Basically, this checking is needed to handle the case where different
2609         // caller functions/callsites may need versions of this function
2610         // containing different mixes of callsite clones across the different
2611         // callsites within the function. If that happens, we need to create
2612         // additional function clones to handle the various combinations.
2613         //
2614         // Keep track of any new clones of this callsite created by the
2615         // following loop, as well as any existing clone that we decided to
2616         // assign this clone to.
2617         std::map<FuncInfo, ContextNode *> FuncCloneToNewCallsiteCloneMap;
2618         FuncInfo FuncCloneAssignedToCurCallsiteClone;
2619         // We need to be able to remove Edge from CallerEdges, so need to adjust
2620         // iterator in the loop.
2621         for (auto EI = Clone->CallerEdges.begin();
2622              EI != Clone->CallerEdges.end();) {
2623           auto Edge = *EI;
2624           // Ignore any caller that does not have a recorded callsite Call.
2625           if (!Edge->Caller->hasCall()) {
2626             EI++;
2627             continue;
2628           }
2629           // If this caller already assigned to call a version of OrigFunc, need
2630           // to ensure we can assign this callsite clone to that function clone.
2631           if (CallsiteToCalleeFuncCloneMap.count(Edge->Caller)) {
2632             FuncInfo FuncCloneCalledByCaller =
2633                 CallsiteToCalleeFuncCloneMap[Edge->Caller];
2634             // First we need to confirm that this function clone is available
2635             // for use by this callsite node clone.
2636             //
2637             // While FuncCloneToCurNodeCloneMap is built only for this Node and
2638             // its callsite clones, one of those callsite clones X could have
2639             // been assigned to the same function clone called by Edge's caller
2640             // - if Edge's caller calls another callsite within Node's original
2641             // function, and that callsite has another caller reaching clone X.
2642             // We need to clone Node again in this case.
2643             if ((FuncCloneToCurNodeCloneMap.count(FuncCloneCalledByCaller) &&
2644                  FuncCloneToCurNodeCloneMap[FuncCloneCalledByCaller] !=
2645                      Clone) ||
2646                 // Detect when we have multiple callers of this callsite that
2647                 // have already been assigned to specific, and different, clones
2648                 // of OrigFunc (due to other unrelated callsites in Func they
2649                 // reach via call contexts). Is this Clone of callsite Node
2650                 // assigned to a different clone of OrigFunc? If so, clone Node
2651                 // again.
2652                 (FuncCloneAssignedToCurCallsiteClone &&
2653                  FuncCloneAssignedToCurCallsiteClone !=
2654                      FuncCloneCalledByCaller)) {
2655               // We need to use a different newly created callsite clone, in
2656               // order to assign it to another new function clone on a
2657               // subsequent iteration over the Clones array (adjusted below).
2658               // Note we specifically do not reset the
2659               // CallsiteToCalleeFuncCloneMap entry for this caller, so that
2660               // when this new clone is processed later we know which version of
2661               // the function to copy (so that other callsite clones we have
2662               // assigned to that function clone are properly cloned over). See
2663               // comments in the function cloning handling earlier.
2664 
2665               // Check if we already have cloned this callsite again while
2666               // walking through caller edges, for a caller calling the same
2667               // function clone. If so, we can move this edge to that new clone
2668               // rather than creating yet another new clone.
2669               if (FuncCloneToNewCallsiteCloneMap.count(
2670                       FuncCloneCalledByCaller)) {
2671                 ContextNode *NewClone =
2672                     FuncCloneToNewCallsiteCloneMap[FuncCloneCalledByCaller];
2673                 moveEdgeToExistingCalleeClone(Edge, NewClone, &EI);
2674                 // Cleanup any none type edges cloned over.
2675                 removeNoneTypeCalleeEdges(NewClone);
2676               } else {
2677                 // Create a new callsite clone.
2678                 ContextNode *NewClone = moveEdgeToNewCalleeClone(Edge, &EI);
2679                 removeNoneTypeCalleeEdges(NewClone);
2680                 FuncCloneToNewCallsiteCloneMap[FuncCloneCalledByCaller] =
2681                     NewClone;
2682                 // Add to list of clones and process later.
2683                 ClonesWorklist.push_back(NewClone);
2684                 assert(EI == Clone->CallerEdges.end() ||
2685                        Clone->AllocTypes != (uint8_t)AllocationType::None);
2686                 assert(NewClone->AllocTypes != (uint8_t)AllocationType::None);
2687               }
2688               // Moving the caller edge may have resulted in some none type
2689               // callee edges.
2690               removeNoneTypeCalleeEdges(Clone);
2691               // We will handle the newly created callsite clone in a subsequent
2692               // iteration over this Node's Clones. Continue here since we
2693               // already adjusted iterator EI while moving the edge.
2694               continue;
2695             }
2696 
2697             // Otherwise, we can use the function clone already assigned to this
2698             // caller.
2699             if (!FuncCloneAssignedToCurCallsiteClone) {
2700               FuncCloneAssignedToCurCallsiteClone = FuncCloneCalledByCaller;
2701               // Assign Clone to FuncCloneCalledByCaller
2702               AssignCallsiteCloneToFuncClone(
2703                   FuncCloneCalledByCaller, Call, Clone,
2704                   AllocationCallToContextNodeMap.count(Call));
2705             } else
2706               // Don't need to do anything - callsite is already calling this
2707               // function clone.
2708               assert(FuncCloneAssignedToCurCallsiteClone ==
2709                      FuncCloneCalledByCaller);
2710 
2711           } else {
2712             // We have not already assigned this caller to a version of
2713             // OrigFunc. Do the assignment now.
2714 
2715             // First check if we have already assigned this callsite clone to a
2716             // clone of OrigFunc for another caller during this iteration over
2717             // its caller edges.
2718             if (!FuncCloneAssignedToCurCallsiteClone) {
2719               // Find first function in FuncClonesToCallMap without an assigned
2720               // clone of this callsite Node. We should always have one
2721               // available at this point due to the earlier cloning when the
2722               // FuncClonesToCallMap size was smaller than the clone number.
2723               for (auto &CF : FuncClonesToCallMap) {
2724                 if (!FuncCloneToCurNodeCloneMap.count(CF.first)) {
2725                   FuncCloneAssignedToCurCallsiteClone = CF.first;
2726                   break;
2727                 }
2728               }
2729               assert(FuncCloneAssignedToCurCallsiteClone);
2730               // Assign Clone to FuncCloneAssignedToCurCallsiteClone
2731               AssignCallsiteCloneToFuncClone(
2732                   FuncCloneAssignedToCurCallsiteClone, Call, Clone,
2733                   AllocationCallToContextNodeMap.count(Call));
2734             } else
2735               assert(FuncCloneToCurNodeCloneMap
2736                          [FuncCloneAssignedToCurCallsiteClone] == Clone);
2737             // Update callers to record function version called.
2738             RecordCalleeFuncOfCallsite(Edge->Caller,
2739                                        FuncCloneAssignedToCurCallsiteClone);
2740           }
2741 
2742           EI++;
2743         }
2744       }
2745       if (VerifyCCG) {
2746         checkNode<DerivedCCG, FuncTy, CallTy>(Node);
2747         for (const auto &PE : Node->CalleeEdges)
2748           checkNode<DerivedCCG, FuncTy, CallTy>(PE->Callee);
2749         for (const auto &CE : Node->CallerEdges)
2750           checkNode<DerivedCCG, FuncTy, CallTy>(CE->Caller);
2751         for (auto *Clone : Node->Clones) {
2752           checkNode<DerivedCCG, FuncTy, CallTy>(Clone);
2753           for (const auto &PE : Clone->CalleeEdges)
2754             checkNode<DerivedCCG, FuncTy, CallTy>(PE->Callee);
2755           for (const auto &CE : Clone->CallerEdges)
2756             checkNode<DerivedCCG, FuncTy, CallTy>(CE->Caller);
2757         }
2758       }
2759     }
2760   }
2761 
2762   auto UpdateCalls = [&](ContextNode *Node,
2763                          DenseSet<const ContextNode *> &Visited,
2764                          auto &&UpdateCalls) {
2765     auto Inserted = Visited.insert(Node);
2766     if (!Inserted.second)
2767       return;
2768 
2769     for (auto *Clone : Node->Clones)
2770       UpdateCalls(Clone, Visited, UpdateCalls);
2771 
2772     for (auto &Edge : Node->CallerEdges)
2773       UpdateCalls(Edge->Caller, Visited, UpdateCalls);
2774 
2775     // Skip if either no call to update, or if we ended up with no context ids
2776     // (we moved all edges onto other clones).
2777     if (!Node->hasCall() || Node->ContextIds.empty())
2778       return;
2779 
2780     if (Node->IsAllocation) {
2781       updateAllocationCall(Node->Call, allocTypeToUse(Node->AllocTypes));
2782       return;
2783     }
2784 
2785     if (!CallsiteToCalleeFuncCloneMap.count(Node))
2786       return;
2787 
2788     auto CalleeFunc = CallsiteToCalleeFuncCloneMap[Node];
2789     updateCall(Node->Call, CalleeFunc);
2790   };
2791 
2792   // Performs DFS traversal starting from allocation nodes to update calls to
2793   // reflect cloning decisions recorded earlier. For regular LTO this will
2794   // update the actual calls in the IR to call the appropriate function clone
2795   // (and add attributes to allocation calls), whereas for ThinLTO the decisions
2796   // are recorded in the summary entries.
2797   DenseSet<const ContextNode *> Visited;
2798   for (auto &Entry : AllocationCallToContextNodeMap)
2799     UpdateCalls(Entry.second, Visited, UpdateCalls);
2800 
2801   return Changed;
2802 }
2803 
2804 static SmallVector<std::unique_ptr<ValueToValueMapTy>, 4> createFunctionClones(
2805     Function &F, unsigned NumClones, Module &M, OptimizationRemarkEmitter &ORE,
2806     std::map<const Function *, SmallPtrSet<const GlobalAlias *, 1>>
2807         &FuncToAliasMap) {
2808   // The first "clone" is the original copy, we should only call this if we
2809   // needed to create new clones.
2810   assert(NumClones > 1);
2811   SmallVector<std::unique_ptr<ValueToValueMapTy>, 4> VMaps;
2812   VMaps.reserve(NumClones - 1);
2813   FunctionsClonedThinBackend++;
2814   for (unsigned I = 1; I < NumClones; I++) {
2815     VMaps.emplace_back(std::make_unique<ValueToValueMapTy>());
2816     auto *NewF = CloneFunction(&F, *VMaps.back());
2817     FunctionClonesThinBackend++;
2818     // Strip memprof and callsite metadata from clone as they are no longer
2819     // needed.
2820     for (auto &BB : *NewF) {
2821       for (auto &Inst : BB) {
2822         Inst.setMetadata(LLVMContext::MD_memprof, nullptr);
2823         Inst.setMetadata(LLVMContext::MD_callsite, nullptr);
2824       }
2825     }
2826     std::string Name = getMemProfFuncName(F.getName(), I);
2827     auto *PrevF = M.getFunction(Name);
2828     if (PrevF) {
2829       // We might have created this when adjusting callsite in another
2830       // function. It should be a declaration.
2831       assert(PrevF->isDeclaration());
2832       NewF->takeName(PrevF);
2833       PrevF->replaceAllUsesWith(NewF);
2834       PrevF->eraseFromParent();
2835     } else
2836       NewF->setName(Name);
2837     ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofClone", &F)
2838              << "created clone " << ore::NV("NewFunction", NewF));
2839 
2840     // Now handle aliases to this function, and clone those as well.
2841     if (!FuncToAliasMap.count(&F))
2842       continue;
2843     for (auto *A : FuncToAliasMap[&F]) {
2844       std::string Name = getMemProfFuncName(A->getName(), I);
2845       auto *PrevA = M.getNamedAlias(Name);
2846       auto *NewA = GlobalAlias::create(A->getValueType(),
2847                                        A->getType()->getPointerAddressSpace(),
2848                                        A->getLinkage(), Name, NewF);
2849       NewA->copyAttributesFrom(A);
2850       if (PrevA) {
2851         // We might have created this when adjusting callsite in another
2852         // function. It should be a declaration.
2853         assert(PrevA->isDeclaration());
2854         NewA->takeName(PrevA);
2855         PrevA->replaceAllUsesWith(NewA);
2856         PrevA->eraseFromParent();
2857       }
2858     }
2859   }
2860   return VMaps;
2861 }
2862 
2863 // Locate the summary for F. This is complicated by the fact that it might
2864 // have been internalized or promoted.
2865 static ValueInfo findValueInfoForFunc(const Function &F, const Module &M,
2866                                       const ModuleSummaryIndex *ImportSummary) {
2867   // FIXME: Ideally we would retain the original GUID in some fashion on the
2868   // function (e.g. as metadata), but for now do our best to locate the
2869   // summary without that information.
2870   ValueInfo TheFnVI = ImportSummary->getValueInfo(F.getGUID());
2871   if (!TheFnVI)
2872     // See if theFn was internalized, by checking index directly with
2873     // original name (this avoids the name adjustment done by getGUID() for
2874     // internal symbols).
2875     TheFnVI = ImportSummary->getValueInfo(GlobalValue::getGUID(F.getName()));
2876   if (TheFnVI)
2877     return TheFnVI;
2878   // Now query with the original name before any promotion was performed.
2879   StringRef OrigName =
2880       ModuleSummaryIndex::getOriginalNameBeforePromote(F.getName());
2881   std::string OrigId = GlobalValue::getGlobalIdentifier(
2882       OrigName, GlobalValue::InternalLinkage, M.getSourceFileName());
2883   TheFnVI = ImportSummary->getValueInfo(GlobalValue::getGUID(OrigId));
2884   if (TheFnVI)
2885     return TheFnVI;
2886   // Could be a promoted local imported from another module. We need to pass
2887   // down more info here to find the original module id. For now, try with
2888   // the OrigName which might have been stored in the OidGuidMap in the
2889   // index. This would not work if there were same-named locals in multiple
2890   // modules, however.
2891   auto OrigGUID =
2892       ImportSummary->getGUIDFromOriginalID(GlobalValue::getGUID(OrigName));
2893   if (OrigGUID)
2894     TheFnVI = ImportSummary->getValueInfo(OrigGUID);
2895   return TheFnVI;
2896 }
2897 
2898 bool MemProfContextDisambiguation::applyImport(Module &M) {
2899   assert(ImportSummary);
2900   bool Changed = false;
2901 
2902   auto IsMemProfClone = [](const Function &F) {
2903     return F.getName().contains(MemProfCloneSuffix);
2904   };
2905 
2906   // We also need to clone any aliases that reference cloned functions, because
2907   // the modified callsites may invoke via the alias. Keep track of the aliases
2908   // for each function.
2909   std::map<const Function *, SmallPtrSet<const GlobalAlias *, 1>>
2910       FuncToAliasMap;
2911   for (auto &A : M.aliases()) {
2912     auto *Aliasee = A.getAliaseeObject();
2913     if (auto *F = dyn_cast<Function>(Aliasee))
2914       FuncToAliasMap[F].insert(&A);
2915   }
2916 
2917   for (auto &F : M) {
2918     if (F.isDeclaration() || IsMemProfClone(F))
2919       continue;
2920 
2921     OptimizationRemarkEmitter ORE(&F);
2922 
2923     SmallVector<std::unique_ptr<ValueToValueMapTy>, 4> VMaps;
2924     bool ClonesCreated = false;
2925     unsigned NumClonesCreated = 0;
2926     auto CloneFuncIfNeeded = [&](unsigned NumClones) {
2927       // We should at least have version 0 which is the original copy.
2928       assert(NumClones > 0);
2929       // If only one copy needed use original.
2930       if (NumClones == 1)
2931         return;
2932       // If we already performed cloning of this function, confirm that the
2933       // requested number of clones matches (the thin link should ensure the
2934       // number of clones for each constituent callsite is consistent within
2935       // each function), before returning.
2936       if (ClonesCreated) {
2937         assert(NumClonesCreated == NumClones);
2938         return;
2939       }
2940       VMaps = createFunctionClones(F, NumClones, M, ORE, FuncToAliasMap);
2941       // The first "clone" is the original copy, which doesn't have a VMap.
2942       assert(VMaps.size() == NumClones - 1);
2943       Changed = true;
2944       ClonesCreated = true;
2945       NumClonesCreated = NumClones;
2946     };
2947 
2948     // Locate the summary for F.
2949     ValueInfo TheFnVI = findValueInfoForFunc(F, M, ImportSummary);
2950     // If not found, this could be an imported local (see comment in
2951     // findValueInfoForFunc). Skip for now as it will be cloned in its original
2952     // module (where it would have been promoted to global scope so should
2953     // satisfy any reference in this module).
2954     if (!TheFnVI)
2955       continue;
2956 
2957     auto *GVSummary =
2958         ImportSummary->findSummaryInModule(TheFnVI, M.getModuleIdentifier());
2959     if (!GVSummary)
2960       // Must have been imported, use the first summary (might be multiple if
2961       // this was a linkonce_odr).
2962       GVSummary = TheFnVI.getSummaryList().front().get();
2963 
2964     // If this was an imported alias skip it as we won't have the function
2965     // summary, and it should be cloned in the original module.
2966     if (isa<AliasSummary>(GVSummary))
2967       continue;
2968 
2969     auto *FS = cast<FunctionSummary>(GVSummary->getBaseObject());
2970 
2971     if (FS->allocs().empty() && FS->callsites().empty())
2972       continue;
2973 
2974     auto SI = FS->callsites().begin();
2975     auto AI = FS->allocs().begin();
2976 
2977     // Assume for now that the instructions are in the exact same order
2978     // as when the summary was created, but confirm this is correct by
2979     // matching the stack ids.
2980     for (auto &BB : F) {
2981       for (auto &I : BB) {
2982         auto *CB = dyn_cast<CallBase>(&I);
2983         // Same handling as when creating module summary.
2984         if (!mayHaveMemprofSummary(CB))
2985           continue;
2986 
2987         auto *CalledValue = CB->getCalledOperand();
2988         auto *CalledFunction = CB->getCalledFunction();
2989         if (CalledValue && !CalledFunction) {
2990           CalledValue = CalledValue->stripPointerCasts();
2991           // Stripping pointer casts can reveal a called function.
2992           CalledFunction = dyn_cast<Function>(CalledValue);
2993         }
2994         // Check if this is an alias to a function. If so, get the
2995         // called aliasee for the checks below.
2996         if (auto *GA = dyn_cast<GlobalAlias>(CalledValue)) {
2997           assert(!CalledFunction &&
2998                  "Expected null called function in callsite for alias");
2999           CalledFunction = dyn_cast<Function>(GA->getAliaseeObject());
3000         }
3001 
3002         CallStack<MDNode, MDNode::op_iterator> CallsiteContext(
3003             I.getMetadata(LLVMContext::MD_callsite));
3004         auto *MemProfMD = I.getMetadata(LLVMContext::MD_memprof);
3005 
3006         // Include allocs that were already assigned a memprof function
3007         // attribute in the statistics.
3008         if (CB->getAttributes().hasFnAttr("memprof")) {
3009           assert(!MemProfMD);
3010           CB->getAttributes().getFnAttr("memprof").getValueAsString() == "cold"
3011               ? AllocTypeColdThinBackend++
3012               : AllocTypeNotColdThinBackend++;
3013           OrigAllocsThinBackend++;
3014           AllocVersionsThinBackend++;
3015           if (!MaxAllocVersionsThinBackend)
3016             MaxAllocVersionsThinBackend = 1;
3017           // Remove any remaining callsite metadata and we can skip the rest of
3018           // the handling for this instruction, since no cloning needed.
3019           I.setMetadata(LLVMContext::MD_callsite, nullptr);
3020           continue;
3021         }
3022 
3023         if (MemProfMD) {
3024           // Consult the next alloc node.
3025           assert(AI != FS->allocs().end());
3026           auto &AllocNode = *(AI++);
3027 
3028           // Sanity check that the MIB stack ids match between the summary and
3029           // instruction metadata.
3030           auto MIBIter = AllocNode.MIBs.begin();
3031           for (auto &MDOp : MemProfMD->operands()) {
3032             assert(MIBIter != AllocNode.MIBs.end());
3033             LLVM_ATTRIBUTE_UNUSED auto StackIdIndexIter =
3034                 MIBIter->StackIdIndices.begin();
3035             auto *MIBMD = cast<const MDNode>(MDOp);
3036             MDNode *StackMDNode = getMIBStackNode(MIBMD);
3037             assert(StackMDNode);
3038             SmallVector<unsigned> StackIdsFromMetadata;
3039             CallStack<MDNode, MDNode::op_iterator> StackContext(StackMDNode);
3040             for (auto ContextIter =
3041                      StackContext.beginAfterSharedPrefix(CallsiteContext);
3042                  ContextIter != StackContext.end(); ++ContextIter) {
3043               // If this is a direct recursion, simply skip the duplicate
3044               // entries, to be consistent with how the summary ids were
3045               // generated during ModuleSummaryAnalysis.
3046               if (!StackIdsFromMetadata.empty() &&
3047                   StackIdsFromMetadata.back() == *ContextIter)
3048                 continue;
3049               assert(StackIdIndexIter != MIBIter->StackIdIndices.end());
3050               assert(ImportSummary->getStackIdAtIndex(*StackIdIndexIter) ==
3051                      *ContextIter);
3052               StackIdIndexIter++;
3053             }
3054             MIBIter++;
3055           }
3056 
3057           // Perform cloning if not yet done.
3058           CloneFuncIfNeeded(/*NumClones=*/AllocNode.Versions.size());
3059 
3060           OrigAllocsThinBackend++;
3061           AllocVersionsThinBackend += AllocNode.Versions.size();
3062           if (MaxAllocVersionsThinBackend < AllocNode.Versions.size())
3063             MaxAllocVersionsThinBackend = AllocNode.Versions.size();
3064 
3065           // If there is only one version that means we didn't end up
3066           // considering this function for cloning, and in that case the alloc
3067           // will still be none type or should have gotten the default NotCold.
3068           // Skip that after calling clone helper since that does some sanity
3069           // checks that confirm we haven't decided yet that we need cloning.
3070           if (AllocNode.Versions.size() == 1) {
3071             assert((AllocationType)AllocNode.Versions[0] ==
3072                        AllocationType::NotCold ||
3073                    (AllocationType)AllocNode.Versions[0] ==
3074                        AllocationType::None);
3075             UnclonableAllocsThinBackend++;
3076             continue;
3077           }
3078 
3079           // All versions should have a singular allocation type.
3080           assert(llvm::none_of(AllocNode.Versions, [](uint8_t Type) {
3081             return Type == ((uint8_t)AllocationType::NotCold |
3082                             (uint8_t)AllocationType::Cold);
3083           }));
3084 
3085           // Update the allocation types per the summary info.
3086           for (unsigned J = 0; J < AllocNode.Versions.size(); J++) {
3087             // Ignore any that didn't get an assigned allocation type.
3088             if (AllocNode.Versions[J] == (uint8_t)AllocationType::None)
3089               continue;
3090             AllocationType AllocTy = (AllocationType)AllocNode.Versions[J];
3091             AllocTy == AllocationType::Cold ? AllocTypeColdThinBackend++
3092                                             : AllocTypeNotColdThinBackend++;
3093             std::string AllocTypeString = getAllocTypeAttributeString(AllocTy);
3094             auto A = llvm::Attribute::get(F.getContext(), "memprof",
3095                                           AllocTypeString);
3096             CallBase *CBClone;
3097             // Copy 0 is the original function.
3098             if (!J)
3099               CBClone = CB;
3100             else
3101               // Since VMaps are only created for new clones, we index with
3102               // clone J-1 (J==0 is the original clone and does not have a VMaps
3103               // entry).
3104               CBClone = cast<CallBase>((*VMaps[J - 1])[CB]);
3105             CBClone->addFnAttr(A);
3106             ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofAttribute", CBClone)
3107                      << ore::NV("AllocationCall", CBClone) << " in clone "
3108                      << ore::NV("Caller", CBClone->getFunction())
3109                      << " marked with memprof allocation attribute "
3110                      << ore::NV("Attribute", AllocTypeString));
3111           }
3112         } else if (!CallsiteContext.empty()) {
3113           // Consult the next callsite node.
3114           assert(SI != FS->callsites().end());
3115           auto &StackNode = *(SI++);
3116 
3117 #ifndef NDEBUG
3118           // Sanity check that the stack ids match between the summary and
3119           // instruction metadata.
3120           auto StackIdIndexIter = StackNode.StackIdIndices.begin();
3121           for (auto StackId : CallsiteContext) {
3122             assert(StackIdIndexIter != StackNode.StackIdIndices.end());
3123             assert(ImportSummary->getStackIdAtIndex(*StackIdIndexIter) ==
3124                    StackId);
3125             StackIdIndexIter++;
3126           }
3127 #endif
3128 
3129           // Perform cloning if not yet done.
3130           CloneFuncIfNeeded(/*NumClones=*/StackNode.Clones.size());
3131 
3132           // Should have skipped indirect calls via mayHaveMemprofSummary.
3133           assert(CalledFunction);
3134           assert(!IsMemProfClone(*CalledFunction));
3135 
3136           // Update the calls per the summary info.
3137           // Save orig name since it gets updated in the first iteration
3138           // below.
3139           auto CalleeOrigName = CalledFunction->getName();
3140           for (unsigned J = 0; J < StackNode.Clones.size(); J++) {
3141             // Do nothing if this version calls the original version of its
3142             // callee.
3143             if (!StackNode.Clones[J])
3144               continue;
3145             auto NewF = M.getOrInsertFunction(
3146                 getMemProfFuncName(CalleeOrigName, StackNode.Clones[J]),
3147                 CalledFunction->getFunctionType());
3148             CallBase *CBClone;
3149             // Copy 0 is the original function.
3150             if (!J)
3151               CBClone = CB;
3152             else
3153               CBClone = cast<CallBase>((*VMaps[J - 1])[CB]);
3154             CBClone->setCalledFunction(NewF);
3155             ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofCall", CBClone)
3156                      << ore::NV("Call", CBClone) << " in clone "
3157                      << ore::NV("Caller", CBClone->getFunction())
3158                      << " assigned to call function clone "
3159                      << ore::NV("Callee", NewF.getCallee()));
3160           }
3161         }
3162         // Memprof and callsite metadata on memory allocations no longer needed.
3163         I.setMetadata(LLVMContext::MD_memprof, nullptr);
3164         I.setMetadata(LLVMContext::MD_callsite, nullptr);
3165       }
3166     }
3167   }
3168 
3169   return Changed;
3170 }
3171 
3172 template <typename DerivedCCG, typename FuncTy, typename CallTy>
3173 bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::process() {
3174   if (DumpCCG) {
3175     dbgs() << "CCG before cloning:\n";
3176     dbgs() << *this;
3177   }
3178   if (ExportToDot)
3179     exportToDot("postbuild");
3180 
3181   if (VerifyCCG) {
3182     check();
3183   }
3184 
3185   identifyClones();
3186 
3187   if (VerifyCCG) {
3188     check();
3189   }
3190 
3191   if (DumpCCG) {
3192     dbgs() << "CCG after cloning:\n";
3193     dbgs() << *this;
3194   }
3195   if (ExportToDot)
3196     exportToDot("cloned");
3197 
3198   bool Changed = assignFunctions();
3199 
3200   if (DumpCCG) {
3201     dbgs() << "CCG after assigning function clones:\n";
3202     dbgs() << *this;
3203   }
3204   if (ExportToDot)
3205     exportToDot("clonefuncassign");
3206 
3207   return Changed;
3208 }
3209 
3210 bool MemProfContextDisambiguation::processModule(
3211     Module &M,
3212     function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter) {
3213 
3214   // If we have an import summary, then the cloning decisions were made during
3215   // the thin link on the index. Apply them and return.
3216   if (ImportSummary)
3217     return applyImport(M);
3218 
3219   // TODO: If/when other types of memprof cloning are enabled beyond just for
3220   // hot and cold, we will need to change this to individually control the
3221   // AllocationType passed to addStackNodesForMIB during CCG construction.
3222   // Note that we specifically check this after applying imports above, so that
3223   // the option isn't needed to be passed to distributed ThinLTO backend
3224   // clang processes, which won't necessarily have visibility into the linker
3225   // dependences. Instead the information is communicated from the LTO link to
3226   // the backends via the combined summary index.
3227   if (!SupportsHotColdNew)
3228     return false;
3229 
3230   ModuleCallsiteContextGraph CCG(M, OREGetter);
3231   return CCG.process();
3232 }
3233 
3234 MemProfContextDisambiguation::MemProfContextDisambiguation(
3235     const ModuleSummaryIndex *Summary)
3236     : ImportSummary(Summary) {
3237   if (ImportSummary) {
3238     // The MemProfImportSummary should only be used for testing ThinLTO
3239     // distributed backend handling via opt, in which case we don't have a
3240     // summary from the pass pipeline.
3241     assert(MemProfImportSummary.empty());
3242     return;
3243   }
3244   if (MemProfImportSummary.empty())
3245     return;
3246 
3247   auto ReadSummaryFile =
3248       errorOrToExpected(MemoryBuffer::getFile(MemProfImportSummary));
3249   if (!ReadSummaryFile) {
3250     logAllUnhandledErrors(ReadSummaryFile.takeError(), errs(),
3251                           "Error loading file '" + MemProfImportSummary +
3252                               "': ");
3253     return;
3254   }
3255   auto ImportSummaryForTestingOrErr = getModuleSummaryIndex(**ReadSummaryFile);
3256   if (!ImportSummaryForTestingOrErr) {
3257     logAllUnhandledErrors(ImportSummaryForTestingOrErr.takeError(), errs(),
3258                           "Error parsing file '" + MemProfImportSummary +
3259                               "': ");
3260     return;
3261   }
3262   ImportSummaryForTesting = std::move(*ImportSummaryForTestingOrErr);
3263   ImportSummary = ImportSummaryForTesting.get();
3264 }
3265 
3266 PreservedAnalyses MemProfContextDisambiguation::run(Module &M,
3267                                                     ModuleAnalysisManager &AM) {
3268   auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
3269   auto OREGetter = [&](Function *F) -> OptimizationRemarkEmitter & {
3270     return FAM.getResult<OptimizationRemarkEmitterAnalysis>(*F);
3271   };
3272   if (!processModule(M, OREGetter))
3273     return PreservedAnalyses::all();
3274   return PreservedAnalyses::none();
3275 }
3276 
3277 void MemProfContextDisambiguation::run(
3278     ModuleSummaryIndex &Index,
3279     function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)>
3280         isPrevailing) {
3281   // TODO: If/when other types of memprof cloning are enabled beyond just for
3282   // hot and cold, we will need to change this to individually control the
3283   // AllocationType passed to addStackNodesForMIB during CCG construction.
3284   // The index was set from the option, so these should be in sync.
3285   assert(Index.withSupportsHotColdNew() == SupportsHotColdNew);
3286   if (!SupportsHotColdNew)
3287     return;
3288 
3289   IndexCallsiteContextGraph CCG(Index, isPrevailing);
3290   CCG.process();
3291 }
3292