xref: /llvm-project/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp (revision d95b82c49aef0223bcc03ff5a9163e70037c82be)
1 //===- AMDGPUSplitModule.cpp ----------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file Implements a module splitting algorithm designed to support the
10 /// FullLTO --lto-partitions option for parallel codegen. This is completely
11 /// different from the common SplitModule pass, as this system is designed with
12 /// AMDGPU in mind.
13 ///
14 /// The basic idea of this module splitting implementation is the same as
15 /// SplitModule: load-balance the module's functions across a set of N
16 /// partitions to allow parallel codegen. However, it does it very
17 /// differently than the target-agnostic variant:
18 ///   - Kernels are used as the module's "roots".
19 ///     They're known entry points on AMDGPU, and everything else is often
20 ///     internal only.
21 ///   - Each kernel has a set of dependencies, and when a kernel and its
22 ///     dependencies is considered "big", we try to put it in a partition where
23 ///     most dependencies are already imported, to avoid duplicating large
24 ///     amounts of code.
25 ///   - There's special care for indirect calls in order to ensure
26 ///     AMDGPUResourceUsageAnalysis can work correctly.
27 ///
28 /// This file also includes a more elaborate logging system to enable
29 /// users to easily generate logs that (if desired) do not include any value
30 /// names, in order to not leak information about the source file.
31 /// Such logs are very helpful to understand and fix potential issues with
32 /// module splitting.
33 
34 #include "AMDGPUSplitModule.h"
35 #include "AMDGPUTargetMachine.h"
36 #include "Utils/AMDGPUBaseInfo.h"
37 #include "llvm/ADT/DenseMap.h"
38 #include "llvm/ADT/SmallVector.h"
39 #include "llvm/ADT/StringExtras.h"
40 #include "llvm/ADT/StringRef.h"
41 #include "llvm/Analysis/CallGraph.h"
42 #include "llvm/Analysis/TargetTransformInfo.h"
43 #include "llvm/IR/Function.h"
44 #include "llvm/IR/Instruction.h"
45 #include "llvm/IR/Module.h"
46 #include "llvm/IR/User.h"
47 #include "llvm/IR/Value.h"
48 #include "llvm/Support/Casting.h"
49 #include "llvm/Support/Debug.h"
50 #include "llvm/Support/FileSystem.h"
51 #include "llvm/Support/Path.h"
52 #include "llvm/Support/Process.h"
53 #include "llvm/Support/SHA256.h"
54 #include "llvm/Support/Threading.h"
55 #include "llvm/Support/raw_ostream.h"
56 #include "llvm/Transforms/Utils/Cloning.h"
57 #include <algorithm>
58 #include <cassert>
59 #include <iterator>
60 #include <memory>
61 #include <utility>
62 #include <vector>
63 
64 using namespace llvm;
65 
66 #define DEBUG_TYPE "amdgpu-split-module"
67 
68 namespace {
69 
70 static cl::opt<float> LargeKernelFactor(
71     "amdgpu-module-splitting-large-kernel-threshold", cl::init(2.0f),
72     cl::Hidden,
73     cl::desc(
74         "consider a kernel as large and needing special treatment when it "
75         "exceeds the average cost of a partition by this factor; e;g. 2.0 "
76         "means if the kernel and its dependencies is 2 times bigger than "
77         "an average partition; 0 disables large kernels handling entirely"));
78 
79 static cl::opt<float> LargeKernelOverlapForMerge(
80     "amdgpu-module-splitting-large-kernel-merge-overlap", cl::init(0.8f),
81     cl::Hidden,
82     cl::desc("defines how much overlap between two large kernel's dependencies "
83              "is needed to put them in the same partition"));
84 
85 static cl::opt<bool> NoExternalizeGlobals(
86     "amdgpu-module-splitting-no-externalize-globals", cl::Hidden,
87     cl::desc("disables externalization of global variable with local linkage; "
88              "may cause globals to be duplicated which increases binary size"));
89 
90 static cl::opt<std::string>
91     LogDirOpt("amdgpu-module-splitting-log-dir", cl::Hidden,
92               cl::desc("output directory for AMDGPU module splitting logs"));
93 
94 static cl::opt<bool>
95     LogPrivate("amdgpu-module-splitting-log-private", cl::Hidden,
96                cl::desc("hash value names before printing them in the AMDGPU "
97                         "module splitting logs"));
98 
99 using CostType = InstructionCost::CostType;
100 using PartitionID = unsigned;
101 using GetTTIFn = function_ref<const TargetTransformInfo &(Function &)>;
102 
103 static bool isEntryPoint(const Function *F) {
104   return AMDGPU::isEntryFunctionCC(F->getCallingConv());
105 }
106 
107 static std::string getName(const Value &V) {
108   static bool HideNames;
109 
110   static llvm::once_flag HideNameInitFlag;
111   llvm::call_once(HideNameInitFlag, [&]() {
112     if (LogPrivate.getNumOccurrences())
113       HideNames = LogPrivate;
114     else {
115       const auto EV = sys::Process::GetEnv("AMD_SPLIT_MODULE_LOG_PRIVATE");
116       HideNames = (EV.value_or("0") != "0");
117     }
118   });
119 
120   if (!HideNames)
121     return V.getName().str();
122   return toHex(SHA256::hash(arrayRefFromStringRef(V.getName())),
123                /*LowerCase=*/true);
124 }
125 
126 /// Main logging helper.
127 ///
128 /// Logging can be configured by the following environment variable.
129 ///   AMD_SPLIT_MODULE_LOG_DIR=<filepath>
130 ///     If set, uses <filepath> as the directory to write logfiles to
131 ///     each time module splitting is used.
132 ///   AMD_SPLIT_MODULE_LOG_PRIVATE
133 ///     If set to anything other than zero, all names are hidden.
134 ///
135 /// Both environment variables have corresponding CL options which
136 /// takes priority over them.
137 ///
138 /// Any output printed to the log files is also printed to dbgs() when -debug is
139 /// used and LLVM_DEBUG is defined.
140 ///
141 /// This approach has a small disadvantage over LLVM_DEBUG though: logging logic
142 /// cannot be removed from the code (by building without debug). This probably
143 /// has a small performance cost because if some computation/formatting is
144 /// needed for logging purpose, it may be done everytime only to be ignored
145 /// by the logger.
146 ///
147 /// As this pass only runs once and is not doing anything computationally
148 /// expensive, this is likely a reasonable trade-off.
149 ///
150 /// If some computation should really be avoided when unused, users of the class
151 /// can check whether any logging will occur by using the bool operator.
152 ///
153 /// \code
154 ///   if (SML) {
155 ///     // Executes only if logging to a file or if -debug is available and
156 ///     used.
157 ///   }
158 /// \endcode
159 class SplitModuleLogger {
160 public:
161   SplitModuleLogger(const Module &M) {
162     std::string LogDir = LogDirOpt;
163     if (LogDir.empty())
164       LogDir = sys::Process::GetEnv("AMD_SPLIT_MODULE_LOG_DIR").value_or("");
165 
166     // No log dir specified means we don't need to log to a file.
167     // We may still log to dbgs(), though.
168     if (LogDir.empty())
169       return;
170 
171     // If a log directory is specified, create a new file with a unique name in
172     // that directory.
173     int Fd;
174     SmallString<0> PathTemplate;
175     SmallString<0> RealPath;
176     sys::path::append(PathTemplate, LogDir, "Module-%%-%%-%%-%%-%%-%%-%%.txt");
177     if (auto Err =
178             sys::fs::createUniqueFile(PathTemplate.str(), Fd, RealPath)) {
179       report_fatal_error("Failed to create log file at '" + Twine(LogDir) +
180                              "': " + Err.message(),
181                          /*CrashDiag=*/false);
182     }
183 
184     FileOS = std::make_unique<raw_fd_ostream>(Fd, /*shouldClose=*/true);
185   }
186 
187   bool hasLogFile() const { return FileOS != nullptr; }
188 
189   raw_ostream &logfile() {
190     assert(FileOS && "no logfile!");
191     return *FileOS;
192   }
193 
194   /// \returns true if this SML will log anything either to a file or dbgs().
195   /// Can be used to avoid expensive computations that are ignored when logging
196   /// is disabled.
197   operator bool() const {
198     return hasLogFile() || (DebugFlag && isCurrentDebugType(DEBUG_TYPE));
199   }
200 
201 private:
202   std::unique_ptr<raw_fd_ostream> FileOS;
203 };
204 
205 template <typename Ty>
206 static SplitModuleLogger &operator<<(SplitModuleLogger &SML, const Ty &Val) {
207   static_assert(
208       !std::is_same_v<Ty, Value>,
209       "do not print values to logs directly, use handleName instead!");
210   LLVM_DEBUG(dbgs() << Val);
211   if (SML.hasLogFile())
212     SML.logfile() << Val;
213   return SML;
214 }
215 
216 /// Calculate the cost of each function in \p M
217 /// \param SML Log Helper
218 /// \param GetTTI Abstract getter for TargetTransformInfo.
219 /// \param M Module to analyze.
220 /// \param CostMap[out] Resulting Function -> Cost map.
221 /// \return The module's total cost.
222 static CostType
223 calculateFunctionCosts(SplitModuleLogger &SML, GetTTIFn GetTTI, Module &M,
224                        DenseMap<const Function *, CostType> &CostMap) {
225   CostType ModuleCost = 0;
226   CostType KernelCost = 0;
227 
228   for (auto &Fn : M) {
229     if (Fn.isDeclaration())
230       continue;
231 
232     CostType FnCost = 0;
233     const auto &TTI = GetTTI(Fn);
234     for (const auto &BB : Fn) {
235       for (const auto &I : BB) {
236         auto Cost =
237             TTI.getInstructionCost(&I, TargetTransformInfo::TCK_CodeSize);
238         assert(Cost != InstructionCost::getMax());
239         // Assume expensive if we can't tell the cost of an instruction.
240         CostType CostVal =
241             Cost.getValue().value_or(TargetTransformInfo::TCC_Expensive);
242         assert((FnCost + CostVal) >= FnCost && "Overflow!");
243         FnCost += CostVal;
244       }
245     }
246 
247     assert(FnCost != 0);
248 
249     CostMap[&Fn] = FnCost;
250     assert((ModuleCost + FnCost) >= ModuleCost && "Overflow!");
251     ModuleCost += FnCost;
252 
253     if (isEntryPoint(&Fn))
254       KernelCost += FnCost;
255   }
256 
257   CostType FnCost = (ModuleCost - KernelCost);
258   SML << "=> Total Module Cost: " << ModuleCost << '\n'
259       << "  => KernelCost: " << KernelCost << " ("
260       << format("%0.2f", (float(KernelCost) / ModuleCost) * 100) << "%)\n"
261       << "  => FnsCost: " << FnCost << " ("
262       << format("%0.2f", (float(FnCost) / ModuleCost) * 100) << "%)\n";
263 
264   return ModuleCost;
265 }
266 
267 static bool canBeIndirectlyCalled(const Function &F) {
268   if (F.isDeclaration() || isEntryPoint(&F))
269     return false;
270   return !F.hasLocalLinkage() ||
271          F.hasAddressTaken(/*PutOffender=*/nullptr,
272                            /*IgnoreCallbackUses=*/false,
273                            /*IgnoreAssumeLikeCalls=*/true,
274                            /*IgnoreLLVMUsed=*/true,
275                            /*IgnoreARCAttachedCall=*/false,
276                            /*IgnoreCastedDirectCall=*/true);
277 }
278 
279 /// When a kernel or any of its callees performs an indirect call, this function
280 /// takes over \ref addAllDependencies and adds all potentially callable
281 /// functions to \p Fns so they can be counted as dependencies of the kernel.
282 ///
283 /// This is needed due to how AMDGPUResourceUsageAnalysis operates: in the
284 /// presence of an indirect call, the function's resource usage is the same as
285 /// the most expensive function in the module.
286 /// \param M    The module.
287 /// \param Fns[out] Resulting list of functions.
288 static void addAllIndirectCallDependencies(const Module &M,
289                                            DenseSet<const Function *> &Fns) {
290   for (const auto &Fn : M) {
291     if (canBeIndirectlyCalled(Fn))
292       Fns.insert(&Fn);
293   }
294 }
295 
296 /// Adds the functions that \p Fn may call to \p Fns, then recurses into each
297 /// callee until all reachable functions have been gathered.
298 ///
299 /// \param SML Log Helper
300 /// \param CG Call graph for \p Fn's module.
301 /// \param Fn Current function to look at.
302 /// \param Fns[out] Resulting list of functions.
303 /// \param HadIndirectCall[out] Set to true if an indirect call was seen at some
304 /// point, either in \p Fn or in one of the function it calls. When that
305 /// happens, we fall back to adding all callable functions inside \p Fn's module
306 /// to \p Fns.
307 static void addAllDependencies(SplitModuleLogger &SML, const CallGraph &CG,
308                                const Function &Fn,
309                                DenseSet<const Function *> &Fns,
310                                bool &HadIndirectCall) {
311   assert(!Fn.isDeclaration());
312 
313   const Module &M = *Fn.getParent();
314   SmallVector<const Function *> WorkList({&Fn});
315   while (!WorkList.empty()) {
316     const auto &CurFn = *WorkList.pop_back_val();
317     assert(!CurFn.isDeclaration());
318 
319     // Scan for an indirect call. If such a call is found, we have to
320     // conservatively assume this can call all non-entrypoint functions in the
321     // module.
322 
323     for (auto &CGEntry : *CG[&CurFn]) {
324       auto *CGNode = CGEntry.second;
325       auto *Callee = CGNode->getFunction();
326       if (!Callee) {
327         // Functions have an edge towards CallsExternalNode if they're external
328         // declarations, or if they do an indirect call. As we only process
329         // definitions here, we know this means the function has an indirect
330         // call. We then have to conservatively assume this can call all
331         // non-entrypoint functions in the module.
332         if (CGNode != CG.getCallsExternalNode())
333           continue; // this is another function-less node we don't care about.
334 
335         SML << "Indirect call detected in " << getName(CurFn)
336             << " - treating all non-entrypoint functions as "
337                "potential dependencies\n";
338 
339         // TODO: Print an ORE as well ?
340         addAllIndirectCallDependencies(M, Fns);
341         HadIndirectCall = true;
342         continue;
343       }
344 
345       if (Callee->isDeclaration())
346         continue;
347 
348       auto [It, Inserted] = Fns.insert(Callee);
349       if (Inserted)
350         WorkList.push_back(Callee);
351     }
352   }
353 }
354 
355 /// Contains information about a kernel and its dependencies.
356 struct KernelWithDependencies {
357   KernelWithDependencies(SplitModuleLogger &SML, CallGraph &CG,
358                          const DenseMap<const Function *, CostType> &FnCosts,
359                          const Function *Fn)
360       : Fn(Fn) {
361     addAllDependencies(SML, CG, *Fn, Dependencies, HasIndirectCall);
362     TotalCost = FnCosts.at(Fn);
363     for (const auto *Dep : Dependencies) {
364       TotalCost += FnCosts.at(Dep);
365 
366       // We cannot duplicate functions with external linkage, or functions that
367       // may be overriden at runtime.
368       HasNonDuplicatableDependecy |=
369           (Dep->hasExternalLinkage() || !Dep->isDefinitionExact());
370     }
371   }
372 
373   const Function *Fn = nullptr;
374   DenseSet<const Function *> Dependencies;
375   /// Whether \p Fn or any of its \ref Dependencies contains an indirect call.
376   bool HasIndirectCall = false;
377   /// Whether any of \p Fn's dependencies cannot be duplicated.
378   bool HasNonDuplicatableDependecy = false;
379 
380   CostType TotalCost = 0;
381 
382   /// \returns true if this kernel and its dependencies can be considered large
383   /// according to \p Threshold.
384   bool isLarge(CostType Threshold) const {
385     return TotalCost > Threshold && !Dependencies.empty();
386   }
387 };
388 
389 /// Calculates how much overlap there is between \p A and \p B.
390 /// \return A number between 0.0 and 1.0, where 1.0 means A == B and 0.0 means A
391 /// and B have no shared elements. Kernels do not count in overlap calculation.
392 static float calculateOverlap(const DenseSet<const Function *> &A,
393                               const DenseSet<const Function *> &B) {
394   DenseSet<const Function *> Total;
395   for (const auto *F : A) {
396     if (!isEntryPoint(F))
397       Total.insert(F);
398   }
399 
400   if (Total.empty())
401     return 0.0f;
402 
403   unsigned NumCommon = 0;
404   for (const auto *F : B) {
405     if (isEntryPoint(F))
406       continue;
407 
408     auto [It, Inserted] = Total.insert(F);
409     if (!Inserted)
410       ++NumCommon;
411   }
412 
413   return static_cast<float>(NumCommon) / Total.size();
414 }
415 
416 /// Performs all of the partitioning work on \p M.
417 /// \param SML Log Helper
418 /// \param M Module to partition.
419 /// \param NumParts Number of partitions to create.
420 /// \param ModuleCost Total cost of all functions in \p M.
421 /// \param FnCosts Map of Function -> Cost
422 /// \param WorkList Kernels and their dependencies to process in order.
423 /// \returns The created partitions (a vector of size \p NumParts )
424 static std::vector<DenseSet<const Function *>>
425 doPartitioning(SplitModuleLogger &SML, Module &M, unsigned NumParts,
426                CostType ModuleCost,
427                const DenseMap<const Function *, CostType> &FnCosts,
428                const SmallVector<KernelWithDependencies> &WorkList) {
429 
430   SML << "\n--Partitioning Starts--\n";
431 
432   // Calculate a "large kernel threshold". When more than one kernel's total
433   // import cost exceeds this value, we will try to merge it with other,
434   // similarly large kernels.
435   //
436   // e.g. let two kernels X and Y have a import cost of ~10% of the module, we
437   // assign X to a partition as usual, but when we get to Y, we check if it's
438   // worth also putting it in Y's partition.
439   const CostType LargeKernelThreshold =
440       LargeKernelFactor
441           ? CostType(((ModuleCost / NumParts) * LargeKernelFactor))
442           : std::numeric_limits<CostType>::max();
443 
444   std::vector<DenseSet<const Function *>> Partitions;
445   Partitions.resize(NumParts);
446 
447   // Assign a partition to each kernel, and try to keep the partitions more or
448   // less balanced. We do that through a priority queue sorted in reverse, so we
449   // can always look at the partition with the least content.
450   //
451   // There are some cases where we will be deliberately unbalanced though.
452   //  - Large kernels: we try to merge with existing partitions to reduce code
453   //  duplication.
454   //  - Kernels with indirect or external calls always go in the first partition
455   //  (P0).
456   auto ComparePartitions = [](const std::pair<PartitionID, CostType> &a,
457                               const std::pair<PartitionID, CostType> &b) {
458     // When two partitions have the same cost, assign to the one with the
459     // biggest ID first. This allows us to put things in P0 last, because P0 may
460     // have other stuff added later.
461     if (a.second == b.second)
462       return a.first < b.first;
463     return a.second > b.second;
464   };
465 
466   // We can't use priority_queue here because we need to be able to access any
467   // element. This makes this a bit inefficient as we need to sort it again
468   // everytime we change it, but it's a very small array anyway (likely under 64
469   // partitions) so it's a cheap operation.
470   std::vector<std::pair<PartitionID, CostType>> BalancingQueue;
471   for (unsigned I = 0; I < NumParts; ++I)
472     BalancingQueue.push_back(std::make_pair(I, 0));
473 
474   // Helper function to handle assigning a kernel to a partition. This takes
475   // care of updating the balancing queue.
476   const auto AssignToPartition = [&](PartitionID PID,
477                                      const KernelWithDependencies &KWD) {
478     auto &FnsInPart = Partitions[PID];
479     FnsInPart.insert(KWD.Fn);
480     FnsInPart.insert(KWD.Dependencies.begin(), KWD.Dependencies.end());
481 
482     SML << "assign " << getName(*KWD.Fn) << " to P" << PID << "\n  ->  ";
483     if (!KWD.Dependencies.empty()) {
484       SML << KWD.Dependencies.size() << " dependencies added\n";
485     };
486 
487     // Update the balancing queue. we scan backwards because in the common case
488     // the partition is at the end.
489     for (auto &[QueuePID, Cost] : reverse(BalancingQueue)) {
490       if (QueuePID == PID) {
491         CostType NewCost = 0;
492         for (auto *Fn : Partitions[PID])
493           NewCost += FnCosts.at(Fn);
494 
495         SML << "[Updating P" << PID << " Cost]:" << Cost << " -> " << NewCost;
496         if (Cost) {
497           SML << " (" << unsigned(((float(NewCost) / Cost) - 1) * 100)
498               << "% increase)";
499         }
500         SML << '\n';
501 
502         Cost = NewCost;
503       }
504     }
505 
506     sort(BalancingQueue, ComparePartitions);
507   };
508 
509   for (auto &CurKernel : WorkList) {
510     // When a kernel has indirect calls, it must stay in the first partition
511     // alongside every reachable non-entry function. This is a nightmare case
512     // for splitting as it severely limits what we can do.
513     if (CurKernel.HasIndirectCall) {
514       SML << "Kernel with indirect call(s): " << getName(*CurKernel.Fn)
515           << " defaulting to P0\n";
516       AssignToPartition(0, CurKernel);
517       continue;
518     }
519 
520     // When a kernel has non duplicatable dependencies, we have to keep it in
521     // the first partition as well. This is a conservative approach, a
522     // finer-grained approach could keep track of which dependencies are
523     // non-duplicatable exactly and just make sure they're grouped together.
524     if (CurKernel.HasNonDuplicatableDependecy) {
525       SML << "Kernel with externally visible dependency "
526           << getName(*CurKernel.Fn) << " defaulting to P0\n";
527       AssignToPartition(0, CurKernel);
528       continue;
529     }
530 
531     // Be smart with large kernels to avoid duplicating their dependencies.
532     if (CurKernel.isLarge(LargeKernelThreshold)) {
533       assert(LargeKernelOverlapForMerge >= 0.0f &&
534              LargeKernelOverlapForMerge <= 1.0f);
535       SML << "Large Kernel: " << getName(*CurKernel.Fn)
536           << " - looking for partition with at least "
537           << format("%0.2f", LargeKernelOverlapForMerge * 100) << "% overlap\n";
538 
539       bool Assigned = false;
540       for (const auto &[PID, Fns] : enumerate(Partitions)) {
541         float Overlap = calculateOverlap(CurKernel.Dependencies, Fns);
542         SML << "  => " << format("%0.2f", Overlap * 100) << "% overlap with P"
543             << PID << '\n';
544         if (Overlap > LargeKernelOverlapForMerge) {
545           SML << "  selecting P" << PID << '\n';
546           AssignToPartition(PID, CurKernel);
547           Assigned = true;
548         }
549       }
550 
551       if (Assigned)
552         continue;
553     }
554 
555     // Normal "load-balancing", assign to partition with least pressure.
556     auto [PID, CurCost] = BalancingQueue.back();
557     AssignToPartition(PID, CurKernel);
558   }
559 
560   // Work is mostly done now, verify the partioning and add all functions we may
561   // have missed (= unreachable, or we don't understand how they're reached) to
562   // P0.
563   DenseSet<const Function *> AllFunctions;
564   for (const auto &[Idx, Part] : enumerate(Partitions)) {
565     CostType Cost = 0;
566     for (auto *Fn : Part) {
567       // external linkage functions should exclusively be in the first partition
568       // at this stage. In theory, we should only ever see external linkage
569       // functions here if they're kernels, or if they've been added due to a
570       // kernel using indirect calls somewhere in its CallGraph.
571       assert(Idx == 0 || (!Fn->hasExternalLinkage() || isEntryPoint(Fn)));
572       Cost += FnCosts.at(Fn);
573     }
574     SML << "P" << Idx << " has a total cost of " << Cost << " ("
575         << format("%0.2f", (float(Cost) / ModuleCost) * 100)
576         << "% of source module)\n";
577     AllFunctions.insert(Part.begin(), Part.end());
578   }
579 
580   // Add missed functions to P0. This will take care of adding things like
581   // external functions with no callers in the module to P0. This should be
582   // fairly rare as AMDGPU internalizes everything in most cases, so unused
583   // internal functions would get removed.
584   for (auto &Fn : M) {
585     if (!Fn.isDeclaration() && !AllFunctions.contains(&Fn)) {
586       SML << getName(Fn) << " has no partition assigned, defaulting to P0\n";
587       Partitions[0].insert(&Fn);
588     }
589   }
590 
591   SML << "--Partitioning Done--\n\n";
592 
593   return Partitions;
594 }
595 
596 static void externalize(GlobalValue &GV) {
597   if (GV.hasLocalLinkage()) {
598     GV.setLinkage(GlobalValue::ExternalLinkage);
599     GV.setVisibility(GlobalValue::HiddenVisibility);
600   }
601 
602   // Unnamed entities must be named consistently between modules. setName will
603   // give a distinct name to each such entity.
604   if (!GV.hasName())
605     GV.setName("__llvmsplit_unnamed");
606 }
607 
608 static void splitAMDGPUModule(
609     GetTTIFn GetTTI, Module &M, unsigned N,
610     function_ref<void(std::unique_ptr<Module> MPart)> ModuleCallback) {
611 
612   SplitModuleLogger SML(M);
613 
614   CallGraph CG(M);
615 
616   // Externalize functions whose address are taken.
617   //
618   // This is needed because partitioning is purely based on calls, but sometimes
619   // a kernel/function may just look at the address of another local function
620   // and not do anything (no calls). After partitioning, that local function may
621   // end up in a different module (so it's just a declaration in the module
622   // where its address is taken), which emits a "undefined hidden symbol" linker
623   // error.
624   //
625   // Additionally, it guides partitioning to not duplicate this function if it's
626   // called directly at some point.
627   for (auto &Fn : M) {
628     if (Fn.hasAddressTaken()) {
629       if (Fn.hasLocalLinkage()) {
630         SML << "[externalize] " << Fn.getName()
631             << " because its address is taken\n";
632       }
633       externalize(Fn);
634     }
635   }
636 
637   // Externalize local GVs, which avoids duplicating their initializers, which
638   // in turns helps keep code size in check.
639   if (!NoExternalizeGlobals) {
640     for (auto &GV : M.globals()) {
641       if (GV.hasLocalLinkage())
642         SML << "[externalize] GV " << GV.getName() << '\n';
643       externalize(GV);
644     }
645   }
646 
647   // Start by calculating the cost of every function in the module, as well as
648   // the module's overall cost.
649   DenseMap<const Function *, CostType> FnCosts;
650   const CostType ModuleCost = calculateFunctionCosts(SML, GetTTI, M, FnCosts);
651 
652   // Gather every kernel into a WorkList, then sort it by descending total cost
653   // of the kernel so the biggest kernels are seen first.
654   SmallVector<KernelWithDependencies> WorkList;
655   for (auto &Fn : M) {
656     if (isEntryPoint(&Fn) && !Fn.isDeclaration())
657       WorkList.emplace_back(SML, CG, FnCosts, &Fn);
658   }
659   sort(WorkList, [&](auto &A, auto &B) {
660     // Sort by total cost, and if the total cost is identical, sort
661     // alphabetically.
662     if (A.TotalCost == B.TotalCost)
663       return A.Fn->getName() < B.Fn->getName();
664     return A.TotalCost > B.TotalCost;
665   });
666 
667   if (SML) {
668     SML << "Worklist\n";
669     for (const auto &KWD : WorkList) {
670       SML << "[Kernel] " << getName(*KWD.Fn) << " (totalCost:" << KWD.TotalCost
671           << " indirect:" << KWD.HasIndirectCall
672           << " hasNonDuplicatableDep:" << KWD.HasNonDuplicatableDependecy
673           << ")\n";
674       for (const auto *Dep : KWD.Dependencies)
675         SML << "  [Dep] " << getName(*Dep) << '\n';
676     }
677   }
678 
679   // This performs all of the partitioning work.
680   auto Partitions = doPartitioning(SML, M, N, ModuleCost, FnCosts, WorkList);
681   assert(Partitions.size() == N);
682 
683   // If we didn't externalize GVs, then local GVs need to be conservatively
684   // imported into every module (including their initializers), and then cleaned
685   // up afterwards.
686   const auto NeedsConservativeImport = [&](const GlobalValue *GV) {
687     // We conservatively import private/internal GVs into every module and clean
688     // them up afterwards.
689     const auto *Var = dyn_cast<GlobalVariable>(GV);
690     return Var && Var->hasLocalLinkage();
691   };
692 
693   SML << "Creating " << N << " modules...\n";
694   unsigned TotalFnImpls = 0;
695   for (unsigned I = 0; I < N; ++I) {
696     const auto &FnsInPart = Partitions[I];
697 
698     ValueToValueMapTy VMap;
699     std::unique_ptr<Module> MPart(
700         CloneModule(M, VMap, [&](const GlobalValue *GV) {
701           // Functions go in their assigned partition.
702           if (const auto *Fn = dyn_cast<Function>(GV)) {
703 // Check we don't import an external linkage function in any
704 // partition other than P0.
705 #ifndef NDEBUG
706             if (Fn->hasExternalLinkage() && !isEntryPoint(Fn)) {
707               assert((I == 0) == FnsInPart.contains(Fn));
708             }
709 #endif
710             return FnsInPart.contains(Fn);
711           }
712 
713           if (NeedsConservativeImport(GV))
714             return true;
715 
716           // Everything else goes in the first partition.
717           return I == 0;
718         }));
719 
720     // Clean-up conservatively imported GVs without any users.
721     for (auto &GV : make_early_inc_range(MPart->globals())) {
722       if (NeedsConservativeImport(&GV) && GV.use_empty())
723         GV.eraseFromParent();
724     }
725 
726     unsigned NumAllFns = 0, NumKernels = 0;
727     for (auto &Cur : *MPart) {
728       if (!Cur.isDeclaration()) {
729         ++NumAllFns;
730         if (isEntryPoint(&Cur))
731           ++NumKernels;
732       }
733     }
734     TotalFnImpls += NumAllFns;
735     SML << "  - Module " << I << " with " << NumAllFns << " functions ("
736         << NumKernels << " kernels)\n";
737     ModuleCallback(std::move(MPart));
738   }
739 
740   SML << TotalFnImpls << " function definitions across all modules ("
741       << format("%0.2f", (float(TotalFnImpls) / FnCosts.size()) * 100)
742       << "% of original module)\n";
743 }
744 } // namespace
745 
746 PreservedAnalyses AMDGPUSplitModulePass::run(Module &M,
747                                              ModuleAnalysisManager &MAM) {
748   FunctionAnalysisManager &FAM =
749       MAM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
750   const auto TTIGetter = [&FAM](Function &F) -> const TargetTransformInfo & {
751     return FAM.getResult<TargetIRAnalysis>(F);
752   };
753   splitAMDGPUModule(TTIGetter, M, N, ModuleCallback);
754   // We don't change the original module.
755   return PreservedAnalyses::all();
756 }
757