xref: /freebsd-src/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp (revision 0fca6ea1d4eea4c934cfff25ac9ee8ad6fe95583)
1*0fca6ea1SDimitry Andric //===- AMDGPUSplitModule.cpp ----------------------------------------------===//
2*0fca6ea1SDimitry Andric //
3*0fca6ea1SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4*0fca6ea1SDimitry Andric // See https://llvm.org/LICENSE.txt for license information.
5*0fca6ea1SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6*0fca6ea1SDimitry Andric //
7*0fca6ea1SDimitry Andric //===----------------------------------------------------------------------===//
8*0fca6ea1SDimitry Andric //
9*0fca6ea1SDimitry Andric /// \file Implements a module splitting algorithm designed to support the
10*0fca6ea1SDimitry Andric /// FullLTO --lto-partitions option for parallel codegen. This is completely
11*0fca6ea1SDimitry Andric /// different from the common SplitModule pass, as this system is designed with
12*0fca6ea1SDimitry Andric /// AMDGPU in mind.
13*0fca6ea1SDimitry Andric ///
14*0fca6ea1SDimitry Andric /// The basic idea of this module splitting implementation is the same as
15*0fca6ea1SDimitry Andric /// SplitModule: load-balance the module's functions across a set of N
16*0fca6ea1SDimitry Andric /// partitions to allow parallel codegen. However, it does it very
17*0fca6ea1SDimitry Andric /// differently than the target-agnostic variant:
18*0fca6ea1SDimitry Andric ///   - The module has "split roots", which are kernels in the vast
19*0fca6ea1SDimitry Andric //      majority of cases.
20*0fca6ea1SDimitry Andric ///   - Each root has a set of dependencies, and when a root and its
21*0fca6ea1SDimitry Andric ///     dependencies is considered "big", we try to put it in a partition where
22*0fca6ea1SDimitry Andric ///     most dependencies are already imported, to avoid duplicating large
23*0fca6ea1SDimitry Andric ///     amounts of code.
24*0fca6ea1SDimitry Andric ///   - There's special care for indirect calls in order to ensure
25*0fca6ea1SDimitry Andric ///     AMDGPUResourceUsageAnalysis can work correctly.
26*0fca6ea1SDimitry Andric ///
27*0fca6ea1SDimitry Andric /// This file also includes a more elaborate logging system to enable
28*0fca6ea1SDimitry Andric /// users to easily generate logs that (if desired) do not include any value
29*0fca6ea1SDimitry Andric /// names, in order to not leak information about the source file.
30*0fca6ea1SDimitry Andric /// Such logs are very helpful to understand and fix potential issues with
31*0fca6ea1SDimitry Andric /// module splitting.
32*0fca6ea1SDimitry Andric 
33*0fca6ea1SDimitry Andric #include "AMDGPUSplitModule.h"
34*0fca6ea1SDimitry Andric #include "AMDGPUTargetMachine.h"
35*0fca6ea1SDimitry Andric #include "Utils/AMDGPUBaseInfo.h"
36*0fca6ea1SDimitry Andric #include "llvm/ADT/DenseMap.h"
37*0fca6ea1SDimitry Andric #include "llvm/ADT/SmallVector.h"
38*0fca6ea1SDimitry Andric #include "llvm/ADT/StringExtras.h"
39*0fca6ea1SDimitry Andric #include "llvm/ADT/StringRef.h"
40*0fca6ea1SDimitry Andric #include "llvm/Analysis/CallGraph.h"
41*0fca6ea1SDimitry Andric #include "llvm/Analysis/TargetTransformInfo.h"
42*0fca6ea1SDimitry Andric #include "llvm/IR/Function.h"
43*0fca6ea1SDimitry Andric #include "llvm/IR/Instruction.h"
44*0fca6ea1SDimitry Andric #include "llvm/IR/Module.h"
45*0fca6ea1SDimitry Andric #include "llvm/IR/User.h"
46*0fca6ea1SDimitry Andric #include "llvm/IR/Value.h"
47*0fca6ea1SDimitry Andric #include "llvm/Support/Casting.h"
48*0fca6ea1SDimitry Andric #include "llvm/Support/Debug.h"
49*0fca6ea1SDimitry Andric #include "llvm/Support/FileSystem.h"
50*0fca6ea1SDimitry Andric #include "llvm/Support/Path.h"
51*0fca6ea1SDimitry Andric #include "llvm/Support/Process.h"
52*0fca6ea1SDimitry Andric #include "llvm/Support/SHA256.h"
53*0fca6ea1SDimitry Andric #include "llvm/Support/Threading.h"
54*0fca6ea1SDimitry Andric #include "llvm/Support/raw_ostream.h"
55*0fca6ea1SDimitry Andric #include "llvm/Transforms/Utils/Cloning.h"
56*0fca6ea1SDimitry Andric #include <algorithm>
57*0fca6ea1SDimitry Andric #include <cassert>
58*0fca6ea1SDimitry Andric #include <iterator>
59*0fca6ea1SDimitry Andric #include <memory>
60*0fca6ea1SDimitry Andric #include <utility>
61*0fca6ea1SDimitry Andric #include <vector>
62*0fca6ea1SDimitry Andric 
63*0fca6ea1SDimitry Andric using namespace llvm;
64*0fca6ea1SDimitry Andric 
65*0fca6ea1SDimitry Andric #define DEBUG_TYPE "amdgpu-split-module"
66*0fca6ea1SDimitry Andric 
67*0fca6ea1SDimitry Andric namespace {
68*0fca6ea1SDimitry Andric 
69*0fca6ea1SDimitry Andric static cl::opt<float> LargeFnFactor(
70*0fca6ea1SDimitry Andric     "amdgpu-module-splitting-large-function-threshold", cl::init(2.0f),
71*0fca6ea1SDimitry Andric     cl::Hidden,
72*0fca6ea1SDimitry Andric     cl::desc(
73*0fca6ea1SDimitry Andric         "consider a function as large and needing special treatment when the "
74*0fca6ea1SDimitry Andric         "cost of importing it into a partition"
75*0fca6ea1SDimitry Andric         "exceeds the average cost of a partition by this factor; e;g. 2.0 "
76*0fca6ea1SDimitry Andric         "means if the function and its dependencies is 2 times bigger than "
77*0fca6ea1SDimitry Andric         "an average partition; 0 disables large functions handling entirely"));
78*0fca6ea1SDimitry Andric 
79*0fca6ea1SDimitry Andric static cl::opt<float> LargeFnOverlapForMerge(
80*0fca6ea1SDimitry Andric     "amdgpu-module-splitting-large-function-merge-overlap", cl::init(0.8f),
81*0fca6ea1SDimitry Andric     cl::Hidden,
82*0fca6ea1SDimitry Andric     cl::desc(
83*0fca6ea1SDimitry Andric         "defines how much overlap between two large function's dependencies "
84*0fca6ea1SDimitry Andric         "is needed to put them in the same partition"));
85*0fca6ea1SDimitry Andric 
86*0fca6ea1SDimitry Andric static cl::opt<bool> NoExternalizeGlobals(
87*0fca6ea1SDimitry Andric     "amdgpu-module-splitting-no-externalize-globals", cl::Hidden,
88*0fca6ea1SDimitry Andric     cl::desc("disables externalization of global variable with local linkage; "
89*0fca6ea1SDimitry Andric              "may cause globals to be duplicated which increases binary size"));
90*0fca6ea1SDimitry Andric 
91*0fca6ea1SDimitry Andric static cl::opt<std::string>
92*0fca6ea1SDimitry Andric     LogDirOpt("amdgpu-module-splitting-log-dir", cl::Hidden,
93*0fca6ea1SDimitry Andric               cl::desc("output directory for AMDGPU module splitting logs"));
94*0fca6ea1SDimitry Andric 
95*0fca6ea1SDimitry Andric static cl::opt<bool>
96*0fca6ea1SDimitry Andric     LogPrivate("amdgpu-module-splitting-log-private", cl::Hidden,
97*0fca6ea1SDimitry Andric                cl::desc("hash value names before printing them in the AMDGPU "
98*0fca6ea1SDimitry Andric                         "module splitting logs"));
99*0fca6ea1SDimitry Andric 
100*0fca6ea1SDimitry Andric using CostType = InstructionCost::CostType;
101*0fca6ea1SDimitry Andric using PartitionID = unsigned;
102*0fca6ea1SDimitry Andric using GetTTIFn = function_ref<const TargetTransformInfo &(Function &)>;
103*0fca6ea1SDimitry Andric 
104*0fca6ea1SDimitry Andric static bool isEntryPoint(const Function *F) {
105*0fca6ea1SDimitry Andric   return AMDGPU::isEntryFunctionCC(F->getCallingConv());
106*0fca6ea1SDimitry Andric }
107*0fca6ea1SDimitry Andric 
108*0fca6ea1SDimitry Andric static std::string getName(const Value &V) {
109*0fca6ea1SDimitry Andric   static bool HideNames;
110*0fca6ea1SDimitry Andric 
111*0fca6ea1SDimitry Andric   static llvm::once_flag HideNameInitFlag;
112*0fca6ea1SDimitry Andric   llvm::call_once(HideNameInitFlag, [&]() {
113*0fca6ea1SDimitry Andric     if (LogPrivate.getNumOccurrences())
114*0fca6ea1SDimitry Andric       HideNames = LogPrivate;
115*0fca6ea1SDimitry Andric     else {
116*0fca6ea1SDimitry Andric       const auto EV = sys::Process::GetEnv("AMD_SPLIT_MODULE_LOG_PRIVATE");
117*0fca6ea1SDimitry Andric       HideNames = (EV.value_or("0") != "0");
118*0fca6ea1SDimitry Andric     }
119*0fca6ea1SDimitry Andric   });
120*0fca6ea1SDimitry Andric 
121*0fca6ea1SDimitry Andric   if (!HideNames)
122*0fca6ea1SDimitry Andric     return V.getName().str();
123*0fca6ea1SDimitry Andric   return toHex(SHA256::hash(arrayRefFromStringRef(V.getName())),
124*0fca6ea1SDimitry Andric                /*LowerCase=*/true);
125*0fca6ea1SDimitry Andric }
126*0fca6ea1SDimitry Andric 
127*0fca6ea1SDimitry Andric /// Main logging helper.
128*0fca6ea1SDimitry Andric ///
129*0fca6ea1SDimitry Andric /// Logging can be configured by the following environment variable.
130*0fca6ea1SDimitry Andric ///   AMD_SPLIT_MODULE_LOG_DIR=<filepath>
131*0fca6ea1SDimitry Andric ///     If set, uses <filepath> as the directory to write logfiles to
132*0fca6ea1SDimitry Andric ///     each time module splitting is used.
133*0fca6ea1SDimitry Andric ///   AMD_SPLIT_MODULE_LOG_PRIVATE
134*0fca6ea1SDimitry Andric ///     If set to anything other than zero, all names are hidden.
135*0fca6ea1SDimitry Andric ///
136*0fca6ea1SDimitry Andric /// Both environment variables have corresponding CL options which
137*0fca6ea1SDimitry Andric /// takes priority over them.
138*0fca6ea1SDimitry Andric ///
139*0fca6ea1SDimitry Andric /// Any output printed to the log files is also printed to dbgs() when -debug is
140*0fca6ea1SDimitry Andric /// used and LLVM_DEBUG is defined.
141*0fca6ea1SDimitry Andric ///
142*0fca6ea1SDimitry Andric /// This approach has a small disadvantage over LLVM_DEBUG though: logging logic
143*0fca6ea1SDimitry Andric /// cannot be removed from the code (by building without debug). This probably
144*0fca6ea1SDimitry Andric /// has a small performance cost because if some computation/formatting is
145*0fca6ea1SDimitry Andric /// needed for logging purpose, it may be done everytime only to be ignored
146*0fca6ea1SDimitry Andric /// by the logger.
147*0fca6ea1SDimitry Andric ///
148*0fca6ea1SDimitry Andric /// As this pass only runs once and is not doing anything computationally
149*0fca6ea1SDimitry Andric /// expensive, this is likely a reasonable trade-off.
150*0fca6ea1SDimitry Andric ///
151*0fca6ea1SDimitry Andric /// If some computation should really be avoided when unused, users of the class
152*0fca6ea1SDimitry Andric /// can check whether any logging will occur by using the bool operator.
153*0fca6ea1SDimitry Andric ///
154*0fca6ea1SDimitry Andric /// \code
155*0fca6ea1SDimitry Andric ///   if (SML) {
156*0fca6ea1SDimitry Andric ///     // Executes only if logging to a file or if -debug is available and
157*0fca6ea1SDimitry Andric ///     used.
158*0fca6ea1SDimitry Andric ///   }
159*0fca6ea1SDimitry Andric /// \endcode
160*0fca6ea1SDimitry Andric class SplitModuleLogger {
161*0fca6ea1SDimitry Andric public:
162*0fca6ea1SDimitry Andric   SplitModuleLogger(const Module &M) {
163*0fca6ea1SDimitry Andric     std::string LogDir = LogDirOpt;
164*0fca6ea1SDimitry Andric     if (LogDir.empty())
165*0fca6ea1SDimitry Andric       LogDir = sys::Process::GetEnv("AMD_SPLIT_MODULE_LOG_DIR").value_or("");
166*0fca6ea1SDimitry Andric 
167*0fca6ea1SDimitry Andric     // No log dir specified means we don't need to log to a file.
168*0fca6ea1SDimitry Andric     // We may still log to dbgs(), though.
169*0fca6ea1SDimitry Andric     if (LogDir.empty())
170*0fca6ea1SDimitry Andric       return;
171*0fca6ea1SDimitry Andric 
172*0fca6ea1SDimitry Andric     // If a log directory is specified, create a new file with a unique name in
173*0fca6ea1SDimitry Andric     // that directory.
174*0fca6ea1SDimitry Andric     int Fd;
175*0fca6ea1SDimitry Andric     SmallString<0> PathTemplate;
176*0fca6ea1SDimitry Andric     SmallString<0> RealPath;
177*0fca6ea1SDimitry Andric     sys::path::append(PathTemplate, LogDir, "Module-%%-%%-%%-%%-%%-%%-%%.txt");
178*0fca6ea1SDimitry Andric     if (auto Err =
179*0fca6ea1SDimitry Andric             sys::fs::createUniqueFile(PathTemplate.str(), Fd, RealPath)) {
180*0fca6ea1SDimitry Andric       report_fatal_error("Failed to create log file at '" + Twine(LogDir) +
181*0fca6ea1SDimitry Andric                              "': " + Err.message(),
182*0fca6ea1SDimitry Andric                          /*CrashDiag=*/false);
183*0fca6ea1SDimitry Andric     }
184*0fca6ea1SDimitry Andric 
185*0fca6ea1SDimitry Andric     FileOS = std::make_unique<raw_fd_ostream>(Fd, /*shouldClose=*/true);
186*0fca6ea1SDimitry Andric   }
187*0fca6ea1SDimitry Andric 
188*0fca6ea1SDimitry Andric   bool hasLogFile() const { return FileOS != nullptr; }
189*0fca6ea1SDimitry Andric 
190*0fca6ea1SDimitry Andric   raw_ostream &logfile() {
191*0fca6ea1SDimitry Andric     assert(FileOS && "no logfile!");
192*0fca6ea1SDimitry Andric     return *FileOS;
193*0fca6ea1SDimitry Andric   }
194*0fca6ea1SDimitry Andric 
195*0fca6ea1SDimitry Andric   /// \returns true if this SML will log anything either to a file or dbgs().
196*0fca6ea1SDimitry Andric   /// Can be used to avoid expensive computations that are ignored when logging
197*0fca6ea1SDimitry Andric   /// is disabled.
198*0fca6ea1SDimitry Andric   operator bool() const {
199*0fca6ea1SDimitry Andric     return hasLogFile() || (DebugFlag && isCurrentDebugType(DEBUG_TYPE));
200*0fca6ea1SDimitry Andric   }
201*0fca6ea1SDimitry Andric 
202*0fca6ea1SDimitry Andric private:
203*0fca6ea1SDimitry Andric   std::unique_ptr<raw_fd_ostream> FileOS;
204*0fca6ea1SDimitry Andric };
205*0fca6ea1SDimitry Andric 
206*0fca6ea1SDimitry Andric template <typename Ty>
207*0fca6ea1SDimitry Andric static SplitModuleLogger &operator<<(SplitModuleLogger &SML, const Ty &Val) {
208*0fca6ea1SDimitry Andric   static_assert(
209*0fca6ea1SDimitry Andric       !std::is_same_v<Ty, Value>,
210*0fca6ea1SDimitry Andric       "do not print values to logs directly, use handleName instead!");
211*0fca6ea1SDimitry Andric   LLVM_DEBUG(dbgs() << Val);
212*0fca6ea1SDimitry Andric   if (SML.hasLogFile())
213*0fca6ea1SDimitry Andric     SML.logfile() << Val;
214*0fca6ea1SDimitry Andric   return SML;
215*0fca6ea1SDimitry Andric }
216*0fca6ea1SDimitry Andric 
217*0fca6ea1SDimitry Andric /// Calculate the cost of each function in \p M
218*0fca6ea1SDimitry Andric /// \param SML Log Helper
219*0fca6ea1SDimitry Andric /// \param GetTTI Abstract getter for TargetTransformInfo.
220*0fca6ea1SDimitry Andric /// \param M Module to analyze.
221*0fca6ea1SDimitry Andric /// \param CostMap[out] Resulting Function -> Cost map.
222*0fca6ea1SDimitry Andric /// \return The module's total cost.
223*0fca6ea1SDimitry Andric static CostType
224*0fca6ea1SDimitry Andric calculateFunctionCosts(SplitModuleLogger &SML, GetTTIFn GetTTI, Module &M,
225*0fca6ea1SDimitry Andric                        DenseMap<const Function *, CostType> &CostMap) {
226*0fca6ea1SDimitry Andric   CostType ModuleCost = 0;
227*0fca6ea1SDimitry Andric   CostType KernelCost = 0;
228*0fca6ea1SDimitry Andric 
229*0fca6ea1SDimitry Andric   for (auto &Fn : M) {
230*0fca6ea1SDimitry Andric     if (Fn.isDeclaration())
231*0fca6ea1SDimitry Andric       continue;
232*0fca6ea1SDimitry Andric 
233*0fca6ea1SDimitry Andric     CostType FnCost = 0;
234*0fca6ea1SDimitry Andric     const auto &TTI = GetTTI(Fn);
235*0fca6ea1SDimitry Andric     for (const auto &BB : Fn) {
236*0fca6ea1SDimitry Andric       for (const auto &I : BB) {
237*0fca6ea1SDimitry Andric         auto Cost =
238*0fca6ea1SDimitry Andric             TTI.getInstructionCost(&I, TargetTransformInfo::TCK_CodeSize);
239*0fca6ea1SDimitry Andric         assert(Cost != InstructionCost::getMax());
240*0fca6ea1SDimitry Andric         // Assume expensive if we can't tell the cost of an instruction.
241*0fca6ea1SDimitry Andric         CostType CostVal =
242*0fca6ea1SDimitry Andric             Cost.getValue().value_or(TargetTransformInfo::TCC_Expensive);
243*0fca6ea1SDimitry Andric         assert((FnCost + CostVal) >= FnCost && "Overflow!");
244*0fca6ea1SDimitry Andric         FnCost += CostVal;
245*0fca6ea1SDimitry Andric       }
246*0fca6ea1SDimitry Andric     }
247*0fca6ea1SDimitry Andric 
248*0fca6ea1SDimitry Andric     assert(FnCost != 0);
249*0fca6ea1SDimitry Andric 
250*0fca6ea1SDimitry Andric     CostMap[&Fn] = FnCost;
251*0fca6ea1SDimitry Andric     assert((ModuleCost + FnCost) >= ModuleCost && "Overflow!");
252*0fca6ea1SDimitry Andric     ModuleCost += FnCost;
253*0fca6ea1SDimitry Andric 
254*0fca6ea1SDimitry Andric     if (isEntryPoint(&Fn))
255*0fca6ea1SDimitry Andric       KernelCost += FnCost;
256*0fca6ea1SDimitry Andric   }
257*0fca6ea1SDimitry Andric 
258*0fca6ea1SDimitry Andric   CostType FnCost = (ModuleCost - KernelCost);
259*0fca6ea1SDimitry Andric   CostType ModuleCostOr1 = ModuleCost ? ModuleCost : 1;
260*0fca6ea1SDimitry Andric   SML << "=> Total Module Cost: " << ModuleCost << '\n'
261*0fca6ea1SDimitry Andric       << "  => KernelCost: " << KernelCost << " ("
262*0fca6ea1SDimitry Andric       << format("%0.2f", (float(KernelCost) / ModuleCostOr1) * 100) << "%)\n"
263*0fca6ea1SDimitry Andric       << "  => FnsCost: " << FnCost << " ("
264*0fca6ea1SDimitry Andric       << format("%0.2f", (float(FnCost) / ModuleCostOr1) * 100) << "%)\n";
265*0fca6ea1SDimitry Andric 
266*0fca6ea1SDimitry Andric   return ModuleCost;
267*0fca6ea1SDimitry Andric }
268*0fca6ea1SDimitry Andric 
269*0fca6ea1SDimitry Andric static bool canBeIndirectlyCalled(const Function &F) {
270*0fca6ea1SDimitry Andric   if (F.isDeclaration() || isEntryPoint(&F))
271*0fca6ea1SDimitry Andric     return false;
272*0fca6ea1SDimitry Andric   return !F.hasLocalLinkage() ||
273*0fca6ea1SDimitry Andric          F.hasAddressTaken(/*PutOffender=*/nullptr,
274*0fca6ea1SDimitry Andric                            /*IgnoreCallbackUses=*/false,
275*0fca6ea1SDimitry Andric                            /*IgnoreAssumeLikeCalls=*/true,
276*0fca6ea1SDimitry Andric                            /*IgnoreLLVMUsed=*/true,
277*0fca6ea1SDimitry Andric                            /*IgnoreARCAttachedCall=*/false,
278*0fca6ea1SDimitry Andric                            /*IgnoreCastedDirectCall=*/true);
279*0fca6ea1SDimitry Andric }
280*0fca6ea1SDimitry Andric 
281*0fca6ea1SDimitry Andric /// When a function or any of its callees performs an indirect call, this
282*0fca6ea1SDimitry Andric /// takes over \ref addAllDependencies and adds all potentially callable
283*0fca6ea1SDimitry Andric /// functions to \p Fns so they can be counted as dependencies of the function.
284*0fca6ea1SDimitry Andric ///
285*0fca6ea1SDimitry Andric /// This is needed due to how AMDGPUResourceUsageAnalysis operates: in the
286*0fca6ea1SDimitry Andric /// presence of an indirect call, the function's resource usage is the same as
287*0fca6ea1SDimitry Andric /// the most expensive function in the module.
288*0fca6ea1SDimitry Andric /// \param M    The module.
289*0fca6ea1SDimitry Andric /// \param Fns[out] Resulting list of functions.
290*0fca6ea1SDimitry Andric static void addAllIndirectCallDependencies(const Module &M,
291*0fca6ea1SDimitry Andric                                            DenseSet<const Function *> &Fns) {
292*0fca6ea1SDimitry Andric   for (const auto &Fn : M) {
293*0fca6ea1SDimitry Andric     if (canBeIndirectlyCalled(Fn))
294*0fca6ea1SDimitry Andric       Fns.insert(&Fn);
295*0fca6ea1SDimitry Andric   }
296*0fca6ea1SDimitry Andric }
297*0fca6ea1SDimitry Andric 
298*0fca6ea1SDimitry Andric /// Adds the functions that \p Fn may call to \p Fns, then recurses into each
299*0fca6ea1SDimitry Andric /// callee until all reachable functions have been gathered.
300*0fca6ea1SDimitry Andric ///
301*0fca6ea1SDimitry Andric /// \param SML Log Helper
302*0fca6ea1SDimitry Andric /// \param CG Call graph for \p Fn's module.
303*0fca6ea1SDimitry Andric /// \param Fn Current function to look at.
304*0fca6ea1SDimitry Andric /// \param Fns[out] Resulting list of functions.
305*0fca6ea1SDimitry Andric /// \param OnlyDirect Whether to only consider direct callees.
306*0fca6ea1SDimitry Andric /// \param HadIndirectCall[out] Set to true if an indirect call was seen at some
307*0fca6ea1SDimitry Andric /// point, either in \p Fn or in one of the function it calls. When that
308*0fca6ea1SDimitry Andric /// happens, we fall back to adding all callable functions inside \p Fn's module
309*0fca6ea1SDimitry Andric /// to \p Fns.
310*0fca6ea1SDimitry Andric static void addAllDependencies(SplitModuleLogger &SML, const CallGraph &CG,
311*0fca6ea1SDimitry Andric                                const Function &Fn,
312*0fca6ea1SDimitry Andric                                DenseSet<const Function *> &Fns, bool OnlyDirect,
313*0fca6ea1SDimitry Andric                                bool &HadIndirectCall) {
314*0fca6ea1SDimitry Andric   assert(!Fn.isDeclaration());
315*0fca6ea1SDimitry Andric 
316*0fca6ea1SDimitry Andric   const Module &M = *Fn.getParent();
317*0fca6ea1SDimitry Andric   SmallVector<const Function *> WorkList({&Fn});
318*0fca6ea1SDimitry Andric   while (!WorkList.empty()) {
319*0fca6ea1SDimitry Andric     const auto &CurFn = *WorkList.pop_back_val();
320*0fca6ea1SDimitry Andric     assert(!CurFn.isDeclaration());
321*0fca6ea1SDimitry Andric 
322*0fca6ea1SDimitry Andric     // Scan for an indirect call. If such a call is found, we have to
323*0fca6ea1SDimitry Andric     // conservatively assume this can call all non-entrypoint functions in the
324*0fca6ea1SDimitry Andric     // module.
325*0fca6ea1SDimitry Andric 
326*0fca6ea1SDimitry Andric     for (auto &CGEntry : *CG[&CurFn]) {
327*0fca6ea1SDimitry Andric       auto *CGNode = CGEntry.second;
328*0fca6ea1SDimitry Andric       auto *Callee = CGNode->getFunction();
329*0fca6ea1SDimitry Andric       if (!Callee) {
330*0fca6ea1SDimitry Andric         if (OnlyDirect)
331*0fca6ea1SDimitry Andric           continue;
332*0fca6ea1SDimitry Andric 
333*0fca6ea1SDimitry Andric         // Functions have an edge towards CallsExternalNode if they're external
334*0fca6ea1SDimitry Andric         // declarations, or if they do an indirect call. As we only process
335*0fca6ea1SDimitry Andric         // definitions here, we know this means the function has an indirect
336*0fca6ea1SDimitry Andric         // call. We then have to conservatively assume this can call all
337*0fca6ea1SDimitry Andric         // non-entrypoint functions in the module.
338*0fca6ea1SDimitry Andric         if (CGNode != CG.getCallsExternalNode())
339*0fca6ea1SDimitry Andric           continue; // this is another function-less node we don't care about.
340*0fca6ea1SDimitry Andric 
341*0fca6ea1SDimitry Andric         SML << "Indirect call detected in " << getName(CurFn)
342*0fca6ea1SDimitry Andric             << " - treating all non-entrypoint functions as "
343*0fca6ea1SDimitry Andric                "potential dependencies\n";
344*0fca6ea1SDimitry Andric 
345*0fca6ea1SDimitry Andric         // TODO: Print an ORE as well ?
346*0fca6ea1SDimitry Andric         addAllIndirectCallDependencies(M, Fns);
347*0fca6ea1SDimitry Andric         HadIndirectCall = true;
348*0fca6ea1SDimitry Andric         continue;
349*0fca6ea1SDimitry Andric       }
350*0fca6ea1SDimitry Andric 
351*0fca6ea1SDimitry Andric       if (Callee->isDeclaration())
352*0fca6ea1SDimitry Andric         continue;
353*0fca6ea1SDimitry Andric 
354*0fca6ea1SDimitry Andric       auto [It, Inserted] = Fns.insert(Callee);
355*0fca6ea1SDimitry Andric       if (Inserted)
356*0fca6ea1SDimitry Andric         WorkList.push_back(Callee);
357*0fca6ea1SDimitry Andric     }
358*0fca6ea1SDimitry Andric   }
359*0fca6ea1SDimitry Andric }
360*0fca6ea1SDimitry Andric 
361*0fca6ea1SDimitry Andric /// Contains information about a function and its dependencies.
362*0fca6ea1SDimitry Andric /// This is a splitting root. The splitting algorithm works by
363*0fca6ea1SDimitry Andric /// assigning these to partitions.
364*0fca6ea1SDimitry Andric struct FunctionWithDependencies {
365*0fca6ea1SDimitry Andric   FunctionWithDependencies(SplitModuleLogger &SML, CallGraph &CG,
366*0fca6ea1SDimitry Andric                            const DenseMap<const Function *, CostType> &FnCosts,
367*0fca6ea1SDimitry Andric                            const Function *Fn)
368*0fca6ea1SDimitry Andric       : Fn(Fn) {
369*0fca6ea1SDimitry Andric     // When Fn is not a kernel, we don't need to collect indirect callees.
370*0fca6ea1SDimitry Andric     // Resource usage analysis is only performed on kernels, and we collect
371*0fca6ea1SDimitry Andric     // indirect callees for resource usage analysis.
372*0fca6ea1SDimitry Andric     addAllDependencies(SML, CG, *Fn, Dependencies,
373*0fca6ea1SDimitry Andric                        /*OnlyDirect*/ !isEntryPoint(Fn), HasIndirectCall);
374*0fca6ea1SDimitry Andric     TotalCost = FnCosts.at(Fn);
375*0fca6ea1SDimitry Andric     for (const auto *Dep : Dependencies) {
376*0fca6ea1SDimitry Andric       TotalCost += FnCosts.at(Dep);
377*0fca6ea1SDimitry Andric 
378*0fca6ea1SDimitry Andric       // We cannot duplicate functions with external linkage, or functions that
379*0fca6ea1SDimitry Andric       // may be overriden at runtime.
380*0fca6ea1SDimitry Andric       HasNonDuplicatableDependecy |=
381*0fca6ea1SDimitry Andric           (Dep->hasExternalLinkage() || !Dep->isDefinitionExact());
382*0fca6ea1SDimitry Andric     }
383*0fca6ea1SDimitry Andric   }
384*0fca6ea1SDimitry Andric 
385*0fca6ea1SDimitry Andric   const Function *Fn = nullptr;
386*0fca6ea1SDimitry Andric   DenseSet<const Function *> Dependencies;
387*0fca6ea1SDimitry Andric   /// Whether \p Fn or any of its \ref Dependencies contains an indirect call.
388*0fca6ea1SDimitry Andric   bool HasIndirectCall = false;
389*0fca6ea1SDimitry Andric   /// Whether any of \p Fn's dependencies cannot be duplicated.
390*0fca6ea1SDimitry Andric   bool HasNonDuplicatableDependecy = false;
391*0fca6ea1SDimitry Andric 
392*0fca6ea1SDimitry Andric   CostType TotalCost = 0;
393*0fca6ea1SDimitry Andric 
394*0fca6ea1SDimitry Andric   /// \returns true if this function and its dependencies can be considered
395*0fca6ea1SDimitry Andric   /// large according to \p Threshold.
396*0fca6ea1SDimitry Andric   bool isLarge(CostType Threshold) const {
397*0fca6ea1SDimitry Andric     return TotalCost > Threshold && !Dependencies.empty();
398*0fca6ea1SDimitry Andric   }
399*0fca6ea1SDimitry Andric };
400*0fca6ea1SDimitry Andric 
401*0fca6ea1SDimitry Andric /// Calculates how much overlap there is between \p A and \p B.
402*0fca6ea1SDimitry Andric /// \return A number between 0.0 and 1.0, where 1.0 means A == B and 0.0 means A
403*0fca6ea1SDimitry Andric /// and B have no shared elements. Kernels do not count in overlap calculation.
404*0fca6ea1SDimitry Andric static float calculateOverlap(const DenseSet<const Function *> &A,
405*0fca6ea1SDimitry Andric                               const DenseSet<const Function *> &B) {
406*0fca6ea1SDimitry Andric   DenseSet<const Function *> Total;
407*0fca6ea1SDimitry Andric   for (const auto *F : A) {
408*0fca6ea1SDimitry Andric     if (!isEntryPoint(F))
409*0fca6ea1SDimitry Andric       Total.insert(F);
410*0fca6ea1SDimitry Andric   }
411*0fca6ea1SDimitry Andric 
412*0fca6ea1SDimitry Andric   if (Total.empty())
413*0fca6ea1SDimitry Andric     return 0.0f;
414*0fca6ea1SDimitry Andric 
415*0fca6ea1SDimitry Andric   unsigned NumCommon = 0;
416*0fca6ea1SDimitry Andric   for (const auto *F : B) {
417*0fca6ea1SDimitry Andric     if (isEntryPoint(F))
418*0fca6ea1SDimitry Andric       continue;
419*0fca6ea1SDimitry Andric 
420*0fca6ea1SDimitry Andric     auto [It, Inserted] = Total.insert(F);
421*0fca6ea1SDimitry Andric     if (!Inserted)
422*0fca6ea1SDimitry Andric       ++NumCommon;
423*0fca6ea1SDimitry Andric   }
424*0fca6ea1SDimitry Andric 
425*0fca6ea1SDimitry Andric   return static_cast<float>(NumCommon) / Total.size();
426*0fca6ea1SDimitry Andric }
427*0fca6ea1SDimitry Andric 
428*0fca6ea1SDimitry Andric /// Performs all of the partitioning work on \p M.
429*0fca6ea1SDimitry Andric /// \param SML Log Helper
430*0fca6ea1SDimitry Andric /// \param M Module to partition.
431*0fca6ea1SDimitry Andric /// \param NumParts Number of partitions to create.
432*0fca6ea1SDimitry Andric /// \param ModuleCost Total cost of all functions in \p M.
433*0fca6ea1SDimitry Andric /// \param FnCosts Map of Function -> Cost
434*0fca6ea1SDimitry Andric /// \param WorkList Functions and their dependencies to process in order.
435*0fca6ea1SDimitry Andric /// \returns The created partitions (a vector of size \p NumParts )
436*0fca6ea1SDimitry Andric static std::vector<DenseSet<const Function *>>
437*0fca6ea1SDimitry Andric doPartitioning(SplitModuleLogger &SML, Module &M, unsigned NumParts,
438*0fca6ea1SDimitry Andric                CostType ModuleCost,
439*0fca6ea1SDimitry Andric                const DenseMap<const Function *, CostType> &FnCosts,
440*0fca6ea1SDimitry Andric                const SmallVector<FunctionWithDependencies> &WorkList) {
441*0fca6ea1SDimitry Andric 
442*0fca6ea1SDimitry Andric   SML << "\n--Partitioning Starts--\n";
443*0fca6ea1SDimitry Andric 
444*0fca6ea1SDimitry Andric   // Calculate a "large function threshold". When more than one function's total
445*0fca6ea1SDimitry Andric   // import cost exceeds this value, we will try to assign it to an existing
446*0fca6ea1SDimitry Andric   // partition to reduce the amount of duplication needed.
447*0fca6ea1SDimitry Andric   //
448*0fca6ea1SDimitry Andric   // e.g. let two functions X and Y have a import cost of ~10% of the module, we
449*0fca6ea1SDimitry Andric   // assign X to a partition as usual, but when we get to Y, we check if it's
450*0fca6ea1SDimitry Andric   // worth also putting it in Y's partition.
451*0fca6ea1SDimitry Andric   const CostType LargeFnThreshold =
452*0fca6ea1SDimitry Andric       LargeFnFactor ? CostType(((ModuleCost / NumParts) * LargeFnFactor))
453*0fca6ea1SDimitry Andric                     : std::numeric_limits<CostType>::max();
454*0fca6ea1SDimitry Andric 
455*0fca6ea1SDimitry Andric   std::vector<DenseSet<const Function *>> Partitions;
456*0fca6ea1SDimitry Andric   Partitions.resize(NumParts);
457*0fca6ea1SDimitry Andric 
458*0fca6ea1SDimitry Andric   // Assign functions to partitions, and try to keep the partitions more or
459*0fca6ea1SDimitry Andric   // less balanced. We do that through a priority queue sorted in reverse, so we
460*0fca6ea1SDimitry Andric   // can always look at the partition with the least content.
461*0fca6ea1SDimitry Andric   //
462*0fca6ea1SDimitry Andric   // There are some cases where we will be deliberately unbalanced though.
463*0fca6ea1SDimitry Andric   //  - Large functions: we try to merge with existing partitions to reduce code
464*0fca6ea1SDimitry Andric   //  duplication.
465*0fca6ea1SDimitry Andric   //  - Functions with indirect or external calls always go in the first
466*0fca6ea1SDimitry Andric   //  partition (P0).
467*0fca6ea1SDimitry Andric   auto ComparePartitions = [](const std::pair<PartitionID, CostType> &a,
468*0fca6ea1SDimitry Andric                               const std::pair<PartitionID, CostType> &b) {
469*0fca6ea1SDimitry Andric     // When two partitions have the same cost, assign to the one with the
470*0fca6ea1SDimitry Andric     // biggest ID first. This allows us to put things in P0 last, because P0 may
471*0fca6ea1SDimitry Andric     // have other stuff added later.
472*0fca6ea1SDimitry Andric     if (a.second == b.second)
473*0fca6ea1SDimitry Andric       return a.first < b.first;
474*0fca6ea1SDimitry Andric     return a.second > b.second;
475*0fca6ea1SDimitry Andric   };
476*0fca6ea1SDimitry Andric 
477*0fca6ea1SDimitry Andric   // We can't use priority_queue here because we need to be able to access any
478*0fca6ea1SDimitry Andric   // element. This makes this a bit inefficient as we need to sort it again
479*0fca6ea1SDimitry Andric   // everytime we change it, but it's a very small array anyway (likely under 64
480*0fca6ea1SDimitry Andric   // partitions) so it's a cheap operation.
481*0fca6ea1SDimitry Andric   std::vector<std::pair<PartitionID, CostType>> BalancingQueue;
482*0fca6ea1SDimitry Andric   for (unsigned I = 0; I < NumParts; ++I)
483*0fca6ea1SDimitry Andric     BalancingQueue.emplace_back(I, 0);
484*0fca6ea1SDimitry Andric 
485*0fca6ea1SDimitry Andric   // Helper function to handle assigning a function to a partition. This takes
486*0fca6ea1SDimitry Andric   // care of updating the balancing queue.
487*0fca6ea1SDimitry Andric   const auto AssignToPartition = [&](PartitionID PID,
488*0fca6ea1SDimitry Andric                                      const FunctionWithDependencies &FWD) {
489*0fca6ea1SDimitry Andric     auto &FnsInPart = Partitions[PID];
490*0fca6ea1SDimitry Andric     FnsInPart.insert(FWD.Fn);
491*0fca6ea1SDimitry Andric     FnsInPart.insert(FWD.Dependencies.begin(), FWD.Dependencies.end());
492*0fca6ea1SDimitry Andric 
493*0fca6ea1SDimitry Andric     SML << "assign " << getName(*FWD.Fn) << " to P" << PID << "\n  ->  ";
494*0fca6ea1SDimitry Andric     if (!FWD.Dependencies.empty()) {
495*0fca6ea1SDimitry Andric       SML << FWD.Dependencies.size() << " dependencies added\n";
496*0fca6ea1SDimitry Andric     };
497*0fca6ea1SDimitry Andric 
498*0fca6ea1SDimitry Andric     // Update the balancing queue. we scan backwards because in the common case
499*0fca6ea1SDimitry Andric     // the partition is at the end.
500*0fca6ea1SDimitry Andric     for (auto &[QueuePID, Cost] : reverse(BalancingQueue)) {
501*0fca6ea1SDimitry Andric       if (QueuePID == PID) {
502*0fca6ea1SDimitry Andric         CostType NewCost = 0;
503*0fca6ea1SDimitry Andric         for (auto *Fn : Partitions[PID])
504*0fca6ea1SDimitry Andric           NewCost += FnCosts.at(Fn);
505*0fca6ea1SDimitry Andric 
506*0fca6ea1SDimitry Andric         SML << "[Updating P" << PID << " Cost]:" << Cost << " -> " << NewCost;
507*0fca6ea1SDimitry Andric         if (Cost) {
508*0fca6ea1SDimitry Andric           SML << " (" << unsigned(((float(NewCost) / Cost) - 1) * 100)
509*0fca6ea1SDimitry Andric               << "% increase)";
510*0fca6ea1SDimitry Andric         }
511*0fca6ea1SDimitry Andric         SML << '\n';
512*0fca6ea1SDimitry Andric 
513*0fca6ea1SDimitry Andric         Cost = NewCost;
514*0fca6ea1SDimitry Andric       }
515*0fca6ea1SDimitry Andric     }
516*0fca6ea1SDimitry Andric 
517*0fca6ea1SDimitry Andric     sort(BalancingQueue, ComparePartitions);
518*0fca6ea1SDimitry Andric   };
519*0fca6ea1SDimitry Andric 
520*0fca6ea1SDimitry Andric   for (auto &CurFn : WorkList) {
521*0fca6ea1SDimitry Andric     // When a function has indirect calls, it must stay in the first partition
522*0fca6ea1SDimitry Andric     // alongside every reachable non-entry function. This is a nightmare case
523*0fca6ea1SDimitry Andric     // for splitting as it severely limits what we can do.
524*0fca6ea1SDimitry Andric     if (CurFn.HasIndirectCall) {
525*0fca6ea1SDimitry Andric       SML << "Function with indirect call(s): " << getName(*CurFn.Fn)
526*0fca6ea1SDimitry Andric           << " defaulting to P0\n";
527*0fca6ea1SDimitry Andric       AssignToPartition(0, CurFn);
528*0fca6ea1SDimitry Andric       continue;
529*0fca6ea1SDimitry Andric     }
530*0fca6ea1SDimitry Andric 
531*0fca6ea1SDimitry Andric     // When a function has non duplicatable dependencies, we have to keep it in
532*0fca6ea1SDimitry Andric     // the first partition as well. This is a conservative approach, a
533*0fca6ea1SDimitry Andric     // finer-grained approach could keep track of which dependencies are
534*0fca6ea1SDimitry Andric     // non-duplicatable exactly and just make sure they're grouped together.
535*0fca6ea1SDimitry Andric     if (CurFn.HasNonDuplicatableDependecy) {
536*0fca6ea1SDimitry Andric       SML << "Function with externally visible dependency "
537*0fca6ea1SDimitry Andric           << getName(*CurFn.Fn) << " defaulting to P0\n";
538*0fca6ea1SDimitry Andric       AssignToPartition(0, CurFn);
539*0fca6ea1SDimitry Andric       continue;
540*0fca6ea1SDimitry Andric     }
541*0fca6ea1SDimitry Andric 
542*0fca6ea1SDimitry Andric     // Be smart with large functions to avoid duplicating their dependencies.
543*0fca6ea1SDimitry Andric     if (CurFn.isLarge(LargeFnThreshold)) {
544*0fca6ea1SDimitry Andric       assert(LargeFnOverlapForMerge >= 0.0f && LargeFnOverlapForMerge <= 1.0f);
545*0fca6ea1SDimitry Andric       SML << "Large Function: " << getName(*CurFn.Fn)
546*0fca6ea1SDimitry Andric           << " - looking for partition with at least "
547*0fca6ea1SDimitry Andric           << format("%0.2f", LargeFnOverlapForMerge * 100) << "% overlap\n";
548*0fca6ea1SDimitry Andric 
549*0fca6ea1SDimitry Andric       bool Assigned = false;
550*0fca6ea1SDimitry Andric       for (const auto &[PID, Fns] : enumerate(Partitions)) {
551*0fca6ea1SDimitry Andric         float Overlap = calculateOverlap(CurFn.Dependencies, Fns);
552*0fca6ea1SDimitry Andric         SML << "  => " << format("%0.2f", Overlap * 100) << "% overlap with P"
553*0fca6ea1SDimitry Andric             << PID << '\n';
554*0fca6ea1SDimitry Andric         if (Overlap > LargeFnOverlapForMerge) {
555*0fca6ea1SDimitry Andric           SML << "  selecting P" << PID << '\n';
556*0fca6ea1SDimitry Andric           AssignToPartition(PID, CurFn);
557*0fca6ea1SDimitry Andric           Assigned = true;
558*0fca6ea1SDimitry Andric         }
559*0fca6ea1SDimitry Andric       }
560*0fca6ea1SDimitry Andric 
561*0fca6ea1SDimitry Andric       if (Assigned)
562*0fca6ea1SDimitry Andric         continue;
563*0fca6ea1SDimitry Andric     }
564*0fca6ea1SDimitry Andric 
565*0fca6ea1SDimitry Andric     // Normal "load-balancing", assign to partition with least pressure.
566*0fca6ea1SDimitry Andric     auto [PID, CurCost] = BalancingQueue.back();
567*0fca6ea1SDimitry Andric     AssignToPartition(PID, CurFn);
568*0fca6ea1SDimitry Andric   }
569*0fca6ea1SDimitry Andric 
570*0fca6ea1SDimitry Andric   if (SML) {
571*0fca6ea1SDimitry Andric     for (const auto &[Idx, Part] : enumerate(Partitions)) {
572*0fca6ea1SDimitry Andric       CostType Cost = 0;
573*0fca6ea1SDimitry Andric       for (auto *Fn : Part)
574*0fca6ea1SDimitry Andric         Cost += FnCosts.at(Fn);
575*0fca6ea1SDimitry Andric       SML << "P" << Idx << " has a total cost of " << Cost << " ("
576*0fca6ea1SDimitry Andric           << format("%0.2f", (float(Cost) / ModuleCost) * 100)
577*0fca6ea1SDimitry Andric           << "% of source module)\n";
578*0fca6ea1SDimitry Andric     }
579*0fca6ea1SDimitry Andric 
580*0fca6ea1SDimitry Andric     SML << "--Partitioning Done--\n\n";
581*0fca6ea1SDimitry Andric   }
582*0fca6ea1SDimitry Andric 
583*0fca6ea1SDimitry Andric   // Check no functions were missed.
584*0fca6ea1SDimitry Andric #ifndef NDEBUG
585*0fca6ea1SDimitry Andric   DenseSet<const Function *> AllFunctions;
586*0fca6ea1SDimitry Andric   for (const auto &Part : Partitions)
587*0fca6ea1SDimitry Andric     AllFunctions.insert(Part.begin(), Part.end());
588*0fca6ea1SDimitry Andric 
589*0fca6ea1SDimitry Andric   for (auto &Fn : M) {
590*0fca6ea1SDimitry Andric     if (!Fn.isDeclaration() && !AllFunctions.contains(&Fn)) {
591*0fca6ea1SDimitry Andric       assert(AllFunctions.contains(&Fn) && "Missed a function?!");
592*0fca6ea1SDimitry Andric     }
593*0fca6ea1SDimitry Andric   }
594*0fca6ea1SDimitry Andric #endif
595*0fca6ea1SDimitry Andric 
596*0fca6ea1SDimitry Andric   return Partitions;
597*0fca6ea1SDimitry Andric }
598*0fca6ea1SDimitry Andric 
599*0fca6ea1SDimitry Andric static void externalize(GlobalValue &GV) {
600*0fca6ea1SDimitry Andric   if (GV.hasLocalLinkage()) {
601*0fca6ea1SDimitry Andric     GV.setLinkage(GlobalValue::ExternalLinkage);
602*0fca6ea1SDimitry Andric     GV.setVisibility(GlobalValue::HiddenVisibility);
603*0fca6ea1SDimitry Andric   }
604*0fca6ea1SDimitry Andric 
605*0fca6ea1SDimitry Andric   // Unnamed entities must be named consistently between modules. setName will
606*0fca6ea1SDimitry Andric   // give a distinct name to each such entity.
607*0fca6ea1SDimitry Andric   if (!GV.hasName())
608*0fca6ea1SDimitry Andric     GV.setName("__llvmsplit_unnamed");
609*0fca6ea1SDimitry Andric }
610*0fca6ea1SDimitry Andric 
611*0fca6ea1SDimitry Andric static bool hasDirectCaller(const Function &Fn) {
612*0fca6ea1SDimitry Andric   for (auto &U : Fn.uses()) {
613*0fca6ea1SDimitry Andric     if (auto *CB = dyn_cast<CallBase>(U.getUser()); CB && CB->isCallee(&U))
614*0fca6ea1SDimitry Andric       return true;
615*0fca6ea1SDimitry Andric   }
616*0fca6ea1SDimitry Andric   return false;
617*0fca6ea1SDimitry Andric }
618*0fca6ea1SDimitry Andric 
619*0fca6ea1SDimitry Andric static void splitAMDGPUModule(
620*0fca6ea1SDimitry Andric     GetTTIFn GetTTI, Module &M, unsigned N,
621*0fca6ea1SDimitry Andric     function_ref<void(std::unique_ptr<Module> MPart)> ModuleCallback) {
622*0fca6ea1SDimitry Andric 
623*0fca6ea1SDimitry Andric   SplitModuleLogger SML(M);
624*0fca6ea1SDimitry Andric 
625*0fca6ea1SDimitry Andric   CallGraph CG(M);
626*0fca6ea1SDimitry Andric 
627*0fca6ea1SDimitry Andric   // Externalize functions whose address are taken.
628*0fca6ea1SDimitry Andric   //
629*0fca6ea1SDimitry Andric   // This is needed because partitioning is purely based on calls, but sometimes
630*0fca6ea1SDimitry Andric   // a kernel/function may just look at the address of another local function
631*0fca6ea1SDimitry Andric   // and not do anything (no calls). After partitioning, that local function may
632*0fca6ea1SDimitry Andric   // end up in a different module (so it's just a declaration in the module
633*0fca6ea1SDimitry Andric   // where its address is taken), which emits a "undefined hidden symbol" linker
634*0fca6ea1SDimitry Andric   // error.
635*0fca6ea1SDimitry Andric   //
636*0fca6ea1SDimitry Andric   // Additionally, it guides partitioning to not duplicate this function if it's
637*0fca6ea1SDimitry Andric   // called directly at some point.
638*0fca6ea1SDimitry Andric   for (auto &Fn : M) {
639*0fca6ea1SDimitry Andric     if (Fn.hasAddressTaken()) {
640*0fca6ea1SDimitry Andric       if (Fn.hasLocalLinkage()) {
641*0fca6ea1SDimitry Andric         SML << "[externalize] " << Fn.getName()
642*0fca6ea1SDimitry Andric             << " because its address is taken\n";
643*0fca6ea1SDimitry Andric       }
644*0fca6ea1SDimitry Andric       externalize(Fn);
645*0fca6ea1SDimitry Andric     }
646*0fca6ea1SDimitry Andric   }
647*0fca6ea1SDimitry Andric 
648*0fca6ea1SDimitry Andric   // Externalize local GVs, which avoids duplicating their initializers, which
649*0fca6ea1SDimitry Andric   // in turns helps keep code size in check.
650*0fca6ea1SDimitry Andric   if (!NoExternalizeGlobals) {
651*0fca6ea1SDimitry Andric     for (auto &GV : M.globals()) {
652*0fca6ea1SDimitry Andric       if (GV.hasLocalLinkage())
653*0fca6ea1SDimitry Andric         SML << "[externalize] GV " << GV.getName() << '\n';
654*0fca6ea1SDimitry Andric       externalize(GV);
655*0fca6ea1SDimitry Andric     }
656*0fca6ea1SDimitry Andric   }
657*0fca6ea1SDimitry Andric 
658*0fca6ea1SDimitry Andric   // Start by calculating the cost of every function in the module, as well as
659*0fca6ea1SDimitry Andric   // the module's overall cost.
660*0fca6ea1SDimitry Andric   DenseMap<const Function *, CostType> FnCosts;
661*0fca6ea1SDimitry Andric   const CostType ModuleCost = calculateFunctionCosts(SML, GetTTI, M, FnCosts);
662*0fca6ea1SDimitry Andric 
663*0fca6ea1SDimitry Andric   // First, gather ever kernel into the worklist.
664*0fca6ea1SDimitry Andric   SmallVector<FunctionWithDependencies> WorkList;
665*0fca6ea1SDimitry Andric   for (auto &Fn : M) {
666*0fca6ea1SDimitry Andric     if (isEntryPoint(&Fn) && !Fn.isDeclaration())
667*0fca6ea1SDimitry Andric       WorkList.emplace_back(SML, CG, FnCosts, &Fn);
668*0fca6ea1SDimitry Andric   }
669*0fca6ea1SDimitry Andric 
670*0fca6ea1SDimitry Andric   // Then, find missing functions that need to be considered as additional
671*0fca6ea1SDimitry Andric   // roots. These can't be called in theory, but in practice we still have to
672*0fca6ea1SDimitry Andric   // handle them to avoid linker errors.
673*0fca6ea1SDimitry Andric   {
674*0fca6ea1SDimitry Andric     DenseSet<const Function *> SeenFunctions;
675*0fca6ea1SDimitry Andric     for (const auto &FWD : WorkList) {
676*0fca6ea1SDimitry Andric       SeenFunctions.insert(FWD.Fn);
677*0fca6ea1SDimitry Andric       SeenFunctions.insert(FWD.Dependencies.begin(), FWD.Dependencies.end());
678*0fca6ea1SDimitry Andric     }
679*0fca6ea1SDimitry Andric 
680*0fca6ea1SDimitry Andric     for (auto &Fn : M) {
681*0fca6ea1SDimitry Andric       // If this function is not part of any kernel's dependencies and isn't
682*0fca6ea1SDimitry Andric       // directly called, consider it as a root.
683*0fca6ea1SDimitry Andric       if (!Fn.isDeclaration() && !isEntryPoint(&Fn) &&
684*0fca6ea1SDimitry Andric           !SeenFunctions.count(&Fn) && !hasDirectCaller(Fn)) {
685*0fca6ea1SDimitry Andric         WorkList.emplace_back(SML, CG, FnCosts, &Fn);
686*0fca6ea1SDimitry Andric       }
687*0fca6ea1SDimitry Andric     }
688*0fca6ea1SDimitry Andric   }
689*0fca6ea1SDimitry Andric 
690*0fca6ea1SDimitry Andric   // Sort the worklist so the most expensive roots are seen first.
691*0fca6ea1SDimitry Andric   sort(WorkList, [&](auto &A, auto &B) {
692*0fca6ea1SDimitry Andric     // Sort by total cost, and if the total cost is identical, sort
693*0fca6ea1SDimitry Andric     // alphabetically.
694*0fca6ea1SDimitry Andric     if (A.TotalCost == B.TotalCost)
695*0fca6ea1SDimitry Andric       return A.Fn->getName() < B.Fn->getName();
696*0fca6ea1SDimitry Andric     return A.TotalCost > B.TotalCost;
697*0fca6ea1SDimitry Andric   });
698*0fca6ea1SDimitry Andric 
699*0fca6ea1SDimitry Andric   if (SML) {
700*0fca6ea1SDimitry Andric     SML << "Worklist\n";
701*0fca6ea1SDimitry Andric     for (const auto &FWD : WorkList) {
702*0fca6ea1SDimitry Andric       SML << "[root] " << getName(*FWD.Fn) << " (totalCost:" << FWD.TotalCost
703*0fca6ea1SDimitry Andric           << " indirect:" << FWD.HasIndirectCall
704*0fca6ea1SDimitry Andric           << " hasNonDuplicatableDep:" << FWD.HasNonDuplicatableDependecy
705*0fca6ea1SDimitry Andric           << ")\n";
706*0fca6ea1SDimitry Andric       // Sort function names before printing to ensure determinism.
707*0fca6ea1SDimitry Andric       SmallVector<std::string> SortedDepNames;
708*0fca6ea1SDimitry Andric       SortedDepNames.reserve(FWD.Dependencies.size());
709*0fca6ea1SDimitry Andric       for (const auto *Dep : FWD.Dependencies)
710*0fca6ea1SDimitry Andric         SortedDepNames.push_back(getName(*Dep));
711*0fca6ea1SDimitry Andric       sort(SortedDepNames);
712*0fca6ea1SDimitry Andric 
713*0fca6ea1SDimitry Andric       for (const auto &Name : SortedDepNames)
714*0fca6ea1SDimitry Andric         SML << "  [dependency] " << Name << '\n';
715*0fca6ea1SDimitry Andric     }
716*0fca6ea1SDimitry Andric   }
717*0fca6ea1SDimitry Andric 
718*0fca6ea1SDimitry Andric   // This performs all of the partitioning work.
719*0fca6ea1SDimitry Andric   auto Partitions = doPartitioning(SML, M, N, ModuleCost, FnCosts, WorkList);
720*0fca6ea1SDimitry Andric   assert(Partitions.size() == N);
721*0fca6ea1SDimitry Andric 
722*0fca6ea1SDimitry Andric   // If we didn't externalize GVs, then local GVs need to be conservatively
723*0fca6ea1SDimitry Andric   // imported into every module (including their initializers), and then cleaned
724*0fca6ea1SDimitry Andric   // up afterwards.
725*0fca6ea1SDimitry Andric   const auto NeedsConservativeImport = [&](const GlobalValue *GV) {
726*0fca6ea1SDimitry Andric     // We conservatively import private/internal GVs into every module and clean
727*0fca6ea1SDimitry Andric     // them up afterwards.
728*0fca6ea1SDimitry Andric     const auto *Var = dyn_cast<GlobalVariable>(GV);
729*0fca6ea1SDimitry Andric     return Var && Var->hasLocalLinkage();
730*0fca6ea1SDimitry Andric   };
731*0fca6ea1SDimitry Andric 
732*0fca6ea1SDimitry Andric   SML << "Creating " << N << " modules...\n";
733*0fca6ea1SDimitry Andric   unsigned TotalFnImpls = 0;
734*0fca6ea1SDimitry Andric   for (unsigned I = 0; I < N; ++I) {
735*0fca6ea1SDimitry Andric     const auto &FnsInPart = Partitions[I];
736*0fca6ea1SDimitry Andric 
737*0fca6ea1SDimitry Andric     ValueToValueMapTy VMap;
738*0fca6ea1SDimitry Andric     std::unique_ptr<Module> MPart(
739*0fca6ea1SDimitry Andric         CloneModule(M, VMap, [&](const GlobalValue *GV) {
740*0fca6ea1SDimitry Andric           // Functions go in their assigned partition.
741*0fca6ea1SDimitry Andric           if (const auto *Fn = dyn_cast<Function>(GV))
742*0fca6ea1SDimitry Andric             return FnsInPart.contains(Fn);
743*0fca6ea1SDimitry Andric 
744*0fca6ea1SDimitry Andric           if (NeedsConservativeImport(GV))
745*0fca6ea1SDimitry Andric             return true;
746*0fca6ea1SDimitry Andric 
747*0fca6ea1SDimitry Andric           // Everything else goes in the first partition.
748*0fca6ea1SDimitry Andric           return I == 0;
749*0fca6ea1SDimitry Andric         }));
750*0fca6ea1SDimitry Andric 
751*0fca6ea1SDimitry Andric     // Clean-up conservatively imported GVs without any users.
752*0fca6ea1SDimitry Andric     for (auto &GV : make_early_inc_range(MPart->globals())) {
753*0fca6ea1SDimitry Andric       if (NeedsConservativeImport(&GV) && GV.use_empty())
754*0fca6ea1SDimitry Andric         GV.eraseFromParent();
755*0fca6ea1SDimitry Andric     }
756*0fca6ea1SDimitry Andric 
757*0fca6ea1SDimitry Andric     unsigned NumAllFns = 0, NumKernels = 0;
758*0fca6ea1SDimitry Andric     for (auto &Cur : *MPart) {
759*0fca6ea1SDimitry Andric       if (!Cur.isDeclaration()) {
760*0fca6ea1SDimitry Andric         ++NumAllFns;
761*0fca6ea1SDimitry Andric         if (isEntryPoint(&Cur))
762*0fca6ea1SDimitry Andric           ++NumKernels;
763*0fca6ea1SDimitry Andric       }
764*0fca6ea1SDimitry Andric     }
765*0fca6ea1SDimitry Andric     TotalFnImpls += NumAllFns;
766*0fca6ea1SDimitry Andric     SML << "  - Module " << I << " with " << NumAllFns << " functions ("
767*0fca6ea1SDimitry Andric         << NumKernels << " kernels)\n";
768*0fca6ea1SDimitry Andric     ModuleCallback(std::move(MPart));
769*0fca6ea1SDimitry Andric   }
770*0fca6ea1SDimitry Andric 
771*0fca6ea1SDimitry Andric   SML << TotalFnImpls << " function definitions across all modules ("
772*0fca6ea1SDimitry Andric       << format("%0.2f", (float(TotalFnImpls) / FnCosts.size()) * 100)
773*0fca6ea1SDimitry Andric       << "% of original module)\n";
774*0fca6ea1SDimitry Andric }
775*0fca6ea1SDimitry Andric } // namespace
776*0fca6ea1SDimitry Andric 
777*0fca6ea1SDimitry Andric PreservedAnalyses AMDGPUSplitModulePass::run(Module &M,
778*0fca6ea1SDimitry Andric                                              ModuleAnalysisManager &MAM) {
779*0fca6ea1SDimitry Andric   FunctionAnalysisManager &FAM =
780*0fca6ea1SDimitry Andric       MAM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
781*0fca6ea1SDimitry Andric   const auto TTIGetter = [&FAM](Function &F) -> const TargetTransformInfo & {
782*0fca6ea1SDimitry Andric     return FAM.getResult<TargetIRAnalysis>(F);
783*0fca6ea1SDimitry Andric   };
784*0fca6ea1SDimitry Andric   splitAMDGPUModule(TTIGetter, M, N, ModuleCallback);
785*0fca6ea1SDimitry Andric   // We don't change the original module.
786*0fca6ea1SDimitry Andric   return PreservedAnalyses::all();
787*0fca6ea1SDimitry Andric }
788