1*0fca6ea1SDimitry Andric //===- AMDGPUSplitModule.cpp ----------------------------------------------===// 2*0fca6ea1SDimitry Andric // 3*0fca6ea1SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4*0fca6ea1SDimitry Andric // See https://llvm.org/LICENSE.txt for license information. 5*0fca6ea1SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6*0fca6ea1SDimitry Andric // 7*0fca6ea1SDimitry Andric //===----------------------------------------------------------------------===// 8*0fca6ea1SDimitry Andric // 9*0fca6ea1SDimitry Andric /// \file Implements a module splitting algorithm designed to support the 10*0fca6ea1SDimitry Andric /// FullLTO --lto-partitions option for parallel codegen. This is completely 11*0fca6ea1SDimitry Andric /// different from the common SplitModule pass, as this system is designed with 12*0fca6ea1SDimitry Andric /// AMDGPU in mind. 13*0fca6ea1SDimitry Andric /// 14*0fca6ea1SDimitry Andric /// The basic idea of this module splitting implementation is the same as 15*0fca6ea1SDimitry Andric /// SplitModule: load-balance the module's functions across a set of N 16*0fca6ea1SDimitry Andric /// partitions to allow parallel codegen. However, it does it very 17*0fca6ea1SDimitry Andric /// differently than the target-agnostic variant: 18*0fca6ea1SDimitry Andric /// - The module has "split roots", which are kernels in the vast 19*0fca6ea1SDimitry Andric // majority of cases. 20*0fca6ea1SDimitry Andric /// - Each root has a set of dependencies, and when a root and its 21*0fca6ea1SDimitry Andric /// dependencies is considered "big", we try to put it in a partition where 22*0fca6ea1SDimitry Andric /// most dependencies are already imported, to avoid duplicating large 23*0fca6ea1SDimitry Andric /// amounts of code. 24*0fca6ea1SDimitry Andric /// - There's special care for indirect calls in order to ensure 25*0fca6ea1SDimitry Andric /// AMDGPUResourceUsageAnalysis can work correctly. 26*0fca6ea1SDimitry Andric /// 27*0fca6ea1SDimitry Andric /// This file also includes a more elaborate logging system to enable 28*0fca6ea1SDimitry Andric /// users to easily generate logs that (if desired) do not include any value 29*0fca6ea1SDimitry Andric /// names, in order to not leak information about the source file. 30*0fca6ea1SDimitry Andric /// Such logs are very helpful to understand and fix potential issues with 31*0fca6ea1SDimitry Andric /// module splitting. 32*0fca6ea1SDimitry Andric 33*0fca6ea1SDimitry Andric #include "AMDGPUSplitModule.h" 34*0fca6ea1SDimitry Andric #include "AMDGPUTargetMachine.h" 35*0fca6ea1SDimitry Andric #include "Utils/AMDGPUBaseInfo.h" 36*0fca6ea1SDimitry Andric #include "llvm/ADT/DenseMap.h" 37*0fca6ea1SDimitry Andric #include "llvm/ADT/SmallVector.h" 38*0fca6ea1SDimitry Andric #include "llvm/ADT/StringExtras.h" 39*0fca6ea1SDimitry Andric #include "llvm/ADT/StringRef.h" 40*0fca6ea1SDimitry Andric #include "llvm/Analysis/CallGraph.h" 41*0fca6ea1SDimitry Andric #include "llvm/Analysis/TargetTransformInfo.h" 42*0fca6ea1SDimitry Andric #include "llvm/IR/Function.h" 43*0fca6ea1SDimitry Andric #include "llvm/IR/Instruction.h" 44*0fca6ea1SDimitry Andric #include "llvm/IR/Module.h" 45*0fca6ea1SDimitry Andric #include "llvm/IR/User.h" 46*0fca6ea1SDimitry Andric #include "llvm/IR/Value.h" 47*0fca6ea1SDimitry Andric #include "llvm/Support/Casting.h" 48*0fca6ea1SDimitry Andric #include "llvm/Support/Debug.h" 49*0fca6ea1SDimitry Andric #include "llvm/Support/FileSystem.h" 50*0fca6ea1SDimitry Andric #include "llvm/Support/Path.h" 51*0fca6ea1SDimitry Andric #include "llvm/Support/Process.h" 52*0fca6ea1SDimitry Andric #include "llvm/Support/SHA256.h" 53*0fca6ea1SDimitry Andric #include "llvm/Support/Threading.h" 54*0fca6ea1SDimitry Andric #include "llvm/Support/raw_ostream.h" 55*0fca6ea1SDimitry Andric #include "llvm/Transforms/Utils/Cloning.h" 56*0fca6ea1SDimitry Andric #include <algorithm> 57*0fca6ea1SDimitry Andric #include <cassert> 58*0fca6ea1SDimitry Andric #include <iterator> 59*0fca6ea1SDimitry Andric #include <memory> 60*0fca6ea1SDimitry Andric #include <utility> 61*0fca6ea1SDimitry Andric #include <vector> 62*0fca6ea1SDimitry Andric 63*0fca6ea1SDimitry Andric using namespace llvm; 64*0fca6ea1SDimitry Andric 65*0fca6ea1SDimitry Andric #define DEBUG_TYPE "amdgpu-split-module" 66*0fca6ea1SDimitry Andric 67*0fca6ea1SDimitry Andric namespace { 68*0fca6ea1SDimitry Andric 69*0fca6ea1SDimitry Andric static cl::opt<float> LargeFnFactor( 70*0fca6ea1SDimitry Andric "amdgpu-module-splitting-large-function-threshold", cl::init(2.0f), 71*0fca6ea1SDimitry Andric cl::Hidden, 72*0fca6ea1SDimitry Andric cl::desc( 73*0fca6ea1SDimitry Andric "consider a function as large and needing special treatment when the " 74*0fca6ea1SDimitry Andric "cost of importing it into a partition" 75*0fca6ea1SDimitry Andric "exceeds the average cost of a partition by this factor; e;g. 2.0 " 76*0fca6ea1SDimitry Andric "means if the function and its dependencies is 2 times bigger than " 77*0fca6ea1SDimitry Andric "an average partition; 0 disables large functions handling entirely")); 78*0fca6ea1SDimitry Andric 79*0fca6ea1SDimitry Andric static cl::opt<float> LargeFnOverlapForMerge( 80*0fca6ea1SDimitry Andric "amdgpu-module-splitting-large-function-merge-overlap", cl::init(0.8f), 81*0fca6ea1SDimitry Andric cl::Hidden, 82*0fca6ea1SDimitry Andric cl::desc( 83*0fca6ea1SDimitry Andric "defines how much overlap between two large function's dependencies " 84*0fca6ea1SDimitry Andric "is needed to put them in the same partition")); 85*0fca6ea1SDimitry Andric 86*0fca6ea1SDimitry Andric static cl::opt<bool> NoExternalizeGlobals( 87*0fca6ea1SDimitry Andric "amdgpu-module-splitting-no-externalize-globals", cl::Hidden, 88*0fca6ea1SDimitry Andric cl::desc("disables externalization of global variable with local linkage; " 89*0fca6ea1SDimitry Andric "may cause globals to be duplicated which increases binary size")); 90*0fca6ea1SDimitry Andric 91*0fca6ea1SDimitry Andric static cl::opt<std::string> 92*0fca6ea1SDimitry Andric LogDirOpt("amdgpu-module-splitting-log-dir", cl::Hidden, 93*0fca6ea1SDimitry Andric cl::desc("output directory for AMDGPU module splitting logs")); 94*0fca6ea1SDimitry Andric 95*0fca6ea1SDimitry Andric static cl::opt<bool> 96*0fca6ea1SDimitry Andric LogPrivate("amdgpu-module-splitting-log-private", cl::Hidden, 97*0fca6ea1SDimitry Andric cl::desc("hash value names before printing them in the AMDGPU " 98*0fca6ea1SDimitry Andric "module splitting logs")); 99*0fca6ea1SDimitry Andric 100*0fca6ea1SDimitry Andric using CostType = InstructionCost::CostType; 101*0fca6ea1SDimitry Andric using PartitionID = unsigned; 102*0fca6ea1SDimitry Andric using GetTTIFn = function_ref<const TargetTransformInfo &(Function &)>; 103*0fca6ea1SDimitry Andric 104*0fca6ea1SDimitry Andric static bool isEntryPoint(const Function *F) { 105*0fca6ea1SDimitry Andric return AMDGPU::isEntryFunctionCC(F->getCallingConv()); 106*0fca6ea1SDimitry Andric } 107*0fca6ea1SDimitry Andric 108*0fca6ea1SDimitry Andric static std::string getName(const Value &V) { 109*0fca6ea1SDimitry Andric static bool HideNames; 110*0fca6ea1SDimitry Andric 111*0fca6ea1SDimitry Andric static llvm::once_flag HideNameInitFlag; 112*0fca6ea1SDimitry Andric llvm::call_once(HideNameInitFlag, [&]() { 113*0fca6ea1SDimitry Andric if (LogPrivate.getNumOccurrences()) 114*0fca6ea1SDimitry Andric HideNames = LogPrivate; 115*0fca6ea1SDimitry Andric else { 116*0fca6ea1SDimitry Andric const auto EV = sys::Process::GetEnv("AMD_SPLIT_MODULE_LOG_PRIVATE"); 117*0fca6ea1SDimitry Andric HideNames = (EV.value_or("0") != "0"); 118*0fca6ea1SDimitry Andric } 119*0fca6ea1SDimitry Andric }); 120*0fca6ea1SDimitry Andric 121*0fca6ea1SDimitry Andric if (!HideNames) 122*0fca6ea1SDimitry Andric return V.getName().str(); 123*0fca6ea1SDimitry Andric return toHex(SHA256::hash(arrayRefFromStringRef(V.getName())), 124*0fca6ea1SDimitry Andric /*LowerCase=*/true); 125*0fca6ea1SDimitry Andric } 126*0fca6ea1SDimitry Andric 127*0fca6ea1SDimitry Andric /// Main logging helper. 128*0fca6ea1SDimitry Andric /// 129*0fca6ea1SDimitry Andric /// Logging can be configured by the following environment variable. 130*0fca6ea1SDimitry Andric /// AMD_SPLIT_MODULE_LOG_DIR=<filepath> 131*0fca6ea1SDimitry Andric /// If set, uses <filepath> as the directory to write logfiles to 132*0fca6ea1SDimitry Andric /// each time module splitting is used. 133*0fca6ea1SDimitry Andric /// AMD_SPLIT_MODULE_LOG_PRIVATE 134*0fca6ea1SDimitry Andric /// If set to anything other than zero, all names are hidden. 135*0fca6ea1SDimitry Andric /// 136*0fca6ea1SDimitry Andric /// Both environment variables have corresponding CL options which 137*0fca6ea1SDimitry Andric /// takes priority over them. 138*0fca6ea1SDimitry Andric /// 139*0fca6ea1SDimitry Andric /// Any output printed to the log files is also printed to dbgs() when -debug is 140*0fca6ea1SDimitry Andric /// used and LLVM_DEBUG is defined. 141*0fca6ea1SDimitry Andric /// 142*0fca6ea1SDimitry Andric /// This approach has a small disadvantage over LLVM_DEBUG though: logging logic 143*0fca6ea1SDimitry Andric /// cannot be removed from the code (by building without debug). This probably 144*0fca6ea1SDimitry Andric /// has a small performance cost because if some computation/formatting is 145*0fca6ea1SDimitry Andric /// needed for logging purpose, it may be done everytime only to be ignored 146*0fca6ea1SDimitry Andric /// by the logger. 147*0fca6ea1SDimitry Andric /// 148*0fca6ea1SDimitry Andric /// As this pass only runs once and is not doing anything computationally 149*0fca6ea1SDimitry Andric /// expensive, this is likely a reasonable trade-off. 150*0fca6ea1SDimitry Andric /// 151*0fca6ea1SDimitry Andric /// If some computation should really be avoided when unused, users of the class 152*0fca6ea1SDimitry Andric /// can check whether any logging will occur by using the bool operator. 153*0fca6ea1SDimitry Andric /// 154*0fca6ea1SDimitry Andric /// \code 155*0fca6ea1SDimitry Andric /// if (SML) { 156*0fca6ea1SDimitry Andric /// // Executes only if logging to a file or if -debug is available and 157*0fca6ea1SDimitry Andric /// used. 158*0fca6ea1SDimitry Andric /// } 159*0fca6ea1SDimitry Andric /// \endcode 160*0fca6ea1SDimitry Andric class SplitModuleLogger { 161*0fca6ea1SDimitry Andric public: 162*0fca6ea1SDimitry Andric SplitModuleLogger(const Module &M) { 163*0fca6ea1SDimitry Andric std::string LogDir = LogDirOpt; 164*0fca6ea1SDimitry Andric if (LogDir.empty()) 165*0fca6ea1SDimitry Andric LogDir = sys::Process::GetEnv("AMD_SPLIT_MODULE_LOG_DIR").value_or(""); 166*0fca6ea1SDimitry Andric 167*0fca6ea1SDimitry Andric // No log dir specified means we don't need to log to a file. 168*0fca6ea1SDimitry Andric // We may still log to dbgs(), though. 169*0fca6ea1SDimitry Andric if (LogDir.empty()) 170*0fca6ea1SDimitry Andric return; 171*0fca6ea1SDimitry Andric 172*0fca6ea1SDimitry Andric // If a log directory is specified, create a new file with a unique name in 173*0fca6ea1SDimitry Andric // that directory. 174*0fca6ea1SDimitry Andric int Fd; 175*0fca6ea1SDimitry Andric SmallString<0> PathTemplate; 176*0fca6ea1SDimitry Andric SmallString<0> RealPath; 177*0fca6ea1SDimitry Andric sys::path::append(PathTemplate, LogDir, "Module-%%-%%-%%-%%-%%-%%-%%.txt"); 178*0fca6ea1SDimitry Andric if (auto Err = 179*0fca6ea1SDimitry Andric sys::fs::createUniqueFile(PathTemplate.str(), Fd, RealPath)) { 180*0fca6ea1SDimitry Andric report_fatal_error("Failed to create log file at '" + Twine(LogDir) + 181*0fca6ea1SDimitry Andric "': " + Err.message(), 182*0fca6ea1SDimitry Andric /*CrashDiag=*/false); 183*0fca6ea1SDimitry Andric } 184*0fca6ea1SDimitry Andric 185*0fca6ea1SDimitry Andric FileOS = std::make_unique<raw_fd_ostream>(Fd, /*shouldClose=*/true); 186*0fca6ea1SDimitry Andric } 187*0fca6ea1SDimitry Andric 188*0fca6ea1SDimitry Andric bool hasLogFile() const { return FileOS != nullptr; } 189*0fca6ea1SDimitry Andric 190*0fca6ea1SDimitry Andric raw_ostream &logfile() { 191*0fca6ea1SDimitry Andric assert(FileOS && "no logfile!"); 192*0fca6ea1SDimitry Andric return *FileOS; 193*0fca6ea1SDimitry Andric } 194*0fca6ea1SDimitry Andric 195*0fca6ea1SDimitry Andric /// \returns true if this SML will log anything either to a file or dbgs(). 196*0fca6ea1SDimitry Andric /// Can be used to avoid expensive computations that are ignored when logging 197*0fca6ea1SDimitry Andric /// is disabled. 198*0fca6ea1SDimitry Andric operator bool() const { 199*0fca6ea1SDimitry Andric return hasLogFile() || (DebugFlag && isCurrentDebugType(DEBUG_TYPE)); 200*0fca6ea1SDimitry Andric } 201*0fca6ea1SDimitry Andric 202*0fca6ea1SDimitry Andric private: 203*0fca6ea1SDimitry Andric std::unique_ptr<raw_fd_ostream> FileOS; 204*0fca6ea1SDimitry Andric }; 205*0fca6ea1SDimitry Andric 206*0fca6ea1SDimitry Andric template <typename Ty> 207*0fca6ea1SDimitry Andric static SplitModuleLogger &operator<<(SplitModuleLogger &SML, const Ty &Val) { 208*0fca6ea1SDimitry Andric static_assert( 209*0fca6ea1SDimitry Andric !std::is_same_v<Ty, Value>, 210*0fca6ea1SDimitry Andric "do not print values to logs directly, use handleName instead!"); 211*0fca6ea1SDimitry Andric LLVM_DEBUG(dbgs() << Val); 212*0fca6ea1SDimitry Andric if (SML.hasLogFile()) 213*0fca6ea1SDimitry Andric SML.logfile() << Val; 214*0fca6ea1SDimitry Andric return SML; 215*0fca6ea1SDimitry Andric } 216*0fca6ea1SDimitry Andric 217*0fca6ea1SDimitry Andric /// Calculate the cost of each function in \p M 218*0fca6ea1SDimitry Andric /// \param SML Log Helper 219*0fca6ea1SDimitry Andric /// \param GetTTI Abstract getter for TargetTransformInfo. 220*0fca6ea1SDimitry Andric /// \param M Module to analyze. 221*0fca6ea1SDimitry Andric /// \param CostMap[out] Resulting Function -> Cost map. 222*0fca6ea1SDimitry Andric /// \return The module's total cost. 223*0fca6ea1SDimitry Andric static CostType 224*0fca6ea1SDimitry Andric calculateFunctionCosts(SplitModuleLogger &SML, GetTTIFn GetTTI, Module &M, 225*0fca6ea1SDimitry Andric DenseMap<const Function *, CostType> &CostMap) { 226*0fca6ea1SDimitry Andric CostType ModuleCost = 0; 227*0fca6ea1SDimitry Andric CostType KernelCost = 0; 228*0fca6ea1SDimitry Andric 229*0fca6ea1SDimitry Andric for (auto &Fn : M) { 230*0fca6ea1SDimitry Andric if (Fn.isDeclaration()) 231*0fca6ea1SDimitry Andric continue; 232*0fca6ea1SDimitry Andric 233*0fca6ea1SDimitry Andric CostType FnCost = 0; 234*0fca6ea1SDimitry Andric const auto &TTI = GetTTI(Fn); 235*0fca6ea1SDimitry Andric for (const auto &BB : Fn) { 236*0fca6ea1SDimitry Andric for (const auto &I : BB) { 237*0fca6ea1SDimitry Andric auto Cost = 238*0fca6ea1SDimitry Andric TTI.getInstructionCost(&I, TargetTransformInfo::TCK_CodeSize); 239*0fca6ea1SDimitry Andric assert(Cost != InstructionCost::getMax()); 240*0fca6ea1SDimitry Andric // Assume expensive if we can't tell the cost of an instruction. 241*0fca6ea1SDimitry Andric CostType CostVal = 242*0fca6ea1SDimitry Andric Cost.getValue().value_or(TargetTransformInfo::TCC_Expensive); 243*0fca6ea1SDimitry Andric assert((FnCost + CostVal) >= FnCost && "Overflow!"); 244*0fca6ea1SDimitry Andric FnCost += CostVal; 245*0fca6ea1SDimitry Andric } 246*0fca6ea1SDimitry Andric } 247*0fca6ea1SDimitry Andric 248*0fca6ea1SDimitry Andric assert(FnCost != 0); 249*0fca6ea1SDimitry Andric 250*0fca6ea1SDimitry Andric CostMap[&Fn] = FnCost; 251*0fca6ea1SDimitry Andric assert((ModuleCost + FnCost) >= ModuleCost && "Overflow!"); 252*0fca6ea1SDimitry Andric ModuleCost += FnCost; 253*0fca6ea1SDimitry Andric 254*0fca6ea1SDimitry Andric if (isEntryPoint(&Fn)) 255*0fca6ea1SDimitry Andric KernelCost += FnCost; 256*0fca6ea1SDimitry Andric } 257*0fca6ea1SDimitry Andric 258*0fca6ea1SDimitry Andric CostType FnCost = (ModuleCost - KernelCost); 259*0fca6ea1SDimitry Andric CostType ModuleCostOr1 = ModuleCost ? ModuleCost : 1; 260*0fca6ea1SDimitry Andric SML << "=> Total Module Cost: " << ModuleCost << '\n' 261*0fca6ea1SDimitry Andric << " => KernelCost: " << KernelCost << " (" 262*0fca6ea1SDimitry Andric << format("%0.2f", (float(KernelCost) / ModuleCostOr1) * 100) << "%)\n" 263*0fca6ea1SDimitry Andric << " => FnsCost: " << FnCost << " (" 264*0fca6ea1SDimitry Andric << format("%0.2f", (float(FnCost) / ModuleCostOr1) * 100) << "%)\n"; 265*0fca6ea1SDimitry Andric 266*0fca6ea1SDimitry Andric return ModuleCost; 267*0fca6ea1SDimitry Andric } 268*0fca6ea1SDimitry Andric 269*0fca6ea1SDimitry Andric static bool canBeIndirectlyCalled(const Function &F) { 270*0fca6ea1SDimitry Andric if (F.isDeclaration() || isEntryPoint(&F)) 271*0fca6ea1SDimitry Andric return false; 272*0fca6ea1SDimitry Andric return !F.hasLocalLinkage() || 273*0fca6ea1SDimitry Andric F.hasAddressTaken(/*PutOffender=*/nullptr, 274*0fca6ea1SDimitry Andric /*IgnoreCallbackUses=*/false, 275*0fca6ea1SDimitry Andric /*IgnoreAssumeLikeCalls=*/true, 276*0fca6ea1SDimitry Andric /*IgnoreLLVMUsed=*/true, 277*0fca6ea1SDimitry Andric /*IgnoreARCAttachedCall=*/false, 278*0fca6ea1SDimitry Andric /*IgnoreCastedDirectCall=*/true); 279*0fca6ea1SDimitry Andric } 280*0fca6ea1SDimitry Andric 281*0fca6ea1SDimitry Andric /// When a function or any of its callees performs an indirect call, this 282*0fca6ea1SDimitry Andric /// takes over \ref addAllDependencies and adds all potentially callable 283*0fca6ea1SDimitry Andric /// functions to \p Fns so they can be counted as dependencies of the function. 284*0fca6ea1SDimitry Andric /// 285*0fca6ea1SDimitry Andric /// This is needed due to how AMDGPUResourceUsageAnalysis operates: in the 286*0fca6ea1SDimitry Andric /// presence of an indirect call, the function's resource usage is the same as 287*0fca6ea1SDimitry Andric /// the most expensive function in the module. 288*0fca6ea1SDimitry Andric /// \param M The module. 289*0fca6ea1SDimitry Andric /// \param Fns[out] Resulting list of functions. 290*0fca6ea1SDimitry Andric static void addAllIndirectCallDependencies(const Module &M, 291*0fca6ea1SDimitry Andric DenseSet<const Function *> &Fns) { 292*0fca6ea1SDimitry Andric for (const auto &Fn : M) { 293*0fca6ea1SDimitry Andric if (canBeIndirectlyCalled(Fn)) 294*0fca6ea1SDimitry Andric Fns.insert(&Fn); 295*0fca6ea1SDimitry Andric } 296*0fca6ea1SDimitry Andric } 297*0fca6ea1SDimitry Andric 298*0fca6ea1SDimitry Andric /// Adds the functions that \p Fn may call to \p Fns, then recurses into each 299*0fca6ea1SDimitry Andric /// callee until all reachable functions have been gathered. 300*0fca6ea1SDimitry Andric /// 301*0fca6ea1SDimitry Andric /// \param SML Log Helper 302*0fca6ea1SDimitry Andric /// \param CG Call graph for \p Fn's module. 303*0fca6ea1SDimitry Andric /// \param Fn Current function to look at. 304*0fca6ea1SDimitry Andric /// \param Fns[out] Resulting list of functions. 305*0fca6ea1SDimitry Andric /// \param OnlyDirect Whether to only consider direct callees. 306*0fca6ea1SDimitry Andric /// \param HadIndirectCall[out] Set to true if an indirect call was seen at some 307*0fca6ea1SDimitry Andric /// point, either in \p Fn or in one of the function it calls. When that 308*0fca6ea1SDimitry Andric /// happens, we fall back to adding all callable functions inside \p Fn's module 309*0fca6ea1SDimitry Andric /// to \p Fns. 310*0fca6ea1SDimitry Andric static void addAllDependencies(SplitModuleLogger &SML, const CallGraph &CG, 311*0fca6ea1SDimitry Andric const Function &Fn, 312*0fca6ea1SDimitry Andric DenseSet<const Function *> &Fns, bool OnlyDirect, 313*0fca6ea1SDimitry Andric bool &HadIndirectCall) { 314*0fca6ea1SDimitry Andric assert(!Fn.isDeclaration()); 315*0fca6ea1SDimitry Andric 316*0fca6ea1SDimitry Andric const Module &M = *Fn.getParent(); 317*0fca6ea1SDimitry Andric SmallVector<const Function *> WorkList({&Fn}); 318*0fca6ea1SDimitry Andric while (!WorkList.empty()) { 319*0fca6ea1SDimitry Andric const auto &CurFn = *WorkList.pop_back_val(); 320*0fca6ea1SDimitry Andric assert(!CurFn.isDeclaration()); 321*0fca6ea1SDimitry Andric 322*0fca6ea1SDimitry Andric // Scan for an indirect call. If such a call is found, we have to 323*0fca6ea1SDimitry Andric // conservatively assume this can call all non-entrypoint functions in the 324*0fca6ea1SDimitry Andric // module. 325*0fca6ea1SDimitry Andric 326*0fca6ea1SDimitry Andric for (auto &CGEntry : *CG[&CurFn]) { 327*0fca6ea1SDimitry Andric auto *CGNode = CGEntry.second; 328*0fca6ea1SDimitry Andric auto *Callee = CGNode->getFunction(); 329*0fca6ea1SDimitry Andric if (!Callee) { 330*0fca6ea1SDimitry Andric if (OnlyDirect) 331*0fca6ea1SDimitry Andric continue; 332*0fca6ea1SDimitry Andric 333*0fca6ea1SDimitry Andric // Functions have an edge towards CallsExternalNode if they're external 334*0fca6ea1SDimitry Andric // declarations, or if they do an indirect call. As we only process 335*0fca6ea1SDimitry Andric // definitions here, we know this means the function has an indirect 336*0fca6ea1SDimitry Andric // call. We then have to conservatively assume this can call all 337*0fca6ea1SDimitry Andric // non-entrypoint functions in the module. 338*0fca6ea1SDimitry Andric if (CGNode != CG.getCallsExternalNode()) 339*0fca6ea1SDimitry Andric continue; // this is another function-less node we don't care about. 340*0fca6ea1SDimitry Andric 341*0fca6ea1SDimitry Andric SML << "Indirect call detected in " << getName(CurFn) 342*0fca6ea1SDimitry Andric << " - treating all non-entrypoint functions as " 343*0fca6ea1SDimitry Andric "potential dependencies\n"; 344*0fca6ea1SDimitry Andric 345*0fca6ea1SDimitry Andric // TODO: Print an ORE as well ? 346*0fca6ea1SDimitry Andric addAllIndirectCallDependencies(M, Fns); 347*0fca6ea1SDimitry Andric HadIndirectCall = true; 348*0fca6ea1SDimitry Andric continue; 349*0fca6ea1SDimitry Andric } 350*0fca6ea1SDimitry Andric 351*0fca6ea1SDimitry Andric if (Callee->isDeclaration()) 352*0fca6ea1SDimitry Andric continue; 353*0fca6ea1SDimitry Andric 354*0fca6ea1SDimitry Andric auto [It, Inserted] = Fns.insert(Callee); 355*0fca6ea1SDimitry Andric if (Inserted) 356*0fca6ea1SDimitry Andric WorkList.push_back(Callee); 357*0fca6ea1SDimitry Andric } 358*0fca6ea1SDimitry Andric } 359*0fca6ea1SDimitry Andric } 360*0fca6ea1SDimitry Andric 361*0fca6ea1SDimitry Andric /// Contains information about a function and its dependencies. 362*0fca6ea1SDimitry Andric /// This is a splitting root. The splitting algorithm works by 363*0fca6ea1SDimitry Andric /// assigning these to partitions. 364*0fca6ea1SDimitry Andric struct FunctionWithDependencies { 365*0fca6ea1SDimitry Andric FunctionWithDependencies(SplitModuleLogger &SML, CallGraph &CG, 366*0fca6ea1SDimitry Andric const DenseMap<const Function *, CostType> &FnCosts, 367*0fca6ea1SDimitry Andric const Function *Fn) 368*0fca6ea1SDimitry Andric : Fn(Fn) { 369*0fca6ea1SDimitry Andric // When Fn is not a kernel, we don't need to collect indirect callees. 370*0fca6ea1SDimitry Andric // Resource usage analysis is only performed on kernels, and we collect 371*0fca6ea1SDimitry Andric // indirect callees for resource usage analysis. 372*0fca6ea1SDimitry Andric addAllDependencies(SML, CG, *Fn, Dependencies, 373*0fca6ea1SDimitry Andric /*OnlyDirect*/ !isEntryPoint(Fn), HasIndirectCall); 374*0fca6ea1SDimitry Andric TotalCost = FnCosts.at(Fn); 375*0fca6ea1SDimitry Andric for (const auto *Dep : Dependencies) { 376*0fca6ea1SDimitry Andric TotalCost += FnCosts.at(Dep); 377*0fca6ea1SDimitry Andric 378*0fca6ea1SDimitry Andric // We cannot duplicate functions with external linkage, or functions that 379*0fca6ea1SDimitry Andric // may be overriden at runtime. 380*0fca6ea1SDimitry Andric HasNonDuplicatableDependecy |= 381*0fca6ea1SDimitry Andric (Dep->hasExternalLinkage() || !Dep->isDefinitionExact()); 382*0fca6ea1SDimitry Andric } 383*0fca6ea1SDimitry Andric } 384*0fca6ea1SDimitry Andric 385*0fca6ea1SDimitry Andric const Function *Fn = nullptr; 386*0fca6ea1SDimitry Andric DenseSet<const Function *> Dependencies; 387*0fca6ea1SDimitry Andric /// Whether \p Fn or any of its \ref Dependencies contains an indirect call. 388*0fca6ea1SDimitry Andric bool HasIndirectCall = false; 389*0fca6ea1SDimitry Andric /// Whether any of \p Fn's dependencies cannot be duplicated. 390*0fca6ea1SDimitry Andric bool HasNonDuplicatableDependecy = false; 391*0fca6ea1SDimitry Andric 392*0fca6ea1SDimitry Andric CostType TotalCost = 0; 393*0fca6ea1SDimitry Andric 394*0fca6ea1SDimitry Andric /// \returns true if this function and its dependencies can be considered 395*0fca6ea1SDimitry Andric /// large according to \p Threshold. 396*0fca6ea1SDimitry Andric bool isLarge(CostType Threshold) const { 397*0fca6ea1SDimitry Andric return TotalCost > Threshold && !Dependencies.empty(); 398*0fca6ea1SDimitry Andric } 399*0fca6ea1SDimitry Andric }; 400*0fca6ea1SDimitry Andric 401*0fca6ea1SDimitry Andric /// Calculates how much overlap there is between \p A and \p B. 402*0fca6ea1SDimitry Andric /// \return A number between 0.0 and 1.0, where 1.0 means A == B and 0.0 means A 403*0fca6ea1SDimitry Andric /// and B have no shared elements. Kernels do not count in overlap calculation. 404*0fca6ea1SDimitry Andric static float calculateOverlap(const DenseSet<const Function *> &A, 405*0fca6ea1SDimitry Andric const DenseSet<const Function *> &B) { 406*0fca6ea1SDimitry Andric DenseSet<const Function *> Total; 407*0fca6ea1SDimitry Andric for (const auto *F : A) { 408*0fca6ea1SDimitry Andric if (!isEntryPoint(F)) 409*0fca6ea1SDimitry Andric Total.insert(F); 410*0fca6ea1SDimitry Andric } 411*0fca6ea1SDimitry Andric 412*0fca6ea1SDimitry Andric if (Total.empty()) 413*0fca6ea1SDimitry Andric return 0.0f; 414*0fca6ea1SDimitry Andric 415*0fca6ea1SDimitry Andric unsigned NumCommon = 0; 416*0fca6ea1SDimitry Andric for (const auto *F : B) { 417*0fca6ea1SDimitry Andric if (isEntryPoint(F)) 418*0fca6ea1SDimitry Andric continue; 419*0fca6ea1SDimitry Andric 420*0fca6ea1SDimitry Andric auto [It, Inserted] = Total.insert(F); 421*0fca6ea1SDimitry Andric if (!Inserted) 422*0fca6ea1SDimitry Andric ++NumCommon; 423*0fca6ea1SDimitry Andric } 424*0fca6ea1SDimitry Andric 425*0fca6ea1SDimitry Andric return static_cast<float>(NumCommon) / Total.size(); 426*0fca6ea1SDimitry Andric } 427*0fca6ea1SDimitry Andric 428*0fca6ea1SDimitry Andric /// Performs all of the partitioning work on \p M. 429*0fca6ea1SDimitry Andric /// \param SML Log Helper 430*0fca6ea1SDimitry Andric /// \param M Module to partition. 431*0fca6ea1SDimitry Andric /// \param NumParts Number of partitions to create. 432*0fca6ea1SDimitry Andric /// \param ModuleCost Total cost of all functions in \p M. 433*0fca6ea1SDimitry Andric /// \param FnCosts Map of Function -> Cost 434*0fca6ea1SDimitry Andric /// \param WorkList Functions and their dependencies to process in order. 435*0fca6ea1SDimitry Andric /// \returns The created partitions (a vector of size \p NumParts ) 436*0fca6ea1SDimitry Andric static std::vector<DenseSet<const Function *>> 437*0fca6ea1SDimitry Andric doPartitioning(SplitModuleLogger &SML, Module &M, unsigned NumParts, 438*0fca6ea1SDimitry Andric CostType ModuleCost, 439*0fca6ea1SDimitry Andric const DenseMap<const Function *, CostType> &FnCosts, 440*0fca6ea1SDimitry Andric const SmallVector<FunctionWithDependencies> &WorkList) { 441*0fca6ea1SDimitry Andric 442*0fca6ea1SDimitry Andric SML << "\n--Partitioning Starts--\n"; 443*0fca6ea1SDimitry Andric 444*0fca6ea1SDimitry Andric // Calculate a "large function threshold". When more than one function's total 445*0fca6ea1SDimitry Andric // import cost exceeds this value, we will try to assign it to an existing 446*0fca6ea1SDimitry Andric // partition to reduce the amount of duplication needed. 447*0fca6ea1SDimitry Andric // 448*0fca6ea1SDimitry Andric // e.g. let two functions X and Y have a import cost of ~10% of the module, we 449*0fca6ea1SDimitry Andric // assign X to a partition as usual, but when we get to Y, we check if it's 450*0fca6ea1SDimitry Andric // worth also putting it in Y's partition. 451*0fca6ea1SDimitry Andric const CostType LargeFnThreshold = 452*0fca6ea1SDimitry Andric LargeFnFactor ? CostType(((ModuleCost / NumParts) * LargeFnFactor)) 453*0fca6ea1SDimitry Andric : std::numeric_limits<CostType>::max(); 454*0fca6ea1SDimitry Andric 455*0fca6ea1SDimitry Andric std::vector<DenseSet<const Function *>> Partitions; 456*0fca6ea1SDimitry Andric Partitions.resize(NumParts); 457*0fca6ea1SDimitry Andric 458*0fca6ea1SDimitry Andric // Assign functions to partitions, and try to keep the partitions more or 459*0fca6ea1SDimitry Andric // less balanced. We do that through a priority queue sorted in reverse, so we 460*0fca6ea1SDimitry Andric // can always look at the partition with the least content. 461*0fca6ea1SDimitry Andric // 462*0fca6ea1SDimitry Andric // There are some cases where we will be deliberately unbalanced though. 463*0fca6ea1SDimitry Andric // - Large functions: we try to merge with existing partitions to reduce code 464*0fca6ea1SDimitry Andric // duplication. 465*0fca6ea1SDimitry Andric // - Functions with indirect or external calls always go in the first 466*0fca6ea1SDimitry Andric // partition (P0). 467*0fca6ea1SDimitry Andric auto ComparePartitions = [](const std::pair<PartitionID, CostType> &a, 468*0fca6ea1SDimitry Andric const std::pair<PartitionID, CostType> &b) { 469*0fca6ea1SDimitry Andric // When two partitions have the same cost, assign to the one with the 470*0fca6ea1SDimitry Andric // biggest ID first. This allows us to put things in P0 last, because P0 may 471*0fca6ea1SDimitry Andric // have other stuff added later. 472*0fca6ea1SDimitry Andric if (a.second == b.second) 473*0fca6ea1SDimitry Andric return a.first < b.first; 474*0fca6ea1SDimitry Andric return a.second > b.second; 475*0fca6ea1SDimitry Andric }; 476*0fca6ea1SDimitry Andric 477*0fca6ea1SDimitry Andric // We can't use priority_queue here because we need to be able to access any 478*0fca6ea1SDimitry Andric // element. This makes this a bit inefficient as we need to sort it again 479*0fca6ea1SDimitry Andric // everytime we change it, but it's a very small array anyway (likely under 64 480*0fca6ea1SDimitry Andric // partitions) so it's a cheap operation. 481*0fca6ea1SDimitry Andric std::vector<std::pair<PartitionID, CostType>> BalancingQueue; 482*0fca6ea1SDimitry Andric for (unsigned I = 0; I < NumParts; ++I) 483*0fca6ea1SDimitry Andric BalancingQueue.emplace_back(I, 0); 484*0fca6ea1SDimitry Andric 485*0fca6ea1SDimitry Andric // Helper function to handle assigning a function to a partition. This takes 486*0fca6ea1SDimitry Andric // care of updating the balancing queue. 487*0fca6ea1SDimitry Andric const auto AssignToPartition = [&](PartitionID PID, 488*0fca6ea1SDimitry Andric const FunctionWithDependencies &FWD) { 489*0fca6ea1SDimitry Andric auto &FnsInPart = Partitions[PID]; 490*0fca6ea1SDimitry Andric FnsInPart.insert(FWD.Fn); 491*0fca6ea1SDimitry Andric FnsInPart.insert(FWD.Dependencies.begin(), FWD.Dependencies.end()); 492*0fca6ea1SDimitry Andric 493*0fca6ea1SDimitry Andric SML << "assign " << getName(*FWD.Fn) << " to P" << PID << "\n -> "; 494*0fca6ea1SDimitry Andric if (!FWD.Dependencies.empty()) { 495*0fca6ea1SDimitry Andric SML << FWD.Dependencies.size() << " dependencies added\n"; 496*0fca6ea1SDimitry Andric }; 497*0fca6ea1SDimitry Andric 498*0fca6ea1SDimitry Andric // Update the balancing queue. we scan backwards because in the common case 499*0fca6ea1SDimitry Andric // the partition is at the end. 500*0fca6ea1SDimitry Andric for (auto &[QueuePID, Cost] : reverse(BalancingQueue)) { 501*0fca6ea1SDimitry Andric if (QueuePID == PID) { 502*0fca6ea1SDimitry Andric CostType NewCost = 0; 503*0fca6ea1SDimitry Andric for (auto *Fn : Partitions[PID]) 504*0fca6ea1SDimitry Andric NewCost += FnCosts.at(Fn); 505*0fca6ea1SDimitry Andric 506*0fca6ea1SDimitry Andric SML << "[Updating P" << PID << " Cost]:" << Cost << " -> " << NewCost; 507*0fca6ea1SDimitry Andric if (Cost) { 508*0fca6ea1SDimitry Andric SML << " (" << unsigned(((float(NewCost) / Cost) - 1) * 100) 509*0fca6ea1SDimitry Andric << "% increase)"; 510*0fca6ea1SDimitry Andric } 511*0fca6ea1SDimitry Andric SML << '\n'; 512*0fca6ea1SDimitry Andric 513*0fca6ea1SDimitry Andric Cost = NewCost; 514*0fca6ea1SDimitry Andric } 515*0fca6ea1SDimitry Andric } 516*0fca6ea1SDimitry Andric 517*0fca6ea1SDimitry Andric sort(BalancingQueue, ComparePartitions); 518*0fca6ea1SDimitry Andric }; 519*0fca6ea1SDimitry Andric 520*0fca6ea1SDimitry Andric for (auto &CurFn : WorkList) { 521*0fca6ea1SDimitry Andric // When a function has indirect calls, it must stay in the first partition 522*0fca6ea1SDimitry Andric // alongside every reachable non-entry function. This is a nightmare case 523*0fca6ea1SDimitry Andric // for splitting as it severely limits what we can do. 524*0fca6ea1SDimitry Andric if (CurFn.HasIndirectCall) { 525*0fca6ea1SDimitry Andric SML << "Function with indirect call(s): " << getName(*CurFn.Fn) 526*0fca6ea1SDimitry Andric << " defaulting to P0\n"; 527*0fca6ea1SDimitry Andric AssignToPartition(0, CurFn); 528*0fca6ea1SDimitry Andric continue; 529*0fca6ea1SDimitry Andric } 530*0fca6ea1SDimitry Andric 531*0fca6ea1SDimitry Andric // When a function has non duplicatable dependencies, we have to keep it in 532*0fca6ea1SDimitry Andric // the first partition as well. This is a conservative approach, a 533*0fca6ea1SDimitry Andric // finer-grained approach could keep track of which dependencies are 534*0fca6ea1SDimitry Andric // non-duplicatable exactly and just make sure they're grouped together. 535*0fca6ea1SDimitry Andric if (CurFn.HasNonDuplicatableDependecy) { 536*0fca6ea1SDimitry Andric SML << "Function with externally visible dependency " 537*0fca6ea1SDimitry Andric << getName(*CurFn.Fn) << " defaulting to P0\n"; 538*0fca6ea1SDimitry Andric AssignToPartition(0, CurFn); 539*0fca6ea1SDimitry Andric continue; 540*0fca6ea1SDimitry Andric } 541*0fca6ea1SDimitry Andric 542*0fca6ea1SDimitry Andric // Be smart with large functions to avoid duplicating their dependencies. 543*0fca6ea1SDimitry Andric if (CurFn.isLarge(LargeFnThreshold)) { 544*0fca6ea1SDimitry Andric assert(LargeFnOverlapForMerge >= 0.0f && LargeFnOverlapForMerge <= 1.0f); 545*0fca6ea1SDimitry Andric SML << "Large Function: " << getName(*CurFn.Fn) 546*0fca6ea1SDimitry Andric << " - looking for partition with at least " 547*0fca6ea1SDimitry Andric << format("%0.2f", LargeFnOverlapForMerge * 100) << "% overlap\n"; 548*0fca6ea1SDimitry Andric 549*0fca6ea1SDimitry Andric bool Assigned = false; 550*0fca6ea1SDimitry Andric for (const auto &[PID, Fns] : enumerate(Partitions)) { 551*0fca6ea1SDimitry Andric float Overlap = calculateOverlap(CurFn.Dependencies, Fns); 552*0fca6ea1SDimitry Andric SML << " => " << format("%0.2f", Overlap * 100) << "% overlap with P" 553*0fca6ea1SDimitry Andric << PID << '\n'; 554*0fca6ea1SDimitry Andric if (Overlap > LargeFnOverlapForMerge) { 555*0fca6ea1SDimitry Andric SML << " selecting P" << PID << '\n'; 556*0fca6ea1SDimitry Andric AssignToPartition(PID, CurFn); 557*0fca6ea1SDimitry Andric Assigned = true; 558*0fca6ea1SDimitry Andric } 559*0fca6ea1SDimitry Andric } 560*0fca6ea1SDimitry Andric 561*0fca6ea1SDimitry Andric if (Assigned) 562*0fca6ea1SDimitry Andric continue; 563*0fca6ea1SDimitry Andric } 564*0fca6ea1SDimitry Andric 565*0fca6ea1SDimitry Andric // Normal "load-balancing", assign to partition with least pressure. 566*0fca6ea1SDimitry Andric auto [PID, CurCost] = BalancingQueue.back(); 567*0fca6ea1SDimitry Andric AssignToPartition(PID, CurFn); 568*0fca6ea1SDimitry Andric } 569*0fca6ea1SDimitry Andric 570*0fca6ea1SDimitry Andric if (SML) { 571*0fca6ea1SDimitry Andric for (const auto &[Idx, Part] : enumerate(Partitions)) { 572*0fca6ea1SDimitry Andric CostType Cost = 0; 573*0fca6ea1SDimitry Andric for (auto *Fn : Part) 574*0fca6ea1SDimitry Andric Cost += FnCosts.at(Fn); 575*0fca6ea1SDimitry Andric SML << "P" << Idx << " has a total cost of " << Cost << " (" 576*0fca6ea1SDimitry Andric << format("%0.2f", (float(Cost) / ModuleCost) * 100) 577*0fca6ea1SDimitry Andric << "% of source module)\n"; 578*0fca6ea1SDimitry Andric } 579*0fca6ea1SDimitry Andric 580*0fca6ea1SDimitry Andric SML << "--Partitioning Done--\n\n"; 581*0fca6ea1SDimitry Andric } 582*0fca6ea1SDimitry Andric 583*0fca6ea1SDimitry Andric // Check no functions were missed. 584*0fca6ea1SDimitry Andric #ifndef NDEBUG 585*0fca6ea1SDimitry Andric DenseSet<const Function *> AllFunctions; 586*0fca6ea1SDimitry Andric for (const auto &Part : Partitions) 587*0fca6ea1SDimitry Andric AllFunctions.insert(Part.begin(), Part.end()); 588*0fca6ea1SDimitry Andric 589*0fca6ea1SDimitry Andric for (auto &Fn : M) { 590*0fca6ea1SDimitry Andric if (!Fn.isDeclaration() && !AllFunctions.contains(&Fn)) { 591*0fca6ea1SDimitry Andric assert(AllFunctions.contains(&Fn) && "Missed a function?!"); 592*0fca6ea1SDimitry Andric } 593*0fca6ea1SDimitry Andric } 594*0fca6ea1SDimitry Andric #endif 595*0fca6ea1SDimitry Andric 596*0fca6ea1SDimitry Andric return Partitions; 597*0fca6ea1SDimitry Andric } 598*0fca6ea1SDimitry Andric 599*0fca6ea1SDimitry Andric static void externalize(GlobalValue &GV) { 600*0fca6ea1SDimitry Andric if (GV.hasLocalLinkage()) { 601*0fca6ea1SDimitry Andric GV.setLinkage(GlobalValue::ExternalLinkage); 602*0fca6ea1SDimitry Andric GV.setVisibility(GlobalValue::HiddenVisibility); 603*0fca6ea1SDimitry Andric } 604*0fca6ea1SDimitry Andric 605*0fca6ea1SDimitry Andric // Unnamed entities must be named consistently between modules. setName will 606*0fca6ea1SDimitry Andric // give a distinct name to each such entity. 607*0fca6ea1SDimitry Andric if (!GV.hasName()) 608*0fca6ea1SDimitry Andric GV.setName("__llvmsplit_unnamed"); 609*0fca6ea1SDimitry Andric } 610*0fca6ea1SDimitry Andric 611*0fca6ea1SDimitry Andric static bool hasDirectCaller(const Function &Fn) { 612*0fca6ea1SDimitry Andric for (auto &U : Fn.uses()) { 613*0fca6ea1SDimitry Andric if (auto *CB = dyn_cast<CallBase>(U.getUser()); CB && CB->isCallee(&U)) 614*0fca6ea1SDimitry Andric return true; 615*0fca6ea1SDimitry Andric } 616*0fca6ea1SDimitry Andric return false; 617*0fca6ea1SDimitry Andric } 618*0fca6ea1SDimitry Andric 619*0fca6ea1SDimitry Andric static void splitAMDGPUModule( 620*0fca6ea1SDimitry Andric GetTTIFn GetTTI, Module &M, unsigned N, 621*0fca6ea1SDimitry Andric function_ref<void(std::unique_ptr<Module> MPart)> ModuleCallback) { 622*0fca6ea1SDimitry Andric 623*0fca6ea1SDimitry Andric SplitModuleLogger SML(M); 624*0fca6ea1SDimitry Andric 625*0fca6ea1SDimitry Andric CallGraph CG(M); 626*0fca6ea1SDimitry Andric 627*0fca6ea1SDimitry Andric // Externalize functions whose address are taken. 628*0fca6ea1SDimitry Andric // 629*0fca6ea1SDimitry Andric // This is needed because partitioning is purely based on calls, but sometimes 630*0fca6ea1SDimitry Andric // a kernel/function may just look at the address of another local function 631*0fca6ea1SDimitry Andric // and not do anything (no calls). After partitioning, that local function may 632*0fca6ea1SDimitry Andric // end up in a different module (so it's just a declaration in the module 633*0fca6ea1SDimitry Andric // where its address is taken), which emits a "undefined hidden symbol" linker 634*0fca6ea1SDimitry Andric // error. 635*0fca6ea1SDimitry Andric // 636*0fca6ea1SDimitry Andric // Additionally, it guides partitioning to not duplicate this function if it's 637*0fca6ea1SDimitry Andric // called directly at some point. 638*0fca6ea1SDimitry Andric for (auto &Fn : M) { 639*0fca6ea1SDimitry Andric if (Fn.hasAddressTaken()) { 640*0fca6ea1SDimitry Andric if (Fn.hasLocalLinkage()) { 641*0fca6ea1SDimitry Andric SML << "[externalize] " << Fn.getName() 642*0fca6ea1SDimitry Andric << " because its address is taken\n"; 643*0fca6ea1SDimitry Andric } 644*0fca6ea1SDimitry Andric externalize(Fn); 645*0fca6ea1SDimitry Andric } 646*0fca6ea1SDimitry Andric } 647*0fca6ea1SDimitry Andric 648*0fca6ea1SDimitry Andric // Externalize local GVs, which avoids duplicating their initializers, which 649*0fca6ea1SDimitry Andric // in turns helps keep code size in check. 650*0fca6ea1SDimitry Andric if (!NoExternalizeGlobals) { 651*0fca6ea1SDimitry Andric for (auto &GV : M.globals()) { 652*0fca6ea1SDimitry Andric if (GV.hasLocalLinkage()) 653*0fca6ea1SDimitry Andric SML << "[externalize] GV " << GV.getName() << '\n'; 654*0fca6ea1SDimitry Andric externalize(GV); 655*0fca6ea1SDimitry Andric } 656*0fca6ea1SDimitry Andric } 657*0fca6ea1SDimitry Andric 658*0fca6ea1SDimitry Andric // Start by calculating the cost of every function in the module, as well as 659*0fca6ea1SDimitry Andric // the module's overall cost. 660*0fca6ea1SDimitry Andric DenseMap<const Function *, CostType> FnCosts; 661*0fca6ea1SDimitry Andric const CostType ModuleCost = calculateFunctionCosts(SML, GetTTI, M, FnCosts); 662*0fca6ea1SDimitry Andric 663*0fca6ea1SDimitry Andric // First, gather ever kernel into the worklist. 664*0fca6ea1SDimitry Andric SmallVector<FunctionWithDependencies> WorkList; 665*0fca6ea1SDimitry Andric for (auto &Fn : M) { 666*0fca6ea1SDimitry Andric if (isEntryPoint(&Fn) && !Fn.isDeclaration()) 667*0fca6ea1SDimitry Andric WorkList.emplace_back(SML, CG, FnCosts, &Fn); 668*0fca6ea1SDimitry Andric } 669*0fca6ea1SDimitry Andric 670*0fca6ea1SDimitry Andric // Then, find missing functions that need to be considered as additional 671*0fca6ea1SDimitry Andric // roots. These can't be called in theory, but in practice we still have to 672*0fca6ea1SDimitry Andric // handle them to avoid linker errors. 673*0fca6ea1SDimitry Andric { 674*0fca6ea1SDimitry Andric DenseSet<const Function *> SeenFunctions; 675*0fca6ea1SDimitry Andric for (const auto &FWD : WorkList) { 676*0fca6ea1SDimitry Andric SeenFunctions.insert(FWD.Fn); 677*0fca6ea1SDimitry Andric SeenFunctions.insert(FWD.Dependencies.begin(), FWD.Dependencies.end()); 678*0fca6ea1SDimitry Andric } 679*0fca6ea1SDimitry Andric 680*0fca6ea1SDimitry Andric for (auto &Fn : M) { 681*0fca6ea1SDimitry Andric // If this function is not part of any kernel's dependencies and isn't 682*0fca6ea1SDimitry Andric // directly called, consider it as a root. 683*0fca6ea1SDimitry Andric if (!Fn.isDeclaration() && !isEntryPoint(&Fn) && 684*0fca6ea1SDimitry Andric !SeenFunctions.count(&Fn) && !hasDirectCaller(Fn)) { 685*0fca6ea1SDimitry Andric WorkList.emplace_back(SML, CG, FnCosts, &Fn); 686*0fca6ea1SDimitry Andric } 687*0fca6ea1SDimitry Andric } 688*0fca6ea1SDimitry Andric } 689*0fca6ea1SDimitry Andric 690*0fca6ea1SDimitry Andric // Sort the worklist so the most expensive roots are seen first. 691*0fca6ea1SDimitry Andric sort(WorkList, [&](auto &A, auto &B) { 692*0fca6ea1SDimitry Andric // Sort by total cost, and if the total cost is identical, sort 693*0fca6ea1SDimitry Andric // alphabetically. 694*0fca6ea1SDimitry Andric if (A.TotalCost == B.TotalCost) 695*0fca6ea1SDimitry Andric return A.Fn->getName() < B.Fn->getName(); 696*0fca6ea1SDimitry Andric return A.TotalCost > B.TotalCost; 697*0fca6ea1SDimitry Andric }); 698*0fca6ea1SDimitry Andric 699*0fca6ea1SDimitry Andric if (SML) { 700*0fca6ea1SDimitry Andric SML << "Worklist\n"; 701*0fca6ea1SDimitry Andric for (const auto &FWD : WorkList) { 702*0fca6ea1SDimitry Andric SML << "[root] " << getName(*FWD.Fn) << " (totalCost:" << FWD.TotalCost 703*0fca6ea1SDimitry Andric << " indirect:" << FWD.HasIndirectCall 704*0fca6ea1SDimitry Andric << " hasNonDuplicatableDep:" << FWD.HasNonDuplicatableDependecy 705*0fca6ea1SDimitry Andric << ")\n"; 706*0fca6ea1SDimitry Andric // Sort function names before printing to ensure determinism. 707*0fca6ea1SDimitry Andric SmallVector<std::string> SortedDepNames; 708*0fca6ea1SDimitry Andric SortedDepNames.reserve(FWD.Dependencies.size()); 709*0fca6ea1SDimitry Andric for (const auto *Dep : FWD.Dependencies) 710*0fca6ea1SDimitry Andric SortedDepNames.push_back(getName(*Dep)); 711*0fca6ea1SDimitry Andric sort(SortedDepNames); 712*0fca6ea1SDimitry Andric 713*0fca6ea1SDimitry Andric for (const auto &Name : SortedDepNames) 714*0fca6ea1SDimitry Andric SML << " [dependency] " << Name << '\n'; 715*0fca6ea1SDimitry Andric } 716*0fca6ea1SDimitry Andric } 717*0fca6ea1SDimitry Andric 718*0fca6ea1SDimitry Andric // This performs all of the partitioning work. 719*0fca6ea1SDimitry Andric auto Partitions = doPartitioning(SML, M, N, ModuleCost, FnCosts, WorkList); 720*0fca6ea1SDimitry Andric assert(Partitions.size() == N); 721*0fca6ea1SDimitry Andric 722*0fca6ea1SDimitry Andric // If we didn't externalize GVs, then local GVs need to be conservatively 723*0fca6ea1SDimitry Andric // imported into every module (including their initializers), and then cleaned 724*0fca6ea1SDimitry Andric // up afterwards. 725*0fca6ea1SDimitry Andric const auto NeedsConservativeImport = [&](const GlobalValue *GV) { 726*0fca6ea1SDimitry Andric // We conservatively import private/internal GVs into every module and clean 727*0fca6ea1SDimitry Andric // them up afterwards. 728*0fca6ea1SDimitry Andric const auto *Var = dyn_cast<GlobalVariable>(GV); 729*0fca6ea1SDimitry Andric return Var && Var->hasLocalLinkage(); 730*0fca6ea1SDimitry Andric }; 731*0fca6ea1SDimitry Andric 732*0fca6ea1SDimitry Andric SML << "Creating " << N << " modules...\n"; 733*0fca6ea1SDimitry Andric unsigned TotalFnImpls = 0; 734*0fca6ea1SDimitry Andric for (unsigned I = 0; I < N; ++I) { 735*0fca6ea1SDimitry Andric const auto &FnsInPart = Partitions[I]; 736*0fca6ea1SDimitry Andric 737*0fca6ea1SDimitry Andric ValueToValueMapTy VMap; 738*0fca6ea1SDimitry Andric std::unique_ptr<Module> MPart( 739*0fca6ea1SDimitry Andric CloneModule(M, VMap, [&](const GlobalValue *GV) { 740*0fca6ea1SDimitry Andric // Functions go in their assigned partition. 741*0fca6ea1SDimitry Andric if (const auto *Fn = dyn_cast<Function>(GV)) 742*0fca6ea1SDimitry Andric return FnsInPart.contains(Fn); 743*0fca6ea1SDimitry Andric 744*0fca6ea1SDimitry Andric if (NeedsConservativeImport(GV)) 745*0fca6ea1SDimitry Andric return true; 746*0fca6ea1SDimitry Andric 747*0fca6ea1SDimitry Andric // Everything else goes in the first partition. 748*0fca6ea1SDimitry Andric return I == 0; 749*0fca6ea1SDimitry Andric })); 750*0fca6ea1SDimitry Andric 751*0fca6ea1SDimitry Andric // Clean-up conservatively imported GVs without any users. 752*0fca6ea1SDimitry Andric for (auto &GV : make_early_inc_range(MPart->globals())) { 753*0fca6ea1SDimitry Andric if (NeedsConservativeImport(&GV) && GV.use_empty()) 754*0fca6ea1SDimitry Andric GV.eraseFromParent(); 755*0fca6ea1SDimitry Andric } 756*0fca6ea1SDimitry Andric 757*0fca6ea1SDimitry Andric unsigned NumAllFns = 0, NumKernels = 0; 758*0fca6ea1SDimitry Andric for (auto &Cur : *MPart) { 759*0fca6ea1SDimitry Andric if (!Cur.isDeclaration()) { 760*0fca6ea1SDimitry Andric ++NumAllFns; 761*0fca6ea1SDimitry Andric if (isEntryPoint(&Cur)) 762*0fca6ea1SDimitry Andric ++NumKernels; 763*0fca6ea1SDimitry Andric } 764*0fca6ea1SDimitry Andric } 765*0fca6ea1SDimitry Andric TotalFnImpls += NumAllFns; 766*0fca6ea1SDimitry Andric SML << " - Module " << I << " with " << NumAllFns << " functions (" 767*0fca6ea1SDimitry Andric << NumKernels << " kernels)\n"; 768*0fca6ea1SDimitry Andric ModuleCallback(std::move(MPart)); 769*0fca6ea1SDimitry Andric } 770*0fca6ea1SDimitry Andric 771*0fca6ea1SDimitry Andric SML << TotalFnImpls << " function definitions across all modules (" 772*0fca6ea1SDimitry Andric << format("%0.2f", (float(TotalFnImpls) / FnCosts.size()) * 100) 773*0fca6ea1SDimitry Andric << "% of original module)\n"; 774*0fca6ea1SDimitry Andric } 775*0fca6ea1SDimitry Andric } // namespace 776*0fca6ea1SDimitry Andric 777*0fca6ea1SDimitry Andric PreservedAnalyses AMDGPUSplitModulePass::run(Module &M, 778*0fca6ea1SDimitry Andric ModuleAnalysisManager &MAM) { 779*0fca6ea1SDimitry Andric FunctionAnalysisManager &FAM = 780*0fca6ea1SDimitry Andric MAM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager(); 781*0fca6ea1SDimitry Andric const auto TTIGetter = [&FAM](Function &F) -> const TargetTransformInfo & { 782*0fca6ea1SDimitry Andric return FAM.getResult<TargetIRAnalysis>(F); 783*0fca6ea1SDimitry Andric }; 784*0fca6ea1SDimitry Andric splitAMDGPUModule(TTIGetter, M, N, ModuleCallback); 785*0fca6ea1SDimitry Andric // We don't change the original module. 786*0fca6ea1SDimitry Andric return PreservedAnalyses::all(); 787*0fca6ea1SDimitry Andric } 788