1 //===- AMDGPUSplitModule.cpp ----------------------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file Implements a module splitting algorithm designed to support the 10 /// FullLTO --lto-partitions option for parallel codegen. This is completely 11 /// different from the common SplitModule pass, as this system is designed with 12 /// AMDGPU in mind. 13 /// 14 /// The basic idea of this module splitting implementation is the same as 15 /// SplitModule: load-balance the module's functions across a set of N 16 /// partitions to allow parallel codegen. However, it does it very 17 /// differently than the target-agnostic variant: 18 /// - Kernels are used as the module's "roots". 19 /// They're known entry points on AMDGPU, and everything else is often 20 /// internal only. 21 /// - Each kernel has a set of dependencies, and when a kernel and its 22 /// dependencies is considered "big", we try to put it in a partition where 23 /// most dependencies are already imported, to avoid duplicating large 24 /// amounts of code. 25 /// - There's special care for indirect calls in order to ensure 26 /// AMDGPUResourceUsageAnalysis can work correctly. 27 /// 28 /// This file also includes a more elaborate logging system to enable 29 /// users to easily generate logs that (if desired) do not include any value 30 /// names, in order to not leak information about the source file. 31 /// Such logs are very helpful to understand and fix potential issues with 32 /// module splitting. 33 34 #include "AMDGPUSplitModule.h" 35 #include "AMDGPUTargetMachine.h" 36 #include "Utils/AMDGPUBaseInfo.h" 37 #include "llvm/ADT/DenseMap.h" 38 #include "llvm/ADT/SmallVector.h" 39 #include "llvm/ADT/StringExtras.h" 40 #include "llvm/ADT/StringRef.h" 41 #include "llvm/Analysis/CallGraph.h" 42 #include "llvm/Analysis/TargetTransformInfo.h" 43 #include "llvm/IR/Function.h" 44 #include "llvm/IR/Instruction.h" 45 #include "llvm/IR/Module.h" 46 #include "llvm/IR/User.h" 47 #include "llvm/IR/Value.h" 48 #include "llvm/Support/Casting.h" 49 #include "llvm/Support/Debug.h" 50 #include "llvm/Support/FileSystem.h" 51 #include "llvm/Support/Path.h" 52 #include "llvm/Support/Process.h" 53 #include "llvm/Support/SHA256.h" 54 #include "llvm/Support/Threading.h" 55 #include "llvm/Support/raw_ostream.h" 56 #include "llvm/Transforms/Utils/Cloning.h" 57 #include <algorithm> 58 #include <cassert> 59 #include <iterator> 60 #include <memory> 61 #include <utility> 62 #include <vector> 63 64 using namespace llvm; 65 66 #define DEBUG_TYPE "amdgpu-split-module" 67 68 namespace { 69 70 static cl::opt<float> LargeKernelFactor( 71 "amdgpu-module-splitting-large-kernel-threshold", cl::init(2.0f), 72 cl::Hidden, 73 cl::desc( 74 "consider a kernel as large and needing special treatment when it " 75 "exceeds the average cost of a partition by this factor; e;g. 2.0 " 76 "means if the kernel and its dependencies is 2 times bigger than " 77 "an average partition; 0 disables large kernels handling entirely")); 78 79 static cl::opt<float> LargeKernelOverlapForMerge( 80 "amdgpu-module-splitting-large-kernel-merge-overlap", cl::init(0.8f), 81 cl::Hidden, 82 cl::desc("defines how much overlap between two large kernel's dependencies " 83 "is needed to put them in the same partition")); 84 85 static cl::opt<bool> NoExternalizeGlobals( 86 "amdgpu-module-splitting-no-externalize-globals", cl::Hidden, 87 cl::desc("disables externalization of global variable with local linkage; " 88 "may cause globals to be duplicated which increases binary size")); 89 90 static cl::opt<std::string> 91 LogDirOpt("amdgpu-module-splitting-log-dir", cl::Hidden, 92 cl::desc("output directory for AMDGPU module splitting logs")); 93 94 static cl::opt<bool> 95 LogPrivate("amdgpu-module-splitting-log-private", cl::Hidden, 96 cl::desc("hash value names before printing them in the AMDGPU " 97 "module splitting logs")); 98 99 using CostType = InstructionCost::CostType; 100 using PartitionID = unsigned; 101 using GetTTIFn = function_ref<const TargetTransformInfo &(Function &)>; 102 103 static bool isEntryPoint(const Function *F) { 104 return AMDGPU::isEntryFunctionCC(F->getCallingConv()); 105 } 106 107 static std::string getName(const Value &V) { 108 static bool HideNames; 109 110 static llvm::once_flag HideNameInitFlag; 111 llvm::call_once(HideNameInitFlag, [&]() { 112 if (LogPrivate.getNumOccurrences()) 113 HideNames = LogPrivate; 114 else { 115 const auto EV = sys::Process::GetEnv("AMD_SPLIT_MODULE_LOG_PRIVATE"); 116 HideNames = (EV.value_or("0") != "0"); 117 } 118 }); 119 120 if (!HideNames) 121 return V.getName().str(); 122 return toHex(SHA256::hash(arrayRefFromStringRef(V.getName())), 123 /*LowerCase=*/true); 124 } 125 126 /// Main logging helper. 127 /// 128 /// Logging can be configured by the following environment variable. 129 /// AMD_SPLIT_MODULE_LOG_DIR=<filepath> 130 /// If set, uses <filepath> as the directory to write logfiles to 131 /// each time module splitting is used. 132 /// AMD_SPLIT_MODULE_LOG_PRIVATE 133 /// If set to anything other than zero, all names are hidden. 134 /// 135 /// Both environment variables have corresponding CL options which 136 /// takes priority over them. 137 /// 138 /// Any output printed to the log files is also printed to dbgs() when -debug is 139 /// used and LLVM_DEBUG is defined. 140 /// 141 /// This approach has a small disadvantage over LLVM_DEBUG though: logging logic 142 /// cannot be removed from the code (by building without debug). This probably 143 /// has a small performance cost because if some computation/formatting is 144 /// needed for logging purpose, it may be done everytime only to be ignored 145 /// by the logger. 146 /// 147 /// As this pass only runs once and is not doing anything computationally 148 /// expensive, this is likely a reasonable trade-off. 149 /// 150 /// If some computation should really be avoided when unused, users of the class 151 /// can check whether any logging will occur by using the bool operator. 152 /// 153 /// \code 154 /// if (SML) { 155 /// // Executes only if logging to a file or if -debug is available and 156 /// used. 157 /// } 158 /// \endcode 159 class SplitModuleLogger { 160 public: 161 SplitModuleLogger(const Module &M) { 162 std::string LogDir = LogDirOpt; 163 if (LogDir.empty()) 164 LogDir = sys::Process::GetEnv("AMD_SPLIT_MODULE_LOG_DIR").value_or(""); 165 166 // No log dir specified means we don't need to log to a file. 167 // We may still log to dbgs(), though. 168 if (LogDir.empty()) 169 return; 170 171 // If a log directory is specified, create a new file with a unique name in 172 // that directory. 173 int Fd; 174 SmallString<0> PathTemplate; 175 SmallString<0> RealPath; 176 sys::path::append(PathTemplate, LogDir, "Module-%%-%%-%%-%%-%%-%%-%%.txt"); 177 if (auto Err = 178 sys::fs::createUniqueFile(PathTemplate.str(), Fd, RealPath)) { 179 report_fatal_error("Failed to create log file at '" + Twine(LogDir) + 180 "': " + Err.message(), 181 /*CrashDiag=*/false); 182 } 183 184 FileOS = std::make_unique<raw_fd_ostream>(Fd, /*shouldClose=*/true); 185 } 186 187 bool hasLogFile() const { return FileOS != nullptr; } 188 189 raw_ostream &logfile() { 190 assert(FileOS && "no logfile!"); 191 return *FileOS; 192 } 193 194 /// \returns true if this SML will log anything either to a file or dbgs(). 195 /// Can be used to avoid expensive computations that are ignored when logging 196 /// is disabled. 197 operator bool() const { 198 return hasLogFile() || (DebugFlag && isCurrentDebugType(DEBUG_TYPE)); 199 } 200 201 private: 202 std::unique_ptr<raw_fd_ostream> FileOS; 203 }; 204 205 template <typename Ty> 206 static SplitModuleLogger &operator<<(SplitModuleLogger &SML, const Ty &Val) { 207 static_assert( 208 !std::is_same_v<Ty, Value>, 209 "do not print values to logs directly, use handleName instead!"); 210 LLVM_DEBUG(dbgs() << Val); 211 if (SML.hasLogFile()) 212 SML.logfile() << Val; 213 return SML; 214 } 215 216 /// Calculate the cost of each function in \p M 217 /// \param SML Log Helper 218 /// \param GetTTI Abstract getter for TargetTransformInfo. 219 /// \param M Module to analyze. 220 /// \param CostMap[out] Resulting Function -> Cost map. 221 /// \return The module's total cost. 222 static CostType 223 calculateFunctionCosts(SplitModuleLogger &SML, GetTTIFn GetTTI, Module &M, 224 DenseMap<const Function *, CostType> &CostMap) { 225 CostType ModuleCost = 0; 226 CostType KernelCost = 0; 227 228 for (auto &Fn : M) { 229 if (Fn.isDeclaration()) 230 continue; 231 232 CostType FnCost = 0; 233 const auto &TTI = GetTTI(Fn); 234 for (const auto &BB : Fn) { 235 for (const auto &I : BB) { 236 auto Cost = 237 TTI.getInstructionCost(&I, TargetTransformInfo::TCK_CodeSize); 238 assert(Cost != InstructionCost::getMax()); 239 // Assume expensive if we can't tell the cost of an instruction. 240 CostType CostVal = 241 Cost.getValue().value_or(TargetTransformInfo::TCC_Expensive); 242 assert((FnCost + CostVal) >= FnCost && "Overflow!"); 243 FnCost += CostVal; 244 } 245 } 246 247 assert(FnCost != 0); 248 249 CostMap[&Fn] = FnCost; 250 assert((ModuleCost + FnCost) >= ModuleCost && "Overflow!"); 251 ModuleCost += FnCost; 252 253 if (isEntryPoint(&Fn)) 254 KernelCost += FnCost; 255 } 256 257 CostType FnCost = (ModuleCost - KernelCost); 258 SML << "=> Total Module Cost: " << ModuleCost << '\n' 259 << " => KernelCost: " << KernelCost << " (" 260 << format("%0.2f", (float(KernelCost) / ModuleCost) * 100) << "%)\n" 261 << " => FnsCost: " << FnCost << " (" 262 << format("%0.2f", (float(FnCost) / ModuleCost) * 100) << "%)\n"; 263 264 return ModuleCost; 265 } 266 267 static bool canBeIndirectlyCalled(const Function &F) { 268 if (F.isDeclaration() || isEntryPoint(&F)) 269 return false; 270 return !F.hasLocalLinkage() || 271 F.hasAddressTaken(/*PutOffender=*/nullptr, 272 /*IgnoreCallbackUses=*/false, 273 /*IgnoreAssumeLikeCalls=*/true, 274 /*IgnoreLLVMUsed=*/true, 275 /*IgnoreARCAttachedCall=*/false, 276 /*IgnoreCastedDirectCall=*/true); 277 } 278 279 /// When a kernel or any of its callees performs an indirect call, this function 280 /// takes over \ref addAllDependencies and adds all potentially callable 281 /// functions to \p Fns so they can be counted as dependencies of the kernel. 282 /// 283 /// This is needed due to how AMDGPUResourceUsageAnalysis operates: in the 284 /// presence of an indirect call, the function's resource usage is the same as 285 /// the most expensive function in the module. 286 /// \param M The module. 287 /// \param Fns[out] Resulting list of functions. 288 static void addAllIndirectCallDependencies(const Module &M, 289 DenseSet<const Function *> &Fns) { 290 for (const auto &Fn : M) { 291 if (canBeIndirectlyCalled(Fn)) 292 Fns.insert(&Fn); 293 } 294 } 295 296 /// Adds the functions that \p Fn may call to \p Fns, then recurses into each 297 /// callee until all reachable functions have been gathered. 298 /// 299 /// \param SML Log Helper 300 /// \param CG Call graph for \p Fn's module. 301 /// \param Fn Current function to look at. 302 /// \param Fns[out] Resulting list of functions. 303 /// \param HadIndirectCall[out] Set to true if an indirect call was seen at some 304 /// point, either in \p Fn or in one of the function it calls. When that 305 /// happens, we fall back to adding all callable functions inside \p Fn's module 306 /// to \p Fns. 307 static void addAllDependencies(SplitModuleLogger &SML, const CallGraph &CG, 308 const Function &Fn, 309 DenseSet<const Function *> &Fns, 310 bool &HadIndirectCall) { 311 assert(!Fn.isDeclaration()); 312 313 const Module &M = *Fn.getParent(); 314 SmallVector<const Function *> WorkList({&Fn}); 315 while (!WorkList.empty()) { 316 const auto &CurFn = *WorkList.pop_back_val(); 317 assert(!CurFn.isDeclaration()); 318 319 // Scan for an indirect call. If such a call is found, we have to 320 // conservatively assume this can call all non-entrypoint functions in the 321 // module. 322 323 for (auto &CGEntry : *CG[&CurFn]) { 324 auto *CGNode = CGEntry.second; 325 auto *Callee = CGNode->getFunction(); 326 if (!Callee) { 327 // Functions have an edge towards CallsExternalNode if they're external 328 // declarations, or if they do an indirect call. As we only process 329 // definitions here, we know this means the function has an indirect 330 // call. We then have to conservatively assume this can call all 331 // non-entrypoint functions in the module. 332 if (CGNode != CG.getCallsExternalNode()) 333 continue; // this is another function-less node we don't care about. 334 335 SML << "Indirect call detected in " << getName(CurFn) 336 << " - treating all non-entrypoint functions as " 337 "potential dependencies\n"; 338 339 // TODO: Print an ORE as well ? 340 addAllIndirectCallDependencies(M, Fns); 341 HadIndirectCall = true; 342 continue; 343 } 344 345 if (Callee->isDeclaration()) 346 continue; 347 348 auto [It, Inserted] = Fns.insert(Callee); 349 if (Inserted) 350 WorkList.push_back(Callee); 351 } 352 } 353 } 354 355 /// Contains information about a kernel and its dependencies. 356 struct KernelWithDependencies { 357 KernelWithDependencies(SplitModuleLogger &SML, CallGraph &CG, 358 const DenseMap<const Function *, CostType> &FnCosts, 359 const Function *Fn) 360 : Fn(Fn) { 361 addAllDependencies(SML, CG, *Fn, Dependencies, HasIndirectCall); 362 TotalCost = FnCosts.at(Fn); 363 for (const auto *Dep : Dependencies) { 364 TotalCost += FnCosts.at(Dep); 365 366 // We cannot duplicate functions with external linkage, or functions that 367 // may be overriden at runtime. 368 HasNonDuplicatableDependecy |= 369 (Dep->hasExternalLinkage() || !Dep->isDefinitionExact()); 370 } 371 } 372 373 const Function *Fn = nullptr; 374 DenseSet<const Function *> Dependencies; 375 /// Whether \p Fn or any of its \ref Dependencies contains an indirect call. 376 bool HasIndirectCall = false; 377 /// Whether any of \p Fn's dependencies cannot be duplicated. 378 bool HasNonDuplicatableDependecy = false; 379 380 CostType TotalCost = 0; 381 382 /// \returns true if this kernel and its dependencies can be considered large 383 /// according to \p Threshold. 384 bool isLarge(CostType Threshold) const { 385 return TotalCost > Threshold && !Dependencies.empty(); 386 } 387 }; 388 389 /// Calculates how much overlap there is between \p A and \p B. 390 /// \return A number between 0.0 and 1.0, where 1.0 means A == B and 0.0 means A 391 /// and B have no shared elements. Kernels do not count in overlap calculation. 392 static float calculateOverlap(const DenseSet<const Function *> &A, 393 const DenseSet<const Function *> &B) { 394 DenseSet<const Function *> Total; 395 for (const auto *F : A) { 396 if (!isEntryPoint(F)) 397 Total.insert(F); 398 } 399 400 if (Total.empty()) 401 return 0.0f; 402 403 unsigned NumCommon = 0; 404 for (const auto *F : B) { 405 if (isEntryPoint(F)) 406 continue; 407 408 auto [It, Inserted] = Total.insert(F); 409 if (!Inserted) 410 ++NumCommon; 411 } 412 413 return static_cast<float>(NumCommon) / Total.size(); 414 } 415 416 /// Performs all of the partitioning work on \p M. 417 /// \param SML Log Helper 418 /// \param M Module to partition. 419 /// \param NumParts Number of partitions to create. 420 /// \param ModuleCost Total cost of all functions in \p M. 421 /// \param FnCosts Map of Function -> Cost 422 /// \param WorkList Kernels and their dependencies to process in order. 423 /// \returns The created partitions (a vector of size \p NumParts ) 424 static std::vector<DenseSet<const Function *>> 425 doPartitioning(SplitModuleLogger &SML, Module &M, unsigned NumParts, 426 CostType ModuleCost, 427 const DenseMap<const Function *, CostType> &FnCosts, 428 const SmallVector<KernelWithDependencies> &WorkList) { 429 430 SML << "\n--Partitioning Starts--\n"; 431 432 // Calculate a "large kernel threshold". When more than one kernel's total 433 // import cost exceeds this value, we will try to merge it with other, 434 // similarly large kernels. 435 // 436 // e.g. let two kernels X and Y have a import cost of ~10% of the module, we 437 // assign X to a partition as usual, but when we get to Y, we check if it's 438 // worth also putting it in Y's partition. 439 const CostType LargeKernelThreshold = 440 LargeKernelFactor 441 ? CostType(((ModuleCost / NumParts) * LargeKernelFactor)) 442 : std::numeric_limits<CostType>::max(); 443 444 std::vector<DenseSet<const Function *>> Partitions; 445 Partitions.resize(NumParts); 446 447 // Assign a partition to each kernel, and try to keep the partitions more or 448 // less balanced. We do that through a priority queue sorted in reverse, so we 449 // can always look at the partition with the least content. 450 // 451 // There are some cases where we will be deliberately unbalanced though. 452 // - Large kernels: we try to merge with existing partitions to reduce code 453 // duplication. 454 // - Kernels with indirect or external calls always go in the first partition 455 // (P0). 456 auto ComparePartitions = [](const std::pair<PartitionID, CostType> &a, 457 const std::pair<PartitionID, CostType> &b) { 458 // When two partitions have the same cost, assign to the one with the 459 // biggest ID first. This allows us to put things in P0 last, because P0 may 460 // have other stuff added later. 461 if (a.second == b.second) 462 return a.first < b.first; 463 return a.second > b.second; 464 }; 465 466 // We can't use priority_queue here because we need to be able to access any 467 // element. This makes this a bit inefficient as we need to sort it again 468 // everytime we change it, but it's a very small array anyway (likely under 64 469 // partitions) so it's a cheap operation. 470 std::vector<std::pair<PartitionID, CostType>> BalancingQueue; 471 for (unsigned I = 0; I < NumParts; ++I) 472 BalancingQueue.push_back(std::make_pair(I, 0)); 473 474 // Helper function to handle assigning a kernel to a partition. This takes 475 // care of updating the balancing queue. 476 const auto AssignToPartition = [&](PartitionID PID, 477 const KernelWithDependencies &KWD) { 478 auto &FnsInPart = Partitions[PID]; 479 FnsInPart.insert(KWD.Fn); 480 FnsInPart.insert(KWD.Dependencies.begin(), KWD.Dependencies.end()); 481 482 SML << "assign " << getName(*KWD.Fn) << " to P" << PID << "\n -> "; 483 if (!KWD.Dependencies.empty()) { 484 SML << KWD.Dependencies.size() << " dependencies added\n"; 485 }; 486 487 // Update the balancing queue. we scan backwards because in the common case 488 // the partition is at the end. 489 for (auto &[QueuePID, Cost] : reverse(BalancingQueue)) { 490 if (QueuePID == PID) { 491 CostType NewCost = 0; 492 for (auto *Fn : Partitions[PID]) 493 NewCost += FnCosts.at(Fn); 494 495 SML << "[Updating P" << PID << " Cost]:" << Cost << " -> " << NewCost; 496 if (Cost) { 497 SML << " (" << unsigned(((float(NewCost) / Cost) - 1) * 100) 498 << "% increase)"; 499 } 500 SML << '\n'; 501 502 Cost = NewCost; 503 } 504 } 505 506 sort(BalancingQueue, ComparePartitions); 507 }; 508 509 for (auto &CurKernel : WorkList) { 510 // When a kernel has indirect calls, it must stay in the first partition 511 // alongside every reachable non-entry function. This is a nightmare case 512 // for splitting as it severely limits what we can do. 513 if (CurKernel.HasIndirectCall) { 514 SML << "Kernel with indirect call(s): " << getName(*CurKernel.Fn) 515 << " defaulting to P0\n"; 516 AssignToPartition(0, CurKernel); 517 continue; 518 } 519 520 // When a kernel has non duplicatable dependencies, we have to keep it in 521 // the first partition as well. This is a conservative approach, a 522 // finer-grained approach could keep track of which dependencies are 523 // non-duplicatable exactly and just make sure they're grouped together. 524 if (CurKernel.HasNonDuplicatableDependecy) { 525 SML << "Kernel with externally visible dependency " 526 << getName(*CurKernel.Fn) << " defaulting to P0\n"; 527 AssignToPartition(0, CurKernel); 528 continue; 529 } 530 531 // Be smart with large kernels to avoid duplicating their dependencies. 532 if (CurKernel.isLarge(LargeKernelThreshold)) { 533 assert(LargeKernelOverlapForMerge >= 0.0f && 534 LargeKernelOverlapForMerge <= 1.0f); 535 SML << "Large Kernel: " << getName(*CurKernel.Fn) 536 << " - looking for partition with at least " 537 << format("%0.2f", LargeKernelOverlapForMerge * 100) << "% overlap\n"; 538 539 bool Assigned = false; 540 for (const auto &[PID, Fns] : enumerate(Partitions)) { 541 float Overlap = calculateOverlap(CurKernel.Dependencies, Fns); 542 SML << " => " << format("%0.2f", Overlap * 100) << "% overlap with P" 543 << PID << '\n'; 544 if (Overlap > LargeKernelOverlapForMerge) { 545 SML << " selecting P" << PID << '\n'; 546 AssignToPartition(PID, CurKernel); 547 Assigned = true; 548 } 549 } 550 551 if (Assigned) 552 continue; 553 } 554 555 // Normal "load-balancing", assign to partition with least pressure. 556 auto [PID, CurCost] = BalancingQueue.back(); 557 AssignToPartition(PID, CurKernel); 558 } 559 560 // Work is mostly done now, verify the partioning and add all functions we may 561 // have missed (= unreachable, or we don't understand how they're reached) to 562 // P0. 563 DenseSet<const Function *> AllFunctions; 564 for (const auto &[Idx, Part] : enumerate(Partitions)) { 565 CostType Cost = 0; 566 for (auto *Fn : Part) { 567 // external linkage functions should exclusively be in the first partition 568 // at this stage. In theory, we should only ever see external linkage 569 // functions here if they're kernels, or if they've been added due to a 570 // kernel using indirect calls somewhere in its CallGraph. 571 assert(Idx == 0 || (!Fn->hasExternalLinkage() || isEntryPoint(Fn))); 572 Cost += FnCosts.at(Fn); 573 } 574 SML << "P" << Idx << " has a total cost of " << Cost << " (" 575 << format("%0.2f", (float(Cost) / ModuleCost) * 100) 576 << "% of source module)\n"; 577 AllFunctions.insert(Part.begin(), Part.end()); 578 } 579 580 // Add missed functions to P0. This will take care of adding things like 581 // external functions with no callers in the module to P0. This should be 582 // fairly rare as AMDGPU internalizes everything in most cases, so unused 583 // internal functions would get removed. 584 for (auto &Fn : M) { 585 if (!Fn.isDeclaration() && !AllFunctions.contains(&Fn)) { 586 SML << getName(Fn) << " has no partition assigned, defaulting to P0\n"; 587 Partitions[0].insert(&Fn); 588 } 589 } 590 591 SML << "--Partitioning Done--\n\n"; 592 593 return Partitions; 594 } 595 596 static void externalize(GlobalValue &GV) { 597 if (GV.hasLocalLinkage()) { 598 GV.setLinkage(GlobalValue::ExternalLinkage); 599 GV.setVisibility(GlobalValue::HiddenVisibility); 600 } 601 602 // Unnamed entities must be named consistently between modules. setName will 603 // give a distinct name to each such entity. 604 if (!GV.hasName()) 605 GV.setName("__llvmsplit_unnamed"); 606 } 607 608 static void splitAMDGPUModule( 609 GetTTIFn GetTTI, Module &M, unsigned N, 610 function_ref<void(std::unique_ptr<Module> MPart)> ModuleCallback) { 611 612 SplitModuleLogger SML(M); 613 614 CallGraph CG(M); 615 616 // Externalize functions whose address are taken. 617 // 618 // This is needed because partitioning is purely based on calls, but sometimes 619 // a kernel/function may just look at the address of another local function 620 // and not do anything (no calls). After partitioning, that local function may 621 // end up in a different module (so it's just a declaration in the module 622 // where its address is taken), which emits a "undefined hidden symbol" linker 623 // error. 624 // 625 // Additionally, it guides partitioning to not duplicate this function if it's 626 // called directly at some point. 627 for (auto &Fn : M) { 628 if (Fn.hasAddressTaken()) { 629 if (Fn.hasLocalLinkage()) { 630 SML << "[externalize] " << Fn.getName() 631 << " because its address is taken\n"; 632 } 633 externalize(Fn); 634 } 635 } 636 637 // Externalize local GVs, which avoids duplicating their initializers, which 638 // in turns helps keep code size in check. 639 if (!NoExternalizeGlobals) { 640 for (auto &GV : M.globals()) { 641 if (GV.hasLocalLinkage()) 642 SML << "[externalize] GV " << GV.getName() << '\n'; 643 externalize(GV); 644 } 645 } 646 647 // Start by calculating the cost of every function in the module, as well as 648 // the module's overall cost. 649 DenseMap<const Function *, CostType> FnCosts; 650 const CostType ModuleCost = calculateFunctionCosts(SML, GetTTI, M, FnCosts); 651 652 // Gather every kernel into a WorkList, then sort it by descending total cost 653 // of the kernel so the biggest kernels are seen first. 654 SmallVector<KernelWithDependencies> WorkList; 655 for (auto &Fn : M) { 656 if (isEntryPoint(&Fn) && !Fn.isDeclaration()) 657 WorkList.emplace_back(SML, CG, FnCosts, &Fn); 658 } 659 sort(WorkList, [&](auto &A, auto &B) { 660 // Sort by total cost, and if the total cost is identical, sort 661 // alphabetically. 662 if (A.TotalCost == B.TotalCost) 663 return A.Fn->getName() < B.Fn->getName(); 664 return A.TotalCost > B.TotalCost; 665 }); 666 667 if (SML) { 668 SML << "Worklist\n"; 669 for (const auto &KWD : WorkList) { 670 SML << "[Kernel] " << getName(*KWD.Fn) << " (totalCost:" << KWD.TotalCost 671 << " indirect:" << KWD.HasIndirectCall 672 << " hasNonDuplicatableDep:" << KWD.HasNonDuplicatableDependecy 673 << ")\n"; 674 for (const auto *Dep : KWD.Dependencies) 675 SML << " [Dep] " << getName(*Dep) << '\n'; 676 } 677 } 678 679 // This performs all of the partitioning work. 680 auto Partitions = doPartitioning(SML, M, N, ModuleCost, FnCosts, WorkList); 681 assert(Partitions.size() == N); 682 683 // If we didn't externalize GVs, then local GVs need to be conservatively 684 // imported into every module (including their initializers), and then cleaned 685 // up afterwards. 686 const auto NeedsConservativeImport = [&](const GlobalValue *GV) { 687 // We conservatively import private/internal GVs into every module and clean 688 // them up afterwards. 689 const auto *Var = dyn_cast<GlobalVariable>(GV); 690 return Var && Var->hasLocalLinkage(); 691 }; 692 693 SML << "Creating " << N << " modules...\n"; 694 unsigned TotalFnImpls = 0; 695 for (unsigned I = 0; I < N; ++I) { 696 const auto &FnsInPart = Partitions[I]; 697 698 ValueToValueMapTy VMap; 699 std::unique_ptr<Module> MPart( 700 CloneModule(M, VMap, [&](const GlobalValue *GV) { 701 // Functions go in their assigned partition. 702 if (const auto *Fn = dyn_cast<Function>(GV)) { 703 // Check we don't import an external linkage function in any 704 // partition other than P0. 705 #ifndef NDEBUG 706 if (Fn->hasExternalLinkage() && !isEntryPoint(Fn)) { 707 assert((I == 0) == FnsInPart.contains(Fn)); 708 } 709 #endif 710 return FnsInPart.contains(Fn); 711 } 712 713 if (NeedsConservativeImport(GV)) 714 return true; 715 716 // Everything else goes in the first partition. 717 return I == 0; 718 })); 719 720 // Clean-up conservatively imported GVs without any users. 721 for (auto &GV : make_early_inc_range(MPart->globals())) { 722 if (NeedsConservativeImport(&GV) && GV.use_empty()) 723 GV.eraseFromParent(); 724 } 725 726 unsigned NumAllFns = 0, NumKernels = 0; 727 for (auto &Cur : *MPart) { 728 if (!Cur.isDeclaration()) { 729 ++NumAllFns; 730 if (isEntryPoint(&Cur)) 731 ++NumKernels; 732 } 733 } 734 TotalFnImpls += NumAllFns; 735 SML << " - Module " << I << " with " << NumAllFns << " functions (" 736 << NumKernels << " kernels)\n"; 737 ModuleCallback(std::move(MPart)); 738 } 739 740 SML << TotalFnImpls << " function definitions across all modules (" 741 << format("%0.2f", (float(TotalFnImpls) / FnCosts.size()) * 100) 742 << "% of original module)\n"; 743 } 744 } // namespace 745 746 PreservedAnalyses AMDGPUSplitModulePass::run(Module &M, 747 ModuleAnalysisManager &MAM) { 748 FunctionAnalysisManager &FAM = 749 MAM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager(); 750 const auto TTIGetter = [&FAM](Function &F) -> const TargetTransformInfo & { 751 return FAM.getResult<TargetIRAnalysis>(F); 752 }; 753 splitAMDGPUModule(TTIGetter, M, N, ModuleCallback); 754 // We don't change the original module. 755 return PreservedAnalyses::all(); 756 } 757