1 //===-- AMDGPULowerModuleLDSPass.cpp ------------------------------*- C++ -*-=// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This pass eliminates local data store, LDS, uses from non-kernel functions. 10 // LDS is contiguous memory allocated per kernel execution. 11 // 12 // Background. 13 // 14 // The programming model is global variables, or equivalently function local 15 // static variables, accessible from kernels or other functions. For uses from 16 // kernels this is straightforward - assign an integer to the kernel for the 17 // memory required by all the variables combined, allocate them within that. 18 // For uses from functions there are performance tradeoffs to choose between. 19 // 20 // This model means the GPU runtime can specify the amount of memory allocated. 21 // If this is more than the kernel assumed, the excess can be made available 22 // using a language specific feature, which IR represents as a variable with 23 // no initializer. This feature is not yet implemented for non-kernel functions. 24 // This lowering could be extended to handle that use case, but would probably 25 // require closer integration with promoteAllocaToLDS. 26 // 27 // Consequences of this GPU feature: 28 // - memory is limited and exceeding it halts compilation 29 // - a global accessed by one kernel exists independent of other kernels 30 // - a global exists independent of simultaneous execution of the same kernel 31 // - the address of the global may be different from different kernels as they 32 // do not alias, which permits only allocating variables they use 33 // - if the address is allowed to differ, functions need help to find it 34 // 35 // Uses from kernels are implemented here by grouping them in a per-kernel 36 // struct instance. This duplicates the variables, accurately modelling their 37 // aliasing properties relative to a single global representation. It also 38 // permits control over alignment via padding. 39 // 40 // Uses from functions are more complicated and the primary purpose of this 41 // IR pass. Several different lowering are chosen between to meet requirements 42 // to avoid allocating any LDS where it is not necessary, as that impacts 43 // occupancy and may fail the compilation, while not imposing overhead on a 44 // feature whose primary advantage over global memory is performance. The basic 45 // design goal is to avoid one kernel imposing overhead on another. 46 // 47 // Implementation. 48 // 49 // LDS variables with constant annotation or non-undef initializer are passed 50 // through unchanged for simplification or error diagnostics in later passes. 51 // Non-undef initializers are not yet implemented for LDS. 52 // 53 // LDS variables that are always allocated at the same address can be found 54 // by lookup at that address. Otherwise runtime information/cost is required. 55 // 56 // The simplest strategy possible is to group all LDS variables in a single 57 // struct and allocate that struct in every kernel such that the original 58 // variables are always at the same address. LDS is however a limited resource 59 // so this strategy is unusable in practice. It is not implemented here. 60 // 61 // Strategy | Precise allocation | Zero runtime cost | General purpose | 62 // --------+--------------------+-------------------+-----------------+ 63 // Module | No | Yes | Yes | 64 // Table | Yes | No | Yes | 65 // Kernel | Yes | Yes | No | 66 // Hybrid | Yes | Partial | Yes | 67 // 68 // Module spends LDS memory to save cycles. Table spends cycles and global 69 // memory to save LDS. Kernel is as fast as kernel allocation but only works 70 // for variables that are known reachable from a single kernel. Hybrid picks 71 // between all three. When forced to choose between LDS and cycles it minimises 72 // LDS use. 73 74 // The "module" lowering implemented here finds LDS variables which are used by 75 // non-kernel functions and creates a new struct with a field for each of those 76 // LDS variables. Variables that are only used from kernels are excluded. 77 // Kernels that do not use this struct are annoteated with the attribute 78 // amdgpu-elide-module-lds which allows the back end to elide the allocation. 79 // 80 // The "table" lowering implemented here has three components. 81 // First kernels are assigned a unique integer identifier which is available in 82 // functions it calls through the intrinsic amdgcn_lds_kernel_id. The integer 83 // is passed through a specific SGPR, thus works with indirect calls. 84 // Second, each kernel allocates LDS variables independent of other kernels and 85 // writes the addresses it chose for each variable into an array in consistent 86 // order. If the kernel does not allocate a given variable, it writes undef to 87 // the corresponding array location. These arrays are written to a constant 88 // table in the order matching the kernel unique integer identifier. 89 // Third, uses from non-kernel functions are replaced with a table lookup using 90 // the intrinsic function to find the address of the variable. 91 // 92 // "Kernel" lowering is only applicable for variables that are unambiguously 93 // reachable from exactly one kernel. For those cases, accesses to the variable 94 // can be lowered to ConstantExpr address of a struct instance specific to that 95 // one kernel. This is zero cost in space and in compute. It will raise a fatal 96 // error on any variable that might be reachable from multiple kernels and is 97 // thus most easily used as part of the hybrid lowering strategy. 98 // 99 // Hybrid lowering is a mixture of the above. It uses the zero cost kernel 100 // lowering where it can. It lowers the variable accessed by the greatest 101 // number of kernels using the module strategy as that is free for the first 102 // variable. Any futher variables that can be lowered with the module strategy 103 // without incurring LDS memory overhead are. The remaining ones are lowered 104 // via table. 105 // 106 // Consequences 107 // - No heuristics or user controlled magic numbers, hybrid is the right choice 108 // - Kernels that don't use functions (or have had them all inlined) are not 109 // affected by any lowering for kernels that do. 110 // - Kernels that don't make indirect function calls are not affected by those 111 // that do. 112 // - Variables which are used by lots of kernels, e.g. those injected by a 113 // language runtime in most kernels, are expected to have no overhead 114 // - Implementations that instantiate templates per-kernel where those templates 115 // use LDS are expected to hit the "Kernel" lowering strategy 116 // - The runtime properties impose a cost in compiler implementation complexity 117 // 118 //===----------------------------------------------------------------------===// 119 120 #include "AMDGPU.h" 121 #include "Utils/AMDGPUBaseInfo.h" 122 #include "Utils/AMDGPUMemoryUtils.h" 123 #include "llvm/ADT/BitVector.h" 124 #include "llvm/ADT/DenseMap.h" 125 #include "llvm/ADT/DenseSet.h" 126 #include "llvm/ADT/STLExtras.h" 127 #include "llvm/ADT/SetOperations.h" 128 #include "llvm/ADT/SetVector.h" 129 #include "llvm/ADT/StringSwitch.h" 130 #include "llvm/Analysis/CallGraph.h" 131 #include "llvm/IR/Constants.h" 132 #include "llvm/IR/DerivedTypes.h" 133 #include "llvm/IR/IRBuilder.h" 134 #include "llvm/IR/InlineAsm.h" 135 #include "llvm/IR/Instructions.h" 136 #include "llvm/IR/IntrinsicsAMDGPU.h" 137 #include "llvm/IR/MDBuilder.h" 138 #include "llvm/InitializePasses.h" 139 #include "llvm/Pass.h" 140 #include "llvm/Support/CommandLine.h" 141 #include "llvm/Support/Debug.h" 142 #include "llvm/Support/OptimizedStructLayout.h" 143 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 144 #include "llvm/Transforms/Utils/ModuleUtils.h" 145 146 #include <tuple> 147 #include <vector> 148 149 #include <cstdio> 150 151 #define DEBUG_TYPE "amdgpu-lower-module-lds" 152 153 using namespace llvm; 154 155 namespace { 156 157 cl::opt<bool> SuperAlignLDSGlobals( 158 "amdgpu-super-align-lds-globals", 159 cl::desc("Increase alignment of LDS if it is not on align boundary"), 160 cl::init(true), cl::Hidden); 161 162 enum class LoweringKind { module, table, kernel, hybrid }; 163 cl::opt<LoweringKind> LoweringKindLoc( 164 "amdgpu-lower-module-lds-strategy", 165 cl::desc("Specify lowering strategy for function LDS access:"), cl::Hidden, 166 cl::init(LoweringKind::module), 167 cl::values( 168 clEnumValN(LoweringKind::table, "table", "Lower via table lookup"), 169 clEnumValN(LoweringKind::module, "module", "Lower via module struct"), 170 clEnumValN( 171 LoweringKind::kernel, "kernel", 172 "Lower variables reachable from one kernel, otherwise abort"), 173 clEnumValN(LoweringKind::hybrid, "hybrid", 174 "Lower via mixture of above strategies"))); 175 176 bool isKernelLDS(const Function *F) { 177 // Some weirdness here. AMDGPU::isKernelCC does not call into 178 // AMDGPU::isKernel with the calling conv, it instead calls into 179 // isModuleEntryFunction which returns true for more calling conventions 180 // than AMDGPU::isKernel does. There's a FIXME on AMDGPU::isKernel. 181 // There's also a test that checks that the LDS lowering does not hit on 182 // a graphics shader, denoted amdgpu_ps, so stay with the limited case. 183 // Putting LDS in the name of the function to draw attention to this. 184 return AMDGPU::isKernel(F->getCallingConv()); 185 } 186 187 class AMDGPULowerModuleLDS : public ModulePass { 188 189 static void removeFromUsedList(Module &M, StringRef Name, 190 SmallPtrSetImpl<Constant *> &ToRemove) { 191 GlobalVariable *GV = M.getNamedGlobal(Name); 192 if (!GV || ToRemove.empty()) { 193 return; 194 } 195 196 SmallVector<Constant *, 16> Init; 197 auto *CA = cast<ConstantArray>(GV->getInitializer()); 198 for (auto &Op : CA->operands()) { 199 // ModuleUtils::appendToUsed only inserts Constants 200 Constant *C = cast<Constant>(Op); 201 if (!ToRemove.contains(C->stripPointerCasts())) { 202 Init.push_back(C); 203 } 204 } 205 206 if (Init.size() == CA->getNumOperands()) { 207 return; // none to remove 208 } 209 210 GV->eraseFromParent(); 211 212 for (Constant *C : ToRemove) { 213 C->removeDeadConstantUsers(); 214 } 215 216 if (!Init.empty()) { 217 ArrayType *ATy = 218 ArrayType::get(Type::getInt8PtrTy(M.getContext()), Init.size()); 219 GV = 220 new GlobalVariable(M, ATy, false, GlobalValue::AppendingLinkage, 221 ConstantArray::get(ATy, Init), Name); 222 GV->setSection("llvm.metadata"); 223 } 224 } 225 226 static void removeFromUsedLists(Module &M, 227 const DenseSet<GlobalVariable *> &LocalVars) { 228 // The verifier rejects used lists containing an inttoptr of a constant 229 // so remove the variables from these lists before replaceAllUsesWith 230 231 SmallPtrSet<Constant *, 32> LocalVarsSet; 232 for (GlobalVariable *LocalVar : LocalVars) 233 if (Constant *C = dyn_cast<Constant>(LocalVar->stripPointerCasts())) 234 LocalVarsSet.insert(C); 235 removeFromUsedList(M, "llvm.used", LocalVarsSet); 236 removeFromUsedList(M, "llvm.compiler.used", LocalVarsSet); 237 } 238 239 static void markUsedByKernel(IRBuilder<> &Builder, Function *Func, 240 GlobalVariable *SGV) { 241 // The llvm.amdgcn.module.lds instance is implicitly used by all kernels 242 // that might call a function which accesses a field within it. This is 243 // presently approximated to 'all kernels' if there are any such functions 244 // in the module. This implicit use is redefined as an explicit use here so 245 // that later passes, specifically PromoteAlloca, account for the required 246 // memory without any knowledge of this transform. 247 248 // An operand bundle on llvm.donothing works because the call instruction 249 // survives until after the last pass that needs to account for LDS. It is 250 // better than inline asm as the latter survives until the end of codegen. A 251 // totally robust solution would be a function with the same semantics as 252 // llvm.donothing that takes a pointer to the instance and is lowered to a 253 // no-op after LDS is allocated, but that is not presently necessary. 254 255 LLVMContext &Ctx = Func->getContext(); 256 257 Builder.SetInsertPoint(Func->getEntryBlock().getFirstNonPHI()); 258 259 FunctionType *FTy = FunctionType::get(Type::getVoidTy(Ctx), {}); 260 261 Function *Decl = 262 Intrinsic::getDeclaration(Func->getParent(), Intrinsic::donothing, {}); 263 264 Value *UseInstance[1] = {Builder.CreateInBoundsGEP( 265 SGV->getValueType(), SGV, ConstantInt::get(Type::getInt32Ty(Ctx), 0))}; 266 267 Builder.CreateCall(FTy, Decl, {}, 268 {OperandBundleDefT<Value *>("ExplicitUse", UseInstance)}, 269 ""); 270 } 271 272 static bool eliminateConstantExprUsesOfLDSFromAllInstructions(Module &M) { 273 // Constants are uniqued within LLVM. A ConstantExpr referring to a LDS 274 // global may have uses from multiple different functions as a result. 275 // This pass specialises LDS variables with respect to the kernel that 276 // allocates them. 277 278 // This is semantically equivalent to: 279 // for (auto &F : M.functions()) 280 // for (auto &BB : F) 281 // for (auto &I : BB) 282 // for (Use &Op : I.operands()) 283 // if (constantExprUsesLDS(Op)) 284 // replaceConstantExprInFunction(I, Op); 285 286 bool Changed = false; 287 288 // Find all ConstantExpr that are direct users of an LDS global 289 SmallVector<ConstantExpr *> Stack; 290 for (auto &GV : M.globals()) 291 if (AMDGPU::isLDSVariableToLower(GV)) 292 for (User *U : GV.users()) 293 if (ConstantExpr *C = dyn_cast<ConstantExpr>(U)) 294 Stack.push_back(C); 295 296 // Expand to include constexpr users of direct users 297 SetVector<ConstantExpr *> ConstExprUsersOfLDS; 298 while (!Stack.empty()) { 299 ConstantExpr *V = Stack.pop_back_val(); 300 if (ConstExprUsersOfLDS.contains(V)) 301 continue; 302 303 ConstExprUsersOfLDS.insert(V); 304 305 for (auto *Nested : V->users()) 306 if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Nested)) 307 Stack.push_back(CE); 308 } 309 310 // Find all instructions that use any of the ConstExpr users of LDS 311 SetVector<Instruction *> InstructionWorklist; 312 for (ConstantExpr *CE : ConstExprUsersOfLDS) 313 for (User *U : CE->users()) 314 if (auto *I = dyn_cast<Instruction>(U)) 315 InstructionWorklist.insert(I); 316 317 // Replace those ConstExpr operands with instructions 318 while (!InstructionWorklist.empty()) { 319 Instruction *I = InstructionWorklist.pop_back_val(); 320 for (Use &U : I->operands()) { 321 322 auto *BI = I; 323 if (auto *Phi = dyn_cast<PHINode>(I)) { 324 BasicBlock *BB = Phi->getIncomingBlock(U); 325 BasicBlock::iterator It = BB->getFirstInsertionPt(); 326 assert(It != BB->end() && "Unexpected empty basic block"); 327 BI = &(*(It)); 328 } 329 330 if (ConstantExpr *C = dyn_cast<ConstantExpr>(U.get())) { 331 if (ConstExprUsersOfLDS.contains(C)) { 332 Changed = true; 333 Instruction *NI = C->getAsInstruction(BI); 334 InstructionWorklist.insert(NI); 335 U.set(NI); 336 C->removeDeadConstantUsers(); 337 } 338 } 339 } 340 } 341 342 return Changed; 343 } 344 345 public: 346 static char ID; 347 348 AMDGPULowerModuleLDS() : ModulePass(ID) { 349 initializeAMDGPULowerModuleLDSPass(*PassRegistry::getPassRegistry()); 350 } 351 352 using FunctionVariableMap = DenseMap<Function *, DenseSet<GlobalVariable *>>; 353 354 using VariableFunctionMap = DenseMap<GlobalVariable *, DenseSet<Function *>>; 355 356 static void getUsesOfLDSByFunction(CallGraph const &CG, Module &M, 357 FunctionVariableMap &kernels, 358 FunctionVariableMap &functions) { 359 360 // Get uses from the current function, excluding uses by called functions 361 // Two output variables to avoid walking the globals list twice 362 for (auto &GV : M.globals()) { 363 if (!AMDGPU::isLDSVariableToLower(GV)) { 364 continue; 365 } 366 367 SmallVector<User *, 16> Stack(GV.users()); 368 for (User *V : GV.users()) { 369 if (auto *I = dyn_cast<Instruction>(V)) { 370 Function *F = I->getFunction(); 371 if (isKernelLDS(F)) { 372 kernels[F].insert(&GV); 373 } else { 374 functions[F].insert(&GV); 375 } 376 } 377 } 378 } 379 } 380 381 struct LDSUsesInfoTy { 382 FunctionVariableMap direct_access; 383 FunctionVariableMap indirect_access; 384 }; 385 386 static LDSUsesInfoTy getTransitiveUsesOfLDS(CallGraph const &CG, Module &M) { 387 388 FunctionVariableMap direct_map_kernel; 389 FunctionVariableMap direct_map_function; 390 getUsesOfLDSByFunction(CG, M, direct_map_kernel, direct_map_function); 391 392 // Collect variables that are used by functions whose address has escaped 393 DenseSet<GlobalVariable *> VariablesReachableThroughFunctionPointer; 394 for (Function &F : M.functions()) { 395 if (!isKernelLDS(&F)) 396 if (F.hasAddressTaken(nullptr, 397 /* IgnoreCallbackUses */ false, 398 /* IgnoreAssumeLikeCalls */ false, 399 /* IgnoreLLVMUsed */ true, 400 /* IgnoreArcAttachedCall */ false)) { 401 set_union(VariablesReachableThroughFunctionPointer, 402 direct_map_function[&F]); 403 } 404 } 405 406 auto functionMakesUnknownCall = [&](const Function *F) -> bool { 407 assert(!F->isDeclaration()); 408 for (CallGraphNode::CallRecord R : *CG[F]) { 409 if (!R.second->getFunction()) { 410 return true; 411 } 412 } 413 return false; 414 }; 415 416 // Work out which variables are reachable through function calls 417 FunctionVariableMap transitive_map_function = direct_map_function; 418 419 // If the function makes any unknown call, assume the worst case that it can 420 // access all variables accessed by functions whose address escaped 421 for (Function &F : M.functions()) { 422 if (!F.isDeclaration() && functionMakesUnknownCall(&F)) { 423 if (!isKernelLDS(&F)) { 424 set_union(transitive_map_function[&F], 425 VariablesReachableThroughFunctionPointer); 426 } 427 } 428 } 429 430 // Direct implementation of collecting all variables reachable from each 431 // function 432 for (Function &Func : M.functions()) { 433 if (Func.isDeclaration() || isKernelLDS(&Func)) 434 continue; 435 436 DenseSet<Function *> seen; // catches cycles 437 SmallVector<Function *, 4> wip{&Func}; 438 439 while (!wip.empty()) { 440 Function *F = wip.pop_back_val(); 441 442 // Can accelerate this by referring to transitive map for functions that 443 // have already been computed, with more care than this 444 set_union(transitive_map_function[&Func], direct_map_function[F]); 445 446 for (CallGraphNode::CallRecord R : *CG[F]) { 447 Function *ith = R.second->getFunction(); 448 if (ith) { 449 if (!seen.contains(ith)) { 450 seen.insert(ith); 451 wip.push_back(ith); 452 } 453 } 454 } 455 } 456 } 457 458 // direct_map_kernel lists which variables are used by the kernel 459 // find the variables which are used through a function call 460 FunctionVariableMap indirect_map_kernel; 461 462 for (Function &Func : M.functions()) { 463 if (Func.isDeclaration() || !isKernelLDS(&Func)) 464 continue; 465 466 for (CallGraphNode::CallRecord R : *CG[&Func]) { 467 Function *ith = R.second->getFunction(); 468 if (ith) { 469 set_union(indirect_map_kernel[&Func], transitive_map_function[ith]); 470 } else { 471 set_union(indirect_map_kernel[&Func], 472 VariablesReachableThroughFunctionPointer); 473 } 474 } 475 } 476 477 return {std::move(direct_map_kernel), std::move(indirect_map_kernel)}; 478 } 479 480 struct LDSVariableReplacement { 481 GlobalVariable *SGV = nullptr; 482 DenseMap<GlobalVariable *, Constant *> LDSVarsToConstantGEP; 483 }; 484 485 // remap from lds global to a constantexpr gep to where it has been moved to 486 // for each kernel 487 // an array with an element for each kernel containing where the corresponding 488 // variable was remapped to 489 490 static Constant *getAddressesOfVariablesInKernel( 491 LLVMContext &Ctx, ArrayRef<GlobalVariable *> Variables, 492 DenseMap<GlobalVariable *, Constant *> &LDSVarsToConstantGEP) { 493 // Create a ConstantArray containing the address of each Variable within the 494 // kernel corresponding to LDSVarsToConstantGEP, or poison if that kernel 495 // does not allocate it 496 // TODO: Drop the ptrtoint conversion 497 498 Type *I32 = Type::getInt32Ty(Ctx); 499 500 ArrayType *KernelOffsetsType = ArrayType::get(I32, Variables.size()); 501 502 SmallVector<Constant *> Elements; 503 for (size_t i = 0; i < Variables.size(); i++) { 504 GlobalVariable *GV = Variables[i]; 505 if (LDSVarsToConstantGEP.count(GV) != 0) { 506 auto elt = ConstantExpr::getPtrToInt(LDSVarsToConstantGEP[GV], I32); 507 Elements.push_back(elt); 508 } else { 509 Elements.push_back(PoisonValue::get(I32)); 510 } 511 } 512 return ConstantArray::get(KernelOffsetsType, Elements); 513 } 514 515 static GlobalVariable *buildLookupTable( 516 Module &M, ArrayRef<GlobalVariable *> Variables, 517 ArrayRef<Function *> kernels, 518 DenseMap<Function *, LDSVariableReplacement> &KernelToReplacement) { 519 if (Variables.empty()) { 520 return nullptr; 521 } 522 LLVMContext &Ctx = M.getContext(); 523 524 const size_t NumberVariables = Variables.size(); 525 const size_t NumberKernels = kernels.size(); 526 527 ArrayType *KernelOffsetsType = 528 ArrayType::get(Type::getInt32Ty(Ctx), NumberVariables); 529 530 ArrayType *AllKernelsOffsetsType = 531 ArrayType::get(KernelOffsetsType, NumberKernels); 532 533 std::vector<Constant *> overallConstantExprElts(NumberKernels); 534 for (size_t i = 0; i < NumberKernels; i++) { 535 LDSVariableReplacement Replacement = KernelToReplacement[kernels[i]]; 536 overallConstantExprElts[i] = getAddressesOfVariablesInKernel( 537 Ctx, Variables, Replacement.LDSVarsToConstantGEP); 538 } 539 540 Constant *init = 541 ConstantArray::get(AllKernelsOffsetsType, overallConstantExprElts); 542 543 return new GlobalVariable( 544 M, AllKernelsOffsetsType, true, GlobalValue::InternalLinkage, init, 545 "llvm.amdgcn.lds.offset.table", nullptr, GlobalValue::NotThreadLocal, 546 AMDGPUAS::CONSTANT_ADDRESS); 547 } 548 549 void replaceUsesInInstructionsWithTableLookup( 550 Module &M, ArrayRef<GlobalVariable *> ModuleScopeVariables, 551 GlobalVariable *LookupTable) { 552 553 LLVMContext &Ctx = M.getContext(); 554 IRBuilder<> Builder(Ctx); 555 Type *I32 = Type::getInt32Ty(Ctx); 556 557 // Accesses from a function use the amdgcn_lds_kernel_id intrinsic which 558 // lowers to a read from a live in register. Emit it once in the entry 559 // block to spare deduplicating it later. 560 561 DenseMap<Function *, Value *> tableKernelIndexCache; 562 auto getTableKernelIndex = [&](Function *F) -> Value * { 563 if (tableKernelIndexCache.count(F) == 0) { 564 LLVMContext &Ctx = M.getContext(); 565 FunctionType *FTy = FunctionType::get(Type::getInt32Ty(Ctx), {}); 566 Function *Decl = 567 Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_lds_kernel_id, {}); 568 569 BasicBlock::iterator it = 570 F->getEntryBlock().getFirstNonPHIOrDbgOrAlloca(); 571 Instruction &i = *it; 572 Builder.SetInsertPoint(&i); 573 574 tableKernelIndexCache[F] = Builder.CreateCall(FTy, Decl, {}); 575 } 576 577 return tableKernelIndexCache[F]; 578 }; 579 580 for (size_t Index = 0; Index < ModuleScopeVariables.size(); Index++) { 581 auto *GV = ModuleScopeVariables[Index]; 582 583 for (Use &U : make_early_inc_range(GV->uses())) { 584 auto *I = dyn_cast<Instruction>(U.getUser()); 585 if (!I) 586 continue; 587 588 Value *tableKernelIndex = getTableKernelIndex(I->getFunction()); 589 590 // So if the phi uses this value multiple times, what does this look 591 // like? 592 if (auto *Phi = dyn_cast<PHINode>(I)) { 593 BasicBlock *BB = Phi->getIncomingBlock(U); 594 Builder.SetInsertPoint(&(*(BB->getFirstInsertionPt()))); 595 } else { 596 Builder.SetInsertPoint(I); 597 } 598 599 Value *GEPIdx[3] = { 600 ConstantInt::get(I32, 0), 601 tableKernelIndex, 602 ConstantInt::get(I32, Index), 603 }; 604 605 Value *Address = Builder.CreateInBoundsGEP( 606 LookupTable->getValueType(), LookupTable, GEPIdx, GV->getName()); 607 608 Value *loaded = Builder.CreateLoad(I32, Address); 609 610 Value *replacement = 611 Builder.CreateIntToPtr(loaded, GV->getType(), GV->getName()); 612 613 U.set(replacement); 614 } 615 } 616 } 617 618 static DenseSet<Function *> kernelsThatIndirectlyAccessAnyOfPassedVariables( 619 Module &M, LDSUsesInfoTy &LDSUsesInfo, 620 DenseSet<GlobalVariable *> const &VariableSet) { 621 622 DenseSet<Function *> KernelSet; 623 624 if (VariableSet.empty()) return KernelSet; 625 626 for (Function &Func : M.functions()) { 627 if (Func.isDeclaration() || !isKernelLDS(&Func)) 628 continue; 629 for (GlobalVariable *GV : LDSUsesInfo.indirect_access[&Func]) { 630 if (VariableSet.contains(GV)) { 631 KernelSet.insert(&Func); 632 break; 633 } 634 } 635 } 636 637 return KernelSet; 638 } 639 640 static GlobalVariable * 641 chooseBestVariableForModuleStrategy(const DataLayout &DL, 642 VariableFunctionMap &LDSVars) { 643 // Find the global variable with the most indirect uses from kernels 644 645 struct CandidateTy { 646 GlobalVariable *GV = nullptr; 647 size_t UserCount = 0; 648 size_t Size = 0; 649 650 CandidateTy() = default; 651 652 CandidateTy(GlobalVariable *GV, uint64_t UserCount, uint64_t AllocSize) 653 : GV(GV), UserCount(UserCount), Size(AllocSize) {} 654 655 bool operator<(const CandidateTy &Other) const { 656 // Fewer users makes module scope variable less attractive 657 if (UserCount < Other.UserCount) { 658 return true; 659 } 660 if (UserCount > Other.UserCount) { 661 return false; 662 } 663 664 // Bigger makes module scope variable less attractive 665 if (Size < Other.Size) { 666 return false; 667 } 668 669 if (Size > Other.Size) { 670 return true; 671 } 672 673 // Arbitrary but consistent 674 return GV->getName() < Other.GV->getName(); 675 } 676 }; 677 678 CandidateTy MostUsed; 679 680 for (auto &K : LDSVars) { 681 GlobalVariable *GV = K.first; 682 if (K.second.size() <= 1) { 683 // A variable reachable by only one kernel is best lowered with kernel 684 // strategy 685 continue; 686 } 687 CandidateTy Candidate(GV, K.second.size(), 688 DL.getTypeAllocSize(GV->getValueType()).getFixedValue()); 689 if (MostUsed < Candidate) 690 MostUsed = Candidate; 691 } 692 693 return MostUsed.GV; 694 } 695 696 bool runOnModule(Module &M) override { 697 LLVMContext &Ctx = M.getContext(); 698 CallGraph CG = CallGraph(M); 699 bool Changed = superAlignLDSGlobals(M); 700 701 Changed |= eliminateConstantExprUsesOfLDSFromAllInstructions(M); 702 703 Changed = true; // todo: narrow this down 704 705 // For each kernel, what variables does it access directly or through 706 // callees 707 LDSUsesInfoTy LDSUsesInfo = getTransitiveUsesOfLDS(CG, M); 708 709 // For each variable accessed through callees, which kernels access it 710 VariableFunctionMap LDSToKernelsThatNeedToAccessItIndirectly; 711 for (auto &K : LDSUsesInfo.indirect_access) { 712 Function *F = K.first; 713 assert(isKernelLDS(F)); 714 for (GlobalVariable *GV : K.second) { 715 LDSToKernelsThatNeedToAccessItIndirectly[GV].insert(F); 716 } 717 } 718 719 // Partition variables into the different strategies 720 DenseSet<GlobalVariable *> ModuleScopeVariables; 721 DenseSet<GlobalVariable *> TableLookupVariables; 722 DenseSet<GlobalVariable *> KernelAccessVariables; 723 724 { 725 GlobalVariable *HybridModuleRoot = 726 LoweringKindLoc != LoweringKind::hybrid 727 ? nullptr 728 : chooseBestVariableForModuleStrategy( 729 M.getDataLayout(), 730 LDSToKernelsThatNeedToAccessItIndirectly); 731 732 DenseSet<Function *> const EmptySet; 733 DenseSet<Function *> const &HybridModuleRootKernels = 734 HybridModuleRoot 735 ? LDSToKernelsThatNeedToAccessItIndirectly[HybridModuleRoot] 736 : EmptySet; 737 738 for (auto &K : LDSToKernelsThatNeedToAccessItIndirectly) { 739 // Each iteration of this loop assigns exactly one global variable to 740 // exactly one of the implementation strategies. 741 742 GlobalVariable *GV = K.first; 743 assert(AMDGPU::isLDSVariableToLower(*GV)); 744 assert(K.second.size() != 0); 745 746 switch (LoweringKindLoc) { 747 case LoweringKind::module: 748 ModuleScopeVariables.insert(GV); 749 break; 750 751 case LoweringKind::table: 752 TableLookupVariables.insert(GV); 753 break; 754 755 case LoweringKind::kernel: 756 if (K.second.size() == 1) { 757 KernelAccessVariables.insert(GV); 758 } else { 759 report_fatal_error("Cannot lower LDS to kernel access as it is " 760 "reachable from multiple kernels"); 761 } 762 break; 763 764 case LoweringKind::hybrid: { 765 if (GV == HybridModuleRoot) { 766 assert(K.second.size() != 1); 767 ModuleScopeVariables.insert(GV); 768 } else if (K.second.size() == 1) { 769 KernelAccessVariables.insert(GV); 770 } else if (set_is_subset(K.second, HybridModuleRootKernels)) { 771 ModuleScopeVariables.insert(GV); 772 } else { 773 TableLookupVariables.insert(GV); 774 } 775 break; 776 } 777 } 778 } 779 780 assert(ModuleScopeVariables.size() + TableLookupVariables.size() + 781 KernelAccessVariables.size() == 782 LDSToKernelsThatNeedToAccessItIndirectly.size()); 783 } // Variables have now been partitioned into the three lowering strategies. 784 785 // If the kernel accesses a variable that is going to be stored in the 786 // module instance through a call then that kernel needs to allocate the 787 // module instance 788 DenseSet<Function *> KernelsThatAllocateModuleLDS = 789 kernelsThatIndirectlyAccessAnyOfPassedVariables(M, LDSUsesInfo, 790 ModuleScopeVariables); 791 DenseSet<Function *> KernelsThatAllocateTableLDS = 792 kernelsThatIndirectlyAccessAnyOfPassedVariables(M, LDSUsesInfo, 793 TableLookupVariables); 794 795 if (!ModuleScopeVariables.empty()) { 796 LDSVariableReplacement ModuleScopeReplacement = 797 createLDSVariableReplacement(M, "llvm.amdgcn.module.lds", 798 ModuleScopeVariables); 799 800 appendToCompilerUsed(M, 801 {static_cast<GlobalValue *>( 802 ConstantExpr::getPointerBitCastOrAddrSpaceCast( 803 cast<Constant>(ModuleScopeReplacement.SGV), 804 Type::getInt8PtrTy(Ctx)))}); 805 806 // historic 807 removeFromUsedLists(M, ModuleScopeVariables); 808 809 // Replace all uses of module scope variable from non-kernel functions 810 replaceLDSVariablesWithStruct( 811 M, ModuleScopeVariables, ModuleScopeReplacement, [&](Use &U) { 812 Instruction *I = dyn_cast<Instruction>(U.getUser()); 813 if (!I) { 814 return false; 815 } 816 Function *F = I->getFunction(); 817 return !isKernelLDS(F); 818 }); 819 820 // Replace uses of module scope variable from kernel functions that 821 // allocate the module scope variable, otherwise leave them unchanged 822 // Record on each kernel whether the module scope global is used by it 823 824 LLVMContext &Ctx = M.getContext(); 825 IRBuilder<> Builder(Ctx); 826 827 for (Function &Func : M.functions()) { 828 if (Func.isDeclaration() || !isKernelLDS(&Func)) 829 continue; 830 831 if (KernelsThatAllocateModuleLDS.contains(&Func)) { 832 replaceLDSVariablesWithStruct( 833 M, ModuleScopeVariables, ModuleScopeReplacement, [&](Use &U) { 834 Instruction *I = dyn_cast<Instruction>(U.getUser()); 835 if (!I) { 836 return false; 837 } 838 Function *F = I->getFunction(); 839 return F == &Func; 840 }); 841 842 markUsedByKernel(Builder, &Func, ModuleScopeReplacement.SGV); 843 844 } else { 845 Func.addFnAttr("amdgpu-elide-module-lds"); 846 } 847 } 848 } 849 850 // Create a struct for each kernel for the non-module-scope variables 851 DenseMap<Function *, LDSVariableReplacement> KernelToReplacement; 852 for (Function &Func : M.functions()) { 853 if (Func.isDeclaration() || !isKernelLDS(&Func)) 854 continue; 855 856 DenseSet<GlobalVariable *> KernelUsedVariables; 857 for (auto &v : LDSUsesInfo.direct_access[&Func]) { 858 KernelUsedVariables.insert(v); 859 } 860 for (auto &v : LDSUsesInfo.indirect_access[&Func]) { 861 KernelUsedVariables.insert(v); 862 } 863 864 // Variables allocated in module lds must all resolve to that struct, 865 // not to the per-kernel instance. 866 if (KernelsThatAllocateModuleLDS.contains(&Func)) { 867 for (GlobalVariable *v : ModuleScopeVariables) { 868 KernelUsedVariables.erase(v); 869 } 870 } 871 872 if (KernelUsedVariables.empty()) { 873 // Either used no LDS, or all the LDS it used was also in module 874 continue; 875 } 876 877 // The association between kernel function and LDS struct is done by 878 // symbol name, which only works if the function in question has a 879 // name This is not expected to be a problem in practice as kernels 880 // are called by name making anonymous ones (which are named by the 881 // backend) difficult to use. This does mean that llvm test cases need 882 // to name the kernels. 883 if (!Func.hasName()) { 884 report_fatal_error("Anonymous kernels cannot use LDS variables"); 885 } 886 887 std::string VarName = 888 (Twine("llvm.amdgcn.kernel.") + Func.getName() + ".lds").str(); 889 890 auto Replacement = 891 createLDSVariableReplacement(M, VarName, KernelUsedVariables); 892 893 // remove preserves existing codegen 894 removeFromUsedLists(M, KernelUsedVariables); 895 KernelToReplacement[&Func] = Replacement; 896 897 // Rewrite uses within kernel to the new struct 898 replaceLDSVariablesWithStruct( 899 M, KernelUsedVariables, Replacement, [&Func](Use &U) { 900 Instruction *I = dyn_cast<Instruction>(U.getUser()); 901 return I && I->getFunction() == &Func; 902 }); 903 } 904 905 // Lower zero cost accesses to the kernel instances just created 906 for (auto &GV : KernelAccessVariables) { 907 auto &funcs = LDSToKernelsThatNeedToAccessItIndirectly[GV]; 908 assert(funcs.size() == 1); // Only one kernel can access it 909 LDSVariableReplacement Replacement = 910 KernelToReplacement[*(funcs.begin())]; 911 912 DenseSet<GlobalVariable *> Vec; 913 Vec.insert(GV); 914 915 replaceLDSVariablesWithStruct(M, Vec, Replacement, [](Use &U) { 916 return isa<Instruction>(U.getUser()); 917 }); 918 } 919 920 if (!KernelsThatAllocateTableLDS.empty()) { 921 // Collect the kernels that allocate table lookup LDS 922 std::vector<Function *> OrderedKernels; 923 { 924 for (Function &Func : M.functions()) { 925 if (Func.isDeclaration()) 926 continue; 927 if (!isKernelLDS(&Func)) 928 continue; 929 930 if (KernelsThatAllocateTableLDS.contains(&Func)) { 931 assert(Func.hasName()); // else fatal error earlier 932 OrderedKernels.push_back(&Func); 933 } 934 } 935 936 // Put them in an arbitrary but reproducible order 937 llvm::sort(OrderedKernels.begin(), OrderedKernels.end(), 938 [](const Function *lhs, const Function *rhs) -> bool { 939 return lhs->getName() < rhs->getName(); 940 }); 941 942 // Annotate the kernels with their order in this vector 943 LLVMContext &Ctx = M.getContext(); 944 IRBuilder<> Builder(Ctx); 945 946 if (OrderedKernels.size() > UINT32_MAX) { 947 // 32 bit keeps it in one SGPR. > 2**32 kernels won't fit on the GPU 948 report_fatal_error("Unimplemented LDS lowering for > 2**32 kernels"); 949 } 950 951 for (size_t i = 0; i < OrderedKernels.size(); i++) { 952 Metadata *AttrMDArgs[1] = { 953 ConstantAsMetadata::get(Builder.getInt32(i)), 954 }; 955 OrderedKernels[i]->setMetadata("llvm.amdgcn.lds.kernel.id", 956 MDNode::get(Ctx, AttrMDArgs)); 957 958 markUsedByKernel(Builder, OrderedKernels[i], 959 KernelToReplacement[OrderedKernels[i]].SGV); 960 } 961 } 962 963 // The order must be consistent between lookup table and accesses to 964 // lookup table 965 std::vector<GlobalVariable *> TableLookupVariablesOrdered( 966 TableLookupVariables.begin(), TableLookupVariables.end()); 967 llvm::sort(TableLookupVariablesOrdered.begin(), 968 TableLookupVariablesOrdered.end(), 969 [](const GlobalVariable *lhs, const GlobalVariable *rhs) { 970 return lhs->getName() < rhs->getName(); 971 }); 972 973 GlobalVariable *LookupTable = buildLookupTable( 974 M, TableLookupVariablesOrdered, OrderedKernels, KernelToReplacement); 975 replaceUsesInInstructionsWithTableLookup(M, TableLookupVariablesOrdered, 976 LookupTable); 977 } 978 979 for (auto &GV : make_early_inc_range(M.globals())) 980 if (AMDGPU::isLDSVariableToLower(GV)) { 981 982 // probably want to remove from used lists 983 GV.removeDeadConstantUsers(); 984 if (GV.use_empty()) 985 GV.eraseFromParent(); 986 } 987 988 return Changed; 989 } 990 991 private: 992 // Increase the alignment of LDS globals if necessary to maximise the chance 993 // that we can use aligned LDS instructions to access them. 994 static bool superAlignLDSGlobals(Module &M) { 995 const DataLayout &DL = M.getDataLayout(); 996 bool Changed = false; 997 if (!SuperAlignLDSGlobals) { 998 return Changed; 999 } 1000 1001 for (auto &GV : M.globals()) { 1002 if (GV.getType()->getPointerAddressSpace() != AMDGPUAS::LOCAL_ADDRESS) { 1003 // Only changing alignment of LDS variables 1004 continue; 1005 } 1006 if (!GV.hasInitializer()) { 1007 // cuda/hip extern __shared__ variable, leave alignment alone 1008 continue; 1009 } 1010 1011 Align Alignment = AMDGPU::getAlign(DL, &GV); 1012 TypeSize GVSize = DL.getTypeAllocSize(GV.getValueType()); 1013 1014 if (GVSize > 8) { 1015 // We might want to use a b96 or b128 load/store 1016 Alignment = std::max(Alignment, Align(16)); 1017 } else if (GVSize > 4) { 1018 // We might want to use a b64 load/store 1019 Alignment = std::max(Alignment, Align(8)); 1020 } else if (GVSize > 2) { 1021 // We might want to use a b32 load/store 1022 Alignment = std::max(Alignment, Align(4)); 1023 } else if (GVSize > 1) { 1024 // We might want to use a b16 load/store 1025 Alignment = std::max(Alignment, Align(2)); 1026 } 1027 1028 if (Alignment != AMDGPU::getAlign(DL, &GV)) { 1029 Changed = true; 1030 GV.setAlignment(Alignment); 1031 } 1032 } 1033 return Changed; 1034 } 1035 1036 static LDSVariableReplacement createLDSVariableReplacement( 1037 Module &M, std::string VarName, 1038 DenseSet<GlobalVariable *> const &LDSVarsToTransform) { 1039 // Create a struct instance containing LDSVarsToTransform and map from those 1040 // variables to ConstantExprGEP 1041 // Variables may be introduced to meet alignment requirements. No aliasing 1042 // metadata is useful for these as they have no uses. Erased before return. 1043 1044 LLVMContext &Ctx = M.getContext(); 1045 const DataLayout &DL = M.getDataLayout(); 1046 assert(!LDSVarsToTransform.empty()); 1047 1048 SmallVector<OptimizedStructLayoutField, 8> LayoutFields; 1049 LayoutFields.reserve(LDSVarsToTransform.size()); 1050 { 1051 // The order of fields in this struct depends on the order of 1052 // varables in the argument which varies when changing how they 1053 // are identified, leading to spurious test breakage. 1054 std::vector<GlobalVariable *> Sorted(LDSVarsToTransform.begin(), 1055 LDSVarsToTransform.end()); 1056 llvm::sort(Sorted.begin(), Sorted.end(), 1057 [](const GlobalVariable *lhs, const GlobalVariable *rhs) { 1058 return lhs->getName() < rhs->getName(); 1059 }); 1060 for (GlobalVariable *GV : Sorted) { 1061 OptimizedStructLayoutField F(GV, 1062 DL.getTypeAllocSize(GV->getValueType()), 1063 AMDGPU::getAlign(DL, GV)); 1064 LayoutFields.emplace_back(F); 1065 } 1066 } 1067 1068 performOptimizedStructLayout(LayoutFields); 1069 1070 std::vector<GlobalVariable *> LocalVars; 1071 BitVector IsPaddingField; 1072 LocalVars.reserve(LDSVarsToTransform.size()); // will be at least this large 1073 IsPaddingField.reserve(LDSVarsToTransform.size()); 1074 { 1075 uint64_t CurrentOffset = 0; 1076 for (size_t I = 0; I < LayoutFields.size(); I++) { 1077 GlobalVariable *FGV = static_cast<GlobalVariable *>( 1078 const_cast<void *>(LayoutFields[I].Id)); 1079 Align DataAlign = LayoutFields[I].Alignment; 1080 1081 uint64_t DataAlignV = DataAlign.value(); 1082 if (uint64_t Rem = CurrentOffset % DataAlignV) { 1083 uint64_t Padding = DataAlignV - Rem; 1084 1085 // Append an array of padding bytes to meet alignment requested 1086 // Note (o + (a - (o % a)) ) % a == 0 1087 // (offset + Padding ) % align == 0 1088 1089 Type *ATy = ArrayType::get(Type::getInt8Ty(Ctx), Padding); 1090 LocalVars.push_back(new GlobalVariable( 1091 M, ATy, false, GlobalValue::InternalLinkage, UndefValue::get(ATy), 1092 "", nullptr, GlobalValue::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS, 1093 false)); 1094 IsPaddingField.push_back(true); 1095 CurrentOffset += Padding; 1096 } 1097 1098 LocalVars.push_back(FGV); 1099 IsPaddingField.push_back(false); 1100 CurrentOffset += LayoutFields[I].Size; 1101 } 1102 } 1103 1104 std::vector<Type *> LocalVarTypes; 1105 LocalVarTypes.reserve(LocalVars.size()); 1106 std::transform( 1107 LocalVars.cbegin(), LocalVars.cend(), std::back_inserter(LocalVarTypes), 1108 [](const GlobalVariable *V) -> Type * { return V->getValueType(); }); 1109 1110 StructType *LDSTy = StructType::create(Ctx, LocalVarTypes, VarName + ".t"); 1111 1112 Align StructAlign = AMDGPU::getAlign(DL, LocalVars[0]); 1113 1114 GlobalVariable *SGV = new GlobalVariable( 1115 M, LDSTy, false, GlobalValue::InternalLinkage, UndefValue::get(LDSTy), 1116 VarName, nullptr, GlobalValue::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS, 1117 false); 1118 SGV->setAlignment(StructAlign); 1119 1120 DenseMap<GlobalVariable *, Constant *> Map; 1121 Type *I32 = Type::getInt32Ty(Ctx); 1122 for (size_t I = 0; I < LocalVars.size(); I++) { 1123 GlobalVariable *GV = LocalVars[I]; 1124 Constant *GEPIdx[] = {ConstantInt::get(I32, 0), ConstantInt::get(I32, I)}; 1125 Constant *GEP = ConstantExpr::getGetElementPtr(LDSTy, SGV, GEPIdx, true); 1126 if (IsPaddingField[I]) { 1127 assert(GV->use_empty()); 1128 GV->eraseFromParent(); 1129 } else { 1130 Map[GV] = GEP; 1131 } 1132 } 1133 assert(Map.size() == LDSVarsToTransform.size()); 1134 return {SGV, std::move(Map)}; 1135 } 1136 1137 template <typename PredicateTy> 1138 void replaceLDSVariablesWithStruct( 1139 Module &M, DenseSet<GlobalVariable *> const &LDSVarsToTransformArg, 1140 LDSVariableReplacement Replacement, PredicateTy Predicate) { 1141 LLVMContext &Ctx = M.getContext(); 1142 const DataLayout &DL = M.getDataLayout(); 1143 1144 // A hack... we need to insert the aliasing info in a predictable order for 1145 // lit tests. Would like to have them in a stable order already, ideally the 1146 // same order they get allocated, which might mean an ordered set container 1147 std::vector<GlobalVariable *> LDSVarsToTransform( 1148 LDSVarsToTransformArg.begin(), LDSVarsToTransformArg.end()); 1149 llvm::sort(LDSVarsToTransform.begin(), LDSVarsToTransform.end(), 1150 [](const GlobalVariable *lhs, const GlobalVariable *rhs) { 1151 return lhs->getName() < rhs->getName(); 1152 }); 1153 1154 // Create alias.scope and their lists. Each field in the new structure 1155 // does not alias with all other fields. 1156 SmallVector<MDNode *> AliasScopes; 1157 SmallVector<Metadata *> NoAliasList; 1158 const size_t NumberVars = LDSVarsToTransform.size(); 1159 if (NumberVars > 1) { 1160 MDBuilder MDB(Ctx); 1161 AliasScopes.reserve(NumberVars); 1162 MDNode *Domain = MDB.createAnonymousAliasScopeDomain(); 1163 for (size_t I = 0; I < NumberVars; I++) { 1164 MDNode *Scope = MDB.createAnonymousAliasScope(Domain); 1165 AliasScopes.push_back(Scope); 1166 } 1167 NoAliasList.append(&AliasScopes[1], AliasScopes.end()); 1168 } 1169 1170 // Replace uses of ith variable with a constantexpr to the corresponding 1171 // field of the instance that will be allocated by AMDGPUMachineFunction 1172 for (size_t I = 0; I < NumberVars; I++) { 1173 GlobalVariable *GV = LDSVarsToTransform[I]; 1174 Constant *GEP = Replacement.LDSVarsToConstantGEP[GV]; 1175 1176 GV->replaceUsesWithIf(GEP, Predicate); 1177 1178 APInt APOff(DL.getIndexTypeSizeInBits(GEP->getType()), 0); 1179 GEP->stripAndAccumulateInBoundsConstantOffsets(DL, APOff); 1180 uint64_t Offset = APOff.getZExtValue(); 1181 1182 Align A = 1183 commonAlignment(Replacement.SGV->getAlign().valueOrOne(), Offset); 1184 1185 if (I) 1186 NoAliasList[I - 1] = AliasScopes[I - 1]; 1187 MDNode *NoAlias = 1188 NoAliasList.empty() ? nullptr : MDNode::get(Ctx, NoAliasList); 1189 MDNode *AliasScope = 1190 AliasScopes.empty() ? nullptr : MDNode::get(Ctx, {AliasScopes[I]}); 1191 1192 refineUsesAlignmentAndAA(GEP, A, DL, AliasScope, NoAlias); 1193 } 1194 } 1195 1196 void refineUsesAlignmentAndAA(Value *Ptr, Align A, const DataLayout &DL, 1197 MDNode *AliasScope, MDNode *NoAlias, 1198 unsigned MaxDepth = 5) { 1199 if (!MaxDepth || (A == 1 && !AliasScope)) 1200 return; 1201 1202 for (User *U : Ptr->users()) { 1203 if (auto *I = dyn_cast<Instruction>(U)) { 1204 if (AliasScope && I->mayReadOrWriteMemory()) { 1205 MDNode *AS = I->getMetadata(LLVMContext::MD_alias_scope); 1206 AS = (AS ? MDNode::getMostGenericAliasScope(AS, AliasScope) 1207 : AliasScope); 1208 I->setMetadata(LLVMContext::MD_alias_scope, AS); 1209 1210 MDNode *NA = I->getMetadata(LLVMContext::MD_noalias); 1211 NA = (NA ? MDNode::intersect(NA, NoAlias) : NoAlias); 1212 I->setMetadata(LLVMContext::MD_noalias, NA); 1213 } 1214 } 1215 1216 if (auto *LI = dyn_cast<LoadInst>(U)) { 1217 LI->setAlignment(std::max(A, LI->getAlign())); 1218 continue; 1219 } 1220 if (auto *SI = dyn_cast<StoreInst>(U)) { 1221 if (SI->getPointerOperand() == Ptr) 1222 SI->setAlignment(std::max(A, SI->getAlign())); 1223 continue; 1224 } 1225 if (auto *AI = dyn_cast<AtomicRMWInst>(U)) { 1226 // None of atomicrmw operations can work on pointers, but let's 1227 // check it anyway in case it will or we will process ConstantExpr. 1228 if (AI->getPointerOperand() == Ptr) 1229 AI->setAlignment(std::max(A, AI->getAlign())); 1230 continue; 1231 } 1232 if (auto *AI = dyn_cast<AtomicCmpXchgInst>(U)) { 1233 if (AI->getPointerOperand() == Ptr) 1234 AI->setAlignment(std::max(A, AI->getAlign())); 1235 continue; 1236 } 1237 if (auto *GEP = dyn_cast<GetElementPtrInst>(U)) { 1238 unsigned BitWidth = DL.getIndexTypeSizeInBits(GEP->getType()); 1239 APInt Off(BitWidth, 0); 1240 if (GEP->getPointerOperand() == Ptr) { 1241 Align GA; 1242 if (GEP->accumulateConstantOffset(DL, Off)) 1243 GA = commonAlignment(A, Off.getLimitedValue()); 1244 refineUsesAlignmentAndAA(GEP, GA, DL, AliasScope, NoAlias, 1245 MaxDepth - 1); 1246 } 1247 continue; 1248 } 1249 if (auto *I = dyn_cast<Instruction>(U)) { 1250 if (I->getOpcode() == Instruction::BitCast || 1251 I->getOpcode() == Instruction::AddrSpaceCast) 1252 refineUsesAlignmentAndAA(I, A, DL, AliasScope, NoAlias, MaxDepth - 1); 1253 } 1254 } 1255 } 1256 }; 1257 1258 } // namespace 1259 char AMDGPULowerModuleLDS::ID = 0; 1260 1261 char &llvm::AMDGPULowerModuleLDSID = AMDGPULowerModuleLDS::ID; 1262 1263 INITIALIZE_PASS(AMDGPULowerModuleLDS, DEBUG_TYPE, 1264 "Lower uses of LDS variables from non-kernel functions", false, 1265 false) 1266 1267 ModulePass *llvm::createAMDGPULowerModuleLDSPass() { 1268 return new AMDGPULowerModuleLDS(); 1269 } 1270 1271 PreservedAnalyses AMDGPULowerModuleLDSPass::run(Module &M, 1272 ModuleAnalysisManager &) { 1273 return AMDGPULowerModuleLDS().runOnModule(M) ? PreservedAnalyses::none() 1274 : PreservedAnalyses::all(); 1275 } 1276