1 //===-- AMDGPULowerModuleLDSPass.cpp ------------------------------*- C++ -*-=// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This pass eliminates LDS uses from non-kernel functions. 10 // 11 // The strategy is to create a new struct with a field for each LDS variable 12 // and allocate that struct at the same address for every kernel. Uses of the 13 // original LDS variables are then replaced with compile time offsets from that 14 // known address. AMDGPUMachineFunction allocates the LDS global. 15 // 16 // Local variables with constant annotation or non-undef initializer are passed 17 // through unchanged for simplication or error diagnostics in later passes. 18 // 19 // To reduce the memory overhead variables that are only used by kernels are 20 // excluded from this transform. The analysis to determine whether a variable 21 // is only used by a kernel is cheap and conservative so this may allocate 22 // a variable in every kernel when it was not strictly necessary to do so. 23 // 24 // A possible future refinement is to specialise the structure per-kernel, so 25 // that fields can be elided based on more expensive analysis. 26 // 27 //===----------------------------------------------------------------------===// 28 29 #include "AMDGPU.h" 30 #include "Utils/AMDGPUBaseInfo.h" 31 #include "Utils/AMDGPULDSUtils.h" 32 #include "llvm/ADT/STLExtras.h" 33 #include "llvm/IR/Constants.h" 34 #include "llvm/IR/DerivedTypes.h" 35 #include "llvm/IR/IRBuilder.h" 36 #include "llvm/IR/InlineAsm.h" 37 #include "llvm/IR/Instructions.h" 38 #include "llvm/InitializePasses.h" 39 #include "llvm/Pass.h" 40 #include "llvm/Support/CommandLine.h" 41 #include "llvm/Support/Debug.h" 42 #include "llvm/Transforms/Utils/ModuleUtils.h" 43 #include <algorithm> 44 #include <vector> 45 46 #define DEBUG_TYPE "amdgpu-lower-module-lds" 47 48 using namespace llvm; 49 50 static cl::opt<bool> SuperAlignLDSGlobals( 51 "amdgpu-super-align-lds-globals", 52 cl::desc("Increase alignment of LDS if it is not on align boundary"), 53 cl::init(true), cl::Hidden); 54 55 namespace { 56 57 class AMDGPULowerModuleLDS : public ModulePass { 58 59 static void removeFromUsedList(Module &M, StringRef Name, 60 SmallPtrSetImpl<Constant *> &ToRemove) { 61 GlobalVariable *GV = M.getNamedGlobal(Name); 62 if (!GV || ToRemove.empty()) { 63 return; 64 } 65 66 SmallVector<Constant *, 16> Init; 67 auto *CA = cast<ConstantArray>(GV->getInitializer()); 68 for (auto &Op : CA->operands()) { 69 // ModuleUtils::appendToUsed only inserts Constants 70 Constant *C = cast<Constant>(Op); 71 if (!ToRemove.contains(C->stripPointerCasts())) { 72 Init.push_back(C); 73 } 74 } 75 76 if (Init.size() == CA->getNumOperands()) { 77 return; // none to remove 78 } 79 80 GV->eraseFromParent(); 81 82 for (Constant *C : ToRemove) { 83 C->removeDeadConstantUsers(); 84 } 85 86 if (!Init.empty()) { 87 ArrayType *ATy = 88 ArrayType::get(Type::getInt8PtrTy(M.getContext()), Init.size()); 89 GV = 90 new llvm::GlobalVariable(M, ATy, false, GlobalValue::AppendingLinkage, 91 ConstantArray::get(ATy, Init), Name); 92 GV->setSection("llvm.metadata"); 93 } 94 } 95 96 static void 97 removeFromUsedLists(Module &M, 98 const std::vector<GlobalVariable *> &LocalVars) { 99 SmallPtrSet<Constant *, 32> LocalVarsSet; 100 for (size_t I = 0; I < LocalVars.size(); I++) { 101 if (Constant *C = dyn_cast<Constant>(LocalVars[I]->stripPointerCasts())) { 102 LocalVarsSet.insert(C); 103 } 104 } 105 removeFromUsedList(M, "llvm.used", LocalVarsSet); 106 removeFromUsedList(M, "llvm.compiler.used", LocalVarsSet); 107 } 108 109 static void markUsedByKernel(IRBuilder<> &Builder, Function *Func, 110 GlobalVariable *SGV) { 111 // The llvm.amdgcn.module.lds instance is implicitly used by all kernels 112 // that might call a function which accesses a field within it. This is 113 // presently approximated to 'all kernels' if there are any such functions 114 // in the module. This implicit use is reified as an explicit use here so 115 // that later passes, specifically PromoteAlloca, account for the required 116 // memory without any knowledge of this transform. 117 118 // An operand bundle on llvm.donothing works because the call instruction 119 // survives until after the last pass that needs to account for LDS. It is 120 // better than inline asm as the latter survives until the end of codegen. A 121 // totally robust solution would be a function with the same semantics as 122 // llvm.donothing that takes a pointer to the instance and is lowered to a 123 // no-op after LDS is allocated, but that is not presently necessary. 124 125 LLVMContext &Ctx = Func->getContext(); 126 127 Builder.SetInsertPoint(Func->getEntryBlock().getFirstNonPHI()); 128 129 FunctionType *FTy = FunctionType::get(Type::getVoidTy(Ctx), {}); 130 131 Function *Decl = 132 Intrinsic::getDeclaration(Func->getParent(), Intrinsic::donothing, {}); 133 134 Value *UseInstance[1] = {Builder.CreateInBoundsGEP( 135 SGV->getValueType(), SGV, ConstantInt::get(Type::getInt32Ty(Ctx), 0))}; 136 137 Builder.CreateCall(FTy, Decl, {}, 138 {OperandBundleDefT<Value *>("ExplicitUse", UseInstance)}, 139 ""); 140 } 141 142 private: 143 SmallPtrSet<GlobalValue *, 32> UsedList; 144 145 public: 146 static char ID; 147 148 AMDGPULowerModuleLDS() : ModulePass(ID) { 149 initializeAMDGPULowerModuleLDSPass(*PassRegistry::getPassRegistry()); 150 } 151 152 bool runOnModule(Module &M) override { 153 UsedList = AMDGPU::getUsedList(M); 154 155 bool Changed = processUsedLDS(M); 156 157 for (Function &F : M.functions()) { 158 if (!AMDGPU::isKernelCC(&F)) 159 continue; 160 Changed |= processUsedLDS(M, &F); 161 } 162 163 UsedList.clear(); 164 return Changed; 165 } 166 167 private: 168 bool processUsedLDS(Module &M, Function *F = nullptr) { 169 LLVMContext &Ctx = M.getContext(); 170 const DataLayout &DL = M.getDataLayout(); 171 172 // Find variables to move into new struct instance 173 std::vector<GlobalVariable *> FoundLocalVars = 174 AMDGPU::findVariablesToLower(M, F); 175 176 if (FoundLocalVars.empty()) { 177 // No variables to rewrite, no changes made. 178 return false; 179 } 180 181 // Increase the alignment of LDS globals if necessary to maximise the chance 182 // that we can use aligned LDS instructions to access them. 183 if (SuperAlignLDSGlobals) { 184 for (auto *GV : FoundLocalVars) { 185 Align Alignment = AMDGPU::getAlign(DL, GV); 186 TypeSize GVSize = DL.getTypeAllocSize(GV->getValueType()); 187 188 if (GVSize > 8) { 189 // We might want to use a b96 or b128 load/store 190 Alignment = std::max(Alignment, Align(16)); 191 } else if (GVSize > 4) { 192 // We might want to use a b64 load/store 193 Alignment = std::max(Alignment, Align(8)); 194 } else if (GVSize > 2) { 195 // We might want to use a b32 load/store 196 Alignment = std::max(Alignment, Align(4)); 197 } else if (GVSize > 1) { 198 // We might want to use a b16 load/store 199 Alignment = std::max(Alignment, Align(2)); 200 } 201 202 GV->setAlignment(Alignment); 203 } 204 } 205 206 // Sort by alignment, descending, to minimise padding. 207 // On ties, sort by size, descending, then by name, lexicographical. 208 llvm::stable_sort( 209 FoundLocalVars, 210 [&](const GlobalVariable *LHS, const GlobalVariable *RHS) -> bool { 211 Align ALHS = AMDGPU::getAlign(DL, LHS); 212 Align ARHS = AMDGPU::getAlign(DL, RHS); 213 if (ALHS != ARHS) { 214 return ALHS > ARHS; 215 } 216 217 TypeSize SLHS = DL.getTypeAllocSize(LHS->getValueType()); 218 TypeSize SRHS = DL.getTypeAllocSize(RHS->getValueType()); 219 if (SLHS != SRHS) { 220 return SLHS > SRHS; 221 } 222 223 // By variable name on tie for predictable order in test cases. 224 return LHS->getName() < RHS->getName(); 225 }); 226 227 std::vector<GlobalVariable *> LocalVars; 228 LocalVars.reserve(FoundLocalVars.size()); // will be at least this large 229 { 230 // This usually won't need to insert any padding, perhaps avoid the alloc 231 uint64_t CurrentOffset = 0; 232 for (size_t I = 0; I < FoundLocalVars.size(); I++) { 233 GlobalVariable *FGV = FoundLocalVars[I]; 234 Align DataAlign = AMDGPU::getAlign(DL, FGV); 235 236 uint64_t DataAlignV = DataAlign.value(); 237 if (uint64_t Rem = CurrentOffset % DataAlignV) { 238 uint64_t Padding = DataAlignV - Rem; 239 240 // Append an array of padding bytes to meet alignment requested 241 // Note (o + (a - (o % a)) ) % a == 0 242 // (offset + Padding ) % align == 0 243 244 Type *ATy = ArrayType::get(Type::getInt8Ty(Ctx), Padding); 245 LocalVars.push_back(new GlobalVariable( 246 M, ATy, false, GlobalValue::InternalLinkage, UndefValue::get(ATy), 247 "", nullptr, GlobalValue::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS, 248 false)); 249 CurrentOffset += Padding; 250 } 251 252 LocalVars.push_back(FGV); 253 CurrentOffset += DL.getTypeAllocSize(FGV->getValueType()); 254 } 255 } 256 257 std::vector<Type *> LocalVarTypes; 258 LocalVarTypes.reserve(LocalVars.size()); 259 std::transform( 260 LocalVars.cbegin(), LocalVars.cend(), std::back_inserter(LocalVarTypes), 261 [](const GlobalVariable *V) -> Type * { return V->getValueType(); }); 262 263 std::string VarName( 264 F ? (Twine("llvm.amdgcn.kernel.") + F->getName() + ".lds").str() 265 : "llvm.amdgcn.module.lds"); 266 StructType *LDSTy = StructType::create(Ctx, LocalVarTypes, VarName + ".t"); 267 268 Align MaxAlign = 269 AMDGPU::getAlign(DL, LocalVars[0]); // was sorted on alignment 270 271 GlobalVariable *SGV = new GlobalVariable( 272 M, LDSTy, false, GlobalValue::InternalLinkage, UndefValue::get(LDSTy), 273 VarName, nullptr, GlobalValue::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS, 274 false); 275 SGV->setAlignment(MaxAlign); 276 if (!F) { 277 appendToCompilerUsed( 278 M, {static_cast<GlobalValue *>( 279 ConstantExpr::getPointerBitCastOrAddrSpaceCast( 280 cast<Constant>(SGV), Type::getInt8PtrTy(Ctx)))}); 281 } 282 283 // The verifier rejects used lists containing an inttoptr of a constant 284 // so remove the variables from these lists before replaceAllUsesWith 285 removeFromUsedLists(M, LocalVars); 286 287 // Replace uses of ith variable with a constantexpr to the ith field of the 288 // instance that will be allocated by AMDGPUMachineFunction 289 Type *I32 = Type::getInt32Ty(Ctx); 290 for (size_t I = 0; I < LocalVars.size(); I++) { 291 GlobalVariable *GV = LocalVars[I]; 292 Constant *GEPIdx[] = {ConstantInt::get(I32, 0), ConstantInt::get(I32, I)}; 293 Constant *GEP = ConstantExpr::getGetElementPtr(LDSTy, SGV, GEPIdx); 294 if (F) { 295 // Replace all constant uses with instructions if they belong to the 296 // current kernel. 297 for (User *U : make_early_inc_range(GV->users())) { 298 if (ConstantExpr *C = dyn_cast<ConstantExpr>(U)) 299 AMDGPU::replaceConstantUsesInFunction(C, F); 300 } 301 302 GV->removeDeadConstantUsers(); 303 304 GV->replaceUsesWithIf(GEP, [F](Use &U) { 305 Instruction *I = dyn_cast<Instruction>(U.getUser()); 306 return I && I->getFunction() == F; 307 }); 308 } else { 309 GV->replaceAllUsesWith(GEP); 310 } 311 if (GV->use_empty()) { 312 UsedList.erase(GV); 313 GV->eraseFromParent(); 314 } 315 } 316 317 // Mark kernels with asm that reads the address of the allocated structure 318 // This is not necessary for lowering. This lets other passes, specifically 319 // PromoteAlloca, accurately calculate how much LDS will be used by the 320 // kernel after lowering. 321 if (!F) { 322 IRBuilder<> Builder(Ctx); 323 SmallPtrSet<Function *, 32> Kernels; 324 for (auto &I : M.functions()) { 325 Function *Func = &I; 326 if (AMDGPU::isKernelCC(Func) && !Kernels.contains(Func)) { 327 markUsedByKernel(Builder, Func, SGV); 328 Kernels.insert(Func); 329 } 330 } 331 } 332 return true; 333 } 334 }; 335 336 } // namespace 337 char AMDGPULowerModuleLDS::ID = 0; 338 339 char &llvm::AMDGPULowerModuleLDSID = AMDGPULowerModuleLDS::ID; 340 341 INITIALIZE_PASS(AMDGPULowerModuleLDS, DEBUG_TYPE, 342 "Lower uses of LDS variables from non-kernel functions", false, 343 false) 344 345 ModulePass *llvm::createAMDGPULowerModuleLDSPass() { 346 return new AMDGPULowerModuleLDS(); 347 } 348 349 PreservedAnalyses AMDGPULowerModuleLDSPass::run(Module &M, 350 ModuleAnalysisManager &) { 351 return AMDGPULowerModuleLDS().runOnModule(M) ? PreservedAnalyses::none() 352 : PreservedAnalyses::all(); 353 } 354